README

Bryan Pyle
CPSC 4770
Assignment 5

Assumptions: job-events with timestamp of 0 or MAXINT are outside the bounds of the trace, so they are ignored completely in the calculation of top 20 timestamps. HOWEVER, they are included in the count of unique job_ids. 

Contribution Statement: I worked on this assignment alone, using only course materials. I also posted and answered several questions on the Canvas Discussion boards. 

In [None]:
%%writefile asg4-scalability.pbs
#PBS -N asg4-scalability
#PBS -l select=8:ncpus=8:mpiprocs=8:mem=6gb,walltime=24:00:00
#PBS -j oe

module load gcc/5.3.0 openmpi/1.10.3 anaconda3/4.2.0; 
chmod 755 asg4-process-gtrace.py;

echo "4 Cores:"
time mpirun -np 4 --mca mpi_cuda_support 0 python3 asg4-process-gtrace.py;

echo ""
echo "8 Cores:"
time mpirun -np 8 --mca mpi_cuda_support 0 python3 asg4-process-gtrace.py;

echo ""
echo "16 Cores:"
time mpirun -np 16 --mca mpi_cuda_support 0 python3 asg4-process-gtrace.py;

echo ""
echo "32 Cores:"
time mpirun -np 32 --mca mpi_cuda_support 0 python3 asg4-process-gtrace.py;

echo ""
echo "64 Cores:"
time mpirun -np 64 --mca mpi_cuda_support 0 python3 asg4-process-gtrace.py;

In [None]:
%%writefile asg4-process-gtrace.py
#!/usr/bin/env python
# asg4-process-gtrace.py
from operator import itemgetter
import os
import bisect
import gzip
from mpi4py import MPI
import csv
from itertools import islice

MAXTIMESTAMP = pow(2, 63) - 1

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i+n]
        
def addIDsDict(d, filepath):
    with gzip.open(filepath, "rt", encoding="ascii") as fin:
        reader = csv.reader(fin, delimiter=',')
        count = 0
        for row in reader:
            insertOrderDic(d, row[2], (int(row[0]), int(row[3])))
            count = count + 1
        return count
    
def extendDic(main, extension):
    for key, values in extension.items():
        main.setdefault(key, [])
        for v in values:
            bisect.insort(main[key], v)    

def countDicEntries(dic):
    count = 0
    for key, value in dic.items():
        count = count + len(value)
    return count

def checkTupleOrder(dic):
    print("Checking Tuples:")
    count = 0
    for key, values in dic.items():
        prev = -1
        for v in values:
            if v[0] < prev:
                count = count + 1
            prev = v[0]
    print(count)
    
def insertOrderDic(dic, key, val):
    dic.setdefault(key, [])
    bisect.insort(dic[key], val)

def insortLists(main, extension):
    for item in extension:
        bisect.insort(main, item)
    
def getTimes(dic):
    time_list = []
    bad_dic = {}
    no_stop = 0
    no_start = 0
    valid = 0
    start_oob = 0
    badtype = 0
    
    start_tuple = (0,0) #timestamp,type
    for key, values in dic.items():
        start_time = -1

        for v in values:
            if v[0] > 0 and v[0] < MAXTIMESTAMP: #make sure start time is not zero
                if v[1] == 0:
                    if start_time == -1:
                        start_time = v[0]
                        start_tuple = v
                    #else:
                        # start_time was already set. This probably won't happen 
                        # means order was SUBMIT, SUBMIT, END
                elif v[1] >= 2 and v[1] <= 5:
                    if start_time != -1: #job has start time
                        bisect.insort(time_list, (v[0]-start_time, key, v[1]))
                        valid = valid + 2
                        start_time = -1
                    else: # job has no start time
                        no_start = no_start + 1
                        insertOrderDic(bad_dic, key, v)
                else:
                    badtype = badtype + 1
        if start_time != -1: # job started but found no stop time
            no_stop = no_stop + 1
            # get last start tuple
            insertOrderDic(bad_dic, key, start_tuple)
    total = valid + no_start + no_stop + badtype
    return getTop20List(time_list), bad_dic

def getTop20List(l):
    return l[-20:]


comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
name = MPI.Get_processor_name()

dir = "/scratch3/bpyle/gtrace/job_events/"

if rank == 0:
    all_files = os.listdir(dir)
    if len(all_files) % size == 0:
        num_files = int(len(all_files) / size) 
    else:
        num_files = int(len(all_files) / size) + 1
    chunked_files = list(chunks(all_files, num_files))
    while len(chunked_files) % size != 0:
        chunked_files.append([]) 
else:
    local_files = None
    chunked_files = None
local_files = comm.scatter(chunked_files, root = 0)


local_dic = {}
total = 0
for file in local_files:
    total = total + addIDsDict(local_dic, dir+file)

entryCount = countDicEntries(local_dic)
local_times, local_bad_dic = getTimes(local_dic)
badCount = countDicEntries(local_bad_dic)
# print("Counts:", rank, entryCount, len(local_times)*2 + badCount, len(local_times)*2, badCount)

local_dic_list = comm.gather(local_dic, root=0)
local_times_list = comm.gather(local_times, root=0)
local_bad_dic_list = comm.gather(local_bad_dic, root=0)

if rank == 0:
    total = 0
    big_dic = {}
    big_bad_dic = {}
    big_times_list = []
    
    #combine local dictionaries to get number of unique job IDs
    for i, dic in enumerate(local_dic_list):
        extendDic(big_dic, dic)
    print("Unique Job IDs:", len(big_dic))
        
    #combine bad dictionaries and then find times
    for i, dic in enumerate(local_bad_dic_list):
        extendDic(big_bad_dic, dic)
    bad_times_list, extra_bad_dic = getTimes(big_bad_dic)
    
    #combine time_lists and print top20
    for times in local_times_list:
        insortLists(big_times_list, times)
    insortLists(big_times_list, bad_times_list)
    
    print("Top 20:")
    top20 = getTop20List(big_times_list)
    for item in reversed(top20):
        if item[2] == 2:
            event_type = 'EVICT'
        elif item[2] == 3:
            event_type = 'FAIL'
        elif item[2] == 4:
            event_type = 'FINISH'
        elif item[2] == 5:
            event_type = 'KILL'
        else:
            event_type = 'OTHER'
        print("%s\t%d\t%s" % (item[1], item[0], event_type))



In [None]:
!module load gcc/5.3.0 openmpi/1.10.3; \
chmod 755 asg4-process-gtrace.py; \
time mpirun -np 64 --mca mpi_cuda_support 0 asg4-process-gtrace.py;

4 Cores:
Unique Job IDs: 672074
Top 20:
6272373604	2219071470839	FAIL
6262216777	2179583847857	KILL
6254863670	2075738599153	KILL
6254863987	2075736980619	KILL
6254863748	2075736980616	KILL
6256219395	2057085980979	KILL
6273334224	1997920913533	KILL
6273473735	1915434479654	KILL
6283813709	1884643427720	KILL
6275743443	1881727884505	KILL
6276433166	1875768873954	KILL
6276523719	1874887733830	KILL
6276685724	1873363338733	KILL
6283245463	1843085463781	FAIL
6271227810	1842057638079	KILL
6288829356	1810226800987	KILL
6288842611	1809984463546	KILL
6295425701	1796937871697	KILL
6295425806	1796894803228	KILL
6256516525	1787831764245	KILL

real	0m34.974s
user	1m44.372s
sys	0m2.099s

8 Cores:
Unique Job IDs: 672074
Top 20:
6272373604	2219071470839	FAIL
6262216777	2179583847857	KILL
6254863670	2075738599153	KILL
6254863987	2075736980619	KILL
6254863748	2075736980616	KILL
6256219395	2057085980979	KILL
6273334224	1997920913533	KILL
6273473735	1915434479654	KILL
6283813709	1884643427720	KILL
6275743443	1881727884505	KILL
6276433166	1875768873954	KILL
6276523719	1874887733830	KILL
6276685724	1873363338733	KILL
6283245463	1843085463781	FAIL
6271227810	1842057638079	KILL
6288829356	1810226800987	KILL
6288842611	1809984463546	KILL
6295425701	1796937871697	KILL
6295425806	1796894803228	KILL
6256516525	1787831764245	KILL

real	0m23.779s
user	2m5.253s
sys	0m2.468s

16 Cores:
Unique Job IDs: 672074
Top 20:
6272373604	2219071470839	FAIL
6262216777	2179583847857	KILL
6254863670	2075738599153	KILL
6254863987	2075736980619	KILL
6254863748	2075736980616	KILL
6256219395	2057085980979	KILL
6273334224	1997920913533	KILL
6273473735	1915434479654	KILL
6283813709	1884643427720	KILL
6275743443	1881727884505	KILL
6276433166	1875768873954	KILL
6276523719	1874887733830	KILL
6276685724	1873363338733	KILL
6283245463	1843085463781	FAIL
6271227810	1842057638079	KILL
6288829356	1810226800987	KILL
6288842611	1809984463546	KILL
6295425701	1796937871697	KILL
6295425806	1796894803228	KILL
6256516525	1787831764245	KILL

real	0m20.305s
user	1m26.478s
sys	0m3.938s

32 Cores:
Unique Job IDs: 672074
Top 20:
6272373604	2219071470839	FAIL
6262216777	2179583847857	KILL
6254863670	2075738599153	KILL
6254863987	2075736980619	KILL
6254863748	2075736980616	KILL
6256219395	2057085980979	KILL
6273334224	1997920913533	KILL
6273473735	1915434479654	KILL
6283813709	1884643427720	KILL
6275743443	1881727884505	KILL
6276433166	1875768873954	KILL
6276523719	1874887733830	KILL
6276685724	1873363338733	KILL
6283245463	1843085463781	FAIL
6271227810	1842057638079	KILL
6288829356	1810226800987	KILL
6288842611	1809984463546	KILL
6295425701	1796937871697	KILL
6295425806	1796894803228	KILL
6256516525	1787831764245	KILL

real	0m18.194s
user	1m6.778s
sys	0m6.591s

64 Cores:
Unique Job IDs: 672074
Top 20:
6272373604	2219071470839	FAIL
6262216777	2179583847857	KILL
6254863670	2075738599153	KILL
6254863987	2075736980619	KILL
6254863748	2075736980616	KILL
6256219395	2057085980979	KILL
6273334224	1997920913533	KILL
6273473735	1915434479654	KILL
6283813709	1884643427720	KILL
6275743443	1881727884505	KILL
6276433166	1875768873954	KILL
6276523719	1874887733830	KILL
6276685724	1873363338733	KILL
6283245463	1843085463781	FAIL
6271227810	1842057638079	KILL
6288829356	1810226800987	KILL
6288842611	1809984463546	KILL
6295425701	1796937871697	KILL
6295425806	1796894803228	KILL
6256516525	1787831764245	KILL

real	0m17.391s
user	1m1.140s
sys	0m8.802s
 
 
+------------------------------------------+ 
| PALMETTO CLUSTER PBS RESOURCES REQUESTED | 
+------------------------------------------+ 
 
mem=48gb,ncpus=64,walltime=24:00:00
 
 
+-------------------------------------+ 
| PALMETTO CLUSTER PBS RESOURCES USED | 
+-------------------------------------+ 
 
cpupercent=411,cput=00:07:48,mem=342984kb,ncpus=64,vmem=5448452kb,walltime=00:01:55
 
 