In [1]:
# import libraries
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

# read in data, taking 10% sample
p = .1
required_cols = ['JobID','Partition', 'Account', 'Group', 'GID',
       'User', 'Submit', 'Eligible', 'Start', 'End', 'Elapsed',
       'ExitCode', 'State', 'NNodes', 'NCPUS', 'ReqCPUS', 'ReqMem', 'ReqGRES',
       'ReqTRES', 'Timelimit', 'NodeList', 'JobName']
accre = pd.read_csv('data/accre_data.txt', header=0, sep='|', usecols=required_cols, skiprows=lambda i: i>0 and random.random() > p)

In [2]:
accre.head()

Unnamed: 0,JobID,Partition,Account,Group,GID,User,Submit,Eligible,Start,End,...,State,NNodes,NCPUS,ReqCPUS,ReqMem,ReqGRES,ReqTRES,Timelimit,NodeList,JobName
0,15811631,production,h_vuiis,h_vuiis,36052,vuiiscci,2017-07-01T05:01:23,2017-07-01T05:01:23,2017-07-01T06:03:05,2017-07-05T15:47:51,...,COMPLETED,1,1,1,10Gn,,"cpu=1,mem=10G,node=1",5-08:00:00,vmp1091,NDW_ROCKLAND-x-A00055122-x-A00055122_V2R-x-MGC...
1,15811641,production,h_vuiis,h_vuiis,36052,vuiiscci,2017-07-01T05:01:25,2017-07-01T05:01:25,2017-07-01T10:48:05,2017-07-06T03:18:28,...,COMPLETED,1,1,1,10Gn,,"cpu=1,mem=10G,node=1",5-08:00:00,vmp1088,NDW_ROCKLAND-x-A00043677-x-A00043677_V2-x-MGC_...
2,15811644,production,h_vuiis,h_vuiis,36052,vuiiscci,2017-07-01T05:01:25,2017-07-01T05:01:25,2017-07-01T12:37:01,2017-07-06T20:37:22,...,TIMEOUT,1,1,1,10Gn,,"cpu=1,mem=10G,node=1",5-08:00:00,vmp1087,NDW_ROCKLAND-x-A00052502-x-A00052502_V2-x-MGC_...
3,15811657_31,production,p_iglab_csb,p_iglab,20440,nagarr1,2017-07-01T05:02:05,2017-07-01T05:02:05,2017-07-01T05:02:11,2017-07-01T05:02:21,...,COMPLETED,1,1,1,15Gn,,"cpu=1,mem=15G,node=1",1-00:00:00,vmp1261,fp_single.slurm
4,15811657_73,production,p_iglab_csb,p_iglab,20440,nagarr1,2017-07-01T05:02:05,2017-07-01T05:02:05,2017-07-01T05:02:11,2017-07-01T05:02:42,...,COMPLETED,1,1,1,15Gn,,"cpu=1,mem=15G,node=1",1-00:00:00,vmp1284,fp_single.slurm


In [3]:
# look at the shape of the dataset
accre.shape

(1251534, 22)

In [4]:
# identify the column names of the dataset
accre.columns

Index(['JobID', 'Partition', 'Account', 'Group', 'GID', 'User', 'Submit',
       'Eligible', 'Start', 'End', 'Elapsed', 'ExitCode', 'State', 'NNodes',
       'NCPUS', 'ReqCPUS', 'ReqMem', 'ReqGRES', 'ReqTRES', 'Timelimit',
       'NodeList', 'JobName'],
      dtype='object')

In [5]:
# convert Submit and Start to datetime
accre['Submit'] = pd.to_datetime(accre['Submit'], format='%Y/%m/%d')
accre['Start'] = pd.to_datetime(accre['Start'], format='%Y/%m/%d')

In [6]:
# check head of Submit
accre.Submit.head()

0   2017-07-01 05:01:23
1   2017-07-01 05:01:25
2   2017-07-01 05:01:25
3   2017-07-01 05:02:05
4   2017-07-01 05:02:05
Name: Submit, dtype: datetime64[ns]

In [7]:
# check head of Start
accre.Start.head()

0   2017-07-01 06:03:05
1   2017-07-01 10:48:05
2   2017-07-01 12:37:01
3   2017-07-01 05:02:11
4   2017-07-01 05:02:11
Name: Start, dtype: datetime64[ns]

In [8]:
# create Queue_Time column = Start - Submit
accre['Queue_Time'] = accre['Start'] - accre['Submit']
accre.Queue_Time.head()

0   01:01:42
1   05:46:40
2   07:35:36
3   00:00:06
4   00:00:06
Name: Queue_Time, dtype: timedelta64[ns]

In [9]:
# check data types
accre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1251534 entries, 0 to 1251533
Data columns (total 23 columns):
JobID         1251534 non-null object
Partition     1240594 non-null object
Account       1251534 non-null object
Group         1251534 non-null object
GID           1251534 non-null int64
User          1251534 non-null object
Submit        1251534 non-null datetime64[ns]
Eligible      1251534 non-null object
Start         1251534 non-null datetime64[ns]
End           1251534 non-null object
Elapsed       1251534 non-null object
ExitCode      1251534 non-null object
State         1251534 non-null object
NNodes        1251534 non-null int64
NCPUS         1251534 non-null int64
ReqCPUS       1251534 non-null int64
ReqMem        1251534 non-null object
ReqGRES       8456 non-null object
ReqTRES       1240594 non-null object
Timelimit     1240594 non-null object
NodeList      1251534 non-null object
JobName       1251534 non-null object
Queue_Time    1251534 non-null timedelta64

In [10]:
for column in list(accre.columns):
    print(accre[column].value_counts(normalize=False,ascending=False).head(10))
    print('')
    print('')
    

26764369_57    2
26864480_29    2
19224428_94    2
26747804_28    2
26764369_90    2
23753079_72    2
26828657_47    2
23753079_44    2
26828657_37    2
21880373       1
Name: JobID, dtype: int64


production    1231546
pascal           5785
maxwell          3171
debug              44
bigbox             31
mic                15
fermi               2
Name: Partition, dtype: int64


cms                  353904
beam_lab             145194
cms_lowprio           93042
p_csb_meiler          76419
cms_stage2            66430
mip_eecs              57621
h_biostat_student     44718
lola                  44395
p_meiler              42716
math                  37192
Name: Account, dtype: int64


cms                  506584
beam_lab             150262
p_csb_meiler          73793
mip_eecs              57791
cms_stage1            45794
p_meiler              45634
lola                  45129
h_biostat_student     44681
math                  37192
rokaslab              30298
Name: Group, dtype: int64


In [11]:
# node list counted by count of Job ID
accre['JobID'].groupby(accre['NodeList']).agg('count')

NodeList
None assigned                                                                  347630
amn0008                                                                            14
amn0009                                                                            14
gpu0001                                                                           606
gpu0002                                                                           368
gpu0003                                                                           427
gpu0004                                                                           305
gpu0005                                                                           229
gpu0006                                                                           202
gpu0007                                                                           122
gpu0008                                                                           175
gpu0009                                      

In [12]:
accre_timeout = accre.loc[accre['State'] == 'TIMEOUT']

# node list counted by count of Job ID
# accre['State' == 'TIMEOUT'].groupby(accre['NodeList']).agg('count')

In [13]:
# node list by count of Job ID where 
node_timeout_totals = accre_timeout['JobID'].groupby(accre['NodeList']).agg('count').sort_values()

In [23]:
node_timeout_totals = node_timeout_totals.to_frame()

AttributeError: 'DataFrame' object has no attribute 'to_frame'

In [None]:
node_timeout_totals = node_timeout_totals.rename({"JobID":"Number of Timeouts"},axis=1)

In [31]:
node_timeout_totals.sort_values('Number of Timeouts',ascending=False)

Unnamed: 0_level_0,Number of Timeouts
NodeList,Unnamed: 1_level_1
vmp239,82
vmp1092,55
vmp1093,53
vmp429,51
vmp1117,49
vmp467,48
vmp1087,47
vmp1086,47
vmp447,46
vmp1073,46


In [41]:
node_attempt_totals = accre['JobID'].groupby(accre['NodeList']).agg('count').sort_values().to_frame()
node_attempt_totals = node_attempt_totals.rename({"JobID":"Number of Attempts"},axis=1)
print(node_attempt_totals)

                                                    Number of Attempts
NodeList                                                              
vmp[1303,1333,1368]                                                  1
vmp[205,410-414,450-451,490-491,1093-1094,1133-...                   1
vmp[205,228,345,412,472-474,477-479,482-488,493...                   1
vmp[205,225,301,303-305,307-308,312,324-325,327...                   1
vmp[205,211]                                                         1
vmp[205,209]                                                         1
vmp[205,1037]                                                        1
vmp[204-205,212,214]                                                 1
vmp[204,216]                                                         1
vmp[204,215-216,1308-1310,1312-1313,1369-1378]                       1
vmp[204,210,219,232,238-239,416,1055,1083,1300,...                   1
vmp[203-206,208-210,1049-1050,1107-1108,1114-1117]                   1
vmp[20

In [42]:
timeout_percentage = pd.merge(node_timeout_totals,node_attempt_totals,on="NodeList",how="left")

In [49]:
timeout_percentage['Timeout Percentage'] = timeout_percentage['Number of Timeouts']/timeout_percentage['Number of Attempts']

In [53]:
timeout_percentage = timeout_percentage.sort_values(by="Timeout Percentage",ascending=False)
timeout_percentage

Unnamed: 0_level_0,Number of Timeouts,Number of Attempts,Timeout Percentage
NodeList,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"vmp[221,227]",1,1,1.000000
"vmp[212,226,233,302-304,314-316,334,337,356-358,364-365,384,396,403,408-409,419,437,448,453-455,457,459,470,476-479,494,1054,1073,1078,1091,1096,1099,1109,1120,1201,1203,1233,1273-1274,1276-1277,1285,1287-1288,1295,1297,1299,1342,1345,1366,1374,1384,1389,1414-1415]",1,1,1.000000
vmp[1383-1390],1,1,1.000000
"vmp[304,352,423,484-491,1202,1207,1212,1232,1259,1262,1272,1357-1361,1364-1365]",1,1,1.000000
"vmp[1361,1382]",1,1,1.000000
vmp[681-682],1,1,1.000000
"vmp[1306,1334,1359,1402]",1,1,1.000000
"gpu[0015,0026]",1,1,1.000000
"vmp[201-202,208-211,213,1003,1007,1011-1016,1031,1045,1053-1054,1088-1091,1098,1102-1103,1105-1106,1129-1130,1132-1134]",1,1,1.000000
"vmp[1296,1355]",1,1,1.000000
