In [1]:
# import libraries
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

# read in data, taking 10% sample
p = .1
required_cols = ['JobID','Partition', 'Account', 'Group', 'GID',
       'User', 'Submit', 'Eligible', 'Start', 'End', 'Elapsed',
       'ExitCode', 'State', 'NNodes', 'NCPUS', 'ReqCPUS', 'ReqMem', 'ReqGRES',
       'ReqTRES', 'Timelimit', 'NodeList', 'JobName']
accre = pd.read_csv('data/accre_data.txt', header=0, sep='|', usecols=required_cols, skiprows=lambda i: i>0 and random.random() > p)

In [2]:
accre.head()

Unnamed: 0,JobID,Partition,Account,Group,GID,User,Submit,Eligible,Start,End,...,State,NNodes,NCPUS,ReqCPUS,ReqMem,ReqGRES,ReqTRES,Timelimit,NodeList,JobName
0,15811617,production,cms_samtest,cms_samtest,59297,uscms010,2017-07-01T05:00:00,2017-07-01T05:00:00,2017-07-01T05:00:01,2017-07-01T05:00:11,...,COMPLETED,1,1,1,4000Mn,,"cpu=1,mem=4000M,node=1",2-00:00:00,vmp1242,bl_aa7c22e04f21
1,15811619,production,cms_stage2,cms,31000,tuos,2017-07-01T05:00:02,2017-07-01T05:00:02,2017-07-01T05:00:04,2017-07-01T05:02:07,...,COMPLETED,1,1,1,2Gc,,"cpu=1,mem=2G,node=1",12:00:00,vmp1062,skim_test
2,15811624,production,cms_stage2,cms_stage1,59298,autocms,2017-07-01T05:01:01,2017-07-01T05:01:01,2017-07-01T05:01:02,2017-07-01T05:05:49,...,COMPLETED,1,1,1,2Gc,,"cpu=1,mem=2G,node=1",12:00:00,vmp670,skim_test
3,15811625,production,cms_stage2,cms,31000,tuos,2017-07-01T05:01:02,2017-07-01T05:01:02,2017-07-01T05:01:05,2017-07-01T05:04:17,...,COMPLETED,1,1,1,2Gc,,"cpu=1,mem=2G,node=1",12:00:00,vmp1078,skim_test
4,15811657_21,production,p_iglab_csb,p_iglab,20440,nagarr1,2017-07-01T05:02:05,2017-07-01T05:02:05,2017-07-01T05:02:11,2017-07-01T05:02:23,...,COMPLETED,1,1,1,15Gn,,"cpu=1,mem=15G,node=1",1-00:00:00,vmp1215,fp_single.slurm


In [3]:
# look at the shape of the dataset
accre.shape

(1250493, 22)

In [4]:
# identify the column names of the dataset
accre.dtypes

JobID        object
Partition    object
Account      object
Group        object
GID           int64
User         object
Submit       object
Eligible     object
Start        object
End          object
Elapsed      object
ExitCode     object
State        object
NNodes        int64
NCPUS         int64
ReqCPUS       int64
ReqMem       object
ReqGRES      object
ReqTRES      object
Timelimit    object
NodeList     object
JobName      object
dtype: object

In [5]:
# convert partition & account to category
accre.Partition = accre.Partition.astype('category')
accre.Account = accre.Account.astype('category')
accre.dtypes

JobID          object
Partition    category
Account      category
Group          object
GID             int64
User           object
Submit         object
Eligible       object
Start          object
End            object
Elapsed        object
ExitCode       object
State          object
NNodes          int64
NCPUS           int64
ReqCPUS         int64
ReqMem         object
ReqGRES        object
ReqTRES        object
Timelimit      object
NodeList       object
JobName        object
dtype: object

In [6]:
# convert Submit and Start to datetime
accre['Submit'] = pd.to_datetime(accre['Submit'], format='%Y/%m/%d')
accre['Start'] = pd.to_datetime(accre['Start'], format='%Y/%m/%d')

In [9]:
# create Queue_Time column = Start - Submit
accre['Queue_Time'] = accre['Start'] - accre['Submit']
accre.Queue_Time.head()

0   00:00:01
1   00:00:02
2   00:00:01
3   00:00:03
4   00:00:06
Name: Queue_Time, dtype: timedelta64[ns]

In [10]:
# check data types
accre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250493 entries, 0 to 1250492
Data columns (total 23 columns):
JobID         1250493 non-null object
Partition     1239604 non-null category
Account       1250493 non-null category
Group         1250493 non-null object
GID           1250493 non-null int64
User          1250493 non-null object
Submit        1250493 non-null datetime64[ns]
Eligible      1250493 non-null object
Start         1250493 non-null datetime64[ns]
End           1250493 non-null object
Elapsed       1250493 non-null object
ExitCode      1250493 non-null object
State         1250493 non-null object
NNodes        1250493 non-null int64
NCPUS         1250493 non-null int64
ReqCPUS       1250493 non-null int64
ReqMem        1250493 non-null object
ReqGRES       8422 non-null object
ReqTRES       1239604 non-null object
Timelimit     1239604 non-null object
NodeList      1250493 non-null object
JobName       1250492 non-null object
Queue_Time    1250493 non-null timedel

In [11]:
# head data exploration for each column
for column in list(accre.columns):
    print(accre[column].value_counts(normalize=False,ascending=False).head(10))
    print('')
    print('')
    

26747804_10       2
26749408_19       2
26749408_4        2
26827147_91       2
26764369_8        2
24406196          1
24710485_1880     1
27695635_555      1
17487023_109      1
24787841_10191    1
Name: JobID, dtype: int64


production    1230572
pascal           5808
maxwell          3125
debug              51
bigbox             28
mic                20
Name: Partition, dtype: int64


cms                  354733
beam_lab             144685
cms_lowprio           93042
p_csb_meiler          76139
cms_stage2            66021
mip_eecs              58000
h_biostat_student     44527
lola                  44271
p_meiler              42766
math                  37364
Name: Account, dtype: int64


cms                  507745
beam_lab             149731
p_csb_meiler          73579
mip_eecs              58161
p_meiler              45697
cms_stage1            45315
lola                  45017
h_biostat_student     44488
math                  37365
rokaslab              29819
Name: Group, dtype

In [126]:
#get list of state values ###WORKING HERE

state_messages = accre.State.value_counts().to_frame().reset_index()
state_messages = state_messages.rename({"index":"Status","State":"Count"},axis=1)
condition1 = state_messages['Status' != 'COMPLETED']
state_messages = state_messages[condition1]

KeyError: True

In [52]:
# jobs run per node (count of JobID grouped by NodeList)
jobs_per_node = accre.JobID.groupby(accre['NodeList']).agg('count')
jobs_per_node = jobs_per_node.to_frame()

pandas.core.frame.DataFrame

In [53]:
#jobs failed per node (count of JobID grouped by NodeList from )
state_failed = ['TIMEOUT','FAILED']
state_timeout = accre.loc[accre['State'].isin(state_failed)]
fails_per_node = state_timeout.JobID.groupby(state_timeout['NodeList']).agg('count')
fails_per_node = fails_per_node.to_frame()
type(fails_per_node)

pandas.core.frame.DataFrame

In [54]:
node_fails_table = pd.merge(jobs_per_node,fails_per_node,on='NodeList',how='left')
node_fails_table.head()

Unnamed: 0_level_0,JobID_x,JobID_y
NodeList,Unnamed: 1_level_1,Unnamed: 2_level_1
None assigned,347977,175.0
amn0008,13,2.0
amn0009,10,
gpu0001,557,124.0
gpu0002,442,68.0


In [58]:
node_fails_table = node_fails_table.rename({'JobID_x':'JobRuns','JobID_y':'JobFails'},axis=1)

In [62]:
#fill NA of JobFails with 0
node_fails_table['JobFails'] = node_fails_table.JobFails.fillna(0)

Unnamed: 0_level_0,JobRuns,JobFails
NodeList,Unnamed: 1_level_1,Unnamed: 2_level_1
"vmp[1311,1415]",1,0.0
"vmp[1286,1345]",1,1.0
"vmp[1286,1288]",1,0.0
"vmp[221-226,232-235,1111]",1,0.0
"vmp[1284,1424]",1,0.0
"vmp[1284,1388]",1,0.0
"vmp[1284,1330]",1,0.0
"vmp[1284,1289]",1,0.0
"vmp[1283,1379-1380,1382]",1,0.0
"vmp[1283,1360]",1,1.0


In [63]:
node_fails_table['Fail%'] = node_fails_table.JobFails/node_fails_table.JobRuns

In [86]:
node_fails_table.sort_values(by='JobRuns', ascending=False)

Unnamed: 0_level_0,JobRuns,JobFails,Fail%
NodeList,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
None assigned,347977,175.0,0.000503
vmp1409,9353,133.0,0.014220
vmp1242,7940,552.0,0.069521
vmp1257,7403,127.0,0.017155
vmp1001,6606,557.0,0.084317
vmp1298,4734,160.0,0.033798
vmp1003,3959,404.0,0.102046
vmp1010,3672,314.0,0.085512
vmp1061,3639,498.0,0.136851
vmp1002,3583,347.0,0.096846


In [91]:
node_fails_table = node_fails_table.drop(['None assigned'])
node_fails_table.sort_values(by='JobRuns', ascending=False)

Unnamed: 0_level_0,JobRuns,JobFails,Fail%
NodeList,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
vmp1409,9353,133.0,0.014220
vmp1242,7940,552.0,0.069521
vmp1257,7403,127.0,0.017155
vmp1001,6606,557.0,0.084317
vmp1298,4734,160.0,0.033798
vmp1003,3959,404.0,0.102046
vmp1010,3672,314.0,0.085512
vmp1061,3639,498.0,0.136851
vmp1002,3583,347.0,0.096846
vmp1013,3515,335.0,0.095306


In [102]:
condition1 = node_fails_table.JobFails != 0
condition2 = node_fails_table.JobRuns > 100

node_fails_table_new = node_fails_table[condition1&condition2]

In [105]:
node_fails_table_new.sort_values(by='Fail%',ascending=False)

Unnamed: 0_level_0,JobRuns,JobFails,Fail%
NodeList,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
vmp430,142,99.0,0.697183
vmp398,1395,724.0,0.518996
vmp397,1458,721.0,0.494513
vmp1256,1535,663.0,0.431922
vmp598,112,35.0,0.312500
gpu0009,150,46.0,0.306667
vmp695,112,32.0,0.285714
gpu0006,202,56.0,0.277228
vmp592,109,30.0,0.275229
vmp692,122,32.0,0.262295


# Repeating process to include ExitCode

In [52]:
# jobs run per node (count of JobID grouped by NodeList)
jobs_per_node = accre.JobID.groupby(accre['NodeList','']).agg('count')
jobs_per_node = jobs_per_node.to_frame()

pandas.core.frame.DataFrame

In [53]:
#jobs failed per node (count of JobID grouped by NodeList from )
state_failed = ['TIMEOUT','FAILED']
state_timeout = accre.loc[accre['State'].isin(state_failed)]
fails_per_node = state_timeout.JobID.groupby(state_timeout['NodeList']).agg('count')
fails_per_node = fails_per_node.to_frame()
type(fails_per_node)

pandas.core.frame.DataFrame

In [54]:
node_fails_table = pd.merge(jobs_per_node,fails_per_node,on='NodeList',how='left')
node_fails_table.head()

Unnamed: 0_level_0,JobID_x,JobID_y
NodeList,Unnamed: 1_level_1,Unnamed: 2_level_1
None assigned,347977,175.0
amn0008,13,2.0
amn0009,10,
gpu0001,557,124.0
gpu0002,442,68.0


In [58]:
node_fails_table = node_fails_table.rename({'JobID_x':'JobRuns','JobID_y':'JobFails'},axis=1)

In [62]:
#fill NA of JobFails with 0
node_fails_table['JobFails'] = node_fails_table.JobFails.fillna(0)

Unnamed: 0_level_0,JobRuns,JobFails
NodeList,Unnamed: 1_level_1,Unnamed: 2_level_1
"vmp[1311,1415]",1,0.0
"vmp[1286,1345]",1,1.0
"vmp[1286,1288]",1,0.0
"vmp[221-226,232-235,1111]",1,0.0
"vmp[1284,1424]",1,0.0
"vmp[1284,1388]",1,0.0
"vmp[1284,1330]",1,0.0
"vmp[1284,1289]",1,0.0
"vmp[1283,1379-1380,1382]",1,0.0
"vmp[1283,1360]",1,1.0


In [63]:
node_fails_table['Fail%'] = node_fails_table.JobFails/node_fails_table.JobRuns

In [86]:
node_fails_table.sort_values(by='JobRuns', ascending=False)

Unnamed: 0_level_0,JobRuns,JobFails,Fail%
NodeList,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
None assigned,347977,175.0,0.000503
vmp1409,9353,133.0,0.014220
vmp1242,7940,552.0,0.069521
vmp1257,7403,127.0,0.017155
vmp1001,6606,557.0,0.084317
vmp1298,4734,160.0,0.033798
vmp1003,3959,404.0,0.102046
vmp1010,3672,314.0,0.085512
vmp1061,3639,498.0,0.136851
vmp1002,3583,347.0,0.096846


In [91]:
node_fails_table = node_fails_table.drop(['None assigned'])
node_fails_table.sort_values(by='JobRuns', ascending=False)

Unnamed: 0_level_0,JobRuns,JobFails,Fail%
NodeList,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
vmp1409,9353,133.0,0.014220
vmp1242,7940,552.0,0.069521
vmp1257,7403,127.0,0.017155
vmp1001,6606,557.0,0.084317
vmp1298,4734,160.0,0.033798
vmp1003,3959,404.0,0.102046
vmp1010,3672,314.0,0.085512
vmp1061,3639,498.0,0.136851
vmp1002,3583,347.0,0.096846
vmp1013,3515,335.0,0.095306


In [102]:
condition1 = node_fails_table.JobFails != 0
condition2 = node_fails_table.JobRuns > 100

node_fails_table_new = node_fails_table[condition1&condition2]

In [105]:
node_fails_table_new.sort_values(by='Fail%',ascending=False)

Unnamed: 0_level_0,JobRuns,JobFails,Fail%
NodeList,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
vmp430,142,99.0,0.697183
vmp398,1395,724.0,0.518996
vmp397,1458,721.0,0.494513
vmp1256,1535,663.0,0.431922
vmp598,112,35.0,0.312500
gpu0009,150,46.0,0.306667
vmp695,112,32.0,0.285714
gpu0006,202,56.0,0.277228
vmp592,109,30.0,0.275229
vmp692,122,32.0,0.262295


In [110]:
accre.ExitCode.value_counts()

0:0      1143297
1:0        89110
127:0      11277
2:0         1437
4:0         1041
3:0          974
6:0          877
92:0         751
9:0          505
126:0        416
11:0         286
25:0         155
7:0          133
0:9           64
65:0          35
8:0           26
116:0         14
5:0           12
122:0          9
102:0          9
104:0          7
28:0           7
105:0          5
100:0          5
15:39          5
15:36          5
29:0           5
13:0           4
85:0           3
120:0          3
103:0          2
15:0           2
56:0           2
39:0           1
123:0          1
16:0           1
21:0           1
125:0          1
0:15           1
46:0           1
109:0          1
63:0           1
119:0          1
Name: ExitCode, dtype: int64

In [111]:
accre['ExitCodeShort'] = accre.ExitCode.str[-1]