In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd

## Read in dataset

In [3]:
month1 = pd.read_csv("aug13_decoded", sep='\t')
month2 = pd.read_csv("sep13_decoded", sep='\t')

# merge datasets into df
df = pd.concat([month1, month2])

## Let's inspect the dataset

In [4]:
df.head()

Unnamed: 0,NodeID,Date Time,Complete Node,Cabinet,Chassis,Slot,Node,Node Type,Processor,Time,...,Cache way in error,Syndrome,Core,Errorcode,Ext_errorcode,Error Type,Addr,Addr Desc,Errorcode Type,Misc
0,70,2013-08-01 00:01:24,c19-7c1s7n1,19-7,1.0,7.0,1.0,xe,2:600f12,1375333000.0,...,,'110000100,0.0,'100001011,'11100,"L3 Cache Data Error, Single bit Error, Cache W...",0d1c4,Physical,mem,c00a000001000000
1,70,2013-08-01 00:05:46,c17-0c1s7n0,17-0,1.0,7.0,0.0,xk,2:600f12,1375334000.0,...,,'100101110,,'100000010011,'1000,ECC Error,44bd7910,Physical,bus,c00a002f01000000
2,70,2013-08-01 00:06:24,c12-7c2s6n1,12-7,2.0,6.0,1.0,xe,2:600f12,1375334000.0,...,,'1110110100100000,,'101000010011,'1000,ECC Error,483b3e350,Physical,bus,c00a000101000000
3,70,2013-08-01 00:06:24,c10-9c0s7n3,10-9,0.0,7.0,3.0,xe,2:600f12,1375334000.0,...,,'10,7.0,'100011011,'11100,"L3 Cache Data Error, Single bit Error, Cache W...",411bf0e44,Physical,mem,c00a000001000000
4,70,2013-08-01 00:06:27,c19-5c2s3n3,19-5,2.0,3.0,3.0,xe,2:600f12,1375334000.0,...,,'1001110100000001,,'101000010011,'1000,ECC Error,4854e0c40,Physical,bus,c00a000101000000


In [5]:
df.keys()

Index(['NodeID', 'Date Time', 'Complete Node', 'Cabinet', 'Chassis', 'Slot',
       'Node', 'Node Type', 'Processor', 'Time', 'Socket', 'Apic', 'Bank',
       'Err Val', 'OV', 'UC', 'PCC', 'CECC', 'UECC', 'DEF', 'POISON',
       'L3 Subcache', 'Sub Link', 'LDT Link', 'Scrub', 'Link',
       'Cache way in error', 'Syndrome', 'Core', 'Errorcode', 'Ext_errorcode',
       'Error Type', 'Addr', 'Addr Desc', 'Errorcode Type', 'Misc'],
      dtype='object')

In [6]:
# J: how do these columns look?
# df['UECC'].value_counts()
print(df['Syndrome'].value_counts())


'100000101           20799
'111100001           20753
'111001010           20692
'10111001            20590
10000010000000       20074
'10000101            19452
'111000110           17401
'1000000100000       11077
'1000101100010000     6391
'10001100111          5948
'101011100            5846
'10110000             5784
'110001100000001      5771
'1000000000100        5251
'1000000000100111     4956
'10000011             4891
'1000000010101011     4880
'100000000010         4858
'100000010000000      4628
'10010001011          4053
'10000                3547
'10000010000000       3496
'110001100000010      3320
'1000000001000000     3285
'110011101            3278
'10011000110          3239
'10000001000000       3203
'10000000000101       3199
'100100010            3140
'10000010000          2802
                     ...  
'1001011100000000        1
'111011001111111         1
'10110100011001          1
'1100101                 1
'1011101111110           1
'11110011011111          1
'

## Looks like we have some timestamps in the 'Time' column.
## Let's convert these to human readable timestamps

In [7]:
df['datetime'] = pd.to_datetime(df['Time'], unit='s')
df['datetime'] = df['datetime'].dt.tz_localize('America/Chicago', ambiguous=True)


In [8]:
print(min(df['datetime']))
print(max(df['datetime']))

1970-01-01 00:00:01-06:00
2013-10-01 04:58:06-05:00


## Hmm, we shouldn't have timestamps from 1970. This is probably bad data.
## Let's filter out these bad entries

In [9]:
# J: checking data values
print(df['Time'])

0         1.375333e+09
1         1.375334e+09
2         1.375334e+09
3         1.375334e+09
4         1.375334e+09
5         1.375334e+09
6         1.375334e+09
7         1.375334e+09
8         1.375334e+09
9         1.375335e+09
10        1.375331e+09
11        1.375331e+09
12        1.375331e+09
13        1.375331e+09
14        1.375331e+09
15        1.375331e+09
16        1.375331e+09
17        1.375331e+09
18        1.375331e+09
19        1.375331e+09
20        1.375331e+09
21        1.375331e+09
22        1.375331e+09
23        1.375331e+09
24        1.375331e+09
25        1.375331e+09
26        1.375331e+09
27        1.375331e+09
28        1.375331e+09
29        1.375331e+09
              ...     
184598    1.380603e+09
184599    1.380603e+09
184600    1.380603e+09
184601    1.380603e+09
184602    1.380599e+09
184603    1.380599e+09
184604    1.380599e+09
184605    1.380599e+09
184606    1.380599e+09
184607    1.380599e+09
184608    1.380599e+09
184609    1.380599e+09
184610    1

In [10]:
# keep entries between July 31st and Oct 2nd
df = df[(1375272000 < df['Time']) & (df['Time'] < 1380715200)]

# If you open up the datasets in a text editor, you'll see that 
# Node Type entries with 'service_not_present' have incomplete information
df = df[df['Node Type'] != 'service_not_present']

In [11]:
print(df['Node Type'])

0              xe
1              xk
2              xe
3              xe
4              xe
5              xk
6              xk
7              xk
8              xk
9              xk
13        service
14        service
16        service
17        service
19        service
20        service
22        service
23        service
25        service
26        service
28        service
29        service
31        service
32        service
33        service
34        service
35             xe
36             xk
37             xk
38             xe
           ...   
184598         xe
184599         xe
184600         xe
184601         xe
184602         xe
184603         xe
184604         xe
184605         xe
184606         xe
184607         xe
184608         xe
184609         xe
184610         xe
184611         xe
184612         xe
184613         xe
184614         xe
184615         xe
184616         xe
184617         xe
184618         xe
184619         xe
184620         xe
184621         xe
184622    

In [12]:
print(min(df['datetime']))
print(max(df['datetime']))

2013-08-01 04:26:05-05:00
2013-10-01 04:58:06-05:00


## Much better

## Task 0 - Let's count some columns


In [13]:
# total number of entries
len(df)

389624

In [14]:
# number of nodes
len(df['Complete Node'].unique())

6222

In [15]:
# number of days
# normalize() sets the time to 00:00:00 giving us only the date portion of the timestamp/
len(df['datetime'].dt.normalize().unique())

60

In [16]:
# node types
df['Node Type'].unique()

array(['xe', 'xk', 'service'], dtype=object)

In [17]:
df['Complete Node'].value_counts()

# don't forget to represent this data as a box plot

c13-4c0s2n0     20738
c18-7c1s2n3     20671
c21-5c0s3n0     20601
c0-11c2s3n1     20570
c1-3c2s7n1      19398
c16-9c2s2n0     14425
c8-0c1s7n3      10451
c13-7c0s6n1      9188
c16-11c2s3n3     9115
c22-10c1s3n2     7147
c5-4c2s3n3       6619
c8-6c2s7n3       6489
c13-9c0s1n0      5987
c17-3c2s2n3      5745
c22-8c2s5n0      5708
c3-8c0s4n3       5189
c9-1c1s1n0       5034
c3-7c0s5n3       4813
c7-8c1s0n1       4012
c19-5c2s7n1      3589
c5-3c0s5n1       3168
c7-10c2s1n1      2984
c6-11c2s0n2      2965
c5-5c1s0n2       2883
c13-11c2s3n1     2758
c7-11c1s2n1      2598
c19-9c2s7n0      2520
c3-0c2s7n3       2449
c7-8c1s4n2       2342
c15-3c2s7n1      2270
                ...  
c16-11c1s7n2        1
c18-2c0s7n2         1
c15-11c0s5n3        1
c4-0c0s4n0          1
c0-11c2s2n3         1
c7-1c0s4n2          1
c8-0c1s5n0          1
c10-5c2s2n0         1
c8-9c0s1n2          1
c19-9c2s1n2         1
c3-5c1s2n3          1
c12-10c1s1n3        1
c4-5c2s7n1          1
c23-7c0s5n0         1
c5-8c1s5n0

In [18]:
#BEFORE

timevals = df['Time'].values

# this finds the difference between consecutive values in timevals
diffs = timevals[1:] - timevals[:-1]
diffs.mean(), diffs.std()

(13.526413995067026, 8039.182077463628)

In [19]:
df.sort_values(by=['Time']);
print("DONE")

DONE


In [20]:
# AFTER sorted by time
timevals = df['Time'].values

# this finds the difference between consecutive values in timevals
diffs = timevals[1:] - timevals[:-1]
diffs.mean(), diffs.std()

(13.526413995067026, 8039.182077463628)

In [21]:
for nt in ['service', 'xe', 'xk']:
    timevals = df[df['Node Type'] == nt]['Time'].values
    timevals.sort()
    diffs = timevals[1:] - timevals[:-1]
    print (nt, diffs.mean(), diffs.std())

service 476.487976858 3359.92168417
xe 14.355954498 406.110647721
xk 463.563434806 2802.28529411
