In [1]:
import pandas as pd
import numpy as np

### Reading in the data, checking for nulls or unexpected data types.

In [2]:
df = pd.read_csv("../data/device_failure_data_scientist.csv")
df.head()

Unnamed: 0,date,device,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9,failure
0,15001,S1F01085,215630672,56,0,52,6,407438,0,0,7,0
1,15001,S1F0166B,61370680,0,3,0,6,403174,0,0,0,0
2,15001,S1F01E6Y,173295968,0,0,0,12,237394,0,0,0,0
3,15001,S1F01JE0,79694024,0,0,0,6,410186,0,0,0,0
4,15001,S1F01R2B,135970480,0,0,0,15,313173,0,0,3,0


In [3]:
df.tail()

Unnamed: 0,date,device,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9,failure
124489,15306,Z1F0MA1S,18310224,0,0,0,10,353705,8,8,0,0
124490,15306,Z1F0Q8RT,172556680,96,107,4,11,332792,0,0,13,0
124491,15306,Z1F0QK05,19029120,4832,0,0,11,350410,0,0,0,0
124492,15306,Z1F0QL3N,226953408,0,0,0,12,358980,0,0,0,0
124493,15306,Z1F0QLC1,17572840,0,0,0,10,351431,0,0,0,0


Checking for null values

In [4]:
df.isnull().sum()

date          0
device        0
attribute1    0
attribute2    0
attribute3    0
attribute4    0
attribute5    0
attribute6    0
attribute7    0
attribute8    0
attribute9    0
failure       0
dtype: int64

In [17]:
# Make sure all the devices are actually there and not empty strings
df['device'].sort_values(ascending=False)

45203    Z1F2PBHX
43187    Z1F2PBHX
54650    Z1F2PBHX
28591    Z1F2PBHX
30726    Z1F2PBHX
           ...   
2326     S1F01085
4651     S1F01085
5812     S1F01085
3489     S1F01085
0        S1F01085
Name: device, Length: 124494, dtype: object

In [5]:
df.shape

(124494, 12)

In [6]:
df.dtypes

date           int64
device        object
attribute1     int64
attribute2     int64
attribute3     int64
attribute4     int64
attribute5     int64
attribute6     int64
attribute7     int64
attribute8     int64
attribute9     int64
failure        int64
dtype: object

### Looking at the two classes
Baseline accuracy $= 0.999$ (very large class imbalance)

In [43]:
df['failure'].value_counts()

0    124388
1       106
Name: failure, dtype: int64

In [51]:
df['device'].unique()

array(['S1F01085', 'S1F0166B', 'S1F01E6Y', ..., 'S1F02W1L', 'S1F02XLX',
       'S1F03499'], dtype=object)

In [53]:
df['device'].value_counts()

Z1F0MA1S    304
Z1F0QL3N    304
W1F0FZPA    304
Z1F0QK05    304
W1F0JY02    304
           ... 
Z1F0L4J2      5
S1F0CVRM      5
S1F0BN0S      5
S1F04KSC      4
W1F0WJFT      3
Name: device, Length: 1168, dtype: int64

`failed_devices` is a list of all the devices that failed.  Failed devices had telemetry data until the day they did 
fail while not-failed devices (unsurprisingly) continued transmitting telemetry data.  I can use this list to filter 
the data frame to look specifically at devices that failed or devices that didn't fail, removing all of the data of 
failed devices.  I can better compare the differences in telemetry data and figure out which attributes are relevant.

In [76]:
failed_devices = list(df.query("failure == 1")['device'])
failed_devices


['S1F0RRB1',
 'S1F0CTDN',
 'W1F0PNA5',
 'W1F13SRV',
 'W1F1230J',
 'W1F0T034',
 'S1F0GG8X',
 'S1F023H2',
 'S1F0QY11',
 'S1F0S2WJ',
 'W1F0Z1W9',
 'W1F15S4D',
 'Z1F0LVPW',
 'Z1F0NVZA',
 'Z1F1FCH5',
 'S1F0P3G2',
 'W1F0F6BN',
 'W1F0P114',
 'W1F0X4FC',
 'S1F0LCTV',
 'W1F03DP4',
 'W1F0FW0S',
 'S1F10E6M',
 'S1F11MB0',
 'W1F0SGHR',
 'W1F0VDH2',
 'W1F0TA59',
 'Z1F0LVGY',
 'Z1F0MCCA',
 'Z1F0P5D9',
 'W1F0NZZZ',
 'W1F0T074',
 'S1F0DSTY',
 'S1F0TQCV',
 'Z1F04GCH',
 'W1F08EDA',
 'W1F1C9TE',
 'S1F0S4CA',
 'W1F19BPT',
 'Z1F130LH',
 'S1F0GJW3',
 'S1F0LD2C',
 'W1F0Q8FH',
 'Z1F0FSBY',
 'W1F0Z4EA',
 'Z1F0QH0C',
 'S1F0S4T6',
 'W1F1CDDP',
 'S1F0S57T',
 'S1F0JD7P',
 'S1F13H80',
 'Z1F148T1',
 'S1F0RSZP',
 'S1F0GKFX',
 'S1F0LCVC',
 'W1F1BZTM',
 'Z1F1RJFA',
 'S1F13589',
 'S1F136J0',
 'S1F0F4EB',
 'W1F1C9WG',
 'S1F0RR35',
 'Z1F1653X',
 'Z1F1AG5N',
 'W1F0KCP2',
 'W1F0M35B',
 'Z1F1901P',
 'S1F0GKL6',
 'Z1F0K451',
 'W1F03D4L',
 'W1F0FKWW',
 'S1F0PJJW',
 'W1F0X5GW',
 'S1F0L0DW',
 'W1F0WBTM',
 'S1F0GSD9',
 'S1F0QF3R',

### Separating the failed devices into a separate data frame for easier viewing.

In [9]:
failed_df = df.query("failure == 1")

In [10]:
failed_df.reset_index(inplace=True)


In [18]:
failed_df['device'].value_counts() # Not dependent on device

W1F0VDH2    1
Z1F0P16F    1
S1F0QY11    1
W1F19BPT    1
W1F0FKWW    1
           ..
Z1F0P5D9    1
W1F0WBTM    1
S1F0RSZP    1
S1F0PJJW    1
S1F023H2    1
Name: device, Length: 106, dtype: int64

### Looking more closely at telemetry data.

In [13]:
df.aggregate(np.mean)

date          1.510622e+04
attribute1    1.223868e+08
attribute2    1.594848e+02
attribute3    9.940455e+00
attribute4    1.741120e+00
attribute5    1.422269e+01
attribute6    2.601729e+05
attribute7    2.925282e-01
attribute8    2.925282e-01
attribute9    1.245152e+01
failure       8.514467e-04
dtype: float64

In [14]:
df.query('failure == 1').aggregate(np.mean)

date          1.510791e+04
attribute1    1.271755e+08
attribute2    4.109434e+03
attribute3    3.905660e+00
attribute4    5.463208e+01
attribute5    1.546226e+01
attribute6    2.583035e+05
attribute7    3.062264e+01
attribute8    3.062264e+01
attribute9    2.308491e+01
failure       1.000000e+00
dtype: float64

In [15]:
df.query('failure==0').aggregate(np.mean)

date          1.510622e+04
attribute1    1.223827e+08
attribute2    1.561187e+02
attribute3    9.945598e+00
attribute4    1.696048e+00
attribute5    1.422164e+01
attribute6    2.601745e+05
attribute7    2.666817e-01
attribute8    2.666817e-01
attribute9    1.244246e+01
failure       0.000000e+00
dtype: float64

In [96]:
df.query("device == 'S1F0RRB1'")

Unnamed: 0,date,device,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9,failure
235,15001,S1F0RRB1,5230888,2288,0,37,8,39267,24,24,1,0
1398,15002,S1F0RRB1,13307628,64776,0,49,8,39267,56,56,1,0
2561,15003,S1F0RRB1,26258330,64776,0,135,8,39267,56,56,1,0
3724,15004,S1F0RRB1,37985862,64776,0,763,8,39267,56,56,1,0
4885,15005,S1F0RRB1,48467332,64776,0,841,8,39267,56,56,1,1


In [98]:
df.query("device == 'W1F0F6BN'")

Unnamed: 0,date,device,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9,failure
557,15001,W1F0F6BN,113361680,0,0,0,12,225194,0,0,0,0
1720,15002,W1F0F6BN,132121976,0,0,0,12,226495,0,0,0,0
2883,15003,W1F0F6BN,153745136,0,0,0,12,227728,0,0,0,0
4046,15004,W1F0F6BN,176403712,0,0,0,12,228862,0,0,0,0
5207,15005,W1F0F6BN,200875200,0,0,0,12,230151,0,0,0,0
6321,15006,W1F0F6BN,221711128,0,0,0,12,231468,0,0,0,0
7245,15007,W1F0F6BN,1509976,0,0,0,12,232777,0,0,0,0
8009,15008,W1F0F6BN,22376168,0,0,0,12,234010,0,0,0,0
8765,15009,W1F0F6BN,49165752,0,0,0,12,235082,0,0,0,0
9521,15010,W1F0F6BN,70592240,0,0,0,12,236340,0,0,0,0


In [97]:
df.query("device == 'S1F0T2LA'")

Unnamed: 0,date,device,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9,failure
338,15001,S1F0T2LA,160992856,424,0,1,10,247498,0,0,0,0
1501,15002,S1F0T2LA,183767064,424,0,1,10,248860,0,0,0,0
2664,15003,S1F0T2LA,205069592,424,0,1,10,250270,0,0,0,0
3827,15004,S1F0T2LA,226287408,424,0,1,10,251628,0,0,0,0
4988,15005,S1F0T2LA,6992384,424,0,1,10,252990,0,0,0,0
6112,15006,S1F0T2LA,28195072,424,0,1,10,254346,0,0,0,0
7077,15007,S1F0T2LA,48014624,424,0,1,10,255756,0,0,0,0
7855,15008,S1F0T2LA,68808352,424,0,1,10,257110,0,0,0,0
8611,15009,S1F0T2LA,91780304,424,0,1,10,258508,0,0,0,0
9367,15010,S1F0T2LA,110733408,424,0,1,10,259877,0,0,0,0


Telemetry data ends when device fails and some attributes seem to be constant for each device.  


In [101]:
df.groupby('device')['attribute5'].mean()

device
S1F01085     6.000000
S1F013BB     5.000000
S1F0166B     6.000000
S1F01E6Y    12.000000
S1F01JE0     6.000000
              ...    
Z1F1VMZB     5.000000
Z1F1VQFY     6.328000
Z1F26YZB     1.000000
Z1F282ZV     1.000000
Z1F2PBHX     4.927711
Name: attribute5, Length: 1168, dtype: float64

In [111]:
df.query('device == "Z1F2PBHX"')

Unnamed: 0,date,device,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9,failure
1162,15001,Z1F2PBHX,129475464,0,0,0,4,148008,0,0,0,0
2325,15002,Z1F2PBHX,148229136,0,0,0,4,148008,0,0,0,0
3488,15003,Z1F2PBHX,188059992,0,0,0,4,148008,0,0,0,0
4650,15004,Z1F2PBHX,167365376,0,0,0,4,148745,0,0,0,0
5811,15005,Z1F2PBHX,205455544,0,0,0,4,149672,0,0,0,0
6865,15006,Z1F2PBHX,243630240,0,0,0,4,150588,0,0,0,0
17196,15020,Z1F2PBHX,171318632,0,0,0,5,157075,0,0,0,0
17908,15021,Z1F2PBHX,178270360,0,0,0,5,157122,0,0,0,0
18620,15022,Z1F2PBHX,133112960,0,0,0,5,157403,0,0,0,0
19332,15023,Z1F2PBHX,234895416,0,0,0,5,157404,0,0,0,0


Mean value of `attribute5` for devices that failed.

In [121]:
df[df['device'].isin(failed_devices)].groupby('device')['attribute5'].mean().sort_values().unique()

array([ 2.90829694,  4.        ,  5.73076923,  5.85714286,  6.        ,
        6.328     ,  6.33333333,  6.4       ,  6.53846154,  6.89447236,
        7.        ,  7.01526718,  7.04964539,  7.5       ,  7.57142857,
        7.74698795,  7.85714286,  7.89119171,  7.90232558,  7.92446043,
        8.        ,  8.17283951,  8.72368421,  8.89756098,  8.90869565,
        9.        ,  9.83464567,  9.87573964, 10.        , 10.28813559,
       10.45454545, 11.        , 11.19230769, 11.53333333, 11.85810811,
       12.        , 12.13793103, 12.72368421, 13.        , 13.01666667,
       13.93814433, 14.        , 14.65263158, 14.66666667, 14.78672986,
       15.71428571, 16.        , 16.856     , 19.        , 23.        ,
       24.        , 25.        , 30.        , 30.5       , 33.775     ,
       35.325     , 35.60869565, 35.72727273, 36.74285714, 40.1025641 ,
       58.        , 64.        , 65.        , 90.        , 91.        ])

Mean value of `attribute5` for devices that didn't fail.  

In [120]:
df[~df['device'].isin(failed_devices)].groupby('device')['attribute5'].mean().sort_values().unique()

array([ 1.        ,  2.        ,  2.97087379,  3.        ,  3.03289474,
        3.38135593,  4.        ,  4.38135593,  4.4516129 ,  4.5       ,
        4.81443299,  4.92771084,  5.        ,  5.22222222,  5.38135593,
        5.5       ,  5.6       ,  5.7635468 ,  5.92579505,  5.9384058 ,
        6.        ,  6.03289474,  6.11111111,  6.11458333,  6.218107  ,
        6.40707965,  6.5       ,  6.8452381 ,  6.90721649,  6.91864407,
        6.93617021,  6.95510204,  6.97029703,  6.97627119,  7.        ,
        7.03289474,  7.40707965,  7.41818182,  7.43654822,  7.5       ,
        7.66666667,  7.69230769,  7.89166667,  7.90721649,  7.91428571,
        7.92250923,  7.92307692,  7.92579505,  7.97087379,  7.97849462,
        8.        ,  8.03289474,  8.05357143,  8.05555556,  8.10714286,
        8.17777778,  8.20205479,  8.31147541,  8.40707965,  8.42718447,
        8.49411765,  8.5       ,  8.51612903,  8.75342466,  8.85365854,
        8.9047619 ,  8.92070485,  8.92579505,  8.99056604,  9.  

`attribute5` is mostly the same for each device and there doesn't seem to be any meaningful difference 
between failed and not-failed devices.