In [1]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn import preprocessing 
from datetime import datetime

In [2]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.io as pio
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.graph_objs import *

In [3]:
# read clean datafile
df = pd.read_csv('data/cleaned2.gz', compression='gzip', header=0, sep=',', quotechar='"')
print(df.shape)

(122402, 12)


In [4]:
# The dates are coded as strings.
print(df['date'].dtype)
# Replace this.
df['date']=pd.to_datetime(df['date'],infer_datetime_format=True)
df['date'].dtype

object


dtype('<M8[ns]')

In [5]:
# Number of unique devices
devices=df.groupby('device').max().reset_index(drop=False)
print(devices['device'].nunique())
print(len(devices))

851
851


In [6]:
df.head()

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute9,prefix
0,2015-01-01,S1F01E6Y,0,183303542.0,0,0,0,12,310016.5,0,0,S1F0
1,2015-01-02,S1F01E6Y,0,183303542.0,0,0,0,12,310016.5,0,0,S1F0
2,2015-01-03,S1F01E6Y,0,183303542.0,0,0,0,12,310016.5,0,0,S1F0
3,2015-01-04,S1F01E6Y,0,183303542.0,0,0,0,12,310016.5,0,0,S1F0
4,2015-01-05,S1F01E6Y,0,61294060.0,0,0,0,12,310016.5,0,0,S1F0


## Correlation of attributes

In [7]:
# heatmap: all attributes, correlation
corrs = pd.DataFrame(df[['attribute2', 'attribute3', 'attribute4',
       'attribute5', 'attribute6', 'attribute7', 'attribute9', 'failure']].corr())
corrs

Unnamed: 0,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute9,failure
attribute2,1.0,-0.005345,0.13206,-0.013898,-0.055937,0.139956,-0.005814,0.053854
attribute3,-0.005345,1.0,-0.003204,-0.005562,0.035669,-0.002524,0.695875,-0.001276
attribute4,0.13206,-0.003204,1.0,-0.004219,0.009107,0.036771,-0.004338,0.082168
attribute5,-0.013898,-0.005562,-0.004219,1.0,0.13915,-0.008205,0.015737,0.002093
attribute6,-0.055937,0.035669,0.009107,0.13915,1.0,-0.020771,0.045679,0.001212
attribute7,0.139956,-0.002524,0.036771,-0.008205,-0.020771,1.0,0.011447,0.124205
attribute9,-0.005814,0.695875,-0.004338,0.015737,0.045679,0.011447,1.0,0.002795
failure,0.053854,-0.001276,0.082168,0.002093,0.001212,0.124205,0.002795,1.0


In [8]:
data = [go.Heatmap(z=corrs.values.tolist(), 
                   y=corrs.columns.tolist(),
                   x=corrs.index.tolist(),
                   colorscale='Viridis')]
layout=go.Layout(
        title="Heatmap of attributes and failure")
fig = go.Figure(data=data, layout=layout)
# iplot(fig)
pio.write_image(fig, 'images/heatmap2.png')

![heatmap](images/heatmap2.png)

In [9]:
# One correlation pair is unusually high. Typically .7 is the cutoff, so we're going to remove one of these variables.
print('correlation of 3 & 9:', round(df['attribute9'].corr(df['attribute3']), 3))
df=df.drop('attribute9', axis=1)
df.shape

correlation of 3 & 9: 0.696


(122402, 11)

## Attributes 1 and 6: Numeric Variables

In [10]:
# Earlier, we restricted these to 1.5 times the IQR
names=['attribute1', 'attribute6']
df[names].describe()

Unnamed: 0,attribute1,attribute6
count,122402.0,122402.0
mean,152787700.0,288392.528782
std,52839640.0,38052.979504
min,61294060.0,221429.0
25%,61294060.0,310016.5
50%,183303500.0,310016.5
75%,183303500.0,310016.5
max,183303500.0,310016.5


In [11]:
# Standardize these variables (mean of 0, std dev of 1)
for col in names:
    df[col] = preprocessing.scale(df[col])
    print(col)
    print('mean', round(df[col].mean(), 3))
    print('std', round(df[col].std(), 3))
    print('\n')

attribute1
mean -0.0
std 1.0


attribute6
mean -0.0
std 1.0





Numerical issues were encountered when centering the data and might not be solved. Dataset may contain too large values. You may need to prescale your features.



## Attributes 2, 3, 4, and 7: Sparse data

overall: for 3 of the 9 attributes nearly all of the values are 0

In [12]:
df[['attribute2','attribute3','attribute4','attribute7']].describe()

Unnamed: 0,attribute2,attribute3,attribute4,attribute7
count,122402.0,122402.0,122402.0,122402.0
mean,154.850411,9.160961,1.468742,0.273639
std,2161.872846,121.260407,19.048398,7.193717
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0
max,64968.0,2693.0,1666.0,832.0


In [13]:
# what percentage of each is a zero? very high.
def lotsa_zeroes(col):
    numerator=len(df.loc[df[col]==0])
    denominator=len(df[col])
    print(col+':', round(numerator/denominator, 2))
for i in [2, 3,4,7]:
    lotsa_zeroes('attribute'+str(i))

attribute2: 0.95
attribute3: 0.93
attribute4: 0.93
attribute7: 0.99


In [14]:
# convert each of these into a simple dummy.
for col in ['attribute2','attribute3','attribute4','attribute7']:
    df.loc[df[col]!=0, col]=1
    print(col)
    print(df[col].value_counts())
    print('\n')

attribute2
0    116301
1      6101
Name: attribute2, dtype: int64


attribute3
0    113477
1      8925
Name: attribute3, dtype: int64


attribute4
0    113531
1      8871
Name: attribute4, dtype: int64


attribute7
0    121012
1      1390
Name: attribute7, dtype: int64




## Attribute 5

In [15]:
# attribute 5 is a collection of 60 distinct codes (probably error codes)
print(df['attribute5'].value_counts().index.sort_values())
print(df['attribute5'].value_counts().index.nunique())
df['attribute5'].value_counts().head()

Int64Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
            18, 19, 20, 21, 22, 23, 24, 25, 29, 30, 31, 32, 33, 34, 35, 36, 37,
            38, 39, 40, 41, 42, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
            70, 78, 89, 90, 91, 92, 94, 95, 98],
           dtype='int64')
60


8     21698
9     13428
11    12739
10    11415
7     10995
Name: attribute5, dtype: int64

In [16]:
# use one-hot to encode each of these values into a dummy variable
df=pd.get_dummies(df, columns=['attribute5'], prefix='att5')
df.head()

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute6,attribute7,prefix,...,att5_68,att5_70,att5_78,att5_89,att5_90,att5_91,att5_92,att5_94,att5_95,att5_98
0,2015-01-01,S1F01E6Y,0,0.57752,0,0,0,0.568262,0,S1F0,...,0,0,0,0,0,0,0,0,0,0
1,2015-01-02,S1F01E6Y,0,0.57752,0,0,0,0.568262,0,S1F0,...,0,0,0,0,0,0,0,0,0,0
2,2015-01-03,S1F01E6Y,0,0.57752,0,0,0,0.568262,0,S1F0,...,0,0,0,0,0,0,0,0,0,0
3,2015-01-04,S1F01E6Y,0,0.57752,0,0,0,0.568262,0,S1F0,...,0,0,0,0,0,0,0,0,0,0
4,2015-01-05,S1F01E6Y,0,-1.731542,0,0,0,0.568262,0,S1F0,...,0,0,0,0,0,0,0,0,0,0


## Gaps in Date

In [17]:
# there are roughly 20% of the devices where the timeline is not continous

In [18]:
# here's how it works for one example device:
dateslist=list(df.loc[df['device']=='S1F01E6Y']['date'])
print('number of days:', ((dateslist[-1]-dateslist[0]).days)+1) # be sure to add one
print('number of observations:', len(dateslist))

number of days: 48
number of observations: 48


In [19]:
# Make a column counting number of dates observed for each device
obscount=df.groupby('device')['date'].count().reset_index(drop=False)
obscount=obscount.rename(columns={'date': 'obs_counter'})
df=pd.merge(df, obscount, on='device', how='left')

In [20]:
# Make a column counting number of days since January 1st, for each device
jan1=datetime.strptime('2015-01-01', '%Y-%m-%d')
# find the maximum date for each device
maxdate=df.groupby('device')['date'].max().reset_index(drop=False)
maxdate=maxdate.rename(columns={'date': 'max_date'})
# calculate the elapsed days from Jan 1st until max date
maxdate['min_date']=pd.to_datetime('2015-01-01')
maxdate['ndays']=((maxdate['max_date']-maxdate['min_date']).dt.days)+1
# merge this back into the dataframe
maxdate=maxdate[['device', 'ndays']]
df=pd.merge(df, maxdate, on='device', how='left')

In [21]:
# Make a column indicating whether number of observations differs from elapsed days
df['date_gap']=df['obs_counter']-df['ndays']
df['any_gap']=df['date_gap']!=0

In [22]:
# About 20% of devices have a gap in their date stream.
devices=df.groupby('device')['any_gap', 'failure'].max().reset_index(drop=False)
print('Total number of devices',devices['device'].nunique())
print(devices['any_gap'].value_counts(normalize=True))

Total number of devices 851
False    0.797885
True     0.202115
Name: any_gap, dtype: float64


In [23]:
# Among failing devices, the percent with a gap is about the same as without
pd.crosstab(devices['failure'], devices['any_gap'], margins=True,normalize='index')

any_gap,False,True
failure,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.789262,0.210738
1,0.858491,0.141509
All,0.797885,0.202115


In [24]:
# Make a list of devices with a gap in dates.
bad_devices=devices.loc[devices['any_gap']==True]['device']

In [25]:
# We can either impute this data, or drop these devices. Let's drop them.
print(df.shape)
print(devices.shape)
# drop them.
df=df.loc[~df['device'].isin(bad_devices)]
devices=devices.loc[~devices['device'].isin(bad_devices)]
print(df.shape)
print(devices.shape)

(122402, 74)
(851, 3)
(93701, 74)
(679, 3)


## Create the rolling lag variable for time-series

In [26]:
test=df.sort_values(by=['device','date'], ascending=True)
test.head()

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute6,attribute7,prefix,...,att5_90,att5_91,att5_92,att5_94,att5_95,att5_98,obs_counter,ndays,date_gap,any_gap
0,2015-01-01,S1F01E6Y,0,0.57752,0,0,0,0.568262,0,S1F0,...,0,0,0,0,0,0,48,48,0,False
1,2015-01-02,S1F01E6Y,0,0.57752,0,0,0,0.568262,0,S1F0,...,0,0,0,0,0,0,48,48,0,False
2,2015-01-03,S1F01E6Y,0,0.57752,0,0,0,0.568262,0,S1F0,...,0,0,0,0,0,0,48,48,0,False
3,2015-01-04,S1F01E6Y,0,0.57752,0,0,0,0.568262,0,S1F0,...,0,0,0,0,0,0,48,48,0,False
4,2015-01-05,S1F01E6Y,0,-1.731542,0,0,0,0.568262,0,S1F0,...,0,0,0,0,0,0,48,48,0,False


In [27]:
cols_2_lag=df.columns.drop(['device', 'date', 'failure', 'prefix','obs_counter', 'ndays', 'date_gap',
       'any_gap'])

In [28]:
cols_2_lag

Index(['attribute1', 'attribute2', 'attribute3', 'attribute4', 'attribute6',
       'attribute7', 'att5_1', 'att5_2', 'att5_3', 'att5_4', 'att5_5',
       'att5_6', 'att5_7', 'att5_8', 'att5_9', 'att5_10', 'att5_11', 'att5_12',
       'att5_13', 'att5_14', 'att5_15', 'att5_16', 'att5_17', 'att5_18',
       'att5_19', 'att5_20', 'att5_21', 'att5_22', 'att5_23', 'att5_24',
       'att5_25', 'att5_29', 'att5_30', 'att5_31', 'att5_32', 'att5_33',
       'att5_34', 'att5_35', 'att5_36', 'att5_37', 'att5_38', 'att5_39',
       'att5_40', 'att5_41', 'att5_42', 'att5_57', 'att5_58', 'att5_59',
       'att5_60', 'att5_61', 'att5_62', 'att5_63', 'att5_64', 'att5_65',
       'att5_66', 'att5_67', 'att5_68', 'att5_70', 'att5_78', 'att5_89',
       'att5_90', 'att5_91', 'att5_92', 'att5_94', 'att5_95', 'att5_98'],
      dtype='object')

In [29]:
for col in cols_2_lag:
    for i in range(1,5):
        df[f'{col}_lag0{i}'] = df.groupby('device')[col].shift(i)
        df.loc[df[f'{col}_lag0{i}'].isnull(), f'{col}_lag0{i}']=0

## Zip the data file

In [30]:
df.to_csv('data/cleaned_eda.gz', compression='gzip', index=False)
print(df.shape)

(93701, 338)


In [31]:
df = pd.read_csv('data/cleaned_eda.gz', compression='gzip', header=0, sep=',', quotechar='"')

In [32]:
df.head()

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute6,attribute7,prefix,...,att5_94_lag03,att5_94_lag04,att5_95_lag01,att5_95_lag02,att5_95_lag03,att5_95_lag04,att5_98_lag01,att5_98_lag02,att5_98_lag03,att5_98_lag04
0,2015-01-01,S1F01E6Y,0,0.57752,0,0,0,0.568262,0,S1F0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015-01-02,S1F01E6Y,0,0.57752,0,0,0,0.568262,0,S1F0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2015-01-03,S1F01E6Y,0,0.57752,0,0,0,0.568262,0,S1F0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2015-01-04,S1F01E6Y,0,0.57752,0,0,0,0.568262,0,S1F0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2015-01-05,S1F01E6Y,0,-1.731542,0,0,0,0.568262,0,S1F0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
pd.options.display.max_seq_items = 2000
df.columns

Index(['date', 'device', 'failure', 'attribute1', 'attribute2', 'attribute3',
       'attribute4', 'attribute6', 'attribute7', 'prefix', 'att5_1', 'att5_2',
       'att5_3', 'att5_4', 'att5_5', 'att5_6', 'att5_7', 'att5_8', 'att5_9',
       'att5_10', 'att5_11', 'att5_12', 'att5_13', 'att5_14', 'att5_15',
       'att5_16', 'att5_17', 'att5_18', 'att5_19', 'att5_20', 'att5_21',
       'att5_22', 'att5_23', 'att5_24', 'att5_25', 'att5_29', 'att5_30',
       'att5_31', 'att5_32', 'att5_33', 'att5_34', 'att5_35', 'att5_36',
       'att5_37', 'att5_38', 'att5_39', 'att5_40', 'att5_41', 'att5_42',
       'att5_57', 'att5_58', 'att5_59', 'att5_60', 'att5_61', 'att5_62',
       'att5_63', 'att5_64', 'att5_65', 'att5_66', 'att5_67', 'att5_68',
       'att5_70', 'att5_78', 'att5_89', 'att5_90', 'att5_91', 'att5_92',
       'att5_94', 'att5_95', 'att5_98', 'obs_counter', 'ndays', 'date_gap',
       'any_gap', 'attribute1_lag01', 'attribute1_lag02', 'attribute1_lag03',
       'attribute1_lag04', '