In [1]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn import preprocessing 
from datetime import datetime

In [2]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.io as pio
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.graph_objs import *

In [3]:
# Define the color palette.
Viridis= ['#440154', '#48186a', '#472d7b', '#424086', '#3b528b', '#33638d', '#2c728e', '#26828e', '#21918c', '#1fa088',
          '#28ae80', '#3fbc73', '#5ec962', '#84d44b', '#addc30','#d8e219', '#fde725']   

In [4]:
# read clean datafile
df = pd.read_csv('../data/dataset4.gz', compression='gzip', header=0, sep=',', quotechar='"')
print(df.shape)

(93701, 11)


In [5]:
df.columns

Index(['date', 'device', 'failure', 'prefix', 'ndays', 'attribute2',
       'attribute3', 'attribute4', 'att5', 'att6', 'attribute7'],
      dtype='object')

## Convert date to an ordinal

In [6]:
print(df['date'].dtype)
# Replace this.
df['date']=pd.to_datetime(df['date'],infer_datetime_format=True)
df['date'].dtype

object


dtype('<M8[ns]')

## Days since launch

In [7]:
# calculate the elapsed days from Jan 1st until current date
df['min_date']=pd.to_datetime('2015-01-01')
df['ndays']=((df['date']-df['min_date']).dt.days)
df=df.drop(['min_date'], axis=1)

## Convert prefix to dummy (one-hot encoding)

Prefix is a categorical variable. Our predictive algorithm can only interpret numbers, not categories. We create one dummy variable for each of the six categories.

In [8]:
df = pd.get_dummies(df, prefix='', prefix_sep='', columns=['prefix'])
df.head()

Unnamed: 0,date,device,failure,ndays,attribute2,attribute3,attribute4,att5,att6,attribute7,S1F0,S1F1,W1F0,W1F1,Z1F0,Z1F1
0,2015-01-01,S1F01E6Y,0,0,0,0,0,12.0,237394.0,0,1,0,0,0,0,0
1,2015-01-02,S1F01E6Y,0,1,0,0,0,12.0,238718.0,0,1,0,0,0,0,0
2,2015-01-03,S1F01E6Y,0,2,0,0,0,12.0,240021.0,0,1,0,0,0,0,0
3,2015-01-04,S1F01E6Y,0,3,0,0,0,12.0,241264.0,0,1,0,0,0,0,0
4,2015-01-05,S1F01E6Y,0,4,0,0,0,12.0,242553.0,0,1,0,0,0,0,0


## Create the rolling lag variable for time-series

Our data is time-series -- that is, daily measurements for each device over a series of months. At each point in time, the recent measurements on previous days can be powerful predictors of present status (fail/not fail). For each daily observation, then, we create 5 new variables for each of the features in the dataset, represent the observations of that feature over the 5 previous days. 

In [9]:
df.columns

Index(['date', 'device', 'failure', 'ndays', 'attribute2', 'attribute3',
       'attribute4', 'att5', 'att6', 'attribute7', 'S1F0', 'S1F1', 'W1F0',
       'W1F1', 'Z1F0', 'Z1F1'],
      dtype='object')

In [10]:
# drop columms that never change across date (these don't need to be lagged)
lagged_columns=df.columns.drop(['device', 'failure', 'ndays', 'S1F0', 'S1F1',
       'W1F0', 'W1F1', 'Z1F0', 'Z1F1'])
print(lagged_columns)

Index(['date', 'attribute2', 'attribute3', 'attribute4', 'att5', 'att6',
       'attribute7'],
      dtype='object')


In [11]:
for col in lagged_columns:
    for i in range(1,5):
        df[f'{col}_lag0{i}'] = df.groupby('device')[col].shift(i)
        df.loc[df[f'{col}_lag0{i}'].isnull(), f'{col}_lag0{i}']=0

In [12]:
# Same number of rows, but more columns.
df.shape

(93701, 44)

In [13]:
df.columns

Index(['date', 'device', 'failure', 'ndays', 'attribute2', 'attribute3',
       'attribute4', 'att5', 'att6', 'attribute7', 'S1F0', 'S1F1', 'W1F0',
       'W1F1', 'Z1F0', 'Z1F1', 'date_lag01', 'date_lag02', 'date_lag03',
       'date_lag04', 'attribute2_lag01', 'attribute2_lag02',
       'attribute2_lag03', 'attribute2_lag04', 'attribute3_lag01',
       'attribute3_lag02', 'attribute3_lag03', 'attribute3_lag04',
       'attribute4_lag01', 'attribute4_lag02', 'attribute4_lag03',
       'attribute4_lag04', 'att5_lag01', 'att5_lag02', 'att5_lag03',
       'att5_lag04', 'att6_lag01', 'att6_lag02', 'att6_lag03', 'att6_lag04',
       'attribute7_lag01', 'attribute7_lag02', 'attribute7_lag03',
       'attribute7_lag04'],
      dtype='object')

In [14]:
# Confirm that did what we wanted it do
df.loc[df['device']=='S1F01E6Y'][['att6', 'ndays','att6_lag01','att6_lag02', 'att6_lag03', 'att6_lag04']].head(6)

Unnamed: 0,att6,ndays,att6_lag01,att6_lag02,att6_lag03,att6_lag04
0,237394.0,0,0.0,0.0,0.0,0.0
1,238718.0,1,237394.0,0.0,0.0,0.0
2,240021.0,2,238718.0,237394.0,0.0,0.0
3,241264.0,3,240021.0,238718.0,237394.0,0.0
4,242553.0,4,241264.0,240021.0,238718.0,237394.0
5,243875.0,5,242553.0,241264.0,240021.0,238718.0


## Interaction terms

In [15]:
from patsy import dmatrix
df['daysX2']=dmatrix(df['ndays']*df['attribute2'], df)
df['daysX3']=dmatrix(df['ndays']*df['attribute3'], df)
df['daysX4']=dmatrix(df['ndays']*df['attribute4'], df)
df['daysX5']=dmatrix(df['ndays']*df['att5'], df)
df['daysX6']=dmatrix(df['ndays']*df['att6'], df)
df['daysX7']=dmatrix(df['ndays']*df['attribute7'], df)

In [16]:
df.head()

Unnamed: 0,date,device,failure,ndays,attribute2,attribute3,attribute4,att5,att6,attribute7,...,attribute7_lag01,attribute7_lag02,attribute7_lag03,attribute7_lag04,daysX2,daysX3,daysX4,daysX5,daysX6,daysX7
0,2015-01-01,S1F01E6Y,0,0,0,0,0,12.0,237394.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015-01-02,S1F01E6Y,0,1,0,0,0,12.0,238718.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,238718.0,0.0
2,2015-01-03,S1F01E6Y,0,2,0,0,0,12.0,240021.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,480042.0,0.0
3,2015-01-04,S1F01E6Y,0,3,0,0,0,12.0,241264.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,723792.0,0.0
4,2015-01-05,S1F01E6Y,0,4,0,0,0,12.0,242553.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,970212.0,0.0


In [17]:
# This actually did not improve the model. Drop.
df=df.drop(['daysX2','daysX3', 'daysX4', 'daysX5', 'daysX6', 'daysX7'], axis=1)

## Normalize attributes 5 and 6

[Normalization](https://machinelearningmastery.com/rescaling-data-for-machine-learning-in-python-with-scikit-learn/) rescales all the variables so that they range from 0 to 1. This makes it easier to compare coefficients between variables, and also ensures that variables with a high metric don't outweigh those with a smaller metric in their influence on the predictions.

In [18]:
df['att5_norm'] = preprocessing.normalize(df['att5'].values.reshape(-1,1))
df['att6_norm'] = preprocessing.normalize(df['att6'].values.reshape(-1,1))

## Visualize time series for a single device

In [19]:
# What's the longest-lived device that failed?
df.loc[(df['ndays']==298) & (df['failure']==1)]['device'].unique()

array(['W1F0T0B1'], dtype=object)

In [20]:
T0B1=df.loc[df['device']=='W1F0T0B1']
T0B1.shape

(299, 46)

In [21]:
# Trend over time for a single device
from plotly import tools

trace0 = go.Scatter(x=T0B1['date'], 
                    y=T0B1['attribute2'],
                    mode = 'lines',
                    name = 'attribute 2',
                    marker=dict(color=Viridis[0])
                   )
trace1 = go.Scatter(x=T0B1['date'], 
                    y=T0B1['attribute3'],
                    mode = 'lines',
                    name = 'attribute 3',
                    marker=dict(color=Viridis[3])
                   )
trace2 = go.Scatter(x=T0B1['date'], 
                    y=T0B1['attribute4'],
                    mode = 'lines',
                    name = 'attribute 4',
                    marker=dict(color=Viridis[6])
                   )
trace3 = go.Scatter(x=T0B1['date'], 
                    y=T0B1['att5'],
                    mode = 'lines',
                    name = 'attribute 5',
                    marker=dict(color=Viridis[9])
                   )
trace4 = go.Scatter(x=T0B1['date'], 
                    y=T0B1['att6'],
                    mode = 'lines',
                    name = 'attribute 6',
                    marker=dict(color=Viridis[12])
                   )
trace5 = go.Scatter(x=T0B1['date'], 
                    y=T0B1['attribute7'],
                    mode = 'lines',
                    name = 'attribute 7',
                    marker=dict(color=Viridis[15])
                   )
trace6 = go.Scatter(x=T0B1['date'], 
                    y=T0B1['failure'],
                    mode = 'lines',
                    name = 'failure',
                    marker=dict(color=Viridis[16])
                   )

fig = tools.make_subplots(rows=7, cols=1, subplot_titles=('Attribute 2', 'Attribute 3', 'Attribute 4',
                                                          'Attribute 5', 'Attribute 6', 'Attribute 7', 'Failure'))
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 2, 1)
fig.append_trace(trace2, 3, 1)
fig.append_trace(trace3, 4, 1)
fig.append_trace(trace4, 5, 1)
fig.append_trace(trace5, 6, 1)
fig.append_trace(trace6, 7, 1)

fig['layout'].update(height=1400, width=1000, title='Trend in Attributes for Device W1F0T0B1 ')
iplot(fig)

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x2,y2 ]
[ (3,1) x3,y3 ]
[ (4,1) x4,y4 ]
[ (5,1) x5,y5 ]
[ (6,1) x6,y6 ]
[ (7,1) x7,y7 ]



## Zip the data file

In [22]:
df.to_csv('../data/dataset5.gz', compression='gzip', index=False)
print(df.shape)

(93701, 46)


In [23]:
df.columns

Index(['date', 'device', 'failure', 'ndays', 'attribute2', 'attribute3',
       'attribute4', 'att5', 'att6', 'attribute7', 'S1F0', 'S1F1', 'W1F0',
       'W1F1', 'Z1F0', 'Z1F1', 'date_lag01', 'date_lag02', 'date_lag03',
       'date_lag04', 'attribute2_lag01', 'attribute2_lag02',
       'attribute2_lag03', 'attribute2_lag04', 'attribute3_lag01',
       'attribute3_lag02', 'attribute3_lag03', 'attribute3_lag04',
       'attribute4_lag01', 'attribute4_lag02', 'attribute4_lag03',
       'attribute4_lag04', 'att5_lag01', 'att5_lag02', 'att5_lag03',
       'att5_lag04', 'att6_lag01', 'att6_lag02', 'att6_lag03', 'att6_lag04',
       'attribute7_lag01', 'attribute7_lag02', 'attribute7_lag03',
       'attribute7_lag04', 'att5_norm', 'att6_norm'],
      dtype='object')