In [1]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn import preprocessing 
from datetime import datetime

In [2]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.io as pio
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.graph_objs import *

In [3]:
from sklearn.utils import resample

In [4]:
# read clean datafile
df = pd.read_csv('data/dataset4.gz', compression='gzip', header=0, sep=',', quotechar='"')
print(df.shape)

(93701, 11)


In [5]:
df.columns

Index(['date', 'device', 'failure', 'prefix', 'ndays', 'attribute2',
       'attribute3', 'attribute4', 'att5', 'att6', 'attribute7'],
      dtype='object')

## Convert date to an ordinal

In [6]:
print(df['date'].dtype)
# Replace this.
df['date']=pd.to_datetime(df['date'],infer_datetime_format=True)
df['date'].dtype

object


dtype('<M8[ns]')

## Days since launch

In [7]:
# calculate the elapsed days from Jan 1st until current date
df['min_date']=pd.to_datetime('2015-01-01')
df['ndays']=((df['date']-df['min_date']).dt.days)
df=df.drop(['min_date', 'date'], axis=1)

## Convert prefix to dummy (one-hot encoding)

In [8]:
df = pd.get_dummies(df, prefix='', prefix_sep='', columns=['prefix'])
df.head()

Unnamed: 0,device,failure,ndays,attribute2,attribute3,attribute4,att5,att6,attribute7,S1F0,S1F1,W1F0,W1F1,Z1F0,Z1F1
0,S1F01E6Y,0,0,0,0,0,12.0,237394.0,0,1,0,0,0,0,0
1,S1F01E6Y,0,1,0,0,0,12.0,238718.0,0,1,0,0,0,0,0
2,S1F01E6Y,0,2,0,0,0,12.0,240021.0,0,1,0,0,0,0,0
3,S1F01E6Y,0,3,0,0,0,12.0,241264.0,0,1,0,0,0,0,0
4,S1F01E6Y,0,4,0,0,0,12.0,242553.0,0,1,0,0,0,0,0


## Create the rolling lag variable for time-series

In [9]:
df.columns

Index(['device', 'failure', 'ndays', 'attribute2', 'attribute3', 'attribute4',
       'att5', 'att6', 'attribute7', 'S1F0', 'S1F1', 'W1F0', 'W1F1', 'Z1F0',
       'Z1F1'],
      dtype='object')

In [10]:
# drop columms that never change across date (these don't need to be lagged)
lagged_columns=df.columns.drop(['device', 'failure', 'ndays', 'S1F0', 'S1F1',
       'W1F0', 'W1F1', 'Z1F0', 'Z1F1'])
print(lagged_columns)

Index(['attribute2', 'attribute3', 'attribute4', 'att5', 'att6', 'attribute7'], dtype='object')


In [11]:
for col in lagged_columns:
    for i in range(1,5):
        df[f'{col}_lag0{i}'] = df.groupby('device')[col].shift(i)
        df.loc[df[f'{col}_lag0{i}'].isnull(), f'{col}_lag0{i}']=0

In [12]:
df.shape

(93701, 39)

In [13]:
df.columns

Index(['device', 'failure', 'ndays', 'attribute2', 'attribute3', 'attribute4',
       'att5', 'att6', 'attribute7', 'S1F0', 'S1F1', 'W1F0', 'W1F1', 'Z1F0',
       'Z1F1', 'attribute2_lag01', 'attribute2_lag02', 'attribute2_lag03',
       'attribute2_lag04', 'attribute3_lag01', 'attribute3_lag02',
       'attribute3_lag03', 'attribute3_lag04', 'attribute4_lag01',
       'attribute4_lag02', 'attribute4_lag03', 'attribute4_lag04',
       'att5_lag01', 'att5_lag02', 'att5_lag03', 'att5_lag04', 'att6_lag01',
       'att6_lag02', 'att6_lag03', 'att6_lag04', 'attribute7_lag01',
       'attribute7_lag02', 'attribute7_lag03', 'attribute7_lag04'],
      dtype='object')

In [14]:
# Confirm that did what we wanted it do
df.loc[df['device']=='S1F01E6Y'][['att6', 'ndays','att6_lag01','att6_lag02', 'att6_lag03', 'att6_lag04']].head(6)

Unnamed: 0,att6,ndays,att6_lag01,att6_lag02,att6_lag03,att6_lag04
0,237394.0,0,0.0,0.0,0.0,0.0
1,238718.0,1,237394.0,0.0,0.0,0.0
2,240021.0,2,238718.0,237394.0,0.0,0.0
3,241264.0,3,240021.0,238718.0,237394.0,0.0
4,242553.0,4,241264.0,240021.0,238718.0,237394.0
5,243875.0,5,242553.0,241264.0,240021.0,238718.0


## Upsampling to deal with unbalanced classes

In [16]:
# Separate majority and minority classes
df_majority = df[df['failure']==0]
df_minority = df[df['failure']==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=42) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled['failure'].value_counts()

1    93610
0    93610
Name: failure, dtype: int64

## Zip the data file

In [17]:
df_upsampled.to_csv('data/dataset5.gz', compression='gzip', index=False)
print(df.shape)

(93701, 39)
