# Data Exploration - Device Level

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import pickle

In [2]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.io as pio
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.graph_objs import *

In [3]:
# define data paths
data_path=Path.joinpath(Path.cwd(), 'data', 'clean_df.pkl')

In [4]:
# read clean datafile
df=pd.read_pickle(data_path)
print(df.shape)

(124164, 11)


## Group on device ID, keep max value

In [5]:
# creates new dataframe: one row for each device (reduces dimensions from 124K to 1163)
dfmax=df.groupby('device').max().reset_index(drop=False)
dfmax=dfmax[['device', 'date', 'failure']]
print(dfmax.shape)

(1163, 3)


In [6]:
# Make a list of devices that eventually failed.
fails=list(dfmax.loc[dfmax['failure']==1]['device'])
fails[:4]

['S1F023H2', 'S1F03YZM', 'S1F09DZQ', 'S1F0CTDN']

## Create a variable for Device ID types

In [7]:
dfmax.head()

Unnamed: 0,device,date,failure
0,S1F01085,2015-01-06,0
1,S1F0166B,2015-01-06,0
2,S1F01E6Y,2015-02-17,0
3,S1F01JE0,2015-01-06,0
4,S1F01R2B,2015-08-24,0


In [8]:
# We can group devices into 6 categories based on their device ID code
dfmax['prefix']=dfmax['device'].apply(lambda row: row[:4])
dfmax['prefix']=dfmax['prefix'].apply(lambda row: 'Z1F1' if row == 'Z1F2' else row)
dfmax['prefix'].value_counts()

S1F0    387
W1F0    281
Z1F0    149
S1F1    139
W1F1    137
Z1F1     70
Name: prefix, dtype: int64

In [9]:
df_prefix=dfmax[['device', 'prefix']]

In [10]:
# Copy that variable into the primary dataset
print(df.shape)
df_prefix
df=pd.merge(df, df_prefix, on='device', how='left')
print(df.shape)

(124164, 11)
(124164, 12)


## Devices removed early

In [11]:
# Over 300 devices are taken from the log in the first 5-6 days. These are outliers and should be removed.
dfmax.loc[dfmax['date']<'01-07-2015'].shape[0]

313

In [12]:
# Looks like January 7th was a watershed moment.
dfmax['date'].value_counts().sort_index().head(10)

2015-01-03      1
2015-01-04      1
2015-01-05    106
2015-01-06    205
2015-01-07     42
2015-01-09      1
2015-01-13     23
2015-01-14      1
2015-01-17      1
2015-01-18      1
Name: date, dtype: int64

In [13]:
# Did any of these early losers actually fail? Yes, one.
dfmax.loc[dfmax['date']<'01-07-2015']['failure'].sum()

1

In [14]:
# Looks like S1F0RRB1 was an early failure.
print(dfmax.loc[(dfmax['date']<'01-07-2015') & (dfmax['failure']==1)])
# Looks like this actually had some good data going for it. Let's not delete it.
df.loc[df['device']=='S1F0RRB1']

       device       date  failure prefix
235  S1F0RRB1 2015-01-05        1   S1F0


Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute9,prefix
18392,2015-01-01,S1F0RRB1,0,0.001076,2288,0,37,8,0.002164,24,1,S1F0
18393,2015-01-02,S1F0RRB1,0,0.001076,64776,0,49,8,0.002164,56,1,S1F0
18394,2015-01-03,S1F0RRB1,0,0.001076,64776,0,135,8,0.002164,56,1,S1F0
18395,2015-01-04,S1F0RRB1,0,0.001076,64776,0,763,8,0.002164,56,1,S1F0
18396,2015-01-05,S1F0RRB1,1,0.001076,64776,0,841,8,0.002164,56,1,S1F0


In [15]:
# Make a list of all the other early losers, and remove them from the primary dataset.
early_losers=list(dfmax.loc[(dfmax['date']<'01-07-2015') & (dfmax['failure']==0)]['device'])
len(early_losers)

312

In [16]:
print(df.shape[0])
df=df.loc[~df['device'].isin(early_losers)]
print(df.shape[0])

124164
122402


## Features with very little variance

In [17]:
# std dev of all attributes
dfsd=df.groupby('device').std().reset_index(drop=False)
dfsd=dfsd[['attribute1', 'attribute2', 'attribute3', 'attribute4',
       'attribute5', 'attribute6', 'attribute7', 'attribute9']]

In [18]:
# list of failed devices
dffailed=df.groupby('device')['failure'].max().reset_index(drop=False)
dffailed.shape

(851, 2)

In [19]:
dfsmall=pd.concat([dffailed, dfsd], axis=1)

In [20]:
dfsmall.head()

Unnamed: 0,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute9
0,S1F01E6Y,0,0.000807,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,S1F01R2B,0,0.000936,0.0,0.0,0.0,0.892202,0.0,0.0,0.0
2,S1F01XDJ,0,0.000959,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,S1F023H2,1,0.000969,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,S1F02A0J,0,0.000896,0.0,0.0,0.0,0.410241,0.0,0.0,0.0


In [21]:
# Calculate whether an attribute has zero variance for a given device.
for col in ['attribute1', 'attribute2', 'attribute3', 'attribute4',
       'attribute5', 'attribute6', 'attribute7', 'attribute9']:
    dfsmall['novar_'+col]=0
    dfsmall.loc[dfsmall[col]==0, 'novar_'+col]=1

In [22]:
corrs = dfsmall[['novar_attribute2', 'novar_attribute3', 'novar_attribute4',
       'novar_attribute5', 'novar_attribute6', 'novar_attribute7', 'novar_attribute9', 'failure']].corr()
# There's neg correlation between failure and zero variance in attributes 2, 4, and 7.
# Devices with non-zero variance were more likely to fail.

In [23]:
corrs=pd.DataFrame(corrs)
corrs

Unnamed: 0,novar_attribute2,novar_attribute3,novar_attribute4,novar_attribute5,novar_attribute6,novar_attribute7,novar_attribute9,failure
novar_attribute2,1.0,0.046097,0.392979,0.002456,-0.035057,0.30304,0.039278,-0.40662
novar_attribute3,0.046097,1.0,0.044756,0.085749,0.009815,0.064203,0.241613,0.00421
novar_attribute4,0.392979,0.044756,1.0,0.095409,-0.022144,0.473711,0.018529,-0.508201
novar_attribute5,0.002456,0.085749,0.095409,1.0,-0.058147,0.08313,0.114314,-0.052461
novar_attribute6,-0.035057,0.009815,-0.022144,-0.058147,1.0,-0.005635,0.057488,0.068729
novar_attribute7,0.30304,0.064203,0.473711,0.08313,-0.005635,1.0,0.047457,-0.439856
novar_attribute9,0.039278,0.241613,0.018529,0.114314,0.057488,0.047457,1.0,-0.045483
failure,-0.40662,0.00421,-0.508201,-0.052461,0.068729,-0.439856,-0.045483,1.0


In [24]:
data = [go.Heatmap(z=corrs.values.tolist(), 
                   x=corrs.columns.tolist(),
                   y=corrs.index.tolist(),
                   colorscale='Viridis')]
layout=go.Layout(
        title="Heatmap of attributes and failure")
fig = go.Figure(data=data, layout=layout)
# iplot(fig)
pio.write_image(fig, 'images/heatmap.png')

![heatmap](images/heatmap.png)

In [25]:
for i in [2,4,7]:
    results=dfsmall.groupby('novar_attribute'+str(i))['failure'].mean()
    mydata = [go.Bar(
        x=results.index,
        y=results,
        marker=dict(color=['rgba(100,100,500,1)', 'rgba(222,45,38,0.8)'])
    )]

    mylayout = go.Layout(
        title='Attribute '+str(i),
        xaxis=dict(title = 'No variance'),
        yaxis=dict(title = 'Mean failure'),
        width=500,
        height=300,
    )
    fig = go.Figure(data=mydata, layout=mylayout)
#     iplot(fig)
    pio.write_image(fig, 'images/attribute'+str(i)+'.png')

<table><tr>
<td> <img src="images/attribute2.png"  style="width: 500px;"/> </td>
<td> <img src="images/attribute4.png"  style="width: 500px;"/> </td>
<td> <img src="images/attribute7.png"  style="width: 500px;"/> </td>
</tr></table>

In [26]:
# We'll want to create a measure of standard variance for each feature, and include it in the "rolling" lag.

## Time to failure

In [27]:
# group on device, with county by date.
dfcount=df.groupby('device')['date'].count().reset_index(drop=False)
dfcount.shape

(851, 2)

In [28]:
# Keep only the failures
dfcount=dfcount[dfcount['device'].isin(fails)]

In [29]:
# Failures peak at 19 and 125 days
data = [go.Histogram(x=dfcount['date'],
                    xbins=dict(size=5)
                   )]
layout = go.Layout(
    title = 'Device Failure by Length of Life', # Graph title
    yaxis = dict(title = 'Number of failures'), # x-axis label
    xaxis = dict(title = 'Days since launch'), # y-axis label
    
)
fig = go.Figure(data=data, layout=layout)
# iplot(fig)
pio.write_image(fig, 'images/histogram.png')

![hist](images/histogram.png)

In [30]:
# Be sure to include a measure of time since launch as a predictor.

## Overwrite the dataset

In [31]:
# read clean datafile
df.to_pickle(data_path)
print(df.shape)

(122402, 12)
