# Data Exploration - Device Level

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import pickle

In [2]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.io as pio
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.graph_objs import *

In [3]:
# Define the color palette.
Viridis= ['#440154', '#48186a', '#472d7b', '#424086', '#3b528b', '#33638d', '#2c728e', '#26828e', '#21918c', '#1fa088',
          '#28ae80', '#3fbc73', '#5ec962', '#84d44b', '#addc30','#d8e219', '#fde725']   

In [4]:
# read clean datafile
df = pd.read_csv('../data/dataset1.gz', compression='gzip', header=0, sep=',', quotechar='"')
print(df.shape)

(124164, 11)


In [5]:
# The dates are coded as strings.
print(df['date'].dtype)
# Replace this.
df['date']=pd.to_datetime(df['date'],infer_datetime_format=True)
df['date'].dtype

object


dtype('<M8[ns]')

## Group on device ID, keep max value

In [6]:
# creates new dataframe: one row for each device (reduces dimensions from 124K to 1163)
dfmax=df.groupby('device').max().reset_index(drop=False)
dfmax=dfmax[['device', 'date', 'failure']]
print(dfmax.shape)

(1163, 3)


In [7]:
# Make a list of devices that eventually failed.
fails=list(dfmax.loc[dfmax['failure']==1]['device'])
fails[:4]

['S1F023H2', 'S1F03YZM', 'S1F09DZQ', 'S1F0CTDN']

## Create a variable for device prefix types

In [8]:
dfmax.head()

Unnamed: 0,device,date,failure
0,S1F01085,2015-01-06,0
1,S1F0166B,2015-01-06,0
2,S1F01E6Y,2015-02-17,0
3,S1F01JE0,2015-01-06,0
4,S1F01R2B,2015-08-24,0


In [9]:
# We can group devices into 6 categories based on their device ID code
dfmax['prefix']=dfmax['device'].apply(lambda row: row[:4])
dfmax['prefix']=dfmax['prefix'].apply(lambda row: 'Z1F1' if row == 'Z1F2' else row)
dfmax['prefix'].value_counts()

S1F0    387
W1F0    281
Z1F0    149
S1F1    139
W1F1    137
Z1F1     70
Name: prefix, dtype: int64

In [10]:
df_prefix=dfmax[['device', 'prefix']]

In [11]:
# Copy that variable into the primary dataset
print(df.shape)
df_prefix
df=pd.merge(df, df_prefix, on='device', how='left')
print(df.shape)

(124164, 11)
(124164, 12)


## Devices removed early

In [12]:
# Over 300 devices are taken from the log in the first 5-6 days. These are outliers and should be removed.
dfmax.loc[dfmax['date']<'01-07-2015'].shape[0]

313

In [13]:
# Looks like January 7th was a watershed moment.
dfmax['date'].value_counts().sort_index().head(10)

2015-01-03      1
2015-01-04      1
2015-01-05    106
2015-01-06    205
2015-01-07     42
2015-01-09      1
2015-01-13     23
2015-01-14      1
2015-01-17      1
2015-01-18      1
Name: date, dtype: int64

In [14]:
# Did any of these early losers actually fail? Yes, one.
dfmax.loc[dfmax['date']<'01-07-2015']['failure'].sum()

1

In [15]:
# Looks like S1F0RRB1 was an early failure.
print(dfmax.loc[(dfmax['date']<'01-07-2015') & (dfmax['failure']==1)])
# Looks like this actually had some good data going for it. Let's not delete it.
df.loc[df['device']=='S1F0RRB1']

       device       date  failure prefix
235  S1F0RRB1 2015-01-05        1   S1F0


Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute9,prefix
18392,2015-01-01,S1F0RRB1,0,5230888,2288,0,37,8,39267,24,1,S1F0
18393,2015-01-02,S1F0RRB1,0,13307628,64776,0,49,8,39267,56,1,S1F0
18394,2015-01-03,S1F0RRB1,0,26258330,64776,0,135,8,39267,56,1,S1F0
18395,2015-01-04,S1F0RRB1,0,37985862,64776,0,763,8,39267,56,1,S1F0
18396,2015-01-05,S1F0RRB1,1,48467332,64776,0,841,8,39267,56,1,S1F0


In [16]:
# Make a list of all the other early losers, and remove them from the primary dataset.
early_losers=list(dfmax.loc[(dfmax['date']<'01-07-2015') & (dfmax['failure']==0)]['device'])
len(early_losers)

312

In [17]:
print(df.shape[0])
df=df.loc[~df['device'].isin(early_losers)]
print(df.shape[0])

124164
122402


In [18]:
# Number of unique devices has dropped from 1163
devices=df.groupby('device').max().reset_index(drop=False)
len(devices)

851

## Features with very little variance

In [19]:
# std dev of all attributes
dfsd=df.groupby('device').std().reset_index(drop=False)
dfsd=dfsd[['attribute1', 'attribute2', 'attribute3', 'attribute4',
       'attribute5', 'attribute6', 'attribute7', 'attribute9']]

In [20]:
# list of failed devices
dffailed=df.groupby('device')['failure'].max().reset_index(drop=False)
dffailed.shape

(851, 2)

In [21]:
dfsmall=pd.concat([dffailed, dfsd], axis=1)

In [22]:
dfsmall.head()

Unnamed: 0,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute9
0,S1F01E6Y,0,68185030.0,0.0,0.0,0.0,0.0,6645.807521,0.0,0.0
1,S1F01R2B,0,68017840.0,0.0,0.0,0.0,0.892202,4734.057185,0.0,0.0
2,S1F01XDJ,0,76390780.0,0.0,0.0,0.0,0.0,4716.175787,0.0,0.0
3,S1F023H2,1,76708660.0,0.0,0.0,0.0,0.0,6423.425129,0.0,0.0
4,S1F02A0J,0,73312960.0,0.0,0.0,0.0,0.410241,8647.460767,0.0,0.0


In [23]:
# Calculate whether an attribute has zero variance for a given device.
for col in ['attribute1', 'attribute2', 'attribute3', 'attribute4',
       'attribute5', 'attribute6', 'attribute7', 'attribute9']:
    dfsmall['novar_'+col]=0
    dfsmall.loc[dfsmall[col]==0, 'novar_'+col]=1

In [24]:
corrs = pd.DataFrame(dfsmall[['novar_attribute2', 'novar_attribute3', 'novar_attribute4',
       'novar_attribute5', 'novar_attribute6', 'novar_attribute7', 'novar_attribute9', 'failure']].corr())
corrs
# There's neg correlation between failure and zero variance in attributes 2, 4, and 7.
# Devices with non-zero variance were more likely to fail.

Unnamed: 0,novar_attribute2,novar_attribute3,novar_attribute4,novar_attribute5,novar_attribute6,novar_attribute7,novar_attribute9,failure
novar_attribute2,1.0,0.046097,0.392979,0.002456,-0.021911,0.30304,0.039278,-0.40662
novar_attribute3,0.046097,1.0,0.044756,0.085749,-0.03995,0.064203,0.241613,0.00421
novar_attribute4,0.392979,0.044756,1.0,0.095409,-0.020084,0.473711,0.018529,-0.508201
novar_attribute5,0.002456,0.085749,0.095409,1.0,0.153448,0.08313,0.114314,-0.052461
novar_attribute6,-0.021911,-0.03995,-0.020084,0.153448,1.0,0.003542,-0.040514,0.000264
novar_attribute7,0.30304,0.064203,0.473711,0.08313,0.003542,1.0,0.047457,-0.439856
novar_attribute9,0.039278,0.241613,0.018529,0.114314,-0.040514,0.047457,1.0,-0.045483
failure,-0.40662,0.00421,-0.508201,-0.052461,0.000264,-0.439856,-0.045483,1.0


In [25]:
data = [go.Heatmap(z=corrs.values.tolist(), 
                   x=corrs.columns.tolist(),
                   y=corrs.index.tolist(),
                   colorscale='Viridis')]
layout=go.Layout(
        title="Heatmap of attributes' variance and failure")
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [26]:
for i in [2,4,7]:
    results=dfsmall.groupby('novar_attribute'+str(i))['failure'].mean()
    mydata = [go.Bar(
        x=results.index,
        y=results,
        marker=dict(color=[Viridis[0], Viridis[16]])
    )]

    mylayout = go.Layout(
        title='Attribute '+str(i),
        xaxis=dict(title = 'No variance'),
        yaxis=dict(title = 'Mean failure'),
        width=500,
        height=300,
    )
    fig = go.Figure(data=mydata, layout=mylayout)
    iplot(fig)

## Time to failure

In [27]:
# group on device, with count by date.
dfcount=df.groupby('device')['date'].count().reset_index(drop=False)
dfcount.shape

(851, 2)

In [28]:
# Keep only the failures
dfcount=dfcount[dfcount['device'].isin(fails)]

In [29]:
# Failures peak at 19 and 125 days
data = [go.Histogram(x=dfcount['date'],
                    xbins=dict(size=20),
                     marker=dict(color=Viridis[::1])
                   )]
layout = go.Layout(
    title = 'Device Failure by Length of Life', # Graph title
    yaxis = dict(title = 'Number of failing devices'), # x-axis label
    xaxis = dict(title = 'Days since launch'), # y-axis label
    
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

## Zip the dataset

In [30]:
df.columns

Index(['date', 'device', 'failure', 'attribute1', 'attribute2', 'attribute3',
       'attribute4', 'attribute5', 'attribute6', 'attribute7', 'attribute9',
       'prefix'],
      dtype='object')

In [32]:
# write clean datafile
df.to_csv('../data/dataset2.gz', compression='gzip', index=False)
print(df.shape)

(122402, 12)
