In [1]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn import preprocessing 
from datetime import datetime

In [2]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.io as pio
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.graph_objs import *

In [3]:
# read clean datafile
df = pd.read_csv('data/cleaned2.gz', compression='gzip', header=0, sep=',', quotechar='"')
print(df.shape)

(122402, 12)


In [4]:
# Define the color pallette.
Viridis= ['#440154', '#48186a', '#472d7b', '#424086', '#3b528b', '#33638d', '#2c728e', '#26828e', '#21918c', '#1fa088',
          '#28ae80', '#3fbc73', '#5ec962', '#84d44b', '#addc30','#d8e219', '#fde725']   

## Date time

In [5]:
# The dates are coded as strings.
print(df['date'].dtype)
# Replace this.
df['date']=pd.to_datetime(df['date'],infer_datetime_format=True)
df['date'].dtype

object


dtype('<M8[ns]')

## Number of devices

In [6]:
# Number of unique devices
devices=df.groupby('device').max().reset_index(drop=False)
print(devices['device'].nunique())
print(len(devices))

851
851


## Correlation of Features

In [7]:
# heatmap: all attributes, correlation
corrs = pd.DataFrame(df[['attribute2', 'attribute3', 'attribute4',
       'attribute5', 'attribute6', 'attribute7', 'attribute9', 'failure']].corr())
corrs

Unnamed: 0,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute9,failure
attribute2,1.0,-0.005345,0.13206,-0.013898,-0.027578,0.139956,-0.005814,0.053854
attribute3,-0.005345,1.0,-0.003204,-0.005562,0.007921,-0.002524,0.695875,-0.001276
attribute4,0.13206,-0.003204,1.0,-0.004219,0.020685,0.036771,-0.004338,0.082168
attribute5,-0.013898,-0.005562,-0.004219,1.0,-0.015059,-0.008205,0.015737,0.002093
attribute6,-0.027578,0.007921,0.020685,-0.015059,1.0,-0.014939,0.02745,-0.000478
attribute7,0.139956,-0.002524,0.036771,-0.008205,-0.014939,1.0,0.011447,0.124205
attribute9,-0.005814,0.695875,-0.004338,0.015737,0.02745,0.011447,1.0,0.002795
failure,0.053854,-0.001276,0.082168,0.002093,-0.000478,0.124205,0.002795,1.0


In [8]:
data = [go.Heatmap(z=corrs.values.tolist()[::-1], 
                   y=corrs.columns.tolist()[::-1],
                   x=corrs.index.tolist(),
                   colorscale='Viridis')]
layout=go.Layout(
        title="Heatmap of attributes and failure")
fig = go.Figure(data=data, layout=layout)
iplot(fig)
pio.write_image(fig, 'images/heatmap2.png')

![image](images/heatmap2.png)

In [9]:
# One correlation pair is unusually high. Typically .7 is the cutoff, so we're going to remove one of these variables.
print('correlation of 3 & 9:', round(df['attribute9'].corr(df['attribute3']), 3))
df=df.drop('attribute9', axis=1)
df.shape

correlation of 3 & 9: 0.696


(122402, 11)

## Attributes 1 and 6

In [10]:
# Attribute 1
trace0 = go.Box(
    y=df[df['failure']==0]['attribute1'],
    marker = dict(color=Viridis[0])
)
trace1 = go.Box(
    y=df[df['failure']==1]['attribute1'],
     marker = dict(color=Viridis[10])
)
data = [trace0, trace1]
iplot(data)

In [11]:
# Attribute 6
trace0 = go.Box(
    y=df[df['failure']==0]['attribute6'],
    marker = dict(color=Viridis[5])
)
trace1 = go.Box(
    y=df[df['failure']==1]['attribute6'],
     marker = dict(color=Viridis[16])
)
data = [trace0, trace1]
iplot(data)

## Attributes 2, 3, 4, and 7: Sparse data

overall: for 3 of the 9 attributes nearly all of the values are 0

In [12]:
df[['attribute2','attribute3','attribute4','attribute7']].describe()

Unnamed: 0,attribute2,attribute3,attribute4,attribute7
count,122402.0,122402.0,122402.0,122402.0
mean,154.850411,9.160961,1.468742,0.273639
std,2161.872846,121.260407,19.048398,7.193717
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0
max,64968.0,2693.0,1666.0,832.0


In [13]:
# what percentage of each is a zero? very high.
def lotsa_zeroes(col):
    numerator=len(df.loc[df[col]==0])
    denominator=len(df[col])
    print(col+':', round(numerator/denominator, 2))
for i in [2, 3,4,7]:
    lotsa_zeroes('attribute'+str(i))

attribute2: 0.95
attribute3: 0.93
attribute4: 0.93
attribute7: 0.99


In [14]:
# convert each of these into a simple dummy.
for col in ['attribute2','attribute3','attribute4','attribute7']:
    df.loc[df[col]!=0, col]=1
    print(col)
    print(df[col].value_counts())
    print('\n')

attribute2
0    116301
1      6101
Name: attribute2, dtype: int64


attribute3
0    113477
1      8925
Name: attribute3, dtype: int64


attribute4
0    113531
1      8871
Name: attribute4, dtype: int64


attribute7
0    121012
1      1390
Name: attribute7, dtype: int64




## Attribute 5

In [15]:
# attribute 5 is a collection of 60 distinct codes (probably error codes)
print(df['attribute5'].value_counts().index.sort_values())
print(df['attribute5'].value_counts().index.nunique())
df['attribute5'].value_counts().head()

Int64Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
            18, 19, 20, 21, 22, 23, 24, 25, 29, 30, 31, 32, 33, 34, 35, 36, 37,
            38, 39, 40, 41, 42, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
            70, 78, 89, 90, 91, 92, 94, 95, 98],
           dtype='int64')
60


8     21698
9     13428
11    12739
10    11415
7     10995
Name: attribute5, dtype: int64

## Gaps in Date

there are roughly 20% of the devices where the timeline is not continous

In [16]:
# here's how it works for one example device:
dateslist=list(df.loc[df['device']=='S1F01E6Y']['date'])
print('number of days:', ((dateslist[-1]-dateslist[0]).days)+1) # be sure to add one
print('number of observations:', len(dateslist))

number of days: 48
number of observations: 48


In [17]:
# Make a column counting number of dates observed for each device
obscount=df.groupby('device')['date'].count().reset_index(drop=False)
obscount=obscount.rename(columns={'date': 'obs_counter'})
df=pd.merge(df, obscount, on='device', how='left')

In [18]:
# Make a column counting number of days since January 1st, for each device
jan1=datetime.strptime('2015-01-01', '%Y-%m-%d')
# find the maximum date for each device
maxdate=df.groupby('device')['date'].max().reset_index(drop=False)
maxdate=maxdate.rename(columns={'date': 'max_date'})
# calculate the elapsed days from Jan 1st until max date
maxdate['min_date']=pd.to_datetime('2015-01-01')
maxdate['ndays']=((maxdate['max_date']-maxdate['min_date']).dt.days)+1
# merge this back into the dataframe
maxdate=maxdate[['device', 'ndays']]
df=pd.merge(df, maxdate, on='device', how='left')

In [19]:
# Make a column indicating whether number of observations differs from elapsed days
df['date_gap']=df['obs_counter']-df['ndays']
df['any_gap']=df['date_gap']!=0

In [20]:
# About 20% of devices have a gap in their date stream.
devices=df.groupby('device')['any_gap', 'failure'].max().reset_index(drop=False)
print('Total number of devices',devices['device'].nunique())
print(devices['any_gap'].value_counts(normalize=True))
values=devices['any_gap'].value_counts().values.tolist()
labels=devices['any_gap'].value_counts().index.tolist()
colors = ['#48186a', '#d8e219']

Total number of devices 851
False    0.797885
True     0.202115
Name: any_gap, dtype: float64


In [21]:
data = [go.Pie(labels=labels, 
               values=values,
               hoverinfo='label+percent', 
               textinfo='value',
               hole=.4,
               marker=dict(colors=colors)
              )]
layout=go.Layout(title="Devices with gap in date stream",
                    width=500,
                    height=500, )
fig = go.Figure(data=data, layout=layout)
iplot(fig)
pio.write_image(fig, 'images/donut_dategap1.png')

In [22]:
# Among failing devices, the percent with a gap is about the same as without
pd.crosstab(devices['failure'], devices['any_gap'], margins=True,normalize='index')

any_gap,False,True
failure,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.789262,0.210738
1,0.858491,0.141509
All,0.797885,0.202115


In [23]:
values0=devices[devices['failure']==0]['any_gap'].value_counts().values.tolist()
values1=devices[devices['failure']==1]['any_gap'].value_counts().values.tolist()

In [24]:
from plotly import tools
trace0 = go.Pie(labels=labels, 
               values=values0,
               hoverinfo='label+percent', 
               textinfo='value',
               hole=.4,
               marker=dict(colors=colors),
                domain={"x": [0, .5]}
              )
trace1 = go.Pie(labels=labels, 
               values=values1,
               hoverinfo='label+percent', 
               textinfo='value',
               hole=.4,
               marker=dict(colors=colors),
                 domain={"x": [.5, 1]}
              )

data=[trace0,trace1]
layout=go.Layout(title="Devices with gap in date stream",            
                    width=1000,
                    height=500, )
fig = go.Figure(data=data, layout=layout)

iplot(fig)
# pio.write_image(fig, 'images/donut_dategap1.png')

In [25]:
results=pd.DataFrame(devices.groupby(['any_gap','failure']).count())
results

Unnamed: 0_level_0,Unnamed: 1_level_0,device
any_gap,failure,Unnamed: 2_level_1
False,0,588
False,1,91
True,0,157
True,1,15


In [26]:
results.loc[0]['device']

failure
0    588
1     91
Name: device, dtype: int64

In [33]:
# Let's display that with plotly.
mydata1 = go.Bar(
    x=results.loc[0].index,
    y=results.loc[0]['device'],
    name='Did not fail',
    marker=dict(color=Viridis[0])
)
mydata2 = go.Bar(
    x=results.loc[1].index,
    y=results.loc[1]['device'],
    name='Failed',
    marker=dict(color=Viridis[10])
)

mylayout = go.Layout(
    title='Date gap by failure',
    xaxis = dict(title = 'Has a gap in recorded date range'), # x-axis label
    yaxis = dict(title = 'Number of Devices'), # y-axis label
                    width=500,
                    height=500, 
)
fig = go.Figure(data=[mydata1, mydata2], layout=mylayout)
iplot(fig)

In [28]:
# Make a list of devices with a gap in dates.
bad_devices=devices.loc[devices['any_gap']==True]['device']

In [29]:
# We can either impute this data, or drop these devices. Let's drop them.
print(df.shape)
print(devices.shape)
# drop them.
df=df.loc[~df['device'].isin(bad_devices)]
devices=devices.loc[~devices['device'].isin(bad_devices)]
print(df.shape)
print(devices.shape)

(122402, 15)
(851, 3)
(93701, 15)
(679, 3)


## Zip the data file

In [30]:
pd.options.display.max_seq_items = 2000
df.columns

Index(['date', 'device', 'failure', 'attribute1', 'attribute2', 'attribute3',
       'attribute4', 'attribute5', 'attribute6', 'attribute7', 'prefix',
       'obs_counter', 'ndays', 'date_gap', 'any_gap'],
      dtype='object')

In [31]:
df.to_csv('data/cleaned_eda.gz', compression='gzip', index=False)
print(df.shape)

(93701, 15)


In [32]:
df = pd.read_csv('data/cleaned_eda.gz', compression='gzip', header=0, sep=',', quotechar='"')