# Data cleaning

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly as py
import plotly.graph_objs as go

### Hypotheses about the data:

* one reading every day, per device
* no more than one reading per day, per device
* all devices start on the same day (Jan 1st)
* there are no devices with two failure records (each device can only fail once)
* once a failure happens, device is removed from service

In [2]:
# Define the color palette.
Viridis= ['#440154', '#48186a', '#472d7b', '#424086', '#3b528b', '#33638d', '#2c728e', '#26828e', '#21918c', '#1fa088',
          '#28ae80', '#3fbc73', '#5ec962', '#84d44b', '#addc30','#d8e219', '#fde725']   

### Basic descriptives

In [3]:
data_path=Path.joinpath(Path.cwd().parent, 'data', 'device_failure.csv')
df=pd.read_csv(data_path, encoding = "ISO-8859-1")
df.shape

(124494, 12)

In [4]:
# first few rows
df.head(2).T

Unnamed: 0,0,1
date,2015-01-01,2015-01-01
device,S1F01085,S1F0166B
failure,0,0
attribute1,215630672,61370680
attribute2,56,0
attribute3,0,3
attribute4,52,0
attribute5,6,6
attribute6,407438,403174
attribute7,0,0


In [5]:
# except for the first two columns, all attributes are integers.
df.dtypes

date          object
device        object
failure        int64
attribute1     int64
attribute2     int64
attribute3     int64
attribute4     int64
attribute5     int64
attribute6     int64
attribute7     int64
attribute8     int64
attribute9     int64
dtype: object

### Remove Duplicates

In [6]:
# Attributes 7 and 8 are identical. Drop one of them.
print('correlation:', df['attribute7'].corr(df['attribute8']))
print ('Are there any rows where 7!=8?', False in list(df['attribute7']==df['attribute8']))
df=df.drop('attribute8', axis=1)
print(df.shape)

correlation: 0.9999999999999999
Are there any rows where 7!=8? False
(124494, 11)


In [7]:
# How many unique devices are there?
df['device'].nunique()

1169

In [8]:
# There is only one instance of a duplicated device-date pair.
print('How many devices have 2 measurements in one day?', df[['device', 'date']].duplicated().sum())
# what is that row?
print(df[df[['device', 'date']].duplicated()==True][['device', 'date']])
df.loc[(df['device']=='S1F0R4Q8') & (df['date']=='2015-07-10')]

How many devices have 2 measurements in one day? 1
          device        date
101335  S1F0R4Q8  2015-07-10


Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute9
101334,2015-07-10,S1F0R4Q8,0,192721392,0,0,0,8,213700,0,0
101335,2015-07-10,S1F0R4Q8,0,192721392,0,0,0,8,213700,0,0


In [9]:
df.shape

(124494, 11)

In [10]:
# we should remove one of these duplicate rows.
df=df.drop(df.index[101335], axis=0)
df.reset_index(drop=True)
df.shape

(124493, 11)

## Missing data

In [11]:
# There is no obvious missing data in any variable.
df.isnull().sum()

date          0
device        0
failure       0
attribute1    0
attribute2    0
attribute3    0
attribute4    0
attribute5    0
attribute6    0
attribute7    0
attribute9    0
dtype: int64

## Recode datetime variable type

In [12]:
# The dates are coded as strings.
df['date'].dtype

dtype('O')

In [13]:
# Replace this.
df['date']=pd.to_datetime(df['date'],infer_datetime_format=True)
df['date'].dtype

dtype('<M8[ns]')

## Total number of devices

In [14]:
df['device'].nunique()

1169

In [15]:
# What's the first date in the series?
df['date'].min()

Timestamp('2015-01-01 00:00:00')

In [16]:
# Do all devices start on this same date?
bydevice=df.groupby('device')['date', 'failure'].min().reset_index(drop=False)
bydevice['date'].value_counts()

  bydevice=df.groupby('device')['date', 'failure'].min().reset_index(drop=False)


2015-01-01    1163
2015-05-06       4
2015-06-13       1
2015-01-27       1
Name: date, dtype: int64

In [17]:
# None of these devices ever experienced failure, so they won't add much value to our analysis.
bydevice[bydevice['date']!='2015-01-01']

Unnamed: 0,device,date,failure
1,S1F013BB,2015-05-06,0
16,S1F02W1L,2015-05-06,0
18,S1F02XLX,2015-05-06,0
22,S1F03499,2015-05-06,0
549,W1F0976M,2015-01-27,0
925,W1F1DA5ÿ,2015-06-13,0


In [18]:
# Remove the six devices which don't start on January 1st.
late_ones=bydevice[bydevice['date']!='2015-01-01']['device']

## Remove last dates of zombie devices

For the most part, when a device fails it is removed. A few devices (5) continue after they're dead.

In [19]:
# confirm that there are no devices with two failure records (each device only fails once)
dffailed=df.loc[df['failure']==1]
assert dffailed['device'].nunique()==dffailed.shape[0]

In [20]:
# restrict to the date on which a device failed.
deaddevice=df[df['failure']==1][['device','date']]

In [21]:
# merge the deathdates back into the regular dataset.
deaddevice = deaddevice.rename(columns={'date': 'deathdate'})
df=pd.merge(df, deaddevice, on='device', how='outer')
df.shape

(124493, 12)

In [22]:
df[df['deathdate'].notnull()].head()

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute9,deathdate
401,2015-01-01,S1F023H2,0,141503600,0,0,1,19,494462,16,3,2015-01-19
402,2015-01-02,S1F023H2,0,161679800,0,0,1,19,495730,16,3,2015-01-19
403,2015-01-03,S1F023H2,0,182358672,0,0,1,19,496974,16,3,2015-01-19
404,2015-01-04,S1F023H2,0,204752808,0,0,1,19,497559,16,3,2015-01-19
405,2015-01-05,S1F023H2,0,226982888,0,0,1,19,498753,16,3,2015-01-19


In [23]:
# confirm that, for failed devices, there are no entries later than the failure date.
devices=df.groupby('device').max().reset_index(drop=False)
dead_devices=devices.loc[devices['failure']==1]
dead_devices=dead_devices.rename(columns={'date': 'maxdate'})
dead_devices=dead_devices[['device', 'maxdate', 'deathdate']]
dead_devices.loc[dead_devices['maxdate']!=dead_devices['deathdate']]

Unnamed: 0,device,maxdate,deathdate
101,S1F0GPFZ,2015-07-24,2015-07-12
504,S1F136J0,2015-05-06,2015-05-05
599,W1F0KCP2,2015-05-11,2015-05-09
625,W1F0M35B,2015-05-11,2015-05-09
819,W1F11ZG9,2015-08-17,2015-07-18


In [24]:
# For those 5 devices, remove any dates that occur after the failure.
zombies=dead_devices.loc[dead_devices['maxdate']!=dead_devices['deathdate']]['device']
print('Number of rows removed:', df.loc[(df['device'].isin(zombies)) & (df['date']>df['deathdate'])].shape[0])
print(df.shape[0])
df=df.loc[~((df['device'].isin(zombies)) & (df['date']>df['deathdate']))]
print(df.shape[0])
df=df.drop(['deathdate'], axis=1)

Number of rows removed: 47
124493
124446


In [25]:
# Attribute 1 appears to be an entirely random signal consisting of 9 digits
df['attribute1'].describe()
df=df.drop('attribute1', axis=1)

## Min and Max dates

In [26]:
# creates new dataframe: one row for each device (reduces dimensions from 124K to 1163)
dfmin=df.groupby('device').min().reset_index(drop=False)
dfmin.head(2)

Unnamed: 0,device,date,failure,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute9
0,S1F01085,2015-01-01,0,56,0,52,6,407438,0,7
1,S1F013BB,2015-05-06,0,0,0,0,5,688952,0,0


In [27]:
# same, but for max values
dfmax=df.groupby('device').max().reset_index(drop=False)
dfmax.head(2)

Unnamed: 0,device,date,failure,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute9
0,S1F01085,2015-01-06,0,56,0,52,6,409404,0,7
1,S1F013BB,2015-05-11,0,0,0,0,5,689161,0,0


In [33]:
# which attributes tend to change within a device? all of them.
dfcompare=pd.merge(dfmax, dfmin, on='device', how='left')
dfcompare['attribute2_z']=dfcompare['attribute2_x']-dfcompare['attribute2_y']
dfcompare['attribute3_z']=dfcompare['attribute3_x']-dfcompare['attribute3_y']
dfcompare['attribute4_z']=dfcompare['attribute4_x']-dfcompare['attribute4_y']
dfcompare['attribute5_z']=dfcompare['attribute5_x']-dfcompare['attribute5_y']
dfcompare['attribute6_z']=dfcompare['attribute6_x']-dfcompare['attribute6_y']
dfcompare['attribute7_z']=dfcompare['attribute7_x']-dfcompare['attribute7_y']
dfcompare['attribute9_z']=dfcompare['attribute9_x']-dfcompare['attribute9_y']
for var in ['attribute2_z','attribute3_z','attribute4_z','attribute5_z','attribute6_z','attribute7_z','attribute9_z']:
    print(str(var), dfcompare[var].mean())
# this suggests that attributes change, and are an indicator of health.

attribute2_z 633.0675791274593
attribute3_z 2.223267750213858
attribute4_z 6.098374679213003
attribute5_z 0.4679213002566296
attribute6_z 31298.350727117195
attribute7_z 4.076988879384089
attribute9_z 0.1787852865697177


In [34]:
# just focus on the date for now.
dfmin=dfmin[['device', 'date']]
dfmin.rename(columns={'date':'mindate'},inplace=True)
print(dfmin.shape)
dfmin.head()

(1169, 2)


Unnamed: 0,device,mindate
0,S1F01085,2015-01-01
1,S1F013BB,2015-05-06
2,S1F0166B,2015-01-01
3,S1F01E6Y,2015-01-01
4,S1F01JE0,2015-01-01


In [35]:
# We can group devices into 7 categories based on their device ID code
dfmin['prefix']=dfmin['device'].apply(lambda row: row[:4])
dfmin['prefix'].value_counts()

S1F0    391
W1F0    282
Z1F0    149
S1F1    139
W1F1    138
Z1F1     67
Z1F2      3
Name: prefix, dtype: int64

In [36]:
# merge back into primary dataset
print(df.shape)
df=pd.merge(df, dfmin, on='device', how='left')
df.head()

(124446, 10)


Unnamed: 0,date,device,failure,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute9,mindate,prefix
0,2015-01-01,S1F01085,0,56,0,52,6,407438,0,7,2015-01-01,S1F0
1,2015-01-02,S1F01085,0,56,0,52,6,407438,0,7,2015-01-01,S1F0
2,2015-01-03,S1F01085,0,56,0,52,6,407438,0,7,2015-01-01,S1F0
3,2015-01-04,S1F01085,0,56,0,52,6,407439,0,7,2015-01-01,S1F0
4,2015-01-05,S1F01085,0,56,0,52,6,408114,0,7,2015-01-01,S1F0


In [37]:
# change dates to elapsed time.
df['timelapse']=(df['date']-df['mindate']).dt.days
df['timelapse'].value_counts()

0      1169
1      1168
2      1168
3      1167
4      1166
       ... 
302      31
303      31
301      31
305      31
299      31
Name: timelapse, Length: 304, dtype: int64

## Cox regression analysis

In [45]:
coxdf = df[['device', 'failure', 'prefix', 'timelapse']]
coxdf = coxdf.groupby('device').max().reset_index()
coxdf.tail()

Unnamed: 0,device,failure,prefix,timelapse
1164,Z1F1VMZB,0,Z1F1,291
1165,Z1F1VQFY,1,Z1F1,180
1166,Z1F26YZB,0,Z1F2,83
1167,Z1F282ZV,0,Z1F2,83
1168,Z1F2PBHX,0,Z1F2,95
