# Data cleaning

In [1]:
import pandas as pd
import numpy as np
import pickle
from pathlib import Path

In [2]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.io as pio
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.graph_objs import *

### Hypotheses about the data:

* one reading every day, per device
* no more than one reading per day, per device
* all devices start on the same day (Jan 1st)
* there are no devices with two failure records (each device can only fail once)
* once a failure happens, device is removed from service

In [3]:
# Define the color palette.
Viridis= ['#440154', '#48186a', '#472d7b', '#424086', '#3b528b', '#33638d', '#2c728e', '#26828e', '#21918c', '#1fa088',
          '#28ae80', '#3fbc73', '#5ec962', '#84d44b', '#addc30','#d8e219', '#fde725']   

### Basic descriptives

In [4]:
data_path=Path.joinpath(Path.cwd().parent, 'data', 'device_failure.csv')

In [5]:
df=pd.read_csv(data_path, encoding = "ISO-8859-1")

In [6]:
df.head()

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9
0,2015-01-01,S1F01085,0,215630672,56,0,52,6,407438,0,0,7
1,2015-01-01,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,2015-01-01,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,2015-01-01,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,2015-01-01,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3


In [7]:
# about 125K observations.
df.shape

(124494, 12)

In [8]:
# except for the first two columns, all attributes are integers.
df.dtypes

date          object
device        object
failure        int64
attribute1     int64
attribute2     int64
attribute3     int64
attribute4     int64
attribute5     int64
attribute6     int64
attribute7     int64
attribute8     int64
attribute9     int64
dtype: object

### Remove Duplicates

In [9]:
# Attributes 7 and 8 are identical. Drop one of them.
print('correlation:', df['attribute7'].corr(df['attribute8']))
print ('Are there any rows where 7!=8?', False in list(df['attribute7']==df['attribute8']))
df=df.drop('attribute8', axis=1)
print(df.shape)

correlation: 0.9999999999999999
Are there any rows where 7!=8? False
(124494, 11)


In [10]:
# How many unique devices are there?
df['device'].nunique()

1169

In [11]:
# There is only one instance of a duplicated device-date pair.
print('How many devices have 2 measurements in one day?', df[['device', 'date']].duplicated().sum())
# what is that row?
print(df[df[['device', 'date']].duplicated()==True][['device', 'date']])
df.loc[(df['device']=='S1F0R4Q8') & (df['date']=='2015-07-10')]

How many devices have 2 measurements in one day? 1
          device        date
101335  S1F0R4Q8  2015-07-10


Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute9
101334,2015-07-10,S1F0R4Q8,0,192721392,0,0,0,8,213700,0,0
101335,2015-07-10,S1F0R4Q8,0,192721392,0,0,0,8,213700,0,0


In [12]:
df.shape

(124494, 11)

In [13]:
# we should remove one of these duplicate rows.
df=df.drop(df.index[101335], axis=0)
df.reset_index(drop=True)
df.shape

(124493, 11)

## Missing data

In [14]:
# There is no obvious missing data in any variable.
df.isnull().sum()

date          0
device        0
failure       0
attribute1    0
attribute2    0
attribute3    0
attribute4    0
attribute5    0
attribute6    0
attribute7    0
attribute9    0
dtype: int64

## Recode datetime variable type

In [15]:
# The dates are coded as strings.
df['date'].dtype

dtype('O')

In [16]:
# Replace this.
df['date']=pd.to_datetime(df['date'],infer_datetime_format=True)
df['date'].dtype

dtype('<M8[ns]')

## Remove late-starting devices

In [17]:
# What's the first date in the series?
df['date'].min()

Timestamp('2015-01-01 00:00:00')

In [18]:
# Do all devices start on this same date?
bydevice=df.groupby('device')['date', 'failure'].min().reset_index(drop=False)
bydevice['date'].value_counts()

2015-01-01    1163
2015-05-06       4
2015-01-27       1
2015-06-13       1
Name: date, dtype: int64

In [19]:
# None of these devices ever experienced failure, so they won't add much value to our analysis.
bydevice[bydevice['date']!='2015-01-01']

Unnamed: 0,device,date,failure
1,S1F013BB,2015-05-06,0
16,S1F02W1L,2015-05-06,0
18,S1F02XLX,2015-05-06,0
22,S1F03499,2015-05-06,0
549,W1F0976M,2015-01-27,0
925,W1F1DA5ÿ,2015-06-13,0


In [20]:
# Remove the six devices which don't start on January 1st.
late_ones=bydevice[bydevice['date']!='2015-01-01']['device']
df=df.loc[~df['device'].isin(late_ones)]
df.shape

(124211, 11)

## Remove last dates of zombie devices

For the most part, when a device fails it is removed. A few devices (5) continue after they're dead.

In [21]:
# confirm that there are no devices with two failure records (each device only fails once)
dffailed=df.loc[df['failure']==1]
assert dffailed['device'].nunique()==dffailed.shape[0]

In [22]:
# restrict to the date on which a device failed.
deaddevice=df[df['failure']==1][['device','date']]

In [23]:
# merge the deathdates back into the regular dataset.
deaddevice = deaddevice.rename(columns={'date': 'deathdate'})
df=pd.merge(df, deaddevice, on='device', how='outer')
df.shape

(124211, 12)

In [24]:
# confirm that, for failed devices, there are no entries later than the failure date.
devices=df.groupby('device').max().reset_index(drop=False)
dead_devices=devices.loc[devices['failure']==1]
dead_devices=dead_devices.rename(columns={'date': 'maxdate'})
dead_devices=dead_devices[['device', 'maxdate', 'deathdate']]
dead_devices.loc[dead_devices['maxdate']!=dead_devices['deathdate']]

Unnamed: 0,device,maxdate,deathdate
97,S1F0GPFZ,2015-07-24,2015-07-12
500,S1F136J0,2015-05-06,2015-05-05
594,W1F0KCP2,2015-05-11,2015-05-09
620,W1F0M35B,2015-05-11,2015-05-09
814,W1F11ZG9,2015-08-17,2015-07-18


In [25]:
# For those 5 devices, remove any dates that occur after the failure.
zombies=dead_devices.loc[dead_devices['maxdate']!=dead_devices['deathdate']]['device']
print('Number of rows removed:', df.loc[(df['device'].isin(zombies)) & (df['date']>df['deathdate'])].shape[0])
print(df.shape[0])
df=df.loc[~((df['device'].isin(zombies)) & (df['date']>df['deathdate']))]
df.shape[0]

Number of rows removed: 47
124211


124164

In [26]:
# remove the death-date column, as it's no longer needed
df=df.drop('deathdate', axis=1)

Only two of the variables (attributes 1 and 6) are actually numeric.

## Zip the dataset for further use

In [27]:
df.to_csv('../data/dataset1.gz', compression='gzip', index=False)
print(df.shape)

(124164, 11)
