In [51]:
import pandas as pd
import numpy as np
import re
import pickle

In [52]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.io as pio
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.graph_objs import *

In [53]:
# Define the color pallette (17 colors).
Viridis= ['#440154', '#48186a', '#472d7b', '#424086', '#3b528b', '#33638d', '#2c728e', '#26828e', '#21918c', '#1fa088',
          '#28ae80', '#3fbc73', '#5ec962', '#84d44b', '#addc30','#d8e219', '#fde725']   

In [54]:
# read clean datafile
df = pd.read_csv('data/cleaned_eda.gz', compression='gzip', header=0, sep=',', quotechar='"')
print(df.shape)

(93701, 15)


In [55]:
df.columns

Index(['date', 'device', 'failure', 'attribute1', 'attribute2', 'attribute3',
       'attribute4', 'attribute5', 'attribute6', 'attribute7', 'prefix',
       'obs_counter', 'ndays', 'date_gap', 'any_gap'],
      dtype='object')

## Total number of devices still active

In [56]:
colx='date'
coly='device'
aggdf=df.groupby(colx)[coly].count().reset_index(drop=False)
data = [go.Scatter(x=aggdf[colx], 
                   y=aggdf[coly],
                    mode = 'lines',
                   marker=dict(color=Viridis[0])
)]
layout = go.Layout(
    title = f'Distribution of {colx} by {coly}', 
    xaxis = dict(title = colx), 
    yaxis = dict(title = coly), 
    hovermode ='closest' 
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
pio.write_image(fig, 'images/device_trend.png')

![image](images/device_trend.png)

## Failures

In [57]:
# Distribution by device
coly='device'
colx='failure'
aggdf=df.groupby(coly)[colx].sum().reset_index(drop=False)
counts=aggdf[colx].value_counts().values.tolist()[::-1]

data = [go.Bar(y=counts, 
               x=['Failed', "Did not fail"],
               marker=dict(color=[Viridis[0], Viridis[16]]),
                   )]

layout = go.Layout(
    title = f'Distribution of {coly} by {colx}', 
    xaxis = dict(title = colx), 
    yaxis = dict(title = 'Number of devices'), 
    width=500,
    height=400, 
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
pio.write_image(fig, f'images/{coly}X{colx}.png')

![image](images/failureXdevice.png)

In [58]:
# Trend over time
colx='date'
coly='failure'
aggdf=df.groupby(colx)[coly].sum().reset_index(drop=False)

data = [go.Scatter(x=aggdf[colx], 
                   y=aggdf[coly],
                    mode = 'lines',
                   marker=dict(color=Viridis[0])
)]
layout = go.Layout(
    title = f'Distribution of {coly} by {colx}', 
    xaxis = dict(title = colx), 
    yaxis = dict(title = coly), 
    hovermode ='closest' 
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
pio.write_image(fig, f'images/{coly}X{colx}.png')

![image](images/failureXdate.png)

## Prefix

In [59]:
# Distribution by device
coly='device'
colx='prefix'
aggdf=df.groupby(coly)[colx].max().reset_index(drop=False)
counts=aggdf[colx].value_counts().sort_index().values.tolist()
labels=aggdf[colx].value_counts().sort_index().index.tolist()

data = [go.Bar(y=counts, 
               x=labels,
               marker=dict(color=Viridis[::3]),
                   )]

layout = go.Layout(
    title = f'Distribution of {coly} by {colx}', 
    xaxis = dict(title = colx), 
    yaxis = dict(title = coly+' count'), 
    width=500,
    height=400, 
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
pio.write_image(fig, f'images/{coly}X{colx}.png')

![image](images/deviceXprefix.png)

In [60]:
# Mean failure by prefix and device
coly='device'
colx='prefix'
colz='failure'
aggdf=df.groupby(coly).max().reset_index(drop=False)
aggdf.groupby(colx)[colz].mean().sort_index()
labels=aggdf.groupby(colx)[colz].mean().sort_index().index.tolist()
means=aggdf.groupby(colx)[colz].mean().sort_index().values.tolist()
means = [100*elem for elem in means]
rounded_means = ['%.2f' % elem for elem in means]

data = [go.Bar(y=rounded_means, 
               x=labels,
               marker=dict(color=Viridis[::3]),
                   )]

layout = go.Layout(
    title = f'Mean percent {colz} by {colx} and {coly}', 
    xaxis = dict(title = colx), 
    yaxis = dict(title = colz+' rate'), 
    width=500,
    height=400, 
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
pio.write_image(fig, f'images/{colx}X{colz}.png')

![image](images/prefixXfailure.png)

## Number of days in device life

In [61]:
df['ndays'].describe()

count    93701.000000
mean       199.587656
std         82.263541
min          5.000000
25%         97.000000
50%        226.000000
75%        245.000000
max        299.000000
Name: ndays, dtype: float64

In [62]:
# create labels
age_ranges = ["{0}-{1}".format(age, age + 24) for age in range(0, 300, 25)]
age_ranges

['0-24',
 '25-49',
 '50-74',
 '75-99',
 '100-124',
 '125-149',
 '150-174',
 '175-199',
 '200-224',
 '225-249',
 '250-274',
 '275-299']

In [63]:
# Distribution by device
coly='device'
colx='ndays'
colz='device life'
aggdf=df.groupby(coly)[colx].mean().reset_index(drop=False)
aggdf[colz] = pd.cut(x=aggdf[colx], bins=12, labels=age_ranges)
counts=aggdf[colz].value_counts().sort_index().values.tolist()
labels=aggdf[colz].value_counts().sort_index().index.tolist()

data = [go.Bar(y=counts, 
               x=labels,
               marker=dict(color=Viridis[::1]),
                   )]

layout = go.Layout(
    title = f'Distribution of {colz} in days', 
    xaxis = dict(title = 'Days that device is active'), 
    yaxis = dict(title = coly+' count'), 
#     width=500,
#     height=400, 
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
pio.write_image(fig, f'images/{colx}X{coly}.png')

In [64]:
# Distribution by device
colx='device'
coly='ndays'
colz='failure'
aggdf=df.groupby(colx)[[coly,colz]].mean().reset_index(drop=False)
aggdf['failed']=0
aggdf.loc[aggdf['failure']>0, 'failed']=1
means=aggdf.groupby('failed')['ndays'].mean().values.tolist()
rounded_means= ['%.2f' % elem for elem in means]
labels=['Did not fail', 'Failed']

data = [go.Bar(y=rounded_means, 
               x=labels,
               marker=dict(color=Viridis[::10]),
                   )]

layout = go.Layout(
    title = f'Average device life by {colx} and {colz}', 
    xaxis = dict(title = colz), 
    yaxis = dict(title = 'Days that device is active'), 
    width=500,
    height=400, 
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
pio.write_image(fig, f'images/{coly}X{colx}.png')

## Attribute 1

In [67]:
df['attribute1'].describe()

count    9.370100e+04
mean     1.222246e+08
std      7.047940e+07
min      0.000000e+00
25%      6.113995e+07
50%      1.225107e+08
75%      1.832279e+08
max      2.441386e+08
Name: attribute1, dtype: float64

In [13]:
data = [go.Histogram(x=transformed1,
                    xbins=dict(size=2)
                   )]
layout = go.Layout(
    title = 'Attribute1', # Graph title
    
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

NameError: name 'transformed1' is not defined