## loading data/libs

In [1]:
import pandas as pd
import numpy as np
import calendar

from bokeh.charts import output_notebook, Scatter, Bar, show, output_file, Line, BoxPlot, Scatter
from bokeh.plotting import figure
from bokeh.layouts import row, column, gridplot

from ipywidgets import interactive
from IPython.display import display
from IPython.utils.py3compat import annotate

output_notebook() 

In [2]:
INPUT="data/device_failure.csv" 
dataset = pd.read_csv(INPUT,index_col=[0,1],parse_dates=[0])

 ## features
 
 Per features:
 
 ### Statistical distribution:
 - Distribution
 - Distribution over failures
 - Distribution over devices
 - Distribution over failing devices
 
###  Temporal distribution
 - Average value over time
 - Average value over time for failing devices
 - Value before failure   
 
###  Frequency distribution
 - DFT / device
 - DFT / failing device

In [3]:
@annotate(attribute=list(dataset.columns[1:]))
def pick_attribute(attribute):
    return "current attribute=%s" % s
s = interactive(pick_attribute)
display(s)

'current attribute=<ipywidgets.widgets.widget_box.Box object at 0x7f3a59bda650>'

## building data objects

In [211]:
attribute = s.children[0].value
feature_dset = dataset[[attribute,"failure"]]
failing_points = feature_dset[feature_dset["failure"]>0]

In [212]:
def failure_date(failure):
    data = feature_dset.ix[failure.index]
    dates =data[data["failure"]>0]
    if not dates.empty:
        return dates.iloc[0].name[0]
    else:
        return None
    
devices = feature_dset.groupby(level=1).agg(
    {
        "failure":{
            "failure":np.sum, 
            "failure_date":failure_date},
        attribute : {
            "min_att":np.min,
            "max_att":np.max,
            "mean_att":np.mean,
            "std_att":np.std
        }})
devices.columns = devices.columns.droplevel()

failing_devices = devices[devices["failure"]>0]
working_devices = devices[devices["failure"]==0]


working_devices_t = pd.DataFrame({attribute:feature_dset[attribute].unstack().filter(items=working_devices.index).unstack()}).dropna()
failing_devices_t = pd.DataFrame({attribute:feature_dset[attribute].unstack().filter(items=failing_devices.index).unstack()}).dropna()

In [213]:
def build_hist(df, column,label = None,w = 450,h=300,bins=20,color="lightblue"):
    mu = df[column].mean()
    std = df[column].std()
    label = label if label else column
    title = u"%s (μ=%.2e, σ=%.2e)" % (label, mu,std)
    f = figure(title=title,width=w,height=h) 
    hist, edges = np.histogram(df[column], density=True, bins=bins)
    f.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],fill_color=color, line_color="grey")
    return f

p1 = build_hist(feature_dset,attribute)
p2 = build_hist(failing_points,attribute,u"%s for failing points" % attribute,color="green")
h = row(p1,p2)
show(h)

{'max_att', 'mean_att', 'min_att', 'std_att'}

In [228]:
def per_car(car):
    print car
    p1 = build_hist(working_devices,car,label="%s on working devices"% car)
    p2 = build_hist(failing_devices,car,label="%s on failing devices"% car)
    return [p1,p2]


show(gridplot([ per_car(c) for c in set(c for c in devices.columns if "att" in c ) ]))

mean_att
std_att
max_att
min_att


## Average Value over Time.

In [256]:
def dev_std(device_df):
    return device_df.rolling(window=20,center=False).std()

def roll_std(df):
    return df[attribute].groupby(level="device").transform(dev_std)

    
failing_devices_t["rolling_std"]  = roll_std(failing_devices_t)
working_devices_t["rolling_std"]  = roll_std(working_devices_t)

time = pd.DataFrame({
    attribute : feature_dset[attribute].groupby(level=0).mean(),
    "%s for failing devices" %attribute : failing_devices_t[attribute].groupby(level=1).mean(),
    "%s for working devices" %attribute : working_devices_t[attribute].groupby(level=1).mean()
    #"rolling std(%s) for failing devices" % attribute : failing_devices_t["rolling_std"].groupby(level=1).mean(),
    #"rolling std(%s) for working devices" % attribute : working_devices_t["rolling_std"].groupby(level=1).mean()
})
show(Line(time))

attribute1 : 

- ~~Hyp 1 : amplitude gets higher when the device start failing => disproved~~
- ~~Hyp 2 : amplitude is always higher for more fragile devices => disproved~~
- ~~Hyp 2 : wider amplitude for SOME signals~~
- ~~Hyp 3 : failing devices somehow synchronize, resonnance effect (unlikely. plus what would it mean ?)~~
- Hyp 4 : too few devices to average out see graph "n_devices" [here][1]
[1]: data_exploration.ipynb

In [231]:
# relative time building
def to_relative_time(df,device_to_endtime,rel_time_threshold=-100):
    temp = df.reset_index()
    temp["failure_date"]=temp["device"].map(device_to_endtime)
    temp["dt_from_fail"]= (temp["date"]-temp["failure_date"] )/ np.timedelta64(1,'D')
    relative_time = temp.set_index(["device","dt_from_fail"])
    
    # filter relative values
    relative_time = relative_time[relative_time.index.get_level_values("dt_from_fail") >= rel_time_threshold]
    relative_time = relative_time[relative_time.index.get_level_values("dt_from_fail") <0 ]

    return relative_time


In [248]:
rel_time_threshold = -50
n_samples = 10

# relative time is relative to the failure date for negatives
fail_end_dates= devices["failure_date"].dropna().to_dict()
fail_relative_time = to_relative_time(failing_devices_t,fail_end_dates,rel_time_threshold)

# for working ones, we use the last value (beware, could lead to weird effects, if the attribute changes over time)
work_end_dates= working_devices_t.reset_index(level="date")["date"].groupby(level=0).max().to_dict()
work_relative_time = to_relative_time(working_devices_t,work_end_dates,rel_time_threshold)

fail_rel_sampled = fail_relative_time[attribute].unstack(level="dt_from_fail").sample(n=n_samples).stack()
work_rel_sampled = work_relative_time[attribute].unstack(level="dt_from_fail").sample(n=n_samples).stack()

show(row(
    Line(
        fail_rel_sampled.unstack(level="device"),
        width=450,
        height=400,
        title ="%s before failure for a sample failing devices" % attribute,
        legend=None),
    Line(
        work_rel_sampled.unstack(level="device"),
        width=450,
        height=400,
        title ="%s before end for a sample working devices" % attribute,
        legend=None)
))

In [251]:
n_samples=20
sampled_devices = working_devices_t[attribute].unstack(level="date").sample(n=n_samples).stack()
sampled_failing_devices = failing_devices_t[attribute].unstack(level="date").sample(n=n_samples).stack()

In [254]:
l0 = Line(
    sampled_failing_devices.unstack(level="device"),
    width=450,
    height=400,
    title='%s for sampled failing devices' % attribute)
l1 = Line(
    sampled_devices.unstack(level="device"),
    width=450,
    height=400,
    title = '%s for sampled failing devices' % attribute
)

show(row(l0,l1))

### FFT

In [261]:
from scipy.fftpack import fft
fft_df = feature_dset[[attribute]].copy()

def to_fft(df):
    resampled =  df.resample("1D",level="date").mean().fillna(method='pad')
    n = len(resampled)
    return np.abs(fft(resampled))[n//2:]
fft_per_device = fft_df[attribute].groupby(level="device",sort=True).transform(to_fft)


In [270]:
fft_df["df"] = fft_per_device
fft_plot = fft_df.groupby(level="device").apply(lambda x: x.reset_index(drop=True))["df"]

fft_working_devices = fft_plot.unstack(level=0).filter(items=working_devices.index).stack()
fft_failing_devices = fft_plot.unstack(level=0).filter(items=failing_devices.index).stack()

n_samples = 50
to_plot_working = fft_working_devices.unstack(level=1).sample(n=10).unstack().dropna()
to_plot_failing = fft_failing_devices.unstack(level=0).sample(n=10).unstack().dropna()
show(row(
    Line(to_plot_failing.unstack("device"),
        width=450,
        height=400,
        title = "dft of %s for sampled failing devices" % attribute,
        legend=None),
    Line(to_plot.unstack("device"),
        width=450,
        height=400,
        title = "dft of %s for sampled working device" % attribute,
        legend=None),
))