In [376]:
import pandas as pd
import numpy as np
import calendar
from bokeh.charts import output_notebook, Scatter, Bar, show, output_file, Line, BoxPlot, Scatter
from bokeh.plotting import figure
from bokeh.layouts import column, row
from bokeh.io import hplot
output_notebook() 

In [377]:
INPUT="data/device_failure.csv" 
dataset = pd.read_csv(INPUT,index_col=[0,1],parse_dates=[0])

## per device model

 - Set up first model
 - Precision/recall, ROC
 - Calibration
 - PCA ?
 - feature engineering
 - data cleaning
 - Test other models

## Build Training set

In [384]:
# feature preprocessing
def pre_filter(df):
    res = df.copy()
    del res["attribute1"]
    return res
def post_filter(df):
    res = df.copy()
    for col in res.columns:
        if "min" in col:
            del res[col]
        if "std" in col:
            del res[col]
    return res

In [379]:
features = [f for f in dataset.columns if "att" in f]
def f_to_dict(feature):
    return {
            "min_%s" % feature:np.min,
            "max_%s" % feature:np.max,
            "mean_%s" % feature :np.mean,
            "std_%s" % feature:np.std
        }

agg_dict = dict( (f,f_to_dict(f)) for f in features )

feature_set = pre_filter(dataset)
feature_set = dataset.groupby(level="device").agg(agg_dict)
feature_set.columns = feature_set.columns.droplevel()
feature_set = post_filter(feature_set)
label_set =  dataset[["failure"]].groupby(level="device").sum()
feature_mat = feature_set.to_sparse().as_matrix()
label_mat = label_set.as_matrix().ravel()

## Run model

In [380]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc

#model = GradientBoostingClassifier()
model = RandomForestClassifier()
pipeline= Pipeline([("model",model)])
try:
    # use best parameters if available
    #
    pipeline.set_params(**grid_result.best_params_)
    print "model set up"
except:
    print "no optim result, or bad ones: let's keep the default ones"
    pass
scores = cross_val_score(pipeline, feature_mat, label_mat,cv=3,verbose=1,scoring="accuracy",n_jobs=6)                          
print "accurracy: %g, std(%g))" % (scores.mean(), scores.std())

model set up
accurracy: 0.912682, std(0.00902879))


[Parallel(n_jobs=6)]: Done   3 out of   3 | elapsed:    0.3s finished


In [347]:
pipeline.get_params()

{'model': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
             max_depth=5, max_features='auto', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=10, min_weight_fraction_leaf=0.0,
             n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
             verbose=0, warm_start=False),
 'model__bootstrap': True,
 'model__class_weight': None,
 'model__criterion': 'entropy',
 'model__max_depth': 5,
 'model__max_features': 'auto',
 'model__max_leaf_nodes': None,
 'model__min_impurity_split': 1e-07,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 10,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 10,
 'model__n_jobs': 1,
 'model__oob_score': False,
 'model__random_state': None,
 'model__verbose': 0,
 'model__warm_start': False,
 'steps': [('model',
   RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
               max_d

### Eval Model

In [381]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, accuracy_score,precision_recall_curve, auc

X_train, X_test, Y_train, Y_test = train_test_split(feature_mat,label_mat,test_size=0.3)
# calculate the fpr and tpr for all thresholds of the classification

fitted = pipeline.fit(X_train,Y_train)
probs = fitted.predict_proba(X_test)
preds = probs[:,1]
preds_train = fitted.predict_proba(X_train)[:,1]
fpr, tpr, threshold = roc_curve(Y_test, preds)
fpr_train, tpr_train, threshold_train = roc_curve(Y_train, preds_train)
roc_auc = auc(fpr, tpr)
roc_auc_train = auc(fpr_train, tpr_train)
precision, recall, ths = precision_recall_curve(Y_test, preds)
precision_train, recall_train, ths_train = precision_recall_curve(Y_train, preds_train)

In [385]:
from bokeh.models.ranges import Range1d
#print "auc: %.2g, on train: %.2g" %(roc_auc, roc_auc_train)
roc_df = pd.DataFrame({"fpr":fpr,"tpr":tpr}).set_index("fpr")
pr_df = pd.DataFrame({"precision": precision, "recall":recall}).set_index("recall")
roc_df["diag"] = roc_df.index
pr_df["random"] = pr_df.precision.iloc[0]

# roc curve
roc_f = figure(width=400,height=400,title="roc, auc: %.2g, on train: %.2g"  %(roc_auc, roc_auc_train) )
roc_f.xaxis.axis_label = "tpr"
auc_range= Range1d(0,1)
roc_f.x_range = auc_range 
roc_f.y_range = auc_range 
roc_f.yaxis.axis_label = "fpr"
roc_f.cross(fpr,tpr,size=5)
roc_f.line(fpr,tpr,legend="roc")
roc_f.circle(fpr_train,tpr_train,size=5,color="red", line_width=1)
roc_f.line(fpr_train,tpr_train,color="red",legend="roc on train")
roc_f.line([0,1],[0,1], color="grey")

# pr curve
pr_f = figure(width=400,height=400,title="PR curve")
pr_f.xaxis.axis_label = "recall"
pr_f.yaxis.axis_label = "precision"
pr_f.cross(recall,precision,size=5)
pr_f.line(recall,precision,legend="PR")
pr_f.circle(recall_train,precision_train,size=5,color="red", line_width=1)
pr_f.line(recall_train,precision_train,color="red",legend="PR on train")

show(row(
    pr_f,
    roc_f
))

### Feature Importance

In [357]:
feature_imp = pd.DataFrame({"importance":model.feature_importances_}).set_index(feature_set.columns)
feature_imp.sort_values(by="importance",ascending=False)

Unnamed: 0,importance
max_attribute7,0.184157
max_attribute8,0.17556
mean_attribute7,0.163564
mean_attribute8,0.117988
max_attribute4,0.111517
max_attribute2,0.057986
mean_attribute2,0.047947
mean_attribute4,0.0344
mean_attribute5,0.022434
max_attribute1,0.020665


### Hyperparameter optimisation




In [358]:
# model : GradientBoostingClassifier, parameters: 
#loss : {‘deviance’, ‘exponential’},
#learning_rate : float, optional (default=0.1)
#n_estimators : int (default=100)
#max_depth : integer, optional (default=3)
#min_samples_split : int, float, optional (default=2)
grids=dict()
XDB_param_grid = {
    #"model__loss":  ["deviance", 'exponential'],
    "model__learning_rate" : [1e-3,0.01, 0.1],
    "model__n_estimators" : [10, 50,75, 100, 250],
    "model__max_depth" : [2,3,5,10],
    "model__min_samples_split" : [2,5,10,20]
}
grids[GradientBoostingClassifier] = XDB_param_grid

In [373]:
# model : RandomForestClassifier, parameters: 
# n_estimators : int (default=100)
# criterion : "gini","entropy"
# max_features : auto , fraction
# max_depth : integer, optional (default=3)
# min_samples_split : int, float, optional (default=2)

RF_param_grid = {
    "model__criterion":  ["gini", "entropy"],
    "model__n_estimators" : [100,250],
    #"model__max_features" : ["auto",0.5,0.25,0.1],
    "model__max_depth" : [2,5,10],
    "model__min_samples_split" : [2,10,20]
}
grids[RandomForestClassifier] = RF_param_grid

In [374]:
m  = type(dict(pipeline.steps)["model"])
param_grid=grids[m]

In [383]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=6, shuffle=True)
grid_search = GridSearchCV(pipeline, param_grid, scoring="accuracy", n_jobs=-1, verbose=1,cv=kfold)
grid_result = grid_search.fit(feature_mat,label_mat)


# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
#for mean, stdev, param in zip(means, stds, params):
#    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 6 folds for each of 36 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   33.2s
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed:   40.9s finished


Best: 0.925514 using {'model__criterion': 'gini', 'model__max_depth': 5, 'model__min_samples_split': 10, 'model__n_estimators': 100}
