In [1]:
import pandas as pd
import numpy as np
import calendar

from bokeh.charts import output_notebook, Scatter, Bar, show, output_file, Line, BoxPlot, Scatter
from bokeh.plotting import figure
from bokeh.layouts import row, column, gridplot

In [2]:
import pandas as pd
import numpy as np
import calendar
from bokeh.charts import output_notebook, Scatter, Bar, show, output_file, Line, BoxPlot, Scatter
from bokeh.plotting import figure
from bokeh.layouts import column, row
from bokeh.io import hplot
output_notebook() 

In [3]:
INPUT="data/device_failure.csv" 
dataset = pd.read_csv(INPUT,index_col=[0,1],parse_dates=[0])

## per device model

 - Set up first model
 - Precision/recall, ROC
 - Calibration
 - PCA ?
 - feature engineering
 - data cleaning
 - Test other models

## Build Training set

In [4]:
# fitering methods
suspicious_positives = set(["S1F0GPFZ", "S1F136J0", "W1F0KCP2", "W1F0M35B", "W1F11ZG9"])
def filter_devices(df):
    return df.filter(axis="index",items=(set(df.index) - suspicious_positives)) 

In [5]:
# feature preprocessing
def build_deriv(df,c,n=1):
    def per_device(per_device):
        clean_index = per_device.reset_index(level=1,drop=True)
        resampled=clean_index.resample('1D',"pad")
        
        raw_diff = np.diff(resampled,n=n)
        #fill the series start with zeros
        while (len(raw_diff) < len(resampled)):
            raw_diff = np.insert(raw_diff,0,0)
        d = pd.Series(data=raw_diff,index = resampled.index )
        #print d
        return d.dropna()[d>0]
    some_d= df[c].groupby(level="device").apply(per_device)
    return some_d.swaplevel()

In [6]:
from scipy.fftpack import fft
#fft_df = feature_dset[[attribute]].copy()

def to_fft(df):
    resampled =  df.resample("1D",level="date").mean().fillna(method='pad')
    n = len(resampled)
    return np.abs(fft(resampled))[n//2:]
def fft_line(df):
    return df.groupby(level="device",sort=True).transform(to_fft)

def peaks(line):
    sorted_by_used = sorted(enumerate(line),key = lambda  t: t[1], reverse=True )
    boundaries = set()
    peaks = []
    for i,value in sorted_by_used:
        if i not in boundaries:
            peaks.append((i,value))
        # in any case, i neighbors cannot be peaks now.
        boundaries.add(i+1)
        boundaries.add(i-1)
    return peaks
            
    
def fft_peak(df,p=0,index_no_value=True):
    fft = fft_line(df)
    all_peaks = peaks(fft.tolist())
    if (len(all_peaks) > p):
        return all_peaks[p][0 if index_no_value else 1]
    else:
        return 0

In [7]:
def pre_filter(df):
    res = df.copy()
    del res["attribute1"]
    del res["attribute3"]
    #del res["attribute5"]
    dt_list = ["attribute2"]#,"attribute8"]
    for c in dt_list:
        deriv = build_deriv(res,c)
        res["dt_%s" % c] = deriv
        res["dt2_%s" % c] =  build_deriv(res,c,2)
    return res.fillna(0)

def post_filter(df):
    res = df.copy()
    res = filter_devices(res)
    for col in res.columns:
        if "min" in col:
            del res[col]
        if "std" in col:
            del res[col]
    return res

In [8]:
pre_dataset = pre_filter(dataset)
#print feature_set.columns

features = [f for f in pre_dataset.columns if "att" in f]
def f_to_dict(feature):
    d = {
            "min_%s" % feature:np.min,
            "max_%s" % feature:np.max,
            "mean_%s" % feature :np.mean,
            "std_%s" % feature:np.std
        }
    dft_list = ["attribute4","attribute5", "attribute6","attribute7","attribute9"]
    if feature in dft_list:
        d["dft_p0_ind%s" % feature] = lambda r : fft_peak(r,p=0,index_no_value=True)
        d["dft_p0_val%s" % feature] = lambda r : fft_peak(r,p=0,index_no_value=False)
        #d["dft_p1_ind%s" % feature] = lambda r : fft_peak(r,p=1,index_no_value=True)
        #d["dft_p1_val%s" % feature] = lambda r : fft_peak(r,p=1,index_no_value=False)
    return d

agg_dict = dict( (f,f_to_dict(f)) for f in features )
#print agg_dict

the new syntax is .resample(...).pad()


In [9]:
feature_set = pre_dataset.groupby(level="device").agg(agg_dict)
feature_set.columns = feature_set.columns.droplevel()
feature_set = post_filter(feature_set)

# feature filtering
# feature filtering
#try : 
#    filtered = feature_set.filter(items=feature_imp.index)
#    print "filtering devices"
#    feature_set = filtered
#except:
#    print "no feature filtering"

label_set =  dataset[["failure"]].groupby(level="device").sum()
label_set = filter_devices(label_set)
feature_mat = feature_set.to_sparse().as_matrix()
label_mat = label_set.as_matrix().ravel()

## Run model

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from sklearn.metrics import roc_curve, auc
from sklearn.svm import SVC, NuSVC

#model = GradientBoostingClassifier()
pca = PCA()#n_components="mle",svd_solver="full")
norm = Normalizer()
#model=GradientBoostingClassifier()
model = RandomForestClassifier()
#model = SVC(probability=True)
pipeline= Pipeline([('normalize', norm),('reduce_dim', pca),("model",model)])
try:
    # use best parameters if available
    #
    #pipeline.set_params(**grid_result.best_params_)
    print "using last optimized model"
except:
    print "no optim result, or bad ones: let's keep the default ones"
    pass
scores = cross_val_score(pipeline, feature_mat, label_mat,cv=3,verbose=1,scoring="accuracy",n_jobs=6)                          
print "accurracy: %g, std(%g))" % (scores.mean(), scores.std())

using last optimized model
accurracy: 0.926039, std(0.0137847))


[Parallel(n_jobs=6)]: Done   3 out of   3 | elapsed:    0.1s finished


In [12]:
pipeline.get_params()

{'model': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
             verbose=0, warm_start=False),
 'model__bootstrap': True,
 'model__class_weight': None,
 'model__criterion': 'gini',
 'model__max_depth': None,
 'model__max_features': 'auto',
 'model__max_leaf_nodes': None,
 'model__min_impurity_split': 1e-07,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 10,
 'model__n_jobs': 1,
 'model__oob_score': False,
 'model__random_state': None,
 'model__verbose': 0,
 'model__warm_start': False,
 'normalize': Normalizer(copy=True, norm='l2'),
 'normalize__copy': True,
 'normalize__norm': 'l2',
 'reduce_dim': PCA(copy=T

### Eval Model

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, accuracy_score,precision_recall_curve, auc

X_train, X_test, Y_train, Y_test = train_test_split(feature_mat,label_mat,test_size=0.3)
# calculate the fpr and tpr for all thresholds of the classification

fitted = pipeline.fit(X_train,Y_train)
probs = fitted.predict_proba(X_test)
preds = probs[:,1]
preds_train = fitted.predict_proba(X_train)[:,1]
fpr, tpr, threshold = roc_curve(Y_test, preds)
fpr_train, tpr_train, threshold_train = roc_curve(Y_train, preds_train)
roc_auc = auc(fpr, tpr)
roc_auc_train = auc(fpr_train, tpr_train)
precision, recall, ths = precision_recall_curve(Y_test, preds)
precision_train, recall_train, ths_train = precision_recall_curve(Y_train, preds_train)

In [14]:
from bokeh.models.ranges import Range1d
#print "auc: %.2g, on train: %.2g" %(roc_auc, roc_auc_train)
roc_df = pd.DataFrame({"fpr":fpr,"tpr":tpr}).set_index("fpr")
pr_df = pd.DataFrame({"precision": precision, "recall":recall}).set_index("recall")
roc_df["diag"] = roc_df.index
pr_df["random"] = pr_df.precision.iloc[0]

# roc curve
roc_f = figure(width=400,height=400,title="roc, auc: %.2g, on train: %.2g"  %(roc_auc, roc_auc_train) )
roc_f.xaxis.axis_label = "tpr"
auc_range= Range1d(0,1)
roc_f.x_range = auc_range 
roc_f.y_range = auc_range 
roc_f.yaxis.axis_label = "fpr"
roc_f.cross(fpr,tpr,size=5)
roc_f.line(fpr,tpr,legend="roc")
roc_f.circle(fpr_train,tpr_train,size=5,color="red", line_width=1)
roc_f.line(fpr_train,tpr_train,color="red",legend="roc on train")
roc_f.line([0,1],[0,1], color="grey")

# pr curve
pr_f = figure(width=400,height=400,title="PR curve")
pr_f.xaxis.axis_label = "recall"
pr_f.yaxis.axis_label = "precision"
pr_f.cross(recall,precision,size=5)
pr_f.line(recall,precision,legend="PR")
pr_f.circle(recall_train,precision_train,size=5,color="red", line_width=1)
pr_f.line(recall_train,precision_train,color="red",legend="PR on train")

show(row(
    pr_f,
    roc_f
))

### Feature Importance

In [15]:
feature_imp = pd.DataFrame({"importance":model.feature_importances_}).set_index(feature_set.columns)
feature_imp.sort_values(by="importance",ascending=False)

Unnamed: 0,importance
mean_attribute4,0.123525
dft_p0_indattribute4,0.088753
max_attribute7,0.065422
max_attribute4,0.061774
mean_attribute7,0.060553
dft_p0_indattribute9,0.060123
dft_p0_valattribute6,0.04908
max_attribute6,0.045222
max_dt2_attribute2,0.04472
dft_p0_valattribute7,0.035299


### Hyperparameter optimisation




In [16]:
# model : GradientBoostingClassifier, parameters: 
#loss : {‘deviance’, ‘exponential’},
#learning_rate : float, optional (default=0.1)
#n_estimators : int (default=100)
#max_depth : integer, optional (default=3)
#min_samples_split : int, float, optional (default=2)
grids=dict()
XDB_param_grid = {
    #"model__loss":  ["deviance", 'exponential'],
    "model__learning_rate" : [1e-3,0.01, 0.1],
    "model__n_estimators" : [10, 50, 100, 150],
    "model__max_depth" : [5,10,15],
    "model__min_samples_split" : [5,10,20]
}
grids[GradientBoostingClassifier] = XDB_param_grid

In [17]:
# model : RandomForestClassifier, parameters: 
# n_estimators : int (default=100)
# criterion : "gini","entropy"
# max_features : auto , fraction
# max_depth : integer, optional (default=3)
# min_samples_split : int, float, optional (default=2)

RF_param_grid = {
    #"model__criterion":  ["gini", "entropy"],
    "model__n_estimators" : [75,100,150,200],
    #"model__max_features" : ["auto",0.5,0.25,0.1],
    "model__max_depth" : [2,5,10,20],
    "model__min_samples_split" : [5,10,20]
}
grids[RandomForestClassifier] = RF_param_grid

In [18]:
# C :penalty
# kernel : ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ 
SVC_param_grid = {
   'model__C': [1e-7,1e-6,1e-5,0.1],
    "model__kernel": ["rbf","linear"],
    #"model_degree" : [1,3,5], # polynomial degrees
    "model__gamma" : ["auto"], # kernel coef (rbf)
    #"coef0" # for poly, signmoid
    "model__tol" : [1e-7,1e-6,1e-5, 1e-4,1e-3]
}
grids[SVC] = SVC_param_grid

In [19]:
m  = type(dict(pipeline.steps)["model"])
param_grid=grids[m]

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=6, shuffle=True)
grid_search = GridSearchCV(pipeline, param_grid, scoring="accuracy", n_jobs=-1, verbose=1,cv=kfold)
grid_result = grid_search.fit(feature_mat,label_mat)


# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
#for mean, stdev, param in zip(means, stds, params):
#    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 6 folds for each of 48 candidates, totalling 288 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done 288 out of 288 | elapsed:   55.4s finished


Best: 0.941531 using {'model__min_samples_split': 5, 'model__max_depth': 10, 'model__n_estimators': 100}


In [None]:
pipeline.get_params()

In [28]:
from tpot import TPOTClassifier
pipeline_optimizer = TPOTClassifier(
    generations=20, # the more generatiion, the more optimized you get
    population_size=100,
    num_cv_folds=4,
    scoring="accuracy",
    random_state=42,
    verbosity=3)

pipeline_optimizer.fit(feature_mat, label_mat)
print pipeline_optimizer.score(feature_mat, label_mat)
pipeline_optimizer.export('tpot_longrun_exported_pipeline.py')

          n Progress:   5%|▍         | 100/2100 [03:17<57:09,  1.71s/pipeline]n Progress:   0%|          | 3/2100 [00:06<1:13:39,  2.11s/pipeline]                    Optimization Progress:   5%|▍         | 100/2100 [03:17<57:09,  1.71s/pipeline]                    Optimization Progress:   5%|▍         | 100/2100 [03:17<57:09,  1.71s/pipeline]

Generation 1 - Current Pareto front scores:
1	0.956155658578	XGBClassifier(input_matrix, 23, 4, 12.0, 0.93000000000000005)
2	0.960460098824	GradientBoostingClassifier(MultinomialNB(input_matrix, 0.029999999999999999), 0.29999999999999999, 0.76000000000000001)



          n Progress:   9%|▉         | 190/2100 [08:43<2:33:29,  4.82s/pipeline]imization Progress:   5%|▍         | 102/2100 [03:20<52:40,  1.58s/pipeline]                    Optimization Progress:   9%|▉         | 190/2100 [08:43<2:33:29,  4.82s/pipeline]                    Optimization Progress:   9%|▉         | 190/2100 [08:43<2:33:29,  4.82s/pipeline]

Generation 2 - Current Pareto front scores:
1	0.957867987345	XGBClassifier(input_matrix, 23, 4, 0.16, 0.93000000000000005)
2	0.960460098824	GradientBoostingClassifier(MultinomialNB(input_matrix, 0.029999999999999999), 0.29999999999999999, 0.76000000000000001)



          n Progress:  14%|█▍        | 297/2100 [11:07<42:03,  1.40s/pipeline] Optimization Progress:  10%|▉         | 202/2100 [08:44<1:22:10,  2.60s/pipeline]                    Optimization Progress:  14%|█▍        | 297/2100 [11:07<42:03,  1.40s/pipeline]                    Optimization Progress:  14%|█▍        | 297/2100 [11:07<42:03,  1.40s/pipeline]

Generation 3 - Current Pareto front scores:
1	0.95960393444	XGBClassifier(input_matrix, 23, 4, 0.33000000000000002, 0.93000000000000005)
2	0.960460098824	GradientBoostingClassifier(MultinomialNB(input_matrix, 0.029999999999999999), 0.29999999999999999, 0.76000000000000001)



Optimization Progress:  19%|█▉        | 394/2100 [12:36<26:28,  1.07pipeline/s]timization Progress:  14%|█▍        | 302/2100 [11:09<28:34,  1.05pipeline/s]                    Optimization Progress:  19%|█▉        | 394/2100 [12:36<26:28,  1.07pipeline/s]                    Optimization Progress:  19%|█▉        | 394/2100 [12:36<26:28,  1.07pipeline/s]          

Generation 4 - Current Pareto front scores:
1	0.95960393444	XGBClassifier(input_matrix, 23, 4, 0.33000000000000002, 0.93000000000000005)
2	0.960460098824	GradientBoostingClassifier(MultinomialNB(input_matrix, 0.029999999999999999), 0.29999999999999999, 0.76000000000000001)
3	0.960468965842	GradientBoostingClassifier(XGBClassifier(GaussianNB(input_matrix), 39, 0, 0.65000000000000002, 47.0), 0.58999999999999997, 0.37)



          n Progress:  23%|██▎       | 487/2100 [13:40<15:45,  1.71pipeline/s] timization Progress:  19%|█▉        | 403/2100 [12:37<18:01,  1.57pipeline/s]                    Optimization Progress:  23%|██▎       | 487/2100 [13:40<15:45,  1.71pipeline/s]                    Optimization Progress:  23%|██▎       | 487/2100 [13:40<15:45,  1.71pipeline/s]

Generation 5 - Current Pareto front scores:
1	0.95960393444	XGBClassifier(input_matrix, 23, 4, 0.33000000000000002, 0.93000000000000005)
2	0.96303153412	GradientBoostingClassifier(XGBClassifier(input_matrix, 39, 0, 0.65000000000000002, 47.0), 0.58999999999999997, 0.37)



          n Progress:  28%|██▊       | 593/2100 [15:25<19:04,  1.32pipeline/s] timization Progress:  24%|██▍       | 502/2100 [13:41<12:14,  2.18pipeline/s]                    Optimization Progress:  28%|██▊       | 593/2100 [15:25<19:04,  1.32pipeline/s]                    Optimization Progress:  28%|██▊       | 593/2100 [15:25<19:04,  1.32pipeline/s]

Generation 6 - Current Pareto front scores:
1	0.959606876586	XGBClassifier(input_matrix, 23, 4, 0.62, 0.93000000000000005)
2	0.96303153412	GradientBoostingClassifier(XGBClassifier(input_matrix, 39, 0, 0.65000000000000002, 47.0), 0.58999999999999997, 0.37)



          n Progress:  33%|███▎      | 690/2100 [16:38<16:15,  1.45pipeline/s] timization Progress:  29%|██▊       | 602/2100 [15:26<14:13,  1.76pipeline/s]                    Optimization Progress:  33%|███▎      | 690/2100 [16:38<16:15,  1.45pipeline/s]                    Optimization Progress:  33%|███▎      | 690/2100 [16:38<16:15,  1.45pipeline/s]

Generation 7 - Current Pareto front scores:
1	0.963028571684	XGBClassifier(input_matrix, 23, 4, 0.94000000000000006, 0.93000000000000005)
2	0.96303153412	GradientBoostingClassifier(XGBClassifier(input_matrix, 39, 0, 0.65000000000000002, 47.0), 0.58999999999999997, 0.37)



          n Progress:  38%|███▊      | 790/2100 [18:19<21:02,  1.04pipeline/s] timization Progress:  33%|███▎      | 702/2100 [16:39<14:25,  1.62pipeline/s]                    Optimization Progress:  38%|███▊      | 790/2100 [18:19<21:02,  1.04pipeline/s]                    Optimization Progress:  38%|███▊      | 790/2100 [18:19<21:02,  1.04pipeline/s]

Generation 8 - Current Pareto front scores:
1	0.963028571684	XGBClassifier(input_matrix, 23, 4, 0.94000000000000006, 0.93000000000000005)
2	0.96303153412	GradientBoostingClassifier(XGBClassifier(input_matrix, 39, 0, 0.65000000000000002, 47.0), 0.58999999999999997, 0.37)



          n Progress:  42%|████▏     | 890/2100 [19:59<1:01:35,  3.05s/pipeline]imization Progress:  38%|███▊      | 802/2100 [18:21<16:08,  1.34pipeline/s]                    Optimization Progress:  42%|████▏     | 890/2100 [19:59<1:01:35,  3.05s/pipeline]                    Optimization Progress:  42%|████▏     | 890/2100 [19:59<1:01:35,  3.05s/pipeline]

Generation 9 - Current Pareto front scores:
1	0.963028571684	XGBClassifier(input_matrix, 23, 4, 0.94000000000000006, 0.93000000000000005)
2	0.963043343284	GradientBoostingClassifier(XGBClassifier(input_matrix, 39, 0, 0.65000000000000002, 0.91000000000000003), 0.80000000000000004, 0.070000000000000007)



          n Progress:  47%|████▋     | 991/2100 [21:03<09:17,  1.99pipeline/s] Optimization Progress:  43%|████▎     | 902/2100 [20:01<32:44,  1.64s/pipeline]                    Optimization Progress:  47%|████▋     | 991/2100 [21:03<09:17,  1.99pipeline/s]                    Optimization Progress:  47%|████▋     | 991/2100 [21:03<09:17,  1.99pipeline/s]

Generation 10 - Current Pareto front scores:
1	0.963028571684	XGBClassifier(input_matrix, 23, 4, 0.94000000000000006, 0.93000000000000005)
2	0.963043343284	GradientBoostingClassifier(XGBClassifier(input_matrix, 39, 0, 0.65000000000000002, 0.91000000000000003), 0.80000000000000004, 0.070000000000000007)



          n Progress:  52%|█████▏    | 1092/2100 [22:10<07:04,  2.37pipeline/s] timization Progress:  48%|████▊     | 1002/2100 [21:05<07:12,  2.54pipeline/s]                    Optimization Progress:  52%|█████▏    | 1092/2100 [22:10<07:04,  2.37pipeline/s]                    Optimization Progress:  52%|█████▏    | 1092/2100 [22:10<07:04,  2.37pipeline/s]

Generation 11 - Current Pareto front scores:
1	0.963028571684	XGBClassifier(input_matrix, 23, 4, 0.94000000000000006, 0.93000000000000005)
2	0.963881793922	GradientBoostingClassifier(XGBClassifier(input_matrix, 39, 0, 0.65000000000000002, 0.87), 0.80000000000000004, 0.070000000000000007)



Optimization Progress:  57%|█████▋    | 1187/2100 [23:30<09:19,  1.63pipeline/s]timization Progress:  52%|█████▏    | 1102/2100 [22:13<12:46,  1.30pipeline/s]                    Optimization Progress:  57%|█████▋    | 1187/2100 [23:30<09:19,  1.63pipeline/s]                    Optimization Progress:  57%|█████▋    | 1187/2100 [23:30<09:19,  1.63pipeline/s]          

Generation 12 - Current Pareto front scores:
1	0.963028571684	XGBClassifier(input_matrix, 23, 4, 0.94000000000000006, 0.93000000000000005)
2	0.963881793922	GradientBoostingClassifier(XGBClassifier(input_matrix, 39, 0, 0.65000000000000002, 0.87), 0.80000000000000004, 0.070000000000000007)
3	0.964740920741	XGBClassifier(CombineDFs(VarianceThreshold(input_matrix, 100.0), LinearSVC(input_matrix, 0.78000000000000003, 8, True)), 23, 4, 0.94000000000000006, 0.93000000000000005)



Optimization Progress:  61%|██████▏   | 1289/2100 [24:37<05:40,  2.38pipeline/s]timization Progress:  57%|█████▋    | 1202/2100 [23:31<08:15,  1.81pipeline/s]                    Optimization Progress:  61%|██████▏   | 1289/2100 [24:37<05:40,  2.38pipeline/s]                    Optimization Progress:  61%|██████▏   | 1289/2100 [24:37<05:40,  2.38pipeline/s]          

Generation 13 - Current Pareto front scores:
1	0.963028571684	XGBClassifier(input_matrix, 23, 4, 0.94000000000000006, 0.93000000000000005)
2	0.963881793922	GradientBoostingClassifier(XGBClassifier(input_matrix, 39, 0, 0.65000000000000002, 0.87), 0.80000000000000004, 0.070000000000000007)
3	0.964740920741	XGBClassifier(CombineDFs(VarianceThreshold(input_matrix, 100.0), LinearSVC(input_matrix, 0.78000000000000003, 8, True)), 23, 4, 0.94000000000000006, 0.93000000000000005)



Optimization Progress:  66%|██████▋   | 1395/2100 [26:03<07:48,  1.51pipeline/s]timization Progress:  62%|██████▏   | 1302/2100 [24:38<04:52,  2.73pipeline/s]                    Optimization Progress:  66%|██████▋   | 1395/2100 [26:03<07:48,  1.51pipeline/s]                    Optimization Progress:  66%|██████▋   | 1395/2100 [26:03<07:48,  1.51pipeline/s]          

Generation 14 - Current Pareto front scores:
1	0.963028571684	XGBClassifier(input_matrix, 23, 4, 0.94000000000000006, 0.93000000000000005)
2	0.963881814212	XGBClassifier(CombineDFs(input_matrix, LinearSVC(input_matrix, 0.78000000000000003, 8, True)), 23, 4, 0.94000000000000006, 0.93000000000000005)
3	0.964740920741	XGBClassifier(CombineDFs(VarianceThreshold(input_matrix, 100.0), LinearSVC(input_matrix, 0.78000000000000003, 8, True)), 23, 4, 0.94000000000000006, 0.93000000000000005)



Optimization Progress:  71%|███████   | 1489/2100 [27:49<04:53,  2.08pipeline/s]timization Progress:  67%|██████▋   | 1402/2100 [26:04<05:49,  1.99pipeline/s]                    Optimization Progress:  71%|███████   | 1489/2100 [27:49<04:53,  2.08pipeline/s]                    Optimization Progress:  71%|███████   | 1489/2100 [27:49<04:53,  2.08pipeline/s]          

Generation 15 - Current Pareto front scores:
1	0.963028571684	XGBClassifier(input_matrix, 23, 4, 0.94000000000000006, 0.93000000000000005)
2	0.963881814212	XGBClassifier(CombineDFs(input_matrix, LinearSVC(input_matrix, 0.78000000000000003, 8, True)), 23, 4, 0.94000000000000006, 0.93000000000000005)
3	0.964740920741	XGBClassifier(CombineDFs(VarianceThreshold(input_matrix, 100.0), LinearSVC(input_matrix, 0.78000000000000003, 8, True)), 23, 4, 0.94000000000000006, 0.93000000000000005)



Optimization Progress:  75%|███████▌  | 1585/2100 [29:07<03:57,  2.17pipeline/s]timization Progress:  72%|███████▏  | 1502/2100 [27:53<07:46,  1.28pipeline/s]                    Optimization Progress:  75%|███████▌  | 1585/2100 [29:07<03:57,  2.17pipeline/s]                    Optimization Progress:  75%|███████▌  | 1585/2100 [29:07<03:57,  2.17pipeline/s]          

Generation 16 - Current Pareto front scores:
1	0.963028571684	XGBClassifier(input_matrix, 23, 4, 0.94000000000000006, 0.93000000000000005)
2	0.963881814212	XGBClassifier(CombineDFs(input_matrix, LinearSVC(input_matrix, 0.78000000000000003, 8, True)), 23, 4, 0.94000000000000006, 0.93000000000000005)
3	0.964740920741	XGBClassifier(CombineDFs(VarianceThreshold(input_matrix, 100.0), LinearSVC(input_matrix, 0.78000000000000003, 8, True)), 23, 4, 0.94000000000000006, 0.93000000000000005)



Optimization Progress:  81%|████████  | 1694/2100 [31:03<06:11,  1.09pipeline/s]timization Progress:  76%|███████▋  | 1602/2100 [29:09<03:10,  2.61pipeline/s]                    Optimization Progress:  81%|████████  | 1694/2100 [31:03<06:11,  1.09pipeline/s]                    Optimization Progress:  81%|████████  | 1694/2100 [31:03<06:11,  1.09pipeline/s]          

Generation 17 - Current Pareto front scores:
1	0.963028571684	XGBClassifier(input_matrix, 23, 4, 0.94000000000000006, 0.93000000000000005)
2	0.963881814212	XGBClassifier(CombineDFs(input_matrix, LinearSVC(input_matrix, 0.78000000000000003, 8, True)), 23, 4, 0.94000000000000006, 0.93000000000000005)
3	0.964740920741	XGBClassifier(CombineDFs(VarianceThreshold(input_matrix, 100.0), LinearSVC(input_matrix, 0.78000000000000003, 8, True)), 23, 4, 0.94000000000000006, 0.93000000000000005)



          n Progress:  85%|████████▌ | 1795/2100 [33:37<06:33,  1.29s/pipeline]ptimization Progress:  81%|████████  | 1702/2100 [31:11<16:19,  2.46s/pipeline]                    Optimization Progress:  85%|████████▌ | 1795/2100 [33:37<06:33,  1.29s/pipeline]                    Optimization Progress:  85%|████████▌ | 1795/2100 [33:37<06:33,  1.29s/pipeline]

Generation 18 - Current Pareto front scores:
1	0.963028571684	XGBClassifier(input_matrix, 23, 4, 0.94000000000000006, 0.93000000000000005)
2	0.964740920741	XGBClassifier(CombineDFs(input_matrix, LinearSVC(input_matrix, 0.17999999999999999, 8, True)), 23, 4, 0.94000000000000006, 0.93000000000000005)



          n Progress:  90%|████████▉ | 1889/2100 [36:02<04:04,  1.16s/pipeline]ptimization Progress:  86%|████████▌ | 1802/2100 [33:40<04:39,  1.07pipeline/s]                    Optimization Progress:  90%|████████▉ | 1889/2100 [36:02<04:04,  1.16s/pipeline]                    Optimization Progress:  90%|████████▉ | 1889/2100 [36:02<04:04,  1.16s/pipeline]

Generation 19 - Current Pareto front scores:
1	0.963028571684	XGBClassifier(input_matrix, 23, 4, 0.94000000000000006, 0.93000000000000005)
2	0.964740920741	XGBClassifier(CombineDFs(input_matrix, LinearSVC(input_matrix, 0.17999999999999999, 8, True)), 23, 4, 0.94000000000000006, 0.93000000000000005)



          n Progress:  95%|█████████▍| 1991/2100 [37:03<00:44,  2.45pipeline/s]ptimization Progress:  91%|█████████ | 1902/2100 [36:04<03:53,  1.18s/pipeline]                    Optimization Progress:  95%|█████████▍| 1991/2100 [37:03<00:44,  2.45pipeline/s]                    Optimization Progress:  95%|█████████▍| 1991/2100 [37:03<00:44,  2.45pipeline/s]

Generation 20 - Current Pareto front scores:
1	0.963028571684	XGBClassifier(input_matrix, 23, 4, 0.94000000000000006, 0.93000000000000005)
2	0.964740920741	XGBClassifier(CombineDFs(input_matrix, LinearSVC(input_matrix, 0.17999999999999999, 8, True)), 23, 4, 0.94000000000000006, 0.93000000000000005)



                                                                                Optimization Progress:  95%|█████████▌| 2002/2100 [37:05<00:57,  1.70pipeline/s]

1.0


In [26]:
pipeline_optimizer.get_params

<bound method TPOTClassifier.get_params of TPOTClassifier(crossover_rate=0.05, disable_update_check=False, generations=1,
        max_eval_time_mins=5, max_time_mins=None, mutation_rate=0.9,
        num_cv_folds=4, population_size=20, random_state=42, scoring=None,
        verbosity=2)>