In [2]:
import pandas as pd

In [3]:
from ipywidgets import widgets
from IPython.display import display, clear_output
from jupyterthemes import get_themes
from jupyterthemes.stylefx import set_nb_theme

def update_theme(change):
    """ display chosen theme from dropdown menu """
    name = change['new']
    html_css = set_nb_theme(name)
    display(html_css)

themes = get_themes()
theme_sel = widgets.Dropdown(options=themes, description='Select a Theme:')
theme_sel.observe(update_theme, names=['selected_label'], type='change')

In [5]:
# start with first theme in themes list
display(theme_sel)
set_nb_theme(themes[1])

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"


## Download PIMA Indians Diabetes dataset

1. Number of times pregnant 
2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test 
3. Diastolic blood pressure (mm Hg) 
4. Triceps skin fold thickness (mm) 
5. 2-Hour serum insulin (mu U/ml) 
6. Body mass index (weight in kg/(height in m)^2) 
7. Diabetes pedigree function 
8. Age (years) 
9. Class variable (0 or 1) 

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataset = pd.read_csv(url, names=names)

## Summarize Data

In [3]:
print(dataset.shape)
print(dataset.dtypes)

(768, 9)
preg       int64
plas       int64
pres       int64
skin       int64
test       int64
mass     float64
pedi     float64
age        int64
class      int64
dtype: object


In [4]:
print(dataset.head(20))

    preg  plas  pres  skin  test  mass   pedi  age  class
0      6   148    72    35     0  33.6  0.627   50      1
1      1    85    66    29     0  26.6  0.351   31      0
2      8   183    64     0     0  23.3  0.672   32      1
3      1    89    66    23    94  28.1  0.167   21      0
4      0   137    40    35   168  43.1  2.288   33      1
5      5   116    74     0     0  25.6  0.201   30      0
6      3    78    50    32    88  31.0  0.248   26      1
7     10   115     0     0     0  35.3  0.134   29      0
8      2   197    70    45   543  30.5  0.158   53      1
9      8   125    96     0     0   0.0  0.232   54      1
10     4   110    92     0     0  37.6  0.191   30      0
11    10   168    74     0     0  38.0  0.537   34      1
12    10   139    80     0     0  27.1  1.441   57      0
13     1   189    60    23   846  30.1  0.398   59      1
14     5   166    72    19   175  25.8  0.587   51      1
15     7   100     0     0     0  30.0  0.484   32      1
16     0   118

## Prepare Data

In [5]:
from sklearn import cross_validation



In [6]:
array = dataset.values
X = array[:,0:8]
Y = array[:,8]

In [7]:
validation_size = 0.20
seed = 7
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=validation_size, random_state=seed)

## Tune Models

In [8]:
import numpy as np
np.random.seed(7)
import random
random.seed(7)

from sklearn.grid_search import GridSearchCV



###  Decision Tree 

In [9]:
from sklearn.tree import DecisionTreeClassifier

In [10]:
max_depth = np.arange(2,6)
min_samples_leaf = np.arange(1,10)
min_samples_split = np.arange(2,10,2)
min_impurity_split = 10.0**-np.arange(5,10)

param_grid = dict(max_depth=max_depth,min_samples_leaf=min_samples_leaf,min_samples_split=min_samples_split,min_impurity_split=min_impurity_split)
model = DecisionTreeClassifier(random_state=seed)
scoring = 'roc_auc' 
num_folds = 4
grid = GridSearchCV(cv=num_folds, estimator=model, param_grid=param_grid,scoring=scoring,n_jobs=-1)
grid.fit(X_train, Y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=7, splitter='best'),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'min_samples_split': array([2, 4, 6, 8]), 'min_impurity_split': array([  1.00000e-05,   1.00000e-06,   1.00000e-07,   1.00000e-08,
         1.00000e-09]), 'max_depth': array([2, 3, 4, 5]), 'min_samples_leaf': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [11]:
print(grid.best_score_)
print(grid.best_estimator_.max_depth, grid.best_estimator_.min_samples_leaf, grid.best_estimator_.min_samples_split, grid.best_estimator_.min_impurity_split)
dtTunedMaxDepth = grid.best_estimator_.max_depth
dtTunedMinSamplesLeaf = grid.best_estimator_.min_samples_leaf
dtTunedMinSamplesSplit = grid.best_estimator_.min_samples_split
dtTunedMinImpuritySplit = grid.best_estimator_.min_impurity_split

0.788322105318
(5, 9, 2, 1.0000000000000001e-05)


## Bagging

###  Bagged Decision Tree

In [12]:
from sklearn.ensemble import BaggingClassifier

In [13]:
n_estimators  = np.arange(100,500,100)
max_samples = np.arange(1.0,0.7,-0.1)

param_grid = dict(n_estimators =n_estimators ,max_samples=max_samples)
bdt_dt = DecisionTreeClassifier(random_state=seed, max_depth=dtTunedMaxDepth,min_samples_leaf=dtTunedMinSamplesLeaf,min_samples_split=dtTunedMinSamplesSplit,min_impurity_split=dtTunedMinImpuritySplit)
model = BaggingClassifier(base_estimator=bdt_dt, random_state=seed)
scoring = 'roc_auc' 
num_folds = 4
grid = GridSearchCV(cv=num_folds, estimator=model, param_grid=param_grid,scoring=scoring,n_jobs=-1)
grid.fit(X_train, Y_train)

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


GridSearchCV(cv=4, error_score='raise',
       estimator=BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1.0000000000000001e-05, min_samples_leaf=9,
            min_samples_split=2, min_weight_fraction_leaf...0, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=7, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': array([100, 200, 300, 400]), 'max_samples': array([ 1. ,  0.9,  0.8,  0.7])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [14]:
print(grid.best_score_)
print(grid.best_estimator_.n_estimators, grid.best_estimator_.max_samples)
bdtTunedNEstimators = grid.best_estimator_.n_estimators
bdtTunedMaxSamples = grid.best_estimator_.max_samples

0.822039297929
(400, 0.70000000000000007)


### Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
n_estimators  = np.arange(50,150,25)
max_features = np.arange(0.8,0.6,-0.1)
max_depth = np.arange(2,5)
min_samples_leaf = np.arange(1,6,2)
min_impurity_split = 10.0**-np.arange(7,9)

param_grid = dict(n_estimators=n_estimators,
                  max_features=max_features, 
                  max_depth=max_depth, 
                  min_samples_leaf=min_samples_leaf,
                  min_impurity_split=min_impurity_split)
model = RandomForestClassifier(random_state=seed)
scoring = 'roc_auc' 
num_folds = 4
grid = GridSearchCV(cv=num_folds, estimator=model, param_grid=param_grid,scoring=scoring,n_jobs=-1)
grid.fit(X_train, Y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=7,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_features': array([ 0.8,  0.7,  0.6]), 'min_impurity_split': array([  1.00000e-07,   1.00000e-08]), 'n_estimators': array([ 50,  75, 100, 125]), 'max_depth': array([2, 3, 4]), 'min_samples_leaf': array([1, 3, 5])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [17]:
print(grid.best_score_)
print(grid.best_estimator_.n_estimators, 
      grid.best_estimator_.max_features,
      grid.best_estimator_.max_depth,
      grid.best_estimator_.min_samples_leaf,
      grid.best_estimator_.min_impurity_split)
rfTunedNEstimators = grid.best_estimator_.n_estimators
rfTunedMaxFeatures = grid.best_estimator_.max_features
rfTunedMaxDepth = grid.best_estimator_.max_depth
rfTunedMinSamplesLeaf = grid.best_estimator_.min_samples_leaf
rfTunedMinImpuritySplit = grid.best_estimator_.min_impurity_split

0.827291561066
(125, 0.70000000000000007, 4, 5, 9.9999999999999995e-08)


### Extra Trees

In [18]:
from sklearn.ensemble import ExtraTreesClassifier

In [19]:
n_estimators  = np.arange(50,150,25)
max_features = np.arange(0.8,0.6,-0.1)
max_depth = np.arange(2,5)
min_samples_leaf = np.arange(1,6,2)
min_impurity_split = 10.0**-np.arange(7,9)

param_grid = dict(n_estimators=n_estimators,
                  max_features=max_features, 
                  max_depth=max_depth, 
                  min_samples_leaf=min_samples_leaf,
                  min_impurity_split=min_impurity_split)
model = ExtraTreesClassifier(random_state=seed)
scoring = 'roc_auc' 
num_folds = 4
grid = GridSearchCV(cv=num_folds, estimator=model, param_grid=param_grid,scoring=scoring,n_jobs=-1)
grid.fit(X_train, Y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=7,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_features': array([ 0.8,  0.7,  0.6]), 'min_impurity_split': array([  1.00000e-07,   1.00000e-08]), 'n_estimators': array([ 50,  75, 100, 125]), 'max_depth': array([2, 3, 4]), 'min_samples_leaf': array([1, 3, 5])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [20]:
print(grid.best_score_)
print(grid.best_estimator_.n_estimators, 
      grid.best_estimator_.max_features,
      grid.best_estimator_.max_depth,
      grid.best_estimator_.min_samples_leaf,
      grid.best_estimator_.min_impurity_split)
etTunedNEstimators = grid.best_estimator_.n_estimators
etTunedMaxFeatures = grid.best_estimator_.max_features
etTunedMaxDepth = grid.best_estimator_.max_depth
etTunedMinSamplesLeaf = grid.best_estimator_.min_samples_leaf
etTunedMinImpuritySplit = grid.best_estimator_.min_impurity_split

0.829275263608
(100, 0.70000000000000007, 3, 3, 9.9999999999999995e-08)


## Boosting

### AdaBoost

In [21]:
from sklearn.ensemble import AdaBoostClassifier

In [22]:
n_estimators  = np.arange(50,150,25)
learning_rate = np.arange(1,2,0.25)
base_estimator__max_depth = np.arange(2,5)
base_estimator__min_samples_leaf = np.arange(1,6,1)
base_estimator__min_impurity_split = 10.0**-np.arange(7,9)

param_grid = dict(n_estimators =n_estimators,
                  learning_rate=learning_rate,
                  base_estimator__max_depth=base_estimator__max_depth,
                  base_estimator__min_samples_leaf=base_estimator__min_samples_leaf,
                  base_estimator__min_impurity_split=base_estimator__min_impurity_split)
adaboost_dt = DecisionTreeClassifier(random_state=seed)
model = AdaBoostClassifier(base_estimator=adaboost_dt, random_state=seed)
scoring = 'roc_auc' 
num_folds = 4
grid = GridSearchCV(cv=num_folds, estimator=model, param_grid=param_grid,scoring=scoring,n_jobs=-1)
grid.fit(X_train, Y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=7, splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=7),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': array([ 50,  75, 100, 125]), 'learning_rate': array([ 1.  ,  1.25,  1.5 ,  1.75]), 'base_estimator__max_depth': array([2, 3, 4]), 'base_estimator__min_samples_leaf': array([1, 2, 3, 4, 5]), 'base_estimator__min_impurity_split': array([  1.00000e-07,   1.00000e-08])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [23]:
print(grid.best_score_)
print(grid.best_estimator_.n_estimators, 
      grid.best_estimator_.learning_rate,
      grid.best_estimator_.base_estimator.max_depth,
      grid.best_estimator_.base_estimator.min_samples_leaf,
      grid.best_estimator_.base_estimator.min_impurity_split)
adaboostTunedNEstimators = grid.best_estimator_.n_estimators
adaboostTunedLearningRate = grid.best_estimator_.learning_rate
adaboostTunedMaxDepth = grid.best_estimator_.base_estimator.max_depth
adaboostTunedMinSamplesLeaf = grid.best_estimator_.base_estimator.min_samples_leaf
adaboostTunedMinImpuritySplit = grid.best_estimator_.base_estimator.min_impurity_split

0.803902152719
(125, 1.25, 4, 1, 9.9999999999999995e-08)


### Stochastic Gradient Boosting

In [24]:
from sklearn.ensemble import GradientBoostingClassifier

In [25]:
n_estimators  = np.arange(100,500,200)
learning_rate = 10.0**-np.arange(1,4)
max_depth = np.arange(2,5)
min_samples_leaf = np.arange(1,6,1)
min_impurity_split = 10.0**-np.arange(7,9)

param_grid = dict(n_estimators =n_estimators,
                  learning_rate=learning_rate,
                  max_depth=max_depth,
                  min_samples_leaf=min_samples_leaf,
                  min_impurity_split=min_impurity_split)
model = GradientBoostingClassifier(random_state=seed)
scoring = 'roc_auc' 
num_folds = 4
grid = GridSearchCV(cv=num_folds, estimator=model, param_grid=param_grid,scoring=scoring,n_jobs=-1)
grid.fit(X_train, Y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=7,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': array([100, 300]), 'min_impurity_split': array([  1.00000e-07,   1.00000e-08]), 'learning_rate': array([ 0.1  ,  0.01 ,  0.001]), 'max_depth': array([2, 3, 4]), 'min_samples_leaf': array([1, 2, 3, 4, 5])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [26]:
print(grid.best_score_)
print(grid.best_estimator_.n_estimators, 
      grid.best_estimator_.learning_rate,
      grid.best_estimator_.max_depth,
      grid.best_estimator_.min_samples_leaf,
      grid.best_estimator_.min_impurity_split)
sgbTunedNEstimators = grid.best_estimator_.n_estimators
sgbTunedLearningRate = grid.best_estimator_.learning_rate
sgbTunedMaxDepth = grid.best_estimator_.max_depth
sgbTunedMinSamplesLeaf = grid.best_estimator_.min_samples_leaf
sgbTunedMinImpuritySplit = grid.best_estimator_.min_impurity_split

0.818444742544
(300, 0.01, 3, 1, 9.9999999999999995e-08)


### Another Implementation of Stochastic Gradient Boosting: XGBoost

In [27]:
from xgboost.sklearn import XGBClassifier

In [28]:
n_estimators  = np.arange(100,250,10)
learning_rate = np.arange(0.03,0.06,0.005)
max_depth = np.arange(2,5)
subsample = np.arange(1.0,0.75,-0.05)
colsample_bytree = np.arange(1.0,0.85,-0.05)

param_grid = dict(n_estimators =n_estimators,
                  learning_rate=learning_rate,
                  max_depth=max_depth,
                  subsample=subsample,
                  colsample_bytree=colsample_bytree)
model = XGBClassifier(seed=seed,
                      objective = "binary:logistic",
                      nthread = 1,
                      silent = True)
scoring = 'roc_auc' 
num_folds = 3
grid = GridSearchCV(cv=num_folds, estimator=model, scoring=scoring, param_grid=param_grid,n_jobs=-1)
grid.fit(X_train, Y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=7, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': array([100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220,
       230, 240]), 'subsample': array([ 1.  ,  0.95,  0.9 ,  0.85,  0.8 ]), 'learning_rate': array([ 0.03 ,  0.035,  0.04 ,  0.045,  0.05 ,  0.055]), 'colsample_bytree': array([ 1.  ,  0.95,  0.9 ,  0.85]), 'max_depth': array([2, 3, 4])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [29]:
print(grid.best_score_)
print(grid.best_estimator_.n_estimators, 
      grid.best_estimator_.learning_rate,
      grid.best_estimator_.max_depth,
      grid.best_estimator_.subsample,
      grid.best_estimator_.colsample_bytree)
xgbTunedNEstimators = grid.best_estimator_.n_estimators
xgbTunedLearningRate = grid.best_estimator_.learning_rate
xgbTunedMaxDepth = grid.best_estimator_.max_depth
xgbTunedSubsample = grid.best_estimator_.subsample
xgbTunedColsampleBytree = grid.best_estimator_.colsample_bytree

0.837077432608
(100, 0.054999999999999986, 2, 0.79999999999999982, 0.94999999999999996)


## Voting

In [30]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

In [31]:
estimators = []

model1 = BaggingClassifier(base_estimator=bdt_dt, random_state=seed, n_estimators=bdtTunedNEstimators, max_samples=bdtTunedMaxSamples)
estimators.append(('bdt',model1))
model2 = RandomForestClassifier(random_state=seed, n_estimators=rfTunedNEstimators, max_features=rfTunedMaxFeatures, max_depth=rfTunedMaxDepth, min_samples_leaf=rfTunedMinSamplesLeaf, min_impurity_split=rfTunedMinImpuritySplit)
estimators.append(('rf',model2))
#model3 = XGBClassifier(seed=seed,objective = "binary:logistic",nthread = 1,silent = True,n_estimators=xgbTunedNEstimators,learning_rate=xgbTunedLearningRate,max_depth=xgbTunedMaxDepth,subsample=xgbTunedSubsample,colsample_bytree=xgbTunedColsampleBytree)
#estimators.append(('et',model3))

voting = VotingClassifier(estimators,voting='soft')
num_folds = 4
results = cross_val_score(voting, X_train, Y_train, cv=num_folds, scoring=scoring)
print(results.mean())

0.82539015685


## Evaluate Performance

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import auc
from sklearn.metrics import roc_curve

In [33]:
seed = 7

adaboost_dt = DecisionTreeClassifier(random_state=seed,
                                     max_depth=adaboostTunedMaxDepth,
                                     min_samples_leaf=adaboostTunedMinSamplesLeaf,
                                     min_impurity_split=adaboostTunedMinImpuritySplit)

pipelines = []
pipelines.append(('DT', Pipeline([('DT', DecisionTreeClassifier(random_state=seed, max_depth=dtTunedMaxDepth,min_samples_leaf=dtTunedMinSamplesLeaf,min_samples_split=dtTunedMinSamplesSplit,min_impurity_split=dtTunedMinImpuritySplit))])))
pipelines.append(('BDT', Pipeline([('BDT', BaggingClassifier(base_estimator=bdt_dt, random_state=seed, n_estimators=bdtTunedNEstimators, max_samples=bdtTunedMaxSamples))])))
pipelines.append(('RF', Pipeline([('RF', RandomForestClassifier(random_state=seed, n_estimators=rfTunedNEstimators, max_features=rfTunedMaxFeatures, max_depth=rfTunedMaxDepth, min_samples_leaf=rfTunedMinSamplesLeaf, min_impurity_split=rfTunedMinImpuritySplit))])))
pipelines.append(('ET', Pipeline([('ET', ExtraTreesClassifier(random_state=seed, n_estimators=etTunedNEstimators, max_features=etTunedMaxFeatures, max_depth=etTunedMaxDepth, min_samples_leaf=etTunedMinSamplesLeaf, min_impurity_split=etTunedMinImpuritySplit))])))
pipelines.append(('AdaBoost', Pipeline([('AdaBoost', AdaBoostClassifier(base_estimator=adaboost_dt, random_state=seed, n_estimators=adaboostTunedNEstimators, learning_rate=adaboostTunedLearningRate))])))
pipelines.append(('SGB', Pipeline([('SGB', GradientBoostingClassifier(random_state=seed, n_estimators=sgbTunedNEstimators, learning_rate=sgbTunedLearningRate, max_depth=sgbTunedMaxDepth, min_samples_leaf=sgbTunedMinSamplesLeaf, min_impurity_split=sgbTunedMinImpuritySplit))])))
pipelines.append(('XGB', Pipeline([('XGB', XGBClassifier(seed=seed,objective = "binary:logistic",nthread = 1,silent = True,n_estimators=xgbTunedNEstimators,learning_rate=xgbTunedLearningRate,max_depth=xgbTunedMaxDepth,subsample=xgbTunedSubsample,colsample_bytree=xgbTunedColsampleBytree))])))
pipelines.append(('Voting', Pipeline([('Voting', voting)])))

In [34]:
# Make predictions on test dataset
results = []
names = []
for name, model in pipelines:
    model.fit(X_train,Y_train)
    fpr, tpr, thresholds = roc_curve(Y_test,model.predict_proba(X_test)[:,1])
    result = auc(fpr,tpr)
    results.append(result)
    names.append(name)
    msg = "%s: %f" % (name, result)
    print(msg)

DT: 0.798426
BDT: 0.858383
RF: 0.857298
ET: 0.872671
AdaBoost: 0.808826
SGB: 0.843371
XGB: 0.853681
Voting: 0.859107
