In [1]:
## Import Modules
import os
import sys
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.metrics import confusion_matrix
import sklearn
from sklearn import datasets
#import seaborn as sns

In [2]:
## Set default figure size to be larger 
## this may only work in matplotlib 2.0+!
matplotlib.rcParams['figure.figsize'] = [10.0,6.0]
## Enable multiple outputs from jupyter cells
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Import data

In [3]:
data = pd.read_csv('wdbc.data',header=None,names = ['id_number', 'diagnosis', 'radius_mean', 
         'texture_mean', 'perimeter_mean', 'area_mean', 
         'smoothness_mean', 'compactness_mean', 
         'concavity_mean','concave_points_mean', 
         'symmetry_mean', 'fractal_dimension_mean',
         'radius_se', 'texture_se', 'perimeter_se', 
         'area_se', 'smoothness_se', 'compactness_se', 
         'concavity_se', 'concave_points_se', 
         'symmetry_se', 'fractal_dimension_se', 
         'radius_worst', 'texture_worst', 
         'perimeter_worst', 'area_worst', 
         'smoothness_worst', 'compactness_worst', 
         'concavity_worst', 'concave_points_worst', 
         'symmetry_worst', 'fractal_dimension_worst'])

In [4]:
data.head()

Unnamed: 0,id_number,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [5]:
data['diagnosis'].replace("M", 0, inplace=True)
data['diagnosis'].replace("B", 1, inplace=True)

In [6]:
data.head()

Unnamed: 0,id_number,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,842302,0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,0,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,0,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,0,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,0,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


# Create model matrices

In [7]:
vars = ['radius_mean','texture_mean','perimeter_mean',
               'area_mean','smoothness_mean','compactness_mean','concavity_mean','concave_points_mean','symmetry_mean','fractal_dimension_mean']
formula = "diagnosis ~ "  + " + ".join(vars)
formula

'diagnosis ~ radius_mean + texture_mean + perimeter_mean + area_mean + smoothness_mean + compactness_mean + concavity_mean + concave_points_mean + symmetry_mean + fractal_dimension_mean'

In [8]:
## use Patsy to create model matrices
Y,X = dmatrices(formula,
                data)

In [9]:
Y

DesignMatrix with shape (569, 1)
  diagnosis
          0
          0
          0
          0
          0
          0
          0
          0
          0
          0
          0
          0
          0
          0
          0
          0
          0
          0
          0
          1
          1
          1
          0
          0
          0
          0
          0
          0
          0
          0
  [539 rows omitted]
  Terms:
    'diagnosis' (column 0)
  (to view full data, use np.asarray(this_obj))

In [10]:
X

DesignMatrix with shape (569, 11)
  Columns:
    ['Intercept',
     'radius_mean',
     'texture_mean',
     'perimeter_mean',
     'area_mean',
     'smoothness_mean',
     'compactness_mean',
     'concavity_mean',
     'concave_points_mean',
     'symmetry_mean',
     'fractal_dimension_mean']
  Terms:
    'Intercept' (column 0)
    'radius_mean' (column 1)
    'texture_mean' (column 2)
    'perimeter_mean' (column 3)
    'area_mean' (column 4)
    'smoothness_mean' (column 5)
    'compactness_mean' (column 6)
    'concavity_mean' (column 7)
    'concave_points_mean' (column 8)
    'symmetry_mean' (column 9)
    'fractal_dimension_mean' (column 10)
  (to view full data, use np.asarray(this_obj))

# Establish 20% test sample

In [11]:
## Split Data into 80% training and 20% test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
    train_test_split(X,
                     np.ravel(Y), # prevents dimensionality error later! change dimention
                     test_size=0.2,
                     random_state=0)

In [12]:
X_test.shape
#114 observations in test data

(114, 11)

# Logistic regression with no regularization

In [13]:
## import linear model
from sklearn import linear_model
## Define model parameters
## can implement penalties, but check docs for appropriate solver
clf = linear_model.LogisticRegression(fit_intercept=True, # already have the intercept
                                      solver='liblinear') # could change to lbfgs!
## fit model using data with .fit
clf.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [14]:
clf.coef_
clf.coef_.shape

array([[ 0.70576824,  4.06963388, -0.17472315, -0.43201519, -0.02369288,
        -0.41293936, -1.04197055, -1.43770159, -0.76568274, -0.60092706,
        -0.08808622]])

(1, 11)

In [15]:
## get mean accuracy
clf.score(X_train,y_train) 

array(0.90769231)

In [16]:
## get confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train,
                 clf.predict(X_train))

array([[135,  30],
       [ 12, 278]])

In [17]:
# Get kappa
sklearn.metrics.cohen_kappa_score(y_train,
                                  clf.predict(X_train))

0.7955056179775281

In [18]:
## Create dict to store all these results:
result_scores = {}
## Score the Model on Training and Testing Set
result_scores['Logistic'] = \
            (sklearn.metrics.accuracy_score(y_train,clf.predict(X_train)),
             sklearn.metrics.accuracy_score(y_test,clf.predict(X_test)))

In [19]:
## Create Function to Print Results
def get_results(x1):
    print("\n{0:20}   {1:4}    {2:4}".format('Model','Train','Test'))
    print('-------------------------------------------')
    for i in x1.keys():
        print("{0:20}   {1:<6.4}   {2:<6.4}".format(i,x1[i][0],x1[i][1]))

In [20]:
get_results(result_scores)


Model                  Train    Test
-------------------------------------------
Logistic               0.9077   0.9211


# Null Model

In [21]:
## Dummy classifier
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy='most_frequent',
                      random_state=0)
clf.fit(X_train, y_train)
clf.score(X_train, y_train)  

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

array(0.63736264)

In [22]:
## Score the Model on Training and Testing Set
result_scores['Null'] = \
            (sklearn.metrics.accuracy_score(y_train,clf.predict(X_train)),
             sklearn.metrics.accuracy_score(y_test,clf.predict(X_test)))

In [23]:
get_results(result_scores)


Model                  Train    Test
-------------------------------------------
Logistic               0.9077   0.9211
Null                   0.6374   0.5877


# LASSO

In [24]:
## Logistic Regression with l1 penalty
## Specify penalty directly as C = 1
clf = linear_model.LogisticRegression(penalty='l1',
                                      C=1, solver = 'liblinear') # specify penalty
clf.fit(X_train,y_train)
## get confusion matrix
confusion_matrix(y_train,clf.predict(X_train))



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

array([[137,  28],
       [ 11, 279]])

In [25]:
## Score the Model on Training and Testing Set
result_scores['Logistic_L1_C_1'] = \
            (sklearn.metrics.accuracy_score(y_train,clf.predict(X_train)),
             sklearn.metrics.accuracy_score(y_test,clf.predict(X_test)))
get_results(result_scores)


Model                  Train    Test
-------------------------------------------
Logistic               0.9077   0.9211
Null                   0.6374   0.5877
Logistic_L1_C_1        0.9143   0.9211


# Ridge

In [26]:
## Logistic Regression with l1 penalty
## Specify penalty directly as C = 1
clf = linear_model.LogisticRegression(penalty='l2',
                                      C=1, solver = 'liblinear') # specify penalty
clf.fit(X_train,y_train)
## get confusion matrix
confusion_matrix(y_train,clf.predict(X_train))

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

array([[135,  30],
       [ 12, 278]])

In [27]:
clf.score(X_train,y_train) 
clf.score(X_test,y_test) 

array(0.90769231)

array(0.92105263)

In [28]:
## Score the Model on Training and Testing Set
result_scores['Logistic_L2_C_1'] = \
            (sklearn.metrics.accuracy_score(y_train,clf.predict(X_train)),
             sklearn.metrics.accuracy_score(y_test,clf.predict(X_test)))
get_results(result_scores)


Model                  Train    Test
-------------------------------------------
Logistic               0.9077   0.9211
Null                   0.6374   0.5877
Logistic_L1_C_1        0.9143   0.9211
Logistic_L2_C_1        0.9077   0.9211


# Elastic net penalty logistic regression,

In [29]:
clf= sklearn.linear_model.ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=32)
clf.fit(X_train,y_train)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=32, selection='cyclic', tol=0.0001, warm_start=False)

In [30]:
result_scores['ElasticNet'] = \
            (clf.score(X_train,y_train),
             clf.score(X_test,y_test) )
get_results(result_scores)


Model                  Train    Test
-------------------------------------------
Logistic               0.9077   0.9211
Null                   0.6374   0.5877
Logistic_L1_C_1        0.9143   0.9211
Logistic_L2_C_1        0.9077   0.9211
ElasticNet             0.5094   0.5578


# Random Forest

In [31]:
from sklearn import ensemble
clf = ensemble.RandomForestClassifier(n_estimators=100, 
                                      max_features=10,
                                      random_state=42)
clf.fit(X_train,y_train)
## get confusion matrix
confusion_matrix(y_train,clf.predict(X_train))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

array([[165,   0],
       [  0, 290]])

In [32]:
## Score the Model on Training and Testing Set
result_scores['RandomForest_noCV'] = \
            (sklearn.metrics.accuracy_score(y_train,clf.predict(X_train)),
             sklearn.metrics.accuracy_score(y_test,clf.predict(X_test)))
get_results(result_scores)


Model                  Train    Test
-------------------------------------------
Logistic               0.9077   0.9211
Null                   0.6374   0.5877
Logistic_L1_C_1        0.9143   0.9211
Logistic_L2_C_1        0.9077   0.9211
ElasticNet             0.5094   0.5578
RandomForest_noCV      1.0      0.9561


# Gradient tree boosting: Classification

In [33]:
from sklearn import ensemble
clf= sklearn.ensemble.GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100)
clf.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [34]:
## Score the Model on Training and Testing Set
result_scores['GradientTree'] = \
            (sklearn.metrics.accuracy_score(y_train,clf.predict(X_train)),
             sklearn.metrics.accuracy_score(y_test,clf.predict(X_test)))
get_results(result_scores)


Model                  Train    Test
-------------------------------------------
Logistic               0.9077   0.9211
Null                   0.6374   0.5877
Logistic_L1_C_1        0.9143   0.9211
Logistic_L2_C_1        0.9077   0.9211
ElasticNet             0.5094   0.5578
RandomForest_noCV      1.0      0.9561
GradientTree           1.0      0.9474


# Hyperprameters

For the logistic regression withour regularization, no hyperparameter.<br>
For LASSO, and Ridge, hyperparameters are alpha, which can affect the regularization effects. <br>
For Elastic net penelty regression, there are hyperparameter α accounts for the relative importance of the LASSO and Ridge regularizations. There is another hyperparameter λ, that accounts for the amount of regularization used in the model.<br>
For random forest, the hyperparameters are number of trees, and the max number of variables to consider for each tree, depth of the trees.<br>
For grandient tree boosting, hyperparameters are learning rate, the number of boosting stages to perform, subsample.

## K-fold cross validation for LASSO

In [35]:
## Select the alpha through cross validation (k-folds leave one out)
# auto generate 20 values between 1e-4 and 1e4 on log scale
clf = linear_model.LogisticRegressionCV(cv=5,
                                        Cs=20, ## takes awhile to fit 20 models!
                                        penalty='l1',  #lasso
                                        solver='liblinear') 
clf.fit(X_train,y_train)



LogisticRegressionCV(Cs=20, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l1',
           random_state=None, refit=True, scoring=None, solver='liblinear',
           tol=0.0001, verbose=0)

In [36]:
## Score the Model on Training and Testing Set
result_scores['Logistic_L1_C_auto'] = \
            (sklearn.metrics.accuracy_score(y_train,clf.predict(X_train)),
             sklearn.metrics.accuracy_score(y_test,clf.predict(X_test)))
get_results(result_scores)


Model                  Train    Test
-------------------------------------------
Logistic               0.9077   0.9211
Null                   0.6374   0.5877
Logistic_L1_C_1        0.9143   0.9211
Logistic_L2_C_1        0.9077   0.9211
ElasticNet             0.5094   0.5578
RandomForest_noCV      1.0      0.9561
GradientTree           1.0      0.9474
Logistic_L1_C_auto     0.9516   0.9474


In [37]:
## 20 C's were fit
clf.Cs
## The values of C's 
clf.Cs_
## The best fit C 
clf.C_

20

array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04])

array([545.55947812])

## K fold validation for Rigde

In [38]:
## Select the alpha through cross validation (k-folds leave one out)
# auto generate 20 values between 1e-4 and 1e4 on log scale
clf = linear_model.LogisticRegressionCV(cv=5,
                                        Cs=20, ## takes awhile to fit 20 models!
                                        penalty='l2',  #ridge
                                        solver='liblinear') 
clf.fit(X_train,y_train)

LogisticRegressionCV(Cs=20, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=None, refit=True, scoring=None, solver='liblinear',
           tol=0.0001, verbose=0)

In [39]:
## Score the Model on Training and Testing Set
result_scores['Logistic_L2_C_auto'] = \
            (sklearn.metrics.accuracy_score(y_train,clf.predict(X_train)),
             sklearn.metrics.accuracy_score(y_test,clf.predict(X_test)))
get_results(result_scores)


Model                  Train    Test
-------------------------------------------
Logistic               0.9077   0.9211
Null                   0.6374   0.5877
Logistic_L1_C_1        0.9143   0.9211
Logistic_L2_C_1        0.9077   0.9211
ElasticNet             0.5094   0.5578
RandomForest_noCV      1.0      0.9561
GradientTree           1.0      0.9474
Logistic_L1_C_auto     0.9516   0.9474
Logistic_L2_C_auto     0.9473   0.9386


In [40]:
## 20 C's were fit
clf.Cs
## The values of C's 
clf.Cs_
## The best fit C 
clf.C_

20

array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04])

array([10000.])

## K fold validation for elastic net

In [41]:
clf = linear_model.ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], cv=5, eps=0.001, n_alphas=100, alphas=None, 
                                fit_intercept=True, precompute='auto', max_iter=1000, tol=0.0001,  verbose=0, positive=False, random_state=32)
clf.fit(X_train,y_train)

ElasticNetCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
       l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], max_iter=1000,
       n_alphas=100, n_jobs=None, normalize=False, positive=False,
       precompute='auto', random_state=32, selection='cyclic', tol=0.0001,
       verbose=0)

In [42]:
clf.alpha_
clf.l1_ratio_

0.12031987924163748

1.0

In [43]:
clf.score(X_train,y_train) 
clf.score(X_test,y_test) 

0.5951907017239848

0.6098683896120006

In [44]:
result_scores['ElasticNetCV'] = \
            (clf.score(X_train,y_train),
             clf.score(X_test,y_test) )
get_results(result_scores)


Model                  Train    Test
-------------------------------------------
Logistic               0.9077   0.9211
Null                   0.6374   0.5877
Logistic_L1_C_1        0.9143   0.9211
Logistic_L2_C_1        0.9077   0.9211
ElasticNet             0.5094   0.5578
RandomForest_noCV      1.0      0.9561
GradientTree           1.0      0.9474
Logistic_L1_C_auto     0.9516   0.9474
Logistic_L2_C_auto     0.9473   0.9386
ElasticNetCV           0.5952   0.6099


# Grid search, random forest CV

In [45]:
from sklearn.model_selection import GridSearchCV
## specify grid
parameters = {'n_estimators':(50,100,200,300),
              'max_features':(2,4,6,8,10)}
## specify model without hyperparameters
rf_model = ensemble.RandomForestClassifier(random_state=32)
## specify search with model
clf = GridSearchCV(rf_model,
                   parameters,
                   cv=5,
                   return_train_score=True)
clf.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=32, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': (50, 100, 200, 300), 'max_features': (2, 4, 6, 8, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [47]:
## explore best hyperparameters
clf.best_params_

{'max_features': 4, 'n_estimators': 50}

In [48]:
## add model score
## Score the Model on Training and Testing Set
result_scores['RandomForest_CV'] = \
            (sklearn.metrics.accuracy_score(y_train,clf.predict(X_train)),
             sklearn.metrics.accuracy_score(y_test,clf.predict(X_test)))
get_results(result_scores)


Model                  Train    Test
-------------------------------------------
Logistic               0.9077   0.9211
Null                   0.6374   0.5877
Logistic_L1_C_1        0.9143   0.9211
Logistic_L2_C_1        0.9077   0.9211
ElasticNet             0.5094   0.5578
RandomForest_noCV      1.0      0.9561
GradientTree           1.0      0.9474
Logistic_L1_C_auto     0.9516   0.9474
Logistic_L2_C_auto     0.9473   0.9386
ElasticNetCV           0.5952   0.6099
RandomForest_CV        0.9978   0.9386


In [49]:
# depth of the tree
from sklearn.model_selection import GridSearchCV
## specify grid
parameters2 = {'max_depth':(2,5,7,10,20)}
## specify model without hyperparameters
rf_model = ensemble.RandomForestClassifier(max_features=4,
                                           n_estimators=50,
                                           random_state=32)
## specify search with model
clf = GridSearchCV(rf_model,
                   parameters2,
                   cv=5,
                   return_train_score=True)
clf.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=32, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': (2, 5, 7, 10, 20)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [50]:
## explore best hyperparameters
clf.best_params_

{'max_depth': 7}

In [51]:
## add model score
## Score the Model on Training and Testing Set
result_scores['RandomForest_CV2'] = \
            (sklearn.metrics.accuracy_score(y_train,clf.predict(X_train)),
             sklearn.metrics.accuracy_score(y_test,clf.predict(X_test)))
get_results(result_scores)


Model                  Train    Test
-------------------------------------------
Logistic               0.9077   0.9211
Null                   0.6374   0.5877
Logistic_L1_C_1        0.9143   0.9211
Logistic_L2_C_1        0.9077   0.9211
ElasticNet             0.5094   0.5578
RandomForest_noCV      1.0      0.9561
GradientTree           1.0      0.9474
Logistic_L1_C_auto     0.9516   0.9474
Logistic_L2_C_auto     0.9473   0.9386
ElasticNetCV           0.5952   0.6099
RandomForest_CV        0.9978   0.9386
RandomForest_CV2       0.9978   0.9298


In [52]:
# It seems that the previous random forest model is better

## Tuning Gradient boost classification

In [53]:
## specify grid
parameters = {'learning_rate':(0.1,0.3,0.5,0.7),
              'n_estimators':(50,100,150,200)}
## specify model without hyperparameters
rf_model = ensemble.GradientBoostingClassifier(random_state=32)
## specify search with model
clf = GridSearchCV(rf_model,
                   parameters,
                   cv=5,
                   return_train_score=True)
clf.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': (0.1, 0.3, 0.5, 0.7), 'n_estimators': (50, 100, 150, 200)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [54]:
## explore best hyperparameters
clf.best_params_

{'learning_rate': 0.5, 'n_estimators': 100}

In [55]:
## add model score
## Score the Model on Training and Testing Set
result_scores['GBC_cv'] = \
            (sklearn.metrics.accuracy_score(y_train,clf.predict(X_train)),
             sklearn.metrics.accuracy_score(y_test,clf.predict(X_test)))
get_results(result_scores)


Model                  Train    Test
-------------------------------------------
Logistic               0.9077   0.9211
Null                   0.6374   0.5877
Logistic_L1_C_1        0.9143   0.9211
Logistic_L2_C_1        0.9077   0.9211
ElasticNet             0.5094   0.5578
RandomForest_noCV      1.0      0.9561
GradientTree           1.0      0.9474
Logistic_L1_C_auto     0.9516   0.9474
Logistic_L2_C_auto     0.9473   0.9386
ElasticNetCV           0.5952   0.6099
RandomForest_CV        0.9978   0.9386
RandomForest_CV2       0.9978   0.9298
GBC_cv                 1.0      0.9386
