# Model selection and parameter tuning

## Prepare notebook

Import libraries and functions

In [27]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import re
import seaborn as sns
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import sys
import os
sys.path.append('../src/models')
from predict_model import clint_scorer, trump_scorer, avg_scorer
#warnings.filterwarnings("ignore")

Load predictors and target variable

In [13]:
with open('../data/processed/predictors.pickle', 'rb') as file:
    X = pickle.load(file)
with open('../data/processed/target.pickle', 'rb') as file:
    y = pickle.load(file)

## Split data into train and test sets

Split data into a train set and a test set 

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Define function to score model performance 

Create a metric to pass to cross_val_score when evaluating models

In [15]:
clinton_metric = make_scorer(clint_scorer)
trump_metric = make_scorer(trump_scorer)
comb_metric = make_scorer(avg_scorer)

## Define models 

Logistic regression

In [16]:
lr = LogisticRegression(
    C=2, penalty='l2', solver='lbfgs', multi_class='auto', max_iter=1000)

Random forest classifier

In [17]:
rf = RandomForestClassifier(n_estimators=100)

K-nearest neighbors

In [18]:
knn = KNeighborsClassifier()

Support vector machine

In [19]:
svm = SVC(kernel='linear')

##  Evaluate training performance

Create a list of model names and model variables 

In [20]:
name_list = ['Logistic regression', 'Random forest', 'K-nearest neighbors', 'Support vector machine']
model_list = [lr, rf, knn, svm]

Cross validate all of the models and print scores and score mean for each for  Clinton 

In [21]:
for idx, model in enumerate(model_list): 
    clint_scores = cross_val_score(
    model, X_train, y_train, cv=5, scoring=clinton_metric, n_jobs=-1)
    name = name_list[idx]
    print(name,":", clint_scores, round(clint_scores.mean(), 5))

Logistic regression : [89.6 88.5 88.1 90.5 89.1] 89.16
Random forest : [88.5 90.7 88.3 89.4 85. ] 88.38
K-nearest neighbors : [86.1 89.3 86.9 91.1 87.6] 88.2
Support vector machine : [89.3 90.4 89.4 91.7 89.4] 90.04


Cross validate all of the models and print scores and score mean for each for Trump

In [24]:
for idx, model in enumerate(model_list): 
    trump_scores = cross_val_score(
    model, X_train, y_train, cv=5, scoring=trump_metric, n_jobs=-1)
    name = name_list[idx]
    print(name, ":", trump_scores, 'Mean:', round(trump_scores.mean(), 5))

Logistic regression : [85.1 85.7 87.  86.6 88.4] Mean: 86.56
Random forest : [88.2 87.6 88.8 88.  88.2] Mean: 88.16
K-nearest neighbors : [80.4 83.3 83.7 81.8 82. ] Mean: 82.24
Support vector machine : [85.3 85.3 88.2 87.  87.2] Mean: 86.6


Print cross-validated scores for the average of Trump and Clinton scores 

In [25]:
for idx, model in enumerate(model_list): 
    comb_scores = cross_val_score(
    model, X_train, y_train, cv=5, scoring=comb_metric, n_jobs=-1)
    name = name_list[idx]
    print(name, comb_scores, comb_scores.mean())

Logistic regression [87.35 87.1  87.55 88.55 88.75] 87.86
Random forest [88.05 88.8  88.65 88.6  87.7 ] 88.36
K-nearest neighbors [83.25 86.3  85.3  86.45 84.8 ] 85.22
Support vector machine [87.3  87.85 88.8  89.35 88.3 ] 88.32


Use recursive feature elimination

In [32]:
selector = RFECV(lr, step=1, min_features_to_select=1,  scoring=comb_metric, cv=5)
selector.fit(X_train, y_train)

RFECV(cv=5,
      estimator=LogisticRegression(C=2, class_weight=None, dual=False,
                                   fit_intercept=True, intercept_scaling=1,
                                   l1_ratio=None, max_iter=1000,
                                   multi_class='auto', n_jobs=None,
                                   penalty='l2', random_state=None,
                                   solver='lbfgs', tol=0.0001, verbose=0,
                                   warm_start=False),
      min_features_to_select=1, n_jobs=None, scoring=make_scorer(avg_scorer),
      step=1, verbose=0)

Number of features found in recursive elimination and grid scores

In [55]:
selector.n_features_
selector.grid_scores_

array([70.69, 80.12, 82.47, 82.49, 82.65, 82.63, 82.92, 82.77, 83.46,
       84.57, 86.12, 86.52, 87.1 , 87.04, 87.08, 87.07, 87.01, 87.02,
       86.98, 86.86, 86.86, 86.83, 86.98, 87.01, 87.12, 87.14, 87.72,
       87.74, 87.75, 88.04, 88.18, 88.13, 88.09, 87.97, 87.98, 88.1 ,
       88.31, 88.29, 88.6 , 88.6 , 88.67, 88.71, 88.62, 88.6 , 88.64,
       88.62, 88.48, 88.42, 88.54, 88.37, 88.45, 88.34, 88.34, 88.36,
       88.39, 88.35, 88.37, 88.46, 88.14, 88.12, 88.17, 88.16, 88.12,
       88.04, 88.08, 88.18, 88.17, 88.14, 88.16, 88.14, 88.15, 87.99,
       87.99, 87.97, 88.01, 87.9 , 87.84, 87.78, 87.8 , 87.86, 87.92,
       88.  , 88.01, 87.97, 87.95, 88.01, 87.98, 87.99, 88.02, 87.92,
       87.9 , 87.86])

Print findings from RFE 

In [87]:
selector.support_
print(selector.ranking_)
rank_list = selector.ranking_
feature_ranks = []
no = []
yes = []
for idx, rank in enumerate(rank_list):
    feature_ranks.append([rank, X_train.columns[idx]])
for list in feature_ranks:
    if list[0] != 1:
        #print('no', list)
        no.append(list)
    if list[0] == 1:
        #print('yes', list)
        yes.append(list)
print(no)
print(yes)

[38 23  3  1 25  1  1 20  1  1  1 28 12 11 40 46 34  1  1  1  1  1  1 21
 18  9  1  1 36 35  1  1 30 31 17  1 13  1 16 19 32  2 22  1  1  1  1  1
 37  1 44 33 43 24  1  1 14 29 15 45  1  1 42  1  7  5  6 47 41  1  1  1
  8  4  1 39 27 26 10  1  1  1  1  1  1  1  1  1  1  1  1  1]
[[38, 'imiss_a_2016_2.0'], [23, 'imiss_a_2016_3.0'], [3, 'imiss_a_2016_4.0'], [25, 'imiss_b_2016_2.0'], [20, 'imiss_b_2016_8.0'], [28, 'imiss_c_2016_8.0'], [12, 'imiss_d_2016_2.0'], [11, 'imiss_d_2016_3.0'], [40, 'imiss_d_2016_4.0'], [46, 'imiss_d_2016_8.0'], [34, 'imiss_e_2016_2.0'], [21, 'imiss_f_2016_8.0'], [18, 'imiss_g_2016_2.0'], [9, 'imiss_g_2016_3.0'], [36, 'imiss_h_2016_2.0'], [35, 'imiss_h_2016_3.0'], [30, 'imiss_i_2016_2.0'], [31, 'imiss_i_2016_3.0'], [17, 'imiss_i_2016_4.0'], [13, 'imiss_j_2016_2.0'], [16, 'imiss_j_2016_4.0'], [19, 'imiss_j_2016_8.0'], [32, 'imiss_k_2016_2.0'], [2, 'imiss_k_2016_3.0'], [22, 'imiss_k_2016_4.0'], [37, 'imiss_m_2016_2.0'], [44, 'imiss_m_2016_4.0'], [33, 'imiss_m_2016_

## Parameter tuning

Define parameter grid for random forest

In [59]:
param_grid = {'n_estimators': [100], 'max_depth': [
    1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'oob_score': [True], 'max_features': [None, 'sqrt']}

Find the best parameters and then crossvalidate with five folds

In [60]:
grid = GridSearchCV(rf, param_grid, iid=True, cv=5, n_jobs=-1,
                    scoring=comb_metric, verbose=1, return_train_score=True)

Fit training data to grid

In [61]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid=Tr

Print best parameters and best score

In [None]:
print(grid.best_params_)
print(round(grid.best_score_, 3))

### Feature importance

Check feature importance 

In [None]:
feats = pd.DataFrame(grid.best_estimator_.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feats.sort_values(by='importance')[80:92]

## Evaluate test performance

In [62]:
scores = cross_validate(grid, X_test, y_test, scoring=comb_metric, cv=5)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   40.7s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.0min finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   55.1s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   41.6s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   47.6s finished


In [63]:
scores['test_score'].mean()

90.26