# Model selection and parameter tuning

## Prepare notebook

Import libraries and functions

In [120]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import re
import seaborn as sns
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import sys
import os
sys.path.append('../src/models')
from predict_model import clint_scorer, trump_scorer
#warnings.filterwarnings("ignore")

Load predictors and target variable

In [3]:
with open('../data/processed/predictors.pickle', 'rb') as file:
    X = pickle.load(file)
with open('../data/processed/target.pickle', 'rb') as file:
    y = pickle.load(file)

## Split data into train and test sets

Split data into a train set and a test set 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Define function to score model performance 

Create a metric to pass to cross_val_score when evaluating models

In [5]:
clinton_metric = make_scorer(clint_scorer)
trump_metric = make_scorer(trump_scorer)

## Define models 

Logistic regression

In [9]:
lr = LogisticRegression(
    C=2, penalty='l2', solver='lbfgs', multi_class='auto', max_iter=1000)

Random forest classifier

In [10]:
rf = RandomForestClassifier(n_estimators=100)

K-nearest neighbors

In [11]:
knn = KNeighborsClassifier()

Support vector machine

In [12]:
svm = SVC(kernel='linear')

##  Check model performance

Create a list of model names and model variables 

In [19]:
name_list = ['Logistic regression', 'Random forest', 'K-nearest neighbors', 'Support vector machine']
model_list = [lr, rf, knn, svm]

Cross validate all of the models and print scores and score mean for each for  Clinton 

In [24]:
for idx, model in enumerate(model_list): 
    clint_scores = cross_val_score(
    model, X_train, y_train, cv=5, scoring=clinton_metric, n_jobs=-1)
    name = name_list[idx]
    print(name,":", clint_scores, 'Mean:', round(clint_scores.mean(), 5))

Logistic regression : [89.3 87.8 89.8 91.4 88.7] Mean: 89.4
Random forest : [89.3 87.1 87.2 91.2 90.4] Mean: 89.04
K-nearest neighbors : [88.7 88.7 88.  89.5 88.2] Mean: 88.62
Support vector machine : [90.4 89.1 91.4 91.5 89.1] Mean: 90.3


Cross validate all of the models and print scores and score mean for each for Trump

In [25]:
for idx, model in enumerate(model_list): 
    trump_scores = cross_val_score(
    model, X_train, y_train, cv=5, scoring=trump_metric, n_jobs=-1)
    name = name_list[idx]
    print(name, ":", trump_scores, 'Mean:', round(trump_scores.mean(), 5))

Logistic regression : [86.4 88.5 86.9 87.5 89.4] Mean: 87.74
Random forest : [87.3 91.2 86.4 88.3 91.2] Mean: 88.88
K-nearest neighbors : [80.6 83.7 79.7 81.8 81.9] Mean: 81.54
Support vector machine : [86.8 87.7 85.4 86.4 89. ] Mean: 87.06


In [147]:
predictors = X_train
selector = RFE(lr, n_features_to_select=1)
selector = selector.fit(predictors, y_train)

In [148]:
len(X_train)

6000

In [171]:
rank_list = selector.ranking_
feature_ranks = []
for idx, rank in enumerate(rank_list):
    print(rank, X_train.columns[idx])

89 imiss_a_2016_2.0
45 imiss_a_2016_3.0
84 imiss_a_2016_4.0
18 imiss_a_2016_8.0
83 imiss_b_2016_2.0
88 imiss_b_2016_3.0
8 imiss_b_2016_4.0
44 imiss_b_2016_8.0
24 imiss_c_2016_2.0
22 imiss_c_2016_3.0
23 imiss_c_2016_4.0
60 imiss_c_2016_8.0
54 imiss_d_2016_2.0
55 imiss_d_2016_3.0
72 imiss_d_2016_4.0
92 imiss_d_2016_8.0
53 imiss_e_2016_2.0
48 imiss_e_2016_3.0
52 imiss_e_2016_4.0
51 imiss_e_2016_8.0
10 imiss_f_2016_2.0
4 imiss_f_2016_3.0
5 imiss_f_2016_4.0
56 imiss_f_2016_8.0
78 imiss_g_2016_2.0
68 imiss_g_2016_3.0
27 imiss_g_2016_4.0
21 imiss_g_2016_8.0
75 imiss_h_2016_2.0
63 imiss_h_2016_3.0
38 imiss_h_2016_4.0
58 imiss_h_2016_8.0
70 imiss_i_2016_2.0
69 imiss_i_2016_3.0
76 imiss_i_2016_4.0
17 imiss_i_2016_8.0
49 imiss_j_2016_2.0
28 imiss_j_2016_3.0
40 imiss_j_2016_4.0
85 imiss_j_2016_8.0
80 imiss_k_2016_2.0
79 imiss_k_2016_3.0
42 imiss_k_2016_4.0
36 imiss_k_2016_8.0
9 imiss_l_2016_2.0
2 imiss_l_2016_3.0
1 imiss_l_2016_4.0
16 imiss_l_2016_8.0
86 imiss_m_2016_2.0
33 imiss_m_2016_3.0
87 imi

Looking at the metrics, the random forest model performs the best for this business problem 

## Parameter tuning

Define parameter grid for random forest

In [108]:

param_grid = {'n_estimators': [100], 'max_depth': [1,2,3,4,5,6], 'oob_score': [True], 'max_features': [None, 'sqrt']}



Find the best parameters and then crossvalidate with five folds

In [109]:
grid = GridSearchCV(rf, param_grid, iid=True, cv=5,
                    n_jobs=-1, scoring=clinton_metric, verbose=1, return_train_score=True)

Fit training data to grid

In [110]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   29.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid=Tr

Print best parameters and best score

In [111]:
print(grid.best_params_)
print(round(grid.best_score_, 3))

{'max_depth': 1, 'max_features': None, 'n_estimators': 100, 'oob_score': True}
98.72


# Test 

Crossvalidate with 5 folds on the test data and check performance metrics

In [114]:
grid.best_estimator_.feature_importances_

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.])

In [116]:
feats = pd.DataFrame(grid.best_estimator_.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)

In [117]:
feats

Unnamed: 0,importance
imiss_l_2016_4.0,1.0
imiss_o_2016_4.0,0.0
imiss_q_2016_8.0,0.0
imiss_q_2016_4.0,0.0
imiss_q_2016_3.0,0.0
imiss_q_2016_2.0,0.0
imiss_p_2016_8.0,0.0
imiss_p_2016_4.0,0.0
imiss_p_2016_3.0,0.0
imiss_p_2016_2.0,0.0
