# Model selection and parameter tuning

## Prepare notebook

Import libraries and functions

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import re
import seaborn as sns
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import sys
import os
sys.path.append('../src/models')
from predict_model import clint_scorer, trump_scorer, avg_scorer
#warnings.filterwarnings("ignore")

Load predictors and target variable

In [2]:
with open('../data/processed/predictors.pickle', 'rb') as file:
    X = pickle.load(file)
with open('../data/processed/target.pickle', 'rb') as file:
    y = pickle.load(file)

## Split data into train and test sets

Split data into a train set and a test set 

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Define function to score model performance 

Create a metric to pass to cross_val_score when evaluating models

In [4]:
clinton_metric = make_scorer(clint_scorer)
trump_metric = make_scorer(trump_scorer)
comb_metric = make_scorer(avg_scorer)

## Define models 

Logistic regression

In [5]:
lr = LogisticRegression(
    C=2, penalty='l2', solver='lbfgs', multi_class='auto', max_iter=1000)

Random forest classifier

In [6]:
rf = RandomForestClassifier(n_estimators=100)

K-nearest neighbors

In [7]:
knn = KNeighborsClassifier()

Support vector machine

In [8]:
svm = SVC(kernel='linear')

##  Check training performance

Create a list of model names and model variables 

In [None]:
name_list = ['Logistic regression', 'Random forest', 'K-nearest neighbors', 'Support vector machine']
model_list = [lr, rf, knn, svm]

Cross validate all of the models and print scores and score mean for each for  Clinton 

In [None]:
for idx, model in enumerate(model_list): 
    clint_scores = cross_val_score(
    model, X_train, y_train, cv=5, scoring=[None], n_jobs=-1)
    name = name_list[idx]
    print(name,":", clint_scores, 'Mean:', round(clint_scores.mean(), 5))

Cross validate all of the models and print scores and score mean for each for Trump

In [None]:
for idx, model in enumerate(model_list): 
    trump_scores = cross_val_score(
    model, X_train, y_train, cv=5, scoring=trump_metric, n_jobs=-1)
    name = name_list[idx]
    print(name, ":", trump_scores, 'Mean:', round(trump_scores.mean(), 5))

Use recursive feature elimination 

In [None]:
selector = RFE(lr, 10)
selector = selector.fit(X_train, y_train)

Use RFE selections and crossvalidate with clinton and trump metrics

In [None]:
scores = cross_validate(selector, X_train, y_train, scoring={
                        'clint_scores': clinton_metric, 'trump_scores': trump_metric}, cv=5)

In [None]:
scores

Print the rank by selector 

In [None]:
selector.support_
print(selector.ranking_)

In [None]:
rank_list = selector.ranking_
feature_ranks = []
for idx, rank in enumerate(rank_list):
    feature_ranks.append([rank, X_train.columns[idx]])
feature_ranks[0:5]

Looking at the metrics, the random forest model performs the best for this business problem 

## Parameter tuning

Define parameter grid for random forest

In [9]:
param_grid = {'n_estimators': [100], 'max_depth': [
    1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'oob_score': [True], 'max_features': [None, 'sqrt']}

Find the best parameters and then crossvalidate with five folds

In [10]:
grid = GridSearchCV(rf, param_grid, iid=True, cv=5, n_jobs=-1,
                    scoring=comb_metric, verbose=1, return_train_score=True)

Fit training data to grid

In [11]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   30.1s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid=Tr

Print best parameters and best score

In [12]:
print(grid.best_params_)
print(round(grid.best_score_, 3))

{'max_depth': 6, 'max_features': 'sqrt', 'n_estimators': 100, 'oob_score': True}
87.77


### Feature importance

Check feature importance 

In [32]:
feats = pd.DataFrame(grid.best_estimator_.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feats.sort_values(by='importance')[70:120]

Unnamed: 0,importance
imiss_q_2016_3.0,0.010818
imiss_u_2016_2.0,0.01219
imiss_c_2016_2.0,0.012292
imiss_i_2016_3.0,0.013021
imiss_p_2016_2.0,0.014846
imiss_r_2016_3.0,0.015523
imiss_u_2016_4.0,0.01555
imiss_c_2016_3.0,0.015934
imiss_g_2016_3.0,0.018393
imiss_f_2016_3.0,0.020859


## Check metrics on test data

In [14]:
scores = cross_validate(grid, X_test, y_test, scoring=comb_metric, cv=5)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [19]:
scores['test_score'].mean()

86.88000000000001