# Model selection and parameter tuning

## Prepare notebook

Import libraries and functions

In [26]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import re
import seaborn as sns
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import sys
import os
sys.path.append('../src/models')
from predict_model import clint_scorer, trump_scorer
#warnings.filterwarnings("ignore")

Load predictors and target variable

In [3]:
with open('../data/processed/predictors.pickle', 'rb') as file:
    X = pickle.load(file)
with open('../data/processed/target.pickle', 'rb') as file:
    y = pickle.load(file)

## Split data into train and test sets

Split data into a train set and a test set 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Define function to score model performance 

Create a metric to pass to cross_val_score when evaluating models

In [5]:
clinton_metric = make_scorer(clint_scorer)
trump_metric = make_scorer(trump_scorer)

## Define models 

Logistic regression

In [9]:
lr = LogisticRegression(
    C=2, penalty='l2', solver='lbfgs', multi_class='auto', max_iter=1000)

Random forest classifier

In [10]:
rf = RandomForestClassifier(n_estimators=100)

K-nearest neighbors

In [11]:
knn = KNeighborsClassifier()

Support vector machine

In [12]:
svm = SVC(kernel='linear')

##  Check model performance

Create a list of model names and model variables 

In [19]:
name_list = ['Logistic regression', 'Random forest', 'K-nearest neighbors', 'Support vector machine']
model_list = [lr, rf, knn, svm]

Cross validate all of the models and print scores and score mean for each for  Clinton 

In [24]:
for idx, model in enumerate(model_list): 
    clint_scores = cross_val_score(
    model, X_train, y_train, cv=5, scoring=clinton_metric, n_jobs=-1)
    name = name_list[idx]
    print(name,":", clint_scores, 'Mean:', round(clint_scores.mean(), 5))

Logistic regression : [89.3 87.8 89.8 91.4 88.7] Mean: 89.4
Random forest : [89.3 87.1 87.2 91.2 90.4] Mean: 89.04
K-nearest neighbors : [88.7 88.7 88.  89.5 88.2] Mean: 88.62
Support vector machine : [90.4 89.1 91.4 91.5 89.1] Mean: 90.3


Cross validate all of the models and print scores and score mean for each for Trump

In [25]:
for idx, model in enumerate(model_list): 
    trump_scores = cross_val_score(
    model, X_train, y_train, cv=5, scoring=trump_metric, n_jobs=-1)
    name = name_list[idx]
    print(name, ":", trump_scores, 'Mean:', round(trump_scores.mean(), 5))

Logistic regression : [86.4 88.5 86.9 87.5 89.4] Mean: 87.74
Random forest : [87.3 91.2 86.4 88.3 91.2] Mean: 88.88
K-nearest neighbors : [80.6 83.7 79.7 81.8 81.9] Mean: 81.54
Support vector machine : [86.8 87.7 85.4 86.4 89. ] Mean: 87.06


## Parameter tuning

Define parameter grid for random forest

In [40]:
param_grid = {'n_estimators': [10, 100, 250], 'max_depth': [
    2, 3, 5, 7], 'oob_score': [True], 'max_features': [None]}

In [41]:
grid = GridSearchCV(rf, param_grid, iid=True, cv=5, 
                    n_jobs=-1, scoring=[clinton_metric, trump_metric], verbose=1, return_train_score=True)

In [43]:
grid.cv_results_

AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

# Test 

Which model does better with test data? 
Stick with one metric
Whatever model is best, variation between train and test,etc. 

In [None]:
d