## This Notebook Illustrates Examples of Model Optimization

In [5]:
# Import libraries.
import pandas as pd
from sklearn import linear_model
from sklearn import svm
from sklearn import naive_bayes
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV

# Read in data. 
data = pd.read_csv('./data/glass.csv')

features = list(data)
features.remove('Type')
data_x = data[features]
data_y = data['Type']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3, random_state=4)

## 1. Basic Cross-Validation with SVM

In [6]:
mod = svm.SVC(C=2.5)

# Illustrate the 3 major CV approaches. We will use accuracuy or F1 macro as our scoring criteria. 
k_fold = KFold(n_splits=5, shuffle=True, random_state=4) # This is 5-fold CV.
k_fold_scores = cross_val_score(mod, data_x, data_y, scoring='accuracy', cv=k_fold)
print('CV Scores (K-Fold): '+str(k_fold_scores))

CV Scores (K-Fold): [0.81395349 0.6744186  0.69767442 0.60465116 0.73809524]




In [9]:
# Example of leave-one-out CV
loo = LeaveOneOut()
loo_scores = cross_val_score(mod, data_x, data_y, cv=loo)
print('CV Scores (Leave-One-Out) \n'+str(loo_scores.mean()))











CV Scores (Leave-One-Out) 
0.7102803738317757




In [11]:
shuffle_split = ShuffleSplit(test_size=0.2, train_size=0.8, n_splits=10)
ss_scores = cross_val_score(mod, data_x, data_y, scoring='accuracy', cv=shuffle_split)
print('CV Scores (Shuffle Split): '+str(ss_scores))

CV Scores (Shuffle Split): [0.74418605 0.72093023 0.69767442 0.72093023 0.74418605 0.69767442
 0.72093023 0.69767442 0.69767442 0.74418605]




In [None]:
# GroupKFold is another example. 

## 2. Grid Search + Cross Validation With Random Forest

In [13]:
# Step 1: Specify the param grid.
param_grid = {'n_estimators':[5, 10, 50, 100], 'max_depth':[3, 6, None]}

# Step 2: Construct the searching object.
optimized_rf = GridSearchCV(ensemble.RandomForestClassifier(), param_grid, cv=5)

# Step 3: Fit to find the best model.
optimized_rf.fit(x_train, y_train)

# Step 4: Evaluate.
print('Grid Search Test Score (Random Forest Alg.): '+str(optimized_rf.score(x_test, y_test)))

Grid Search Test Score (Random Forest Alg.): 0.8769230769230769


