# Model Optimization
In this document we optimize model parameters and obtain a single, best refressor.

In [2]:
%matplotlib inline

from collections import Counter
from pprint import pprint
import pickle
import datetime
import traceback

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.linalg as la
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
from sklearn.metrics import f1_score, make_scorer, confusion_matrix, roc_auc_score
from sklearn.utils import shuffle
from sklearn.model_selection import KFold, train_test_split
import itertools
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler


from models.Utils import plot_confusion_matrix, plot_hbar_nameval

In [3]:
def describe_dataset(y_val): 
    heading = 'Pitch index counts'
    print(heading + '\n' + '-'*len(heading))
    for key, val in sorted(Counter(y_val).items()):
        print('{}\t: {}'.format(int(key), val))

## Get the Traning Set

In [4]:
all_data = pd.read_csv('data_all.csv')

### Column Names

In [5]:
feature_names = list(all_data.columns[:-4])

#### *i.Remove null events*

In [6]:
all_data = all_data[all_data['pitch_index'] !=0 ]

In [7]:
describe_dataset(all_data['pitch_index'])

Pitch index counts
------------------
1	: 19797
2	: 47201
3	: 18229
4	: 57806
5	: 169396
6	: 58463
7	: 21156
8	: 52785
9	: 24812


#### *ii.Scale data* 

In [8]:
X = all_data.values[:, 0:-4]
y = all_data.values[:, -3]

# shuffle the data
X, y = shuffle(X, y)

# Scale the data to be between -1 and 1
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

#### *iii.Split data* 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Classifier Description
Default classifier parameters. We will use these parameters as a starting point.

In [10]:
clf = RandomForestRegressor(
    n_estimators=128,
)

params = clf.get_params()
max_key_len = max(len(key) for key in params.keys())
max_val_len = max(len(str(val)) for val in params.values())
header = '{:<{key_width}} : {:<{val_width}}'.format(
    'Parameters',
    'Current values',
    key_width=max_key_len,
    val_width=max_val_len
)

print(header)
print('-'*len(header))
for key, val in params.items():
    print('{:{width}} : {}'.format(key, val, width=max_key_len))

Parameters               : Current values
-----------------------------------------
max_depth                : None
min_weight_fraction_leaf : 0.0
n_jobs                   : 1
n_estimators             : 128
oob_score                : False
max_features             : auto
random_state             : None
verbose                  : 0
bootstrap                : True
warm_start               : False
min_samples_split        : 2
max_leaf_nodes           : None
min_impurity_decrease    : 0.0
min_samples_leaf         : 1
criterion                : mse
min_impurity_split       : None


## Parameters
Search space. Every possible combination will be tested by GridSearch.

In [11]:
# Create the random grid
param_grid = [
    {      'criterion': ['mse', 'mae'],
           'max_features': ['sqrt', 'log2'],
           'min_samples_split': [10, 25, 50, 75, 100]
    }
]

In [12]:
prod = 1
for key, val in param_grid[0].items():
    prod *= len(val)
    
print('Total number of combinations: {}'.format(prod))

Total number of combinations: 40


## Searcher

In [13]:
cv = KFold(n_splits=2, suffle=True)

grid_search_cv = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    cv=cv,
    verbose=1
)

## Search Parameters

In [None]:
# initial message accumulators
train_result_msg = ''
fmt = 'Training has ended in {} minutes and {} seconds'

# we will measure time elapsed
t_beg = datetime.datetime.now()
try:
    # train and save the grid search cross validator
    grid_search_cv = grid_search_cv.fit(X_train, y_train)
    with open('grid_search_cv.pkl', 'wb') as f:
        pickle.dump(grid_search_cv, f)
        
    # success
    fmt = 'SUCCESS! ' + fmt
    fmt += '\n\n'

    # build up success message with training results
    result_dic = grid_search_cv.cv_results_
    for header, content in sorted(result_dic.items()):
        train_result_msg += '{}\n{}\n'.format(header, '-'*len(header))
        train_result_msg += str(content)
        train_result_msg += '\n\n\n'
except:
    # send stack trace
    fmt = 'FAILED! ' + fmt + '\n\n{}'.format(traceback.format_exc())
finally:
    # time elapsed
    t = datetime.datetime.now() - t_beg
    sec = t.seconds
    minutes = sec//60
    seconds = sec - 60*minutes
    
    # final message
    msg = fmt.format(minutes, seconds) + train_result_msg

# write the message to a file to be sent via email
with open('grid_search_results.txt', 'w') as f:
    f.write(msg + '\n')

Fitting 1 folds for each of 40 candidates, totalling 40 fits
