In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample
from sklearn.model_selection import cross_val_score

In [3]:
#read in full data set
full_ = pd.read_csv('all_features.csv.gz', compression = 'gzip')
#drop merge keys and non-numeric columns
full_ = full_.drop(columns = ['date', 'team1', 'team2', 'home_starter', 'road_starter',
                             'is_doubleheader', 'is_tripleheader', 'score1', 'score2'])
X, y = full_.drop(columns = ['home_loss']), full_['home_loss']

# BASELINE TESTING

In [25]:
#split into training and test sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
#train and test baseline logistic regression model
lr = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
lr.fit(x_train, y_train)
lr_preds = lr.predict(x_test)
lr_accuracy = accuracy_score(y_test, lr_preds)
#train and test baseline SGD Classifier
sgd = SGDClassifier()
sgd.fit(x_train, y_train)
sgd_preds = sgd.predict(x_test)
sgd_accuracy = accuracy_score(y_test, sgd_preds)
print('Baseline logistic regression accuracy: {}'.format(lr_accuracy))
print()
print('Baseline SGD Classifier accuracy {}'.format(sgd_accuracy))

Baseline logistic regression accuracy: 0.5736283947718557

Baseline SGD Classifier accuracy 0.4848947327228614


# TREE BASED MODELS

In [4]:
from recursive_selection import FeatureSelector

In [12]:
#generate dictionary of algorithms to select features based on feature importance and specify generic base
#parameters to be passed to FeatureSelector object
tree_based = {'RandomForestClassifier': (RandomForestClassifier(), {'n_estimators' : 100,
                                                                   'max_depth' : 7}),
             'LGBMClassifier' : (lgb.LGBMClassifier(), {'num_leaves' : 70,
                                                       'max_depth' : 6}),
             'XGBClassifier' : (xgb.XGBClassifier(), {'max_depth' : 5,
                                                     'n_estimators' : 100})}
#function to initialize results dictionary for each algorithm to be called to compare results
def initialize_results_dict():
    return( {'Algorithm' : None,
            'best_eval_full' : None,
            'best_subset_full' : None,
            'best_eval_drop_corr' : None,
            'best_subset_drop_corr' : None})
#empty list to store result dictionaries- can be easily made into dataframe using pd.concat
results_list = []

#iterate through algorithms, use recursive_selection method of FeatureSelector object, first using full feature set,
#then eliminating correlated features (with given tolerance)
for algo in tree_based:
    result = initialize_results_dict()
    result['Algorithm'] = algo
    selector = FeatureSelector(X, y, algorithm = tree_based[algo][0], params = tree_based[algo][1], drop_size = 10)
    selector.recursive_selection()
    result['best_eval_full'] = selector.best_eval
    result['best_subset_full'] = selector.best_subset
    selector = FeatureSelector(X, y, algorithm = tree_based[algo][0], drop_corr = True,
                               params = tree_based[algo][1], drop_size = 10, correlation_tolerance = 0.6)
    selector.recursive_selection()
    result['best_eval_drop_corr'] = selector.best_eval
    result['best_subset_drop_corr'] = selector.best_subset
    results_list.append(result)

5 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
5 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
5 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next itera

In [13]:
#create dataframe to evaluate results
results_df = pd.DataFrame(results_list)

In [14]:
#review dataframe 
results_df

Unnamed: 0,Algorithm,best_eval_full,best_subset_full,best_eval_drop_corr,best_subset_drop_corr
0,RandomForestClassifier,0.573724,"Index(['elo1_pre', 'elo_prob1', 'elo_prob2', '...",0.573653,"Index(['road_OBPS', 'road_career_ERA', 'elo1_p..."
1,LGBMClassifier,0.571153,"Index(['home_OBPS', 'home_AVG_RUNS', 'home_AVG...",0.571029,"Index(['home_OBPS', 'elo1_pre', 'elo2_pre', 'p..."
2,XGBClassifier,0.57336,"Index(['home_OBPS', 'home_AVG_RUNS', 'home_tot...",0.572139,"Index(['home_OBPS', 'home_BULLPEN_ERA', 'home_..."


In [15]:
#new result list
results_list = []

#iterate through algorithms, use recursive_selection method of FeatureSelector object, first using full feature set,
#then eliminating correlated features (with given tolerance), this time with scaled features
for algo in tree_based:
    result = initialize_results_dict()
    result['Algorithm'] = algo
    selector = FeatureSelector(X, y, algorithm = tree_based[algo][0], scale = 'standard', 
                               params = tree_based[algo][1], drop_size = 10)
    selector.recursive_selection()
    result['best_eval_full'] = selector.best_eval
    result['best_subset_full'] = selector.best_subset
    selector = FeatureSelector(X, y, algorithm = tree_based[algo][0], scale = 'standard', drop_corr = True,
                               params = tree_based[algo][1], drop_size = 10, correlation_tolerance = 0.6)
    selector.recursive_selection()
    result['best_eval_drop_corr'] = selector.best_eval
    result['best_subset_drop_corr'] = selector.best_subset
    results_list.append(result)

5 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
5 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
5 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next iteration
10 features have been dropped, moving to next itera

In [16]:
#generate DataFrame of results for scaled data
scaled_results_df = pd.DataFrame(results_list)
scaled_results_df

Unnamed: 0,Algorithm,best_eval_full,best_subset_full,best_eval_drop_corr,best_subset_drop_corr
0,RandomForestClassifier,0.573958,"Index(['elo1_pre', 'elo_prob1', 'elo_prob2', '...",0.573653,"Index(['road_OBPS', 'road_career_ERA', 'elo1_p..."
1,LGBMClassifier,0.571147,"Index(['home_OBPS', 'home_AVG_RUNS', 'home_AVG...",0.571065,"Index(['home_OBPS', 'elo1_pre', 'elo2_pre', 'p..."
2,XGBClassifier,0.573201,"Index(['home_OBPS', 'home_AVG_RUNS', 'home_AVG...",0.572978,"Index(['home_OBPS', 'home_BULLPEN_ERA', 'home_..."


In [25]:
#return highest performing subset
best_ = scaled_results_df.iloc[scaled_results_df.best_eval_full.idxmax()]['best_subset_full']
#view highest performing subset
best_

Index(['elo1_pre', 'elo_prob1', 'elo_prob2', 'rating1_pre', 'rating2_pre',
       'pitcher1_rgs', 'pitcher2_rgs', 'rating_prob1', 'rating_prob2'],
      dtype='object')

# Random Forest

In [8]:
#parameter tuning with hyperopt, generate parameter space to search over
hyperopt_space = {'criterion' : hp.choice('criterion', ['gini', 'entropy']),
                  'n_estimators' : scope.int(hp.quniform('n_estimators', 10, 200, 10)),
                  'max_depth' : scope.int(hp.quniform('max_depth', 3, 10, 1)),
                  'max_features' : hp.choice('max_features', ['auto', None]),
                  'bootstrap' : hp.choice('bootstrap', [True, False]),
                  'n_jobs' : -1
                 }
#optimizer function to call fmin function
def rfc_optimizer(param_space, x_train, y_train, num_eval):
    #objective function to minimize 
    def objective(params):
        rfc = RandomForestClassifier(**params)
        score = cross_val_score(rfc, x_train, y_train, cv = 10).mean()
        return({'loss' : -score, 'status' : STATUS_OK})
    #initialize trials objects to record performance
    trials = Trials()
    
    best_params = fmin(objective, param_space, algo = tpe.suggest, max_evals = num_eval, trials = trials,
                      rstate = np.random.RandomState(32))
    
    return(best_params)
    

In [10]:
#use best subset generated by feature selection process
X_best = X[best_]

#call search function
best_params = rfc_optimizer(hyperopt_space, X_best, y, 50)

100%|██████████| 50/50 [1:00:13<00:00, 72.28s/it, best loss: -0.5741168825141509] 


In [11]:
#check best parameters
best_params

{'bootstrap': 0,
 'criterion': 0,
 'max_depth': 6.0,
 'max_features': 0,
 'n_estimators': 30.0}

In [13]:
#fit final feature set and final parameter set and test performance
rfc_optimized = RandomForestClassifier(n_estimators = 30, criterion = 'gini', max_depth = 6, max_features = 'auto',
                                      bootstrap = True, n_jobs = -1)

#split full dataset into training and test data
x_train, x_test, y_train, y_test = train_test_split(X_best, y, test_size = 0.3, random_state = 42)

#fit on training data
rfc_optimized.fit(x_train, y_train)
#predict on test data
y_pred = rfc_optimized.predict(x_test)
#score on validation set
accuracy_rfc = accuracy_score(y_test, y_pred)
print('Accuracy of optimized RandomForestClassifier {}'.format(accuracy_rfc))


Accuracy of optimized RandomForestClassifier 0.5731392345621038


In [14]:
#check for colinearity in final feature frame
best_corr = X_best.corr().abs()
all_correlated = best_corr.where(np.triu(np.ones(best_corr.shape), k = 1)\
                              .astype(np.bool)).stack().sort_values(ascending = False)
#view colinearity
all_correlated

elo_prob1     elo_prob2       1.000000
rating_prob1  rating_prob2    1.000000
elo1_pre      rating1_pre     0.995811
elo_prob1     rating_prob2    0.970041
              rating_prob1    0.970041
elo_prob2     rating_prob2    0.970041
              rating_prob1    0.970041
elo1_pre      elo_prob2       0.742245
              elo_prob1       0.742245
elo_prob1     rating1_pre     0.739824
elo_prob2     rating1_pre     0.739824
elo_prob1     rating2_pre     0.736012
elo_prob2     rating2_pre     0.736012
rating1_pre   rating_prob2    0.722678
              rating_prob1    0.722678
elo1_pre      rating_prob2    0.719878
              rating_prob1    0.719878
rating2_pre   rating_prob2    0.719219
              rating_prob1    0.719219
elo1_pre      pitcher1_rgs    0.354917
rating2_pre   pitcher2_rgs    0.350723
rating1_pre   pitcher1_rgs    0.349292
pitcher2_rgs  rating_prob2    0.288479
              rating_prob1    0.288479
pitcher1_rgs  rating_prob2    0.285707
              rating_prob

In [16]:
#drop highly correlated features
X_best = X_best.drop(columns = ['elo_prob2', 'rating_prob2', 'rating1_pre', 'elo_prob1'])

In [21]:
#fit final feature set and final parameter set and test performance
rfc_optimized = RandomForestClassifier(n_estimators = 30, criterion = 'gini', max_depth = 6, max_features = 'auto',
                                      bootstrap = True, n_jobs = -1)

#split full dataset into training and test data
x_train, x_test, y_train, y_test = train_test_split(X_best, y, test_size = 0.3, random_state = 42)

#fit on training data
rfc_optimized.fit(x_train, y_train)
#predict on test data
y_pred = rfc_optimized.predict(x_test)
#score on validation set
accuracy_rfc = accuracy_score(y_test, y_pred)
print('Accuracy of optimized RandomForestClassifier {}'.format(accuracy_rfc))

Accuracy of optimized RandomForestClassifier 0.5730609689285435


In [23]:
#check if scaling features will have any effect on final accuracy 
ss = StandardScaler()
#fit training data
ss.fit(x_train)

#scale training and test data
x_train_s, x_test_s = ss.transform(x_train), ss.transform(x_test)
#reinitialize optimized RFC
rfc_optimized = RandomForestClassifier(n_estimators = 30, criterion = 'gini', max_depth = 6, max_features = 'auto',
                                      bootstrap = True, n_jobs = -1)
#fit on training data
rfc_optimized.fit(x_train_s, y_train)
#predict on test data
y_pred = rfc_optimized.predict(x_test_s)
#score on validation set
accuracy_rfc = accuracy_score(y_test, y_pred)

print('Accuracy of optimized RandomForestClassifier with scaled features is {}'.format(accuracy_rfc))

Accuracy of optimized RandomForestClassifier with scaled features is 0.5735501291382954
