In [1]:
import pandas as pd
import sys
import os
import logging
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import pickle

In [2]:
sys.path.append('{}/mmml'.format(os.path.dirname(os.getcwd())))
from mmml.config import data_folder
from mmml.game_results import *
from mmml.utils import *

In [3]:
base_dev = pd.read_pickle('{}/Data/Processed/base_dev.pkl'.format(os.path.dirname(os.getcwd())))
base_dev.head(3)

Unnamed: 0,HTeamID,ATeamID,Season,DayNum,HWin,HScore,AScore,GameRound,Seed_H,Seed_A,GameSlot
1,1112,1436,2003,136,1,80,51,1,Z01,Z16,R1Z1
86,1246,1197,2004,137,1,96,76,1,Z01,Z16a,R1Z1
212,1163,1107,2006,137,1,72,59,1,Z01,Z16,R1Z1


In [4]:
scaled_x_features_dev = pd.read_pickle('{}/Data/Processed/scaled_x_features_dev.pkl'.format(os.path.dirname(os.getcwd())))
scaled_x_features_dev.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Ast,Blk,DR,FGA,FGA3,FGM,FGM3,FTA,FTM,NLoc,...,possessions,o_eff,d_eff,net_eff,elo,last_elo,MOR,POM,SAG,Avg_Rank
TeamID,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1171,2012,278,70,667,1457,447,571,148,526,373,2,...,1839.825,90.389032,100.661748,-10.272716,"[1500.0, 1491.8493246116866, 1480.0, 1488.4805...",1351.987274,286.0,294.0,300.0,0.835238
1402,2006,287,93,515,1311,406,542,125,422,289,0,...,1630.65,91.865207,110.385429,-18.520222,"[1500.0, 1491.8493246116866, 1483.36879746356,...",1351.248141,316.0,324.0,327.0,0.918095
1328,2013,384,87,770,1789,472,780,154,646,491,5,...,2090.2375,105.490405,98.170662,7.319742,"[1500.0, 1508.1506753883134, 1520.102609012492...",1560.789937,49.0,51.0,43.0,0.133333


### Train Classifier

In [5]:
## READ FEATURE DICT
columns_key = getFeatureDict(pd.read_csv('{}/mmml/mmml/feature_list2.csv'.format(os.path.dirname(os.getcwd()))))

In [6]:
### HELPER TO REVERSE BASE DF
def reverse_base(base):
    reverse_base = base.copy()
    reverse_base = reverse_base.rename(columns={'HTeamID': 'ATeamID_2',
                                                'ATeamID':'HTeamID_2',
                                                'HScore':'AScore_2',
                                                'AScore':'HScore_2',
                                                'Seed_H':'Seed_A_2',
                                                'Seed_A':'Seed_H_2'})

    reverse_base = reverse_base.rename(columns={'ATeamID_2': 'ATeamID',
                                                'HTeamID_2':'HTeamID',
                                                'AScore_2':'AScore',
                                                'HScore_2':'HScore',
                                                'Seed_A_2':'Seed_A',
                                                'Seed_H_2':'Seed_H'})
    reverse_base['HWin'] = 1 - reverse_base['HWin']

    return reverse_base[base.columns]

### `fnTrain` Work

In [7]:
## Merge Base of Tournament Games w/ X-Features
model_data = createModelData(base_dev, scaled_x_features_dev, columns_key)
print(model_data.shape)
model_data.head(3)

(914, 14)


Unnamed: 0,HTeamID,ATeamID,Season,GameRound,Seed_H,Seed_A,GameSlot,HScore_diff,possessions_diff,o_eff_diff,d_eff_diff,net_eff_diff,last_elo_diff,Avg_Rank_diff
1,1112,1436,2003,1,Z01,Z16,R1Z1,29,239.35,8.0943,-4.293362,12.387662,91.313784,-0.457143
33,1112,1211,2003,2,Z01,Z09,R2Z1,1,-8.8875,0.315553,-6.414116,6.729669,57.057837,-0.119048
57,1112,1242,2003,4,Z01,Z02,R4Z1,-3,-62.175,0.666609,1.474276,-0.807667,29.976656,-0.000952


In [8]:
# Reverse H/A teams in base and create modeling dataset
base_reverse = reverse_base(base_dev)
model_data_reverse = createModelData(base_reverse, scaled_x_features_dev, columns_key)
model_data_reverse.shape

(914, 14)

In [9]:
model_data = model_data.append(model_data_reverse)
model_data.shape

(1828, 14)

In [10]:
### Define Grid Search Parameters
seed = 96

parameters = {'max_depth': [3, 4, 5],
'learning_rate':[0.1],
'n_estimators': [10, 100, 1000], #number of trees, change it to 1000 for better results
'gamma':[0, 0.05, 0.1],
'min_child_weight':[0, 2, 4],
'seed': [seed]} # binary:logistic

xgb_model = xgb.XGBRegressor()

clf = GridSearchCV(xgb_model, parameters, n_jobs=5, cv=5, verbose=0, refit=True)

In [11]:
# Define features to use
included_features = columns_key['features']
print(included_features)
target = columns_key['target']
print(target)

['possessions_diff', 'o_eff_diff', 'd_eff_diff', 'net_eff_diff', 'last_elo_diff', 'Avg_Rank_diff']
['HScore_diff']


In [12]:
# Fit Model
clf.fit(model_data[included_features], model_data[target])

# Best Estimator 
print(clf.best_estimator_)

# Feature Importances
print(pd.DataFrame(included_features, columns=['feature'])\
.merge(pd.DataFrame(clf.best_estimator_.feature_importances_), left_index=True, right_index=True)\
.sort_values(0, ascending=False))

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0.05, learning_rate=0.1,
             max_delta_step=0, max_depth=3, min_child_weight=0, missing=None,
             n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=96, silent=True, subsample=1)
            feature         0
5     Avg_Rank_diff  0.252585
0  possessions_diff  0.193501
1        o_eff_diff  0.163959
4     last_elo_diff  0.143279
3      net_eff_diff  0.131462
2        d_eff_diff  0.115214


In [13]:
## Create dictionary to store output necessary to get predictions & probabilities
mean = model_data[target].mean()
std = model_data[target].std()

model = {}
model['clf'] = clf
model['mean'] = mean
model['std'] = std

model

{'clf': GridSearchCV(cv=5, error_score=nan,
              estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bytree=1,
                                     gamma=0, learning_rate=0.1,
                                     max_delta_step=0, max_depth=3,
                                     min_child_weight=1, missing=None,
                                     n_estimators=100, n_jobs=1, nthread=None,
                                     objective='reg:linear', random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=True,
                                     subsample=1),
              iid='deprecated', n_jobs=5,
              param_grid={'gamma': [0, 0.05, 0.1], 'learning_rate': [0.1],
                          'max_depth': [3, 4, 5], 'min_child_weight': [0, 2, 4],
                          'n_estimators': [10, 100, 1