In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn import linear_model, svm, tree, ensemble
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer
import xgboost as xgb

In [2]:
##Pull in data and seperate the features and labels
full_data = pd.read_csv('historical_cbb_data.csv')
full_data['neutral'] = np.where(full_data['neutral'] == 1, 1, 0)
train = full_data[full_data['year'] != 2019]
test = full_data[full_data['year'] == 2019]
train_labels = np.array(train['result'])
test_labels = np.array(test['result'])
train_odds = np.array(train['close'])
test_odds = np.array(test['close'])

train = train.drop(['result', 'year', 'close'], axis=1)
test = test.drop(['result', 'year', 'close'], axis=1)

feature_list = list(train.columns)
train_features = np.array(train)
test_features = np.array(test)

In [3]:
def truncated_loss_function(y_true, y_pred):
    diff = np.abs(y_true - y_pred)
    ##If the difference is greater than 2 scores, set it to 16
    diff = np.where(diff > 15, 15, diff)
    return np.mean(diff)

score = make_scorer(truncated_loss_function, greater_is_better=False)

In [12]:
##Closing vegas lines
mean_absolute_error(test_odds, test_labels)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [13]:
##Baseline only using average
mean_absolute_error(np.repeat(np.array([np.mean(train_labels)]), len(test_labels)), test_labels)

11.172240249080273

In [14]:
##Baseline linear regression with only neutral and f+
subset_data = full_data[['neutral', 'team_rankings_rating', 'year', 'result']]
base_train = subset_data[subset_data['year'] != 2019]
base_test = subset_data[subset_data['year'] == 2019]
base_train_labels = np.array(base_train['result'])
base_test_labels = np.array(base_test['result'])

base_train = base_train.drop(['result', 'year'], axis=1)
base_test = base_test.drop(['result', 'year'], axis=1)

subset_feature_list = list(base_train.columns)
base_train_features = np.array(base_train)
base_test_features = np.array(base_test)

base_linear_regr = linear_model.LinearRegression().fit(base_train_features, base_train_labels)
base_ols_coefficients = pd.concat([pd.DataFrame(subset_feature_list),pd.DataFrame(np.transpose(base_linear_regr.coef_))], axis = 1)
print(base_linear_regr.score(base_train_features, base_train_labels))
mean_absolute_error(base_linear_regr.predict(base_test_features), base_test_labels)

0.49112559691786806


8.204796029965557

In [15]:
##Linear Regression
linear_regr = linear_model.LinearRegression().fit(train_features, train_labels)
ols_coefficients = pd.concat([pd.DataFrame(feature_list),pd.DataFrame(np.transpose(linear_regr.coef_))], axis=1)
print(linear_regr.score(train_features, train_labels))
mean_absolute_error(linear_regr.predict(test_features), test_labels)

0.4943427029062513


8.180812849521653

In [18]:
##Lasso Regression
lasso_model = linear_model.Lasso()
parameters = {'alpha':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 10]}
lasso_regr = GridSearchCV(lasso_model, parameters, cv=5, scoring=score).fit(train_features, train_labels)
print(lasso_regr.best_params_)
print(lasso_regr.score(train_features, train_labels))
mean_absolute_error(lasso_regr.predict(test_features), test_labels)

{'alpha': 0.1}
-7.300720090766176


8.182770885243073

In [8]:
##Fit best Lasso using parameter calculated above
lasso_model = linear_model.Lasso(alpha=0.1)
lasso_regr = lasso_model.fit(train_features, train_labels)
print(lasso_regr.score(train_features, train_labels))
mean_absolute_error(lasso_regr.predict(test_features), test_labels)

0.4933435464108724


8.182770885243073

In [9]:
##Ridge Regression
ridge_regr = linear_model.RidgeCV(cv=5).fit(train_features, train_labels)
ridge_coefficients = pd.concat([pd.DataFrame(feature_list),pd.DataFrame(np.transpose(ridge_regr.coef_))], axis=1)
print(ridge_regr.score(train_features, train_labels))
mean_absolute_error(ridge_regr.predict(test_features), test_labels)

0.4781316113595314


8.223154760184627

In [10]:
##Elastic Net
en_regr = linear_model.ElasticNetCV(cv=5, random_state=0).fit(train_features, train_labels)
en_coefficients = pd.concat([pd.DataFrame(feature_list),pd.DataFrame(np.transpose(en_regr.coef_))], axis=1)
print(en_regr.score(train_features, train_labels))
mean_absolute_error(en_regr.predict(test_features), test_labels)

0.47796207772411303


8.22322378916986

In [None]:
##SVR using default values
svr_regr = svm.SVR(gamma='scale').fit(train_features, train_labels)
print(svr_regr.score(train_features, train_labels))
mean_absolute_error(svr_regr.predict(test_features), test_labels)

In [20]:
##SVR with grid search
parameters = {'C': [0.1, 1, 2, 5, 10, 15, 25, 50, 100], 'epsilon': [0, 0.5, 1, 2, 3, 4, 5, 8, 10, 20], 'kernel': ['linear', 'rbf', 'poly']}
svr = svm.SVR(gamma='scale')
svr_regr = RandomizedSearchCV(svr, parameters, cv=5, n_iter=100, scoring=score).fit(train_features, train_labels)
print(svr_regr.best_params_)
print(svr_regr.score(train_features, train_labels))
mean_absolute_error(svr_regr.predict(test_features), test_labels)
##Optimal hyper parameters
#C:0.1, epsilon: 1 , kernel:linear

{'kernel': 'linear', 'epsilon': 1, 'C': 0.1}
-7.28635848694344


8.17723495637139

In [7]:
##SVR with the optimal hyperparameters from above
svr_model = svm.SVR(gamma='scale', C=0.1, epsilon=1, kernel='linear')
svr_regr = svr_model.fit(train_features, train_labels)
print(svr_regr.score(train_features, train_labels))
mean_absolute_error(svr_regr.predict(test_features), test_labels)

0.4933209462579541


8.17723495637139

In [None]:
##Decision Tree No pruning
decision_tree_regr = tree.DecisionTreeRegressor(criterion='mae', random_state=0).fit(train_features, train_labels)
print(decision_tree_regr.score(train_features, train_labels))
mean_absolute_error(decision_tree_regr.predict(test_features), test_labels)

In [None]:
##Decision Tree CV
parameters = {'max_depth':list(range(1,33)), 'min_samples_split': [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.90, 0.95, 0.99], 'min_samples_leaf':([0.001, 0.005, .01, .05] + list(np.linspace(.1, .5, 9)))}
decision_tree = tree.DecisionTreeRegressor(criterion='mae', random_state=0)
decision_tree_regr = RandomizedSearchCV(decision_tree, parameters, cv=5, scoring=score, n_iter=100).fit(train_features, train_labels)
print(decision_tree_regr.score(train_features, train_labels))
mean_absolute_error(decision_tree_regr.predict(test_features), test_labels)
##Optimal hyper parameters
##Full Model: max_depth 5, min_samples_leaf = .01, min_samples_split=.001, random_state=0

In [None]:
##Decision tree with optimal hyper parameters
decision_tree_regr = tree.DecisionTreeRegressor(criterion='mae', random_state=0, max_depth=5, min_samples_leaf=.01, min_samples_split=.001).fit(train_features, train_labels)
print(decision_tree_regr.score(train_features, train_labels))
mean_absolute_error(decision_tree_regr.predict(test_features), test_labels)

In [None]:
##Adaboost tree using the optimal parameters from above
##Don't use adaboost because very sensitive to outliers
##max_depth 4, min_samples_leaf = .1, min_samples_split=.1, random_state=0
decision_tree = tree.DecisionTreeRegressor(criterion='mae', random_state=0, max_depth=4, min_samples_leaf=.1, min_samples_split=.1)
parameters ={'learning_rate': list(np.linspace(.1, 1, 10)), 'n_estimators': list(range(10, 110, 10))}
adaboost_tree = ensemble.AdaBoostRegressor(base_estimator=decision_tree, random_state=0)
adaboost_tree_regr = GridSearchCV(adaboost_tree, parameters, cv=5, scoring=score).fit(train_features, train_labels)
print(adaboost_tree_regr.score(train_features, train_labels))
mean_absolute_error(adaboost_tree_regr.predict(test_features), test_labels)

In [5]:
##Random Forest CV
parameters = {'max_depth':list(range(1,20)), 'min_samples_split': [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.90, 0.95, 0.99],\
              'min_samples_leaf':([0.001, 0.005, .01, .05] + list(np.linspace(.1, .25, .5, .9))), 'n_estimators': [10, 20, 50, 75, 100, 150, 200]}
random_forest = ensemble.RandomForestRegressor(criterion='mae', random_state=0, max_features=1/3)
random_forest_regr = RandomizedSearchCV(random_forest, parameters, cv=5, n_iter=100, scoring=score).fit(train_features, train_labels)
print(random_forest_regr.best_params_)
print(random_forest_regr.score(train_features, train_labels))
mean_absolute_error(random_forest_regr.predict(test_features), test_labels)
##Best parameters max depth 18, min_sample_leafs .001, min_samples_split .01, n_estimators 50

  This is separate from the ipykernel package so we can avoid doing imports until


{'n_estimators': 50, 'min_samples_split': 0.01, 'min_samples_leaf': 0.001, 'max_depth': 18}
-6.9954166477246895


8.206103896103896

In [6]:
##Fit random forest with best parameters from above
random_forest_model = ensemble.RandomForestRegressor(criterion='mae', random_state=0, max_features=1/3, max_depth=18, min_samples_leaf=.001, min_samples_split=.01, n_estimators=50)
random_forest_regr = random_forest_model.fit(train_features, train_labels)
print(random_forest_regr.score(train_features, train_labels))
mean_absolute_error(random_forest_regr.predict(test_features), test_labels)

0.5184256918204613


8.206103896103896

In [7]:
##xgboost CV
dtrain = xgb.DMatrix(train_features, label=train_labels)
dtest = xgb.DMatrix(test_features, label=test_labels)
parameters = {'max_depth':list(range(1,20)),  'learning_rate':[0.01, .02, .05]+list(np.linspace(.1, 1, 10)), 'colsample_bytree':[0.01, .02, .05]+list(np.linspace(.1, 1, 10)), 'colsample_bylevel':[0.01, .02, .05]+list(np.linspace(.1, 1, 10)), 'reg_alpha':[0.01, .1, .5] + list(np.linspace(1,10,10)), 'reg_lambda':[0.01, .1, .5] + list(np.linspace(1,10,10)), 'booster':['gbtree', 'gblinear'], 'gamma':[0.01, .02, .05]+list(np.linspace(.1, 1, 10))}
xgboost_model = xgb.XGBRegressor(eval_metric='mae', seed=0, objective='reg:squarederror')
xgboost_regr = RandomizedSearchCV(xgboost_model, parameters, cv=5, n_iter=100, scoring=score).fit(train_features, train_labels)
print(xgboost_regr.best_params_)
print(xgboost_regr.score(train_features, train_labels))
mean_absolute_error(xgboost_regr.predict(test_features), test_labels)
#xgboost best params colsample_bylevel .01, colsample_bytree .9, learning_rate=.05, max_depth 18, reg_alpha 6.0, reg lambda 8.0, gamma 0.8, booster gbtree

{'reg_lambda': 8.0, 'reg_alpha': 6.0, 'max_depth': 18, 'learning_rate': 0.05, 'gamma': 0.8, 'colsample_bytree': 0.9, 'colsample_bylevel': 0.01, 'booster': 'gbtree'}
-6.733532278041718


8.232774008253608

In [5]:
##xgboost with best parameters above 
dtrain = xgb.DMatrix(train_features, label=train_labels)
dtest = xgb.DMatrix(test_features, label=test_labels)
xgboost_model = xgb.XGBRegressor(eval_metric='mae', seed=0, objective='reg:squarederror', colsample_bylevel=.01, colsample_bytree=.9, learning_rate=.05, max_depth=18, reg_alpha=6, reg_lambda=8, gamma=0.8, booster='gbtree')
xgboost_regr = xgboost_model.fit(train_features, train_labels)
print(xgboost_regr.score(train_features, train_labels))
mean_absolute_error(xgboost_regr.predict(test_features), test_labels)

0.5783228122126856


8.232774008253608

In [9]:
##ensemble method using xgboost, svr, random forest, and lasso regression models with cross validation for weights
ensemble_model = ensemble.VotingRegressor([('xgb', xgboost_model), ('svr', svr_model), ('rf', random_forest_model), ('lasso', lasso_model)])
parameters = {'weights': [[1,1,1,1], [1.5, 1, 1, 1], [1,1.5,1,1], [1,1,1.5,1], [1,1,1,1.5], [1.5,1.5,1,1], [1.5,1,1.5,1], [1.5,1,1,1.5], [1,1.5,1.5,1],[1,1.5,1,1.5], [1,1,1.5,1.5]]}
ensemble_regr = GridSearchCV(ensemble_model, parameters, cv=5, scoring=score).fit(train_features, train_labels)
print(ensemble_regr.best_params_)
print(ensemble_regr.score(train_features, train_labels))
mean_absolute_error(ensemble_regr.predict(test_features), test_labels)

{'weights': [1, 1, 1.5, 1]}
-7.057076103165457


8.177787686675773

In [10]:
##ensemble method using xgboost, svr, random forest, and lasso regression models
ensemble_model = ensemble.VotingRegressor([('xgb', xgboost_model), ('svr', svr_model), ('rf', random_forest_model), ('lasso', lasso_model)], weights=[1,1,1.5,1])
ensemble_regr = ensemble_model.fit(train_features, train_labels)
print(ensemble_regr.score(train_features, train_labels))
mean_absolute_error(ensemble_regr.predict(test_features), test_labels)

0.5238904179771711


8.177787686675773

In [None]:
results = pd.DataFrame({'odds':test_odds, 'prediction':ensemble_regr.predict(test_features), 'result':test_labels})
results.to_csv('results.csv', index=False)

In [24]:
ensemble_regr.best_params_

{'weights': [1.5, 1, 1, 1]}

In [17]:
with open('cbb_full_model.pickle', 'wb') as cbb_file:
    pickle.dump(ensemble_model, cbb_file)

In [18]:
with open('cbb_model.pickle', 'wb') as cbb_file:
    pickle.dump(ensemble_model, cbb_file)