In [2]:
import CBB_Funcs as cbb_fun
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, classification_report
import requests
from bs4 import BeautifulSoup
import statsmodels.api as sm
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from playsound import playsound
import pickle

In [4]:
############# Loading in Data ################
## The process of scraping all the data from the web for all four years
## and performing 'combine_data' could take almost an hour

## By setting load = 'True', data is loaded from files that have
## already been processed. This can be done for all 4 years in about 15 seconds.

## By setting load = 'False', data can either be scraped from the web and then processed (scrape_data = 'True')
## or loaded from a file and then processed (scrape_data = 'False'). In the latter scenario, the processing consists of running
## the 'combine_data' function.

# Get team names and urls
teams_source = pd.read_excel('Team_Source.xlsx', header=None, names=['2016', '2016_Teams', '2016 Conf', '2017', '2017_Teams', '2018', '2018_Teams',
                                                                     '2019', '2019_Teams', '2020', '2020_Teams', '2021', '2021_Teams'])
# select years to be processed
years = ['2016', '2017', '2018', '2019']
#years = ['2016']

# Set data-gathering parameters
load = True
scrape_data = True

# Initialize dataframe
cbb_df = pd.DataFrame()

# loop through each year to be looked at
for yr in years:
    if load:
        sources, teams, team_dict = cbb_fun.get_logistics(teams_source, yr) # #urls and team names for that year
        combined_df = [pd.read_csv('./Team_Dataframes/' + yr + '/Team_Combined/' + teams[ii] + '.csv') for ii in range(len(teams))] # load pre-processing data into list of dataframes   
#         with open('./Team_Dataframes/' + yr + '/Combined.pickle', 'rb') as f:
#             combined_df = pickle.load(f)
        combined_df = []
        for ii in range(len(teams)):
            with open('./Team_Dataframes/' + yr + '/Team_Combined/' + teams[ii] + '.pickle', 'rb') as f:
                combined_df.append(pickle.load(f))
        
        cbb_df = pd.concat((cbb_df, pd.concat(combined_df))) # concatenate all dataframes to final output dataframe
    else:
        teams_df, teams, team_dict = cbb_fun.get_team_data(teams_source, yr, scrape_data=scrape_data) # grab data for each team during the 'yr'
        combined_df, next_g = cbb_fun.combine_data(teams_df, team_dict, teams, year=yr) # process the data adding information about opponent and previous games
        cbb_df = pd.concat((cbb_df, pd.concat((combined_df)))) # concatenate all dataframes to final output dataframe

print('DONE LOADING ALL DATA')
playsound('mixkit-intro-transition-1146.wav')

DONE LOADING ALL DATA


In [3]:
############# Sending Data to Predictive Models and Looking at Statistics################

# determine features to be sent into models
features = ['Avg_Result', 'Avg_Result2', 'Home', 'Away', 'SOS', 'Opp_SOS', 'Prev_SOS', 'Opp_Prev_SOS', 'SOS_NC', 'Opp_SOS_NC', 'Opp_Avg_FG_Pct', 'Opp_Avg_FG_Pct2', 'Pct_Margin', 'Pct_Margin2', 'Scoring_Pace_Diff']

# number of features
num_features = len(features)

#data = cbb_df.dropna(subset=features)
data = cbb_df[cbb_df['Game_Number'] > 9].dropna(subset=features)

data['Opp_Scoring_Pace_Diff'] = data['Scoring_Pace_Diff']*-1

opp_features = ['Avg_Result2', 'Avg_Result', 'Home2', 'Away2', 'Opp_SOS', 'SOS', 'Opp_Prev_SOS', 'Prev_SOS', 'Opp_SOS_NC', 'SOS_NC', 'Opp_Avg_FG_Pct2', 'Opp_Avg_FG_Pct', 'Pct_Margin2', 'Pct_Margin', 'Opp_Scoring_Pace_Diff']

X = data[list(set(features + opp_features))]
y = data['result'] # points spread
y2 = data['Win'] # Win (1) or Loss (0) result

# Save Results
# with open('y.pickle', 'wb') as f:
#         pickle.dump(y, f)
# with open('y2.pickle', 'wb') as f:
#         pickle.dump(y2, f)

# Null statistics if winner was chosen at random and points spread was predicted to be 0
print('NULL STATISTICS')
print('Standard Error:', np.std(y) / np.sqrt(len(y)))
print('Standard Deviation:', np.std(y))
print('Mean-squared Error:', mean_squared_error(y, [0]*len(y)))
print('Percent Win/Loss Prediction: 50 %')

# Winner always chosen to be home team - 50/50 for neutral-site games
print('\n\nHOME NULL CORRECT PREDICTIONS: ', cbb_fun.home_null(cbb_df[['Away', 'Win']]), '%')

# Send data to Linear Regression model and get statistics (increase n_runs for stability in results - ~20 sec per 100 runs)
SE, STD, MSE, CV, model, pct_acc, avg_lin_coefs, lin_lr =  cbb_fun.get_linear_stats(X, y, features, opp_features, n_runs=500)

# Save Average Linear MSE
# with open('lin_mse.pickle', 'wb') as f:
#         pickle.dump(MSE, f)

print('\n\nLINEAR STATISTICS')
print('Standard Error:', SE)
print('Standard Deviation:', STD)
print('Mean-squared Error:', MSE)
print('Percent Win/Loss Prediction: ', pct_acc, '%')
print('Cross-validation score:', CV)
print(model.summary())

best_fits1 = X[features].dot(avg_lin_coefs)
best_fits2 = -1 * (X[opp_features].dot(avg_lin_coefs))
best_fits = (best_fits1 + best_fits2) / 2

# Save Best fits
# with open('best_fits.pickle', 'wb') as f:
#         pickle.dump(best_fits, f)

# Difference between predicted results and test data
resid = best_fits - y

# Append statistics
print('Best fit SE:', np.std(resid, ddof=num_features) / np.sqrt(np.size(resid)))
print('Best fit STD:', np.std(resid, ddof=num_features))
print('Best fit MSE:', mean_squared_error(y, best_fits))

# Linear Best Fits
data['Lin_ypred'] = best_fits

# Send data to Logistic Regression model and get statistics (increase n_runs for stability in results - ~30 sec per 100 runs)
score, avg_log_coefs, log_lr, sc = cbb_fun.get_logistic_stats(X, y2, features, opp_features, n_runs=100)

print('\n\nLOGISITC STATISTICS')
print('Score: ', score, '%')
print('Average Coefficients:', avg_log_coefs)
logs = log_lr.predict(sc.transform(X[features]))
print('Classification Report\n', classification_report(y2, logs))

# # Grab logistic regression coefficients
# with open('log_reg_coef.pickle', 'rb') as f:
#     log_reg_coef = pickle.load(f)

# Log Best Fits
log_best_fit_preds, log_best_fit_probs = cbb_fun.log_best_fit(X[features], X[opp_features], sc, avg_log_coefs)
# log_best_fits1 = X[features].dot(avg_lin_coefs)
# log_best_fits2 = -1 * (X[opp_features].dot(avg_lin_coefs))
# log_best_fits = (best_fits1 + best_fits2) / 2

# Add Logistic Regression Predictions to Dataframe
data['Log_y2pred'] = log_best_fit_preds
data['Log_y2prob'] = log_best_fit_probs

# Send Data to RandomForest models and get statistics
# This does Regression for the points spread and Classification for the Win/Loss result
# Default hyperparameters were selected using results from GridSearchCV tuning 
rf_mse, rf_acc, rf_reg, rf_class, rf_y_pred, rf_y2_pred = cbb_fun.get_rf_stats(X, y, y2, features, opp_features)

# Save RF Values
# with open('rf_mse.pickle', 'wb') as f:
#         pickle.dump(rf_mse, f)
# with open('rf_class.pickle', 'wb') as f:
#         pickle.dump(rf_class, f)
# with open('rf_reg.pickle', 'wb') as f:
#         pickle.dump(rf_reg, f)

# Add Random Forest Predictions to Dataframe
data['RF_ypred'] = rf_y_pred
data['RF_y2pred'] = rf_y2_pred

print('\n\nRANDOM FOREST STATISTICS')
print('Points Spread MSE (Regressor):', rf_mse)
print('Percent Win/Loss Prediction (Classifier): ', rf_acc, '%')
print('Classification Report\n', classification_report(y2, rf_class.predict(X[features])))


# # Combine Models
# mse_av, acc_av, y_tests, y2_tests, y_preds, y2_preds, y2_probs, log_y2_prbs, rf_y2_probs = cbb_fun.combine_models(X, y, y2, features, opp_features, MSE, rf_mse, n_runs=100)

# # Save Average MSE and Accuracy
# with open('av_mse.pickle', 'wb') as f:
#     pickle.dump(mse_av, f)
# with open('acc_av.pickle', 'wb') as f:
#     pickle.dump(acc_av, f)

# print('\n\nModel Combination MSE:', mse_av)
# print('Model Combination Accuracy:', acc_av)


# ## Create Confidence Distributions for Given Scores (ie. percent confidence above below a value)
# val_range = np.linspace(-25, 25, 51)
# prob_dists = cbb_fun.create_pts_spread_odds(y_preds, y_tests, val_range)
# dist_models, xs = cbb_fun.create_models(val_range, prob_dists)

# with open('prob_dists.pickle', 'wb') as f:
#     pickle.dump(prob_dists, f)
# with open('val_range.pickle', 'wb') as f:
#     pickle.dump(val_range, f)
# with open('dist_models.pickle', 'wb') as f:
#     pickle.dump(dist_models, f)

        

# game_nums = list(range(10, 32))

# SEs = []
# STDs = []
# MSEs = []
# CVs = []
# pct_accs = []
# avg_lin_coefss = []




# for num in game_nums:
    
#     if num < 31:
#         X = data[data['Game_Number'] == num][features]
#         y = data[data['Game_Number'] == num]['result']
#     else:
#         X = data[data['Game_Number'] >= num][features]
#         y = data[data['Game_Number'] >= num]['result']
    
#     SE, STD, MSE, CV, model, pct_acc, avg_lin_coefs, lr = cbb_fun.get_linear_stats(X, y)
    
#     #print(num)
#     #print(model.summary())
    
#     SEs.append(SE)
#     STDs.append(STD)
#     MSEs.append(MSE)
#     CVs.append(CV)
#     pct_accs.append(pct_acc)
#     avg_lin_coefss.append(avg_lin_coefs)
    
# cbb_fun.plot_params(SEs, STDs, MSEs, CVs, pct_accs, avg_lin_coefss)
    
playsound('mixkit-intro-transition-1146.wav')

NULL STATISTICS
Standard Error: 0.07949575963209486
Standard Deviation: 14.160959687437149
Mean-squared Error: 200.55760746249842
Percent Win/Loss Prediction: 50 %


HOME NULL CORRECT PREDICTIONS:  63.378092191231474 %


LINEAR STATISTICS
Standard Error: 0.12452919810409246
Standard Deviation: 11.091490680900218
Mean-squared Error: 122.8075188149776
Percent Win/Loss Prediction:  71.66398588175973 %
Cross-validation score: 0.38727317803426964
                                 OLS Regression Results                                
Dep. Variable:                 result   R-squared (uncentered):                   0.388
Model:                            OLS   Adj. R-squared (uncentered):              0.388
Method:                 Least Squares   F-statistic:                              1006.
Date:                Thu, 04 Mar 2021   Prob (F-statistic):                        0.00
Time:                        16:00:34   Log-Likelihood:                         -91027.
No. Observations:         

In [28]:
################ Make Predictions ##################
year = '2019'
teams_df, teams, team_dict = cbb_fun.get_team_data(teams_source, year, scrape_data=False) # grab data for each team during the 'yr'

team1 = 'Duke'
team2 = 'Michigan State'

#team_feat = ['Avg_Result', 'Avg_Result2', 'Home', 'Away', 'SOS', 'Matchup_Comp', 'Opp_Matchup_Comp', 'Avg_FG_Pct', 'Opp_Avg_FG_Pct', 'Avg_FG_Pct2', 'Opp_Avg_FG_Pct2']
team_feat = ['Avg_Result', 'Home', 'Away', 'SOS', 'Matchup_Comp', 'Opp_Matchup_Comp', 'Avg_FG_Pct', 'Opp_Avg_FG_Pct', 'Avg_FG_Pct2', 'Opp_Avg_FG_Pct2']
#opp_features = ['Avg_Result2', 'Avg_Result', 'Home2', 'Away2', 'Opp_SOS', 'Opp_Matchup_Comp', 'Matchup_Comp', 'Avg_FG_Pct2', 'Opp_Avg_FG_Pct2', 'Avg_FG_Pct', 'Opp_Avg_FG_Pct']

team1 = teams_df[team_dict[team1]]

test_pd['Avg_Result'] = team1['Avg_Result_Fin'][0]

Team1 = team1.loc[len(team1)- 1][features]
#Team1_opp = team1.loc[len(team1)- 1][opp_features]

# Models to use for predictions
regr_models = [lin_lr]#, rf_reg]
class_models = [log_lr]#, rf_class]

print('DUKE')
# Send data to models
reg1 = cbb_fun.regr_predict(regr_models, [Team1])
class1 = cbb_fun.class_predict(class_models, [Team1])

print()


## While most games have strong correlation amongst models,
## the game below is an example where there are large disrepancies

print('EXAMPLE OF WEAK AGREEMENT AMONGST MODELS')

team2 = pd.read_csv('./Team_Dataframes/' + year + '/Team_Only/' + team2 + '.csv')
Team2 = team2.loc[len(team2)- 2][features]

# Send data to models
reg2 = cbb_fun.regr_predict(regr_models, [Team2])
class2 = cbb_fun.class_predict(class_models, [Team2])

DUKE
LinearRegression(fit_intercept=False) model points spread: 3.62

LinearRegression(fit_intercept=False) model points spread: -0.99

LogisticRegression(max_iter=10000) model prediction: Win (89.0578% probability)

LogisticRegression(max_iter=10000) model prediction: Loss (91.6961% probability)


EXAMPLE OF WEAK AGREEMENT AMONGST MODELS
LinearRegression(fit_intercept=False) model points spread: -0.99

LinearRegression(fit_intercept=False) model points spread: 3.62

LogisticRegression(max_iter=10000) model prediction: Loss (91.6961% probability)

LogisticRegression(max_iter=10000) model prediction: Win (89.0578% probability)



In [47]:
features = ['Avg_Result', 'Avg_Result2', 'Home', 'Away', 'SOS', 'Opp_SOS', 'Prev_SOS', 'Opp_Prev_SOS', 'SOS_NC', 'Opp_SOS_NC', 'Opp_Avg_FG_Pct', 'Opp_Avg_FG_Pct2', 'Scoring_Pace_Diff']

cbb_df['Opp_Scoring_Pace_Diff'] = cbb_df['Scoring_Pace_Diff']*-1

opp_features = ['Avg_Result2', 'Avg_Result', 'Home2', 'Away2', 'Opp_SOS', 'SOS', 'Opp_Prev_SOS', 'Prev_SOS', 'Opp_SOS_NC', 'SOS_NC', 'Opp_Avg_FG_Pct2', 'Opp_Avg_FG_Pct', 'Opp_Scoring_Pace_Diff']

team1 = cbb_df.iloc[-1]

Team1 = team1[features]
Team1_opp = team1[opp_features]
# Models to use for predictions
regr_models = [lin_lr, rf_reg]
class_models = [log_lr, rf_class]

# Send data to models
reg1 = regr_predict(regr_models, [Team1], [Team1_opp])
class1 = class_predict(class_models, [Team1], [Team1_opp])


# X = data[features]
# y = data['result'] # points spread
# y2 = data['Win'] # Win (1) or Loss (0) result

# rf_regr = RandomForestRegressor()
# params = {'n_estimators': range(100, 251, 50), 'max_depth': range(3, X.shape[1]), 'max_features': range(3, X.shape[1])}
# cvtree = GridSearchCV(rf_regr, params)
# cvtree.fit(X, y)
# best_params = cvtree.best_params_
# print(best_params)
# playsound('mixkit-intro-transition-1146.wav')



LinearRegression(fit_intercept=False) model points spread: -7.98

RandomForestRegressor(max_depth=8, max_features=8, n_estimators=120) model points spread: -6.12

LogisticRegression(max_iter=10000) model prediction: Loss (99.7339% probability)

RandomForestClassifier(max_depth=7, max_features=9, n_estimators=134) model prediction: Loss (72.883% probability)



In [322]:
def regr_predict(models, games, matchups):
    """
        Takes in list of predictive regression models and data to make predictions upon.
        
        Prints out predictions for each model passed in.
    """
    for ii, game in enumerate(games):
        # loop through each of the models passed
        for model in models:

            # make a prediction based on the model
            prediction1 = model.predict([game[0]])
            prediction2 = -1 * model.predict([game[1]])
            prediction = (prediction1 + prediction2) / 2

            # print model and prediction
            print(f"{model} model points spread: {matchups[ii][0]} {round(prediction[0], 2)} to {matchups[ii][1]} ({prediction1, prediction2})")
        print()
        
def class_predict(models, games, sc, matchups):
    """
        Takes in list of predictive classification models and data to make predictions upon.
        
        Prints out predictions for each model passed in.
    """
    for ii, game in enumerate(games):
        # loop through each of the models passed
        for model in models:

            # make a prediction based on the model
            prediction1 = model.predict(sc.transform([game[0]]))
            prediction2 = model.predict(sc.transform([game[1]]))
            prediction = prediction1 + prediction2
            probability1 = model.predict_proba(sc.transform([game[0]]))[0]
            probability2 = model.predict_proba(sc.transform([game[1]]))[0]
            print(prediction1, probability1, prediction2, probability2)

            # print model and prediction
            if prediction == 1:
                if prediction1 == 1:
                    probability = (probability1[1] + probability2[0]) / 2
                    print(f"{model} model prediction: {matchups[ii][0]} Win ({round(probability * 100, 4)}% probability)\n")
                else:
                    probability = (probability1[0] + probability2[1]) / 2
                    print(f"{model} model prediction: {matchups[ii][0]} Loss ({round(probability * 100, 4)}% probability)\n")
            else:
                print(f"Undeterminded Winner: (Win probability: {(probability1[1] + probability2[0]) / 2})")
        print()
            
def get_linear_stats(X, y, features, opp_features, n_runs=10, n_folds=10):
    """
        Takes in X and y data and returns Linear Model statistics.
        Returns Standard Error, Standard Deviation, Mean Squared Error,
        Cross-Validation Score, and statsmodels OLS results.
        
        n_runs is the number of training runs to perform. An increase in n_runs
        provides more stable results.
        
        n_folds is the number of folds to create in the cross-validation test
    """
    
    # Initialized Linear Regression model
    lr = LinearRegression(fit_intercept = False)
    
    # number for features in model
    num_features = len(features)
    
    # initializes statistics that will be averaged
    MSEs = []
    SE = []
    stds = []
    scores = []
    pct_acc = []
    cv = []
    coefs = []
    
    # Number of training runs
    # Increase in runs stabilizes results
    for ii in range(n_runs):
        
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        
        # Uses only features
        X_train_tm = X_train[features]
        X_test_tm = X_test[features]
        
        lr.fit(X_train_tm, y_train)

        y_pred1 = lr.predict(X_test_tm)
        
        # Uses only opp_features
        X_train_opp = X_train[opp_features]
        X_test_opp = X_test[opp_features]
        
        y_pred2 =  -1 * lr.predict(X_test_opp)
        
        # Difference between predicted results and test data
        y_pred = np.mean([y_pred1, y_pred2], axis = 0)
        resid = y_pred - y_test
        
        # Append statistics
        SE.append(np.std(resid, ddof=num_features) / np.sqrt(np.size(resid)))
        stds.append(np.std(resid, ddof=num_features))
        MSEs.append(mean_squared_error(y_test, y_pred))
        scores.append(lr.score(X_test_tm, y_test))
        
         # Show percent accuracy of predicting win/loss
        num_correct = 0
        for pred, act in zip(y_pred, y_test):
            if pred*act > 0:
                num_correct += 1
        pct_acc.append(num_correct/len(y_test))
        
        coefs.append(lr.coef_)
        
        cv.append(cross_val_score(LinearRegression(), X[features], y, cv=KFold(n_folds, shuffle=True)).mean())
   
    
    # Develop OLS model
    model = sm.OLS(y_train, X_train_tm, missing = 'drop') # sm.add_constant(X)
    results = model.fit()
    
    return np.mean(SE), np.mean(stds), np.mean(MSEs), np.mean(cv), results, np.mean(pct_acc)*100, np.mean(coefs, axis=0), lr

def get_logistic_stats(X, y, features, opp_features, n_runs=10, n_folds=10):
    """
        Takes in X and y data and returns Logistic Model statistics.
        Returns Percent Accuracy.
        
        n_runs is the number of training runs to perform. An increase in n_runs
        provides more stable results.
        
        n_folds is the number of folds to create in the cross-validation test
    """
    
    # Initialized Logistic Regression model
    lr1 = LogisticRegression(max_iter=10000)
    lr2 = LogisticRegression(max_iter=10000)
    
    # initializes statistics that will be averaged
    cv = []
    coefs = []
    
    sc = StandardScaler()
    X_cv = sc.fit_transform(X)
    
    # Number of training runs
    # Increase in runs stabilizes results
    for ii in range(n_runs):
        
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        
        sc = StandardScaler()
        X_train_tm = sc.fit_transform(X_train[features])
        X_test_tm = sc.transform(X_test[features])
        
        lr1.fit(X_train_tm, y_train)

         # Uses only opp_features
        X_train_opp = sc.transform(X_train[opp_features])
        X_test_opp = sc.transform(X_test[opp_features])
        
        lr2.fit(X_train_opp, y_train)
        
        avg_coefs = np.mean([lr1.coef_, -1 * lr2.coef_], axis=0)

        coefs.append(avg_coefs)
        
        cv.append(cross_val_score(LogisticRegression(max_iter=10000), X_cv, y, cv=KFold(n_folds, shuffle=True), scoring='accuracy').mean())
    
    return np.mean(cv)*100, np.mean(coefs, axis=0), lr1, sc

In [124]:
# determine features to be sent into models
features = ['Avg_Result', 'Avg_Result2', 'Home', 'Away', 'SOS', 'Opp_SOS', 'Prev_SOS', 'Opp_Prev_SOS', 'SOS_NC', 'Opp_SOS_NC', 'Opp_Avg_FG_Pct', 'Opp_Avg_FG_Pct2', 'Scoring_Pace_Diff']

# number of features
num_features = len(features)

#data = cbb_df.dropna(subset=features)
data = cbb_df[cbb_df['Game_Number'] > 9].dropna(subset=features)

data['Opp_Scoring_Pace_Diff'] = data['Scoring_Pace_Diff']*-1

opp_features = ['Avg_Result2', 'Avg_Result', 'Home2', 'Away2', 'Opp_SOS', 'SOS', 'Opp_Prev_SOS', 'Prev_SOS', 'Opp_SOS_NC', 'SOS_NC', 'Opp_Avg_FG_Pct2', 'Opp_Avg_FG_Pct', 'Opp_Scoring_Pace_Diff']

X = data[list(set(features + opp_features))]
y = data['result'] # points spread
y2 = data['Win'] # Win (1) or Loss (0) result

# Send data to Logistic Regression model and get statistics (increase n_runs for stability in results - ~30 sec per 100 runs)
score, avg_log_coefs, log_lr, sc = get_logistic_stats(X, y2, features, opp_features, n_runs=1)

print('\n\nLOGISITC STATISTICS')
print('Score: ', score, '%')
print('Average Coefficients:', avg_log_coefs)
logs = log_lr.predict(sc.transform(X[features]))
print('Classification Report\n', classification_report(y2, logs))



LOGISITC STATISTICS
Score:  71.55549909036768 %
Average Coefficients: [[ 1.10419436 -1.10159036  0.26531197 -0.26593291  0.25394037 -0.2550779
   0.2370644  -0.23890528  0.11642414 -0.11718938  0.08882335 -0.08930684
  -0.07394545]]
Classification Report
               precision    recall  f1-score   support

           0       0.71      0.71      0.71     15734
           1       0.72      0.72      0.72     15998

    accuracy                           0.72     31732
   macro avg       0.72      0.72      0.72     31732
weighted avg       0.72      0.72      0.72     31732



In [24]:
features = ['Avg_Result', 'Avg_Result2', 'Home', 'Away', 'SOS', 'Opp_SOS', 'Prev_SOS', 'Opp_Prev_SOS', 'SOS_NC', 'Opp_SOS_NC', 'Opp_Avg_FG_Pct', 'Opp_Avg_FG_Pct2', 'Scoring_Pace_Diff']
data = cbb_df[cbb_df['Game_Number'] > 9].dropna(subset=features)

opp_features = ['Avg_Result2', 'Avg_Result', 'Home2', 'Away2', 'Opp_SOS', 'SOS', 'Opp_Prev_SOS', 'Prev_SOS', 'Opp_SOS_NC', 'SOS_NC', 'Opp_Avg_FG_Pct2', 'Opp_Avg_FG_Pct', 'Opp_Scoring_Pace_Diff']

X = data[features]
sc = StandardScaler()
Xsc = sc.fit_transform(X)
y = data['result'] # points spread
y2 = data['Win'] # Win (1) or Loss (0) result

rf_class = RandomForestClassifier()
rf_regr = RandomForestRegressor()
params = {'n_estimators': range(150, 301, 50), 'max_depth': range(6, Xsc.shape[1]), 'max_features': range(3, Xsc.shape[1])}
#cvtree = GridSearchCV(rf_class, params)
#cvtree.fit(Xsc, y2)
cvtree = GridSearchCV(rf_regr, params, scoring='neg_mean_squared_error', n_jobs=2)
cvtree.fit(X, y)
best_params = cvtree.best_params_
print(best_params)
playsound('mixkit-intro-transition-1146.wav')
# {'max_depth': 9, 'max_features': 8, 'n_estimators': 200} - 70.9788,71.05131 - 100,250,50 - 3,14 - 3,14 - no PctMargins
#{'max_depth': 9, 'max_features': 6, 'n_estimators': 175} - 71.0607, 70.9631 - 150,250,25 - 3,13 - 3,13 - no PctMargins
#{'max_depth': 12, 'max_features': 3, 'n_estimators': 180} - 71.0638,70.9851 160,220,10 - 3,14 - 3,14 - PctMargins included
#{'max_depth': 9, 'max_features': 5, 'n_estimators': 188} - 71.0387, 71.0072 - 170,180,2 - 3,14 - 3,14 - PctMargins included

#{'max_depth': 12, 'max_features': 5, 'n_estimators': 250} - 126.0945,125.9407 - 100,250,50 - 3,14 - 3,14 - PctMargins included
#{'max_depth': 11, 'max_features': 5, 'n_estimators': 250} - 126.0905,126.2703 - 250,350,50 - 7,14 - 3,10 - PctMargins included
#{'max_depth': 11, 'max_features': 8, 'n_estimators': 250} - 126.2227,126.3123 - 150,300,50 - 4,12 - 3,12 - no PctMargins
#{'max_depth': 11, 'max_features': 7, 'n_estimators': 255} - 126.1556,126.2016 - 225,275,10 - 6,12 - 3,12 - no PctMargins
#{'max_depth': 11, 'max_features': 8, 'n_estimators': 262} - 126.1543,126.5247 - 248,262,2 - 8,12 - 5,12 - no PctMargins
#{'max_depth': 10, 'max_features': 7, 'n_estimators': 300} - 126.3155,126.3439 - 150,300,50 - 6,12 - 3,12 - no PctMargins, specified MSE

{'max_depth': 10, 'max_features': 7, 'n_estimators': 300}


In [26]:
accs = cross_val_score(RandomForestRegressor(n_estimators=300, max_depth=10, max_features=7), \
                       X, y, cv=KFold(10, shuffle=True), scoring='neg_mean_squared_error').mean()
accs

-126.34392648988083

In [None]:
features = ['SMA_Result', 'SMA_Result2', 'Home', 'Away', 'SOS', 'Opp_SOS', 'Prev_SOS', 'Opp_Prev_SOS', 'SMA_Pct_Margin', 'SMA_Pct_Margin2', 
            'Prev_Pct_Margin', 'Prev_Pct_Margin2', 'SOS_NC', 'Opp_SOS_NC', 'EMA_Opp_FG_Pct', 'EMA_Opp_FG_Pct2', 'SMA_Comb_Loc', 'SMA_Comb_Loc2', 'Prev_Comb_Loc', 'Prev_Comb_Loc2',
            'Scoring_Pace_Diff', 'Avg_Result_NC', 'Avg_Result_NC2']

# number of features
num_features = len(features)

#data = cbb_df.dropna(subset=features)
data = cbb_df[cbb_df['Game_Number'] > 9].dropna(subset=features)

X = data[features]
y2 = data['Win']

sc_grid = StandardScaler()
Xsc = sc_grid.fit_transform(X)

rf_class = RandomForestClassifier()
params = {'n_estimators': range(175, 276, 25), 'max_depth': range(4, Xsc.shape[1]), 'max_features': range(4, Xsc.shape[1])}
cvtree = GridSearchCV(rf_class, params)
cvtree.fit(Xsc, y2)
best_params = cvtree.best_params_
print(best_params)