In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import sklearn.metrics as metrics
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
pd.options.display.max_columns = 100

## Pre-processing

In [2]:
over_unders = pd.read_csv('historical_over_under_data.csv')
stats = pd.read_csv('team_stats_00_22.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'historical_over_under_data.csv'

In [None]:
print(over_unders.shape)
over_unders.head()

In [None]:
print(stats.shape)
stats.head()

In [None]:
o_u_teams = over_unders['Team'].map(lambda x: x[:-3])
o_u_teams.value_counts().sort_values(ascending = False)

In [None]:
stats_teams = stats['Team'].map(lambda x: x[:-3])
stats_teams.value_counts().sort_values(ascending = False)

In [None]:
df = pd.merge(over_unders, stats, on = 'Team')
df.shape

In [None]:
df.isnull().sum()

In [None]:
# binarize our target column, 'result', and make sure the ratio is around 50/50
df['result'] = df['result'].map({'O': 1, 'U': 0})
df.head()

In [None]:
df = df.dropna() # drop the one row with a NaN

In [None]:
# Null Model = 51.5% accurate if you predicted Under every time.  This is the baseline we need to beat.
df['result'].value_counts(normalize = True)

In [None]:
df.shape

## Feature Engineering

In [None]:
df.head()

In [None]:
df['pythag_difference'] = df['PW'] - df['W']
df['age*win_total'] = df['Age'] * df['win_total']
df['age*PW'] = df['Age'] * df['PW']
df['win_total*pythag'] = df['win_total'] * df['pythag_difference']
df.head()

## Modeling

### Logistic Regression

In [None]:
# Set up X and Y
X = df.drop(columns = ['Team', 'result', 'actual_wins','MOV', 'SRS', 'TS%', 'TOV%', 'SOS'])
y = df['result']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .2, random_state = 33)

ss = StandardScaler()
ss.fit(X_train)          
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

logreg = LogisticRegression()
logreg.fit(X_train_sc, y_train)
print(f'train score is {logreg.score(X_train_sc, y_train)}')
print(f'test score is {logreg.score(X_test_sc, y_test)}')
print(f'cross val score is {cross_val_score(logreg, X_train_sc, y_train).mean()}')

In [None]:
logreg.predict(X_test_sc)

In [None]:
coef_df = pd.DataFrame(zip(X_train.columns, np.transpose(logreg.coef_)), columns=['features', 'coef'])
coef_df['coef'] = coef_df['coef'].map(lambda x: round(x[0],3))

coef_df.sort_values(by = 'coef')

In [None]:
pred_df = pd.DataFrame({'true_values': y_test,
                        'prediction': logreg.predict(X_test_sc)})

In [None]:
pred_df['prediction'].value_counts()

In [None]:
plot_confusion_matrix(logreg, X_test, y_test, display_labels = ['Under', 'Over']);

In [None]:
preds = logreg.predict(X_test)

In [None]:
accuracy_score(y_test, preds)

### Random Forest

In [None]:
rf = RandomForestClassifier()

In [None]:
X_train, X_test_gs, y_train, y_test_gs = train_test_split(X,y, test_size = .2, random_state = 33)

In [None]:
cross_val_score(rf, X_test_gs, y_test_gs).mean()

In [None]:
params = {
    'max_depth': [7,11,12,13],
    'max_features': [5,6,7,8],
    'min_samples_split': [2,3,4, 7,9],
    'n_estimators': [10,13,20,23,26,30,33] 
    }

gs = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=params, verbose=0)
gs.fit(X_train, y_train)
gs.best_params_, gs.score(X_test_gs, y_test_gs)

In [None]:
preds_gs = gs.predict(X_test_gs)

In [None]:
accuracy_score(y_test_gs, preds_gs)

In [None]:
plot_confusion_matrix(gs, X_test_gs, y_test_gs, display_labels = ['Under', 'Over']);

In [None]:
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100]

train_results = []
test_results = []
for estimator in n_estimators:
   rf = RandomForestClassifier(n_estimators=estimator, n_jobs=-1)
   rf.fit(X_train, y_train)
   train_pred = rf.predict(X_train)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)
   y_pred = rf.predict(X_test)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(n_estimators, train_results, 'b', label='Train AUC')
line2, = plt.plot(n_estimators, test_results, 'r', label='Test AUC')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('AUC Score')
plt.xlabel('n_estimators')
plt.show()