In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# import sklearn

df = pd.read_csv('../../data/processed/engineered.csv')

## Initial set of columns

First I'm testing using all of the rolling average columns, and putting them in an XGBoost model

In [None]:
features = [
    'eFG_pct_avg_last_10', 'tov_rate_avg_last_10', 'oreb_pct_avg_last_10', 'ftr_avg_last_10',
    'ortg_avg_last_10', 'drtg_avg_last_10', 'covered_avg_last_10', 
    'days_of_rest', 'home_team', 'opp_ortg_avg_last_10', 'opp_drtg_avg_last_10',
]


target = 'covered'

df[features].shape

## Baseline model

Linear regression predicting spread for a game based on rolling averages of ppg

In [None]:
from sklearn.model_selection import TimeSeriesSplit

X = df[features]
y = df['covered']

tscv = TimeSeriesSplit(n_splits=5)

from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'n_estimators': 600,
    'learning_rate': 0.05,
    'max_depth': 6,
    'min_child_weight': 4,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.3,
    'reg_lambda': 2.0,
    'reg_alpha': 0.5,
    'random_state': 42,
}

model = XGBClassifier(**xgb_params)

from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

for train_idx, test_idx in tscv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model.fit(X_train, y_train)
    preds = model.predict_proba(X_test)[:, 1]
    
    auc = roc_auc_score(y_test, preds)
    acc = accuracy_score(y_test, (preds > 0.5).astype(int))
    print(f"AUC: {auc:.3f} | Accuracy: {acc:.3f}")



In [None]:
from xgboost import plot_importance

plot_importance(model)
plt.show()

In [None]:
from sklearn.calibration import calibration_curve
prob_true, prob_pred = calibration_curve(y_test, preds, n_bins=10)
plt.plot(prob_pred, prob_true, marker='o')
plt.xlabel('Predicted Probability')
plt.ylabel('True Frequency')
plt.title('Calibration Curve')
plt.show()


In [None]:
df.loc[test_idx, 'pred_prob'] = preds
df.loc[test_idx, 'edge'] = df['pred_prob'] - 0.5  # edge vs implied 50/50 coin flip

plt.hist(df['edge'], bins=50)