In [76]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb 
from sklearn.svm import SVC 
from sklearn.neural_network import  MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import brier_score_loss 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import log_loss 
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV
import seaborn as sns 
import matplotlib.pyplot as plt 


In [77]:
pd.set_option("display.max_columns", None) 
df = pd.read_csv("../data/modeling/final_ml.csv")
df = df[['Season', 'Team1_Wins', 'reg_season_pred', 'Seed_1', 'Seed_2']]
df = pd.get_dummies(df, columns=['Seed_1', 'Seed_2'], prefix=['T1_Seed','T2_Seed'], dtype=int)

In [None]:
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

def train_and_save_logistic_regression(X, y, model_filename='log_reg_model.joblib', test_size=0.2, random_state=57):
    """
    Trains a Logistic Regression model with scaled continuous features and saves it.

    Parameters:
    - X: DataFrame of features.
    - y: Series or array of target variable.
    - model_filename: Name of the file to save the trained model.
    - test_size: Proportion of data to use for testing.
    - random_state: Random seed for reproducibility.

    Returns:
    - Trained logistic regression model.
    - Preprocessor (for future data transformations).
    """

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Identify columns to scale (exclude seed-related columns)
    scaled_columns = [col for col in X.columns if 'seed' not in col.lower()]
    passthrough_columns = [col for col in X.columns if 'seed' in col.lower()]

    # Define the preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('scale', StandardScaler(), scaled_columns),  # Scale selected columns
            ('passthrough', 'passthrough', passthrough_columns)  # Leave seed columns unchanged
        ])

    # Transform the training and testing data
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    # Train the Logistic Regression model
    logreg = LogisticRegression()
    logreg.fit(X_train_transformed, y_train)

    # Save the trained model
    joblib.dump(logreg, model_filename)

    print(f"Model saved as {model_filename}")
    
    return logreg, preprocessor  # Returning preprocessor helps with future transformations


['log_reg_model.joblib']

In [60]:
pd.set_option("display.max_rows", None) 
feature_importance = pd.DataFrame({
    'Feature': X_train.columns, 
    'Importance': logreg.coef_[0]
})

feature_importance = feature_importance.sort_values(by='Importance', ascending=True)

In [61]:
# Define the parameter grid
param_dist = {
    'n_estimators': np.arange(100, 1001, 100),
    'learning_rate': np.linspace(0.01, 0.3, 10),
    'max_depth': np.arange(3, 10),
    'min_child_weight': np.arange(1, 10),
    'subsample': np.linspace(0.5, 1.0, 5),
    'colsample_bytree': np.linspace(0.5, 1.0, 5)
}

# Initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss")

# Set up Randomized Search with Brier Score
random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=20,  # Number of different parameter combinations to try
    scoring='neg_brier_score',  # Minimizing Brier Score
    cv=3,  # 3-fold cross-validation
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Fit the model
random_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)

# Best Brier Score (negated back to positive)
best_brier_score = -random_search.best_score_
print("Best Brier Score:", best_brier_score)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END colsample_bytree=0.875, learning_rate=0.2677777777777778, max_depth=8, min_child_weight=1, n_estimators=200, subsample=0.875; total time=   0.2s
[CV] END colsample_bytree=0.875, learning_rate=0.2677777777777778, max_depth=8, min_child_weight=1, n_estimators=200, subsample=0.875; total time=   0.2s
[CV] END colsample_bytree=0.875, learning_rate=0.2677777777777778, max_depth=8, min_child_weight=1, n_estimators=200, subsample=0.875; total time=   0.2s
[CV] END colsample_bytree=1.0, learning_rate=0.20333333333333334, max_depth=7, min_child_weight=4, n_estimators=400, subsample=0.75; total time=   0.4s
[CV] END colsample_bytree=1.0, learning_rate=0.20333333333333334, max_depth=7, min_child_weight=4, n_estimators=400, subsample=0.75; total time=   0.4s
[CV] END colsample_bytree=0.875, learning_rate=0.1711111111111111, max_depth=3, min_child_weight=1, n_estimators=400, subsample=0.875; total time=   0.3s
[CV] END colsample_

In [67]:
# Train the tuned XGBoost model
xgb_model = xgb.XGBClassifier(
    objective="binary:logistic", eval_metric="logloss",
    subsample=0.5, n_estimators=500, min_child_weight=1, max_depth=7,
    learning_rate=0.01, colsample_bytree=0.75
)
xgb_model.fit(X_train, y_train)
xgb_model.save_model('xgb_model.json')

In [64]:
def test_model_on_each_season(model, df, feature_columns):
    seasons = df['Season'].unique()
    brier_scores = {}

    for test_season in seasons:
        print(f"Testing model on Season {test_season}...")

        test_data = df[df['Season'] == test_season]
        X_test = test_data[feature_columns]
        y_test = test_data['Team1_Wins']

        # Predict probabilities
        y_pred_probs = model.predict_proba(X_test)[:, 1]  # Probabilities for class 1 (Team1 winning)

        # Compute Brier Score
        brier = brier_score_loss(y_test, y_pred_probs)
        brier_scores[test_season] = brier

        print(f"Season {test_season} - Brier Score: {brier:.4f}")

    # Print summary
    print("\nBrier Scores by Season:")
    for season, score in brier_scores.items():
        print(f"Season {season}: {score:.4f}")

    # Average Brier Score across all seasons
    avg_brier = np.mean(list(brier_scores.values()))
    print(f"\nAverage Brier Score: {avg_brier:.4f}")

    return brier_scores, avg_brier


In [75]:

features = df.drop(columns=['Season', 'Team1_Wins']).columns

# Load XGB model 
xgb_model = xgb.XGBClassifier()
xgb_model.load_model("xgb_model.json")

# Load LogReg model 
log_model = joblib.load('log_reg_model.joblib')

# Evaluate the model 
brier_scores, avg_brier = test_model_on_each_season(xgb_model, df, features)

Testing model on Season 2014...
Season 2014 - Brier Score: 0.1774
Testing model on Season 2015...
Season 2015 - Brier Score: 0.1420
Testing model on Season 2016...
Season 2016 - Brier Score: 0.1734
Testing model on Season 2017...
Season 2017 - Brier Score: 0.1273
Testing model on Season 2018...
Season 2018 - Brier Score: 0.1612
Testing model on Season 2019...
Season 2019 - Brier Score: 0.1308
Testing model on Season 2021...
Season 2021 - Brier Score: 0.1565
Testing model on Season 2022...
Season 2022 - Brier Score: 0.1613
Testing model on Season 2023...
Season 2023 - Brier Score: 0.1795
Testing model on Season 2024...
Season 2024 - Brier Score: 0.1561

Brier Scores by Season:
Season 2014: 0.1774
Season 2015: 0.1420
Season 2016: 0.1734
Season 2017: 0.1273
Season 2018: 0.1612
Season 2019: 0.1308
Season 2021: 0.1565
Season 2022: 0.1613
Season 2023: 0.1795
Season 2024: 0.1561

Average Brier Score: 0.1566
