In [50]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import brier_score_loss 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import log_loss 
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA 
import seaborn as sns 
import matplotlib.pyplot as plt 

In [27]:
# Preprocessed data model 
df = pd.read_csv('../data/modeling/reg_season_ml.csv')
pd.set_option("display.max_columns", None)
len(df.columns)

113

# Logistic Regression 

Explained Variance Ratio: [0.05475942 0.04145777]


In [None]:
x = df.drop(columns=['Season', 'DayNum', 'Team1', 'Team2', 'Team1_Wins', 
                    'Effective_FG_Percentage_1', 'Effective_FG_Percentage_2',
                    'Conf_pac_ten_1', 'Conf_pac_ten_2', 'Conf_gwc_1', 'Conf_gwc_2', 'Conf_mid_cont_1', 'Conf_mid_cont_2'])
y = df['Team1_Wins']

# Train the model 
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Normalize the data 
scaler = StandardScaler()

# Fit on training data and transform both train & test sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Logistic Regression model
logreg = LogisticRegression(C=100)
logreg.fit(X_train_scaled, y_train)

### Feature Engineering

Analyze which features are the most important

In [45]:
pd.set_option("display.max_rows", None)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns, 
    'Importance': logreg.coef_[0]
})

feature_importance = feature_importance.sort_values(by='Importance', ascending=True)
feature_importance 

Unnamed: 0,Feature,Importance
50,Win_Percentage_2,-0.879207
99,srs_rating_2,-0.613658
73,Conf_big_twelve_2,-0.280156
72,Conf_big_ten_2,-0.275829
89,Conf_sec_2,-0.265627
44,Conf_swac_1,-0.261507
69,Conf_big_east_2,-0.260098
67,Conf_acc_2,-0.247776
32,Conf_meac_1,-0.220292
87,Conf_pac_twelve_2,-0.215969


Identify which features are highly correlated and can be removed

In [43]:
corr_matrix = X_train.corr().abs() 
# Select upper triangle of correlation matrix (to avoid duplicate pairs)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# Find features with correlation > 0.9
high_correlation = [(column, index) for column in upper.columns for index in upper.index if upper[column][index] > 0.9]

# Print correlated feature pairs
for f1, f2 in high_correlation:
    print(f"High correlation: {f1} <--> {f2} (r = {corr_matrix.loc[f1, f2]:.2f})")

High correlation: srs_rating_1 <--> Win_Percentage_1 (r = 0.91)
High correlation: srs_rating_2 <--> Win_Percentage_2 (r = 0.91)


### Model Evaluation

In [67]:
# Make predictions on the test set
y_pred = logreg.predict(X_test)
y_pred_probs = logreg.predict_proba(X_test_scaled)[:, 1]

brier = brier_score_loss(y_test, y_pred_probs)
print(f'Brier Score: {brier:.4f}')

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

logloss = log_loss(y_test, y_pred_probs)
print(f'Log Loss: {logloss:.4f}')

auc = roc_auc_score(y_test, y_pred_probs)
print(f'ROC-AUC Score: {auc:.4f}')

Brier Score: 0.1649
Accuracy: 0.7163
Log Loss: 0.4943
ROC-AUC Score: 0.8370




### Outcome 
After analyzing the different solvers I identified that 'saga' performed the best and worked well with the large feature set.

**Feature Importance** 

Features that were found to have minimal impact on the model include: 
  - **Conf_pac_10**: Conference rebranded to the Pac 12 with the addition of Colorado and Utah 
  - **Conf_gwc**: Great Western Conference became defunct in 2014
  - **Conf_mid_cont**: Mid Continental conference rebranded to the summit league in 2007

**Feature Correlation** 
Effective FG Percentage and FG Percentage were highly correlated, as expceted. This led me to remove Effective FG Percentage which helped increase the accuracy of hte model 