In [147]:
import pandas as pd
import numpy as np
import joblib
import json 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb 
from sklearn.svm import SVC 
from sklearn.neural_network import  MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import brier_score_loss 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import log_loss 
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV
import seaborn as sns 
import matplotlib.pyplot as plt 


In [None]:
def predict_tournament_games(year, model_path="best_xgb_model.model", best_features_path="best_features.json", data_path="../data/modeling/final_ml.csv", teams_path="../data/MTeams.csv"):
    # Load the data
    df_test = pd.read_csv(data_path)
    df_seeds = df_test[['Team1', 'Team2', 'Season', 'Seed_1', 'Seed_2']]
    df_test = pd.get_dummies(df_test, columns=['Seed_1', 'Seed_2'], prefix=['T1_Seed','T2_Seed'], dtype=int)

    MTeams = pd.read_csv(teams_path)
    MTeams.drop(columns=['FirstD1Season', 'LastD1Season'], inplace=True)

    # Load the best features from JSON
    with open(best_features_path, "r") as f:
        best_features = json.load(f)

    # Extract the selected features
    select_features = ['reg_season_pred']
    best_feats = best_features["selected_features"]
    selected_features = select_features + best_feats

    print("Selected Features:", selected_features)

        # Extract the Brier score for the specified year
    brier_score_for_year = None
    for entry in best_features["brier_scores"]:
        if entry["season"] == year:
            brier_score_for_year = entry["brier_score"]
            break

    print(f"Season {year} Brier Score: {brier_score_for_year}")
    print(f"Brier Score on test set: {best_features['brier_score']}")

    seed_columns = [col for col in df_test.columns if 'seed' in col.lower()]
    feature_columns = selected_features + seed_columns

    # Filter games for the specified year
    tourney_year_data = df_test[df_test['Season'] == year]

    # Prepare test data
    X_year = tourney_year_data[feature_columns]
    y_year = tourney_year_data['Team1_Wins']

    # Load the trained model
    loaded_model = xgb.XGBClassifier()
    loaded_model.load_model(model_path)

    # Get predicted probabilities for the year's games
    y_pred_probs_year = loaded_model.predict_proba(X_year)[:, 1]  # Prob for Team1 winning

    # Create a DataFrame with results
    predictions_year = pd.DataFrame({
        'Team1_Wins': y_year,
        'Predicted_Prob_Team1_Wins': y_pred_probs_year,
        'Team1': tourney_year_data['Team1'], 
        'Team2': tourney_year_data['Team2'], 
        'Season': year
    })

    # Merge with seed data
    predictions_year = predictions_year.merge(df_seeds, on=['Team1', 'Team2', 'Season'], how='left')

    # Merge with team names
    predictions_year = predictions_year.merge(MTeams, how='left', right_on=['TeamID'], left_on=['Team1'])
    predictions_year = predictions_year.rename(columns={'TeamName': 'TeamName_1'})

    predictions_year = predictions_year.merge(MTeams, how='left', right_on=['TeamID'], left_on=['Team2'])
    predictions_year = predictions_year.rename(columns={'TeamName': 'TeamName_2'})

    # Clean up the DataFrame
    predictions_year.drop(columns=['TeamID_x', 'TeamID_y'], inplace=True)
    predictions_year = predictions_year.reindex(columns=[
        'Seed_1', 'Team1', 'TeamName_1', 'Seed_2', 'Team2', 'TeamName_2', 
        'Predicted_Prob_Team1_Wins', 'Team1_Wins'
    ])

    return predictions_year

In [160]:
# Example usage
year = 2023
model_path = "best_xgb_model.model"
best_features_path = "best_features.json"
data_path = "../data/modeling/final_ml.csv"
teams_path = "../data/MTeams.csv"

predictions = predict_tournament_games(year)

# Show all rows
pd.set_option("display.max_rows", None)
predictions

Selected Features: ['reg_season_pred', 'Win_Percentage_1', 'Win_Percentage_2', 'Win_pct_last_10_games_1', 'Win_pct_last_10_games_2', 'defensiveRating_1', 'defensiveRating_2', 'Turnovers_Per_Game_1', 'Turnovers_Per_Game_2', '3PAr_1', '3PAr_2', 'Offensive_Rebound_Rate_1', 'Offensive_Rebound_Rate_2', 'offensiveRating_1', 'offensiveRating_2', 'Pace_1', 'Pace_2']
Season 2023 Brier Score: 0.17844007966399103
Average Brier Score: 0.16002318726712422


Unnamed: 0,Seed_1,Team1,TeamName_1,Seed_2,Team2,TeamName_2,Predicted_Prob_Team1_Wins,Team1_Wins
0,11,1338,Pittsburgh,11,1280,Mississippi St,0.607492,1
1,16,1394,TAM C. Christi,16,1369,SE Missouri St,0.65662,1
2,11,1113,Arizona St,11,1305,Nevada,0.419854,1
3,16,1192,F Dickinson,16,1411,TX Southern,0.63888,1
4,1,1104,Alabama,16,1394,TAM C. Christi,0.853838,1
5,8,1116,Arkansas,9,1228,Illinois,0.590486,1
6,9,1120,Auburn,8,1234,Iowa,0.4635,1
7,5,1181,Duke,12,1331,Oral Roberts,0.798741,1
8,13,1202,Furman,4,1438,Virginia,0.314721,1
9,1,1222,Houston,16,1297,N Kentucky,0.885139,1


In [None]:
pd.set_option("display.max_columns", None) 
df_full = pd.read_csv("../data/modeling/final_ml.csv")


df = df_full.drop(columns=['Team1', 'Team2', 'ConfAbbrev_1', 'ConfAbbrev_2', 'Threes_Per_Game_1', 'Threes_Per_Game_2', 'Free_Throws_Per_Game_1', 'Free_Throws_Per_Game_2'])
df = pd.get_dummies(df, columns=['Seed_1', 'Seed_2'], prefix=['T1_Seed','T2_Seed'], dtype=int)

X = df.drop(columns=['Season','Team1_Wins'])
y = df['Team1_Wins']
df_full.head(5) 

In [None]:
# Train the tuned XGBoost model
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

loaded_model = xgb.XGBClassifier() 
loaded_model.load_model("best_xgb_model.model")

'''
xgb_model = xgb.XGBClassifier(
    objective="binary:logistic", eval_metric="logloss",
)
xgb_model.fit(X_train, y_train)
xgb_model.save_model('xgb_model_2.json')
'''

In [None]:
columns_to_drop = ['Team1', 'Team2', 'Seed_1', 'Seed_2', 'ConfAbbrev_1', 'ConfAbbrev_2']
columns_to_drop += [col for col in df_full.columns if col.endswith('_2')]

# Calculate the correlation matrix
df_corr = df_full.drop(columns=columns_to_drop)
correlation_matrix = df_corr.corr()

# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

# Draw the heatmap
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", vmin=-1, vmax=1, linewidths=0.5)

# Add title and labels
plt.title("Feature Correlation Matrix")
plt.show()

In [None]:
# Set a correlation threshold
threshold = 0.8

# Find highly correlated features
highly_correlated = set()  # Use a set to avoid duplicate pairs

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            colname_i = correlation_matrix.columns[i]
            colname_j = correlation_matrix.columns[j]
            highly_correlated.add((colname_i, colname_j, correlation_matrix.iloc[i, j]))

# Convert the set to a list and sort by correlation in descending order
highly_correlated_sorted = sorted(highly_correlated, key=lambda x: abs(x[2]), reverse=True)

# Display highly correlated feature pairs in descending order
print("Highly Correlated Feature Pairs (Descending Order):")
for pair in highly_correlated_sorted:
    print(f"{pair[0]} and {pair[1]} (Correlation: {pair[2]:.2f})")

In [None]:
pd.set_option("display.max_rows", None) 

# Assuming xgb_model is your trained XGBoost model
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,  
    'Importance': xgb_model.feature_importances_
})

# Sort by importance (ascending or descending)
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Show all rows
pd.set_option("display.max_rows", None)
print(feature_importance)
