In [18]:
import pandas as pd
from itertools import combinations
import numpy as np
import joblib
import json 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb 
from sklearn.svm import SVC 
from sklearn.neural_network import  MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import brier_score_loss 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import log_loss 
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV
import seaborn as sns 
import matplotlib.pyplot as plt 


In [19]:
pd.set_option("display.max_columns", None)

# Mens 
mens_season_25_data = pd.read_csv("../data/modeling/mens_25_final_ml.csv")
mens_df_seeds = mens_season_25_data[['Team1', 'Team2', 'Seed_1', 'Seed_2']]
mens_season_25_data = pd.get_dummies(mens_season_25_data, columns=['Seed_1', 'Seed_2'], prefix=['T1_Seed','T2_Seed'], dtype=int)

# Womens
womens_season_25_data = pd.read_csv("../data/modeling/womens_25_final_ml.csv")
womens_df_seeds = womens_season_25_data[['Team1', 'Team2', 'Seed_1', 'Seed_2']]
womens_season_25_data = pd.get_dummies(womens_season_25_data, columns=['Seed_1', 'Seed_2'], prefix=['T1_Seed','T2_Seed'], dtype=int)

# Load the best features from JSON
with open("../modeling/mens_best_features.json", "r") as f:
    mens_best_features = json.load(f)

# Load the best features from JSON
with open("../modeling/womens_best_features.json", "r") as f:
    womens_best_features = json.load(f)

# Mens Features 
mens_select_features = ['reg_season_pred']
mens_best_feats = mens_best_features["selected_features"]
mens_selected_features = mens_select_features + mens_best_feats

mens_seed_columns = [col for col in mens_season_25_data.columns if 'seed' in col.lower()]
mens_feature_columns = mens_selected_features + mens_seed_columns

# Womens Features 
womens_select_features = ['reg_season_pred']
womens_best_feats = womens_best_features["selected_features"]
womens_selected_features = womens_select_features + womens_best_feats

womens_seed_columns = [col for col in mens_season_25_data.columns if 'seed' in col.lower()]
womens_feature_columns = womens_selected_features + womens_seed_columns

# Predict Mens Tourney Games 
mens_x = mens_season_25_data[mens_feature_columns]

# Load the trained model
mens_loaded_model = xgb.XGBClassifier()
mens_loaded_model.load_model("../modeling/mens_best_xgb_model.model")

y_pred_probs_year_mens = mens_loaded_model.predict_proba(mens_x)[:, 1]  # Prob for Team1 winning

# Create a DataFrame with results
mens_predictions_year = pd.DataFrame({
    'Predicted_Prob_Team1_Wins': y_pred_probs_year_mens,
    'Team1': mens_season_25_data['Team1'], 
    'TeamName1': mens_season_25_data['TeamName_1'],
    'Team2': mens_season_25_data['Team2'], 
    'TeamName2': mens_season_25_data['TeamName_2'],
    'Season': 2025
})

# Predict Womens Tourney Games
womens_x = womens_season_25_data[womens_feature_columns]

# Load the trained model
womens_loaded_model = xgb.XGBClassifier()
womens_loaded_model.load_model("../modeling/womens_best_xgb_model.model")

y_pred_probs_year_womens = womens_loaded_model.predict_proba(womens_x)[:, 1]  # Prob for Team1 winning

# Create a DataFrame with results
womens_predictions_year = pd.DataFrame({
    'Predicted_Prob_Team1_Wins': y_pred_probs_year_womens,
    'Team1': womens_season_25_data['Team1'], 
    'TeamName1': womens_season_25_data['TeamName_1'],
    'Team2': womens_season_25_data['Team2'], 
    'TeamName2': womens_season_25_data['TeamName_2'],
    'Season': 2025
})


# Mens - Merge with seed data 
mens_predictions_year = mens_predictions_year.merge(mens_df_seeds, on=['Team1', 'Team2'], how='left')

# Womens - Merge with seed data 
womens_predictions_year = womens_predictions_year.merge(womens_df_seeds, on=['Team1', 'Team2'], how='left')

# Mens - Clean up dataframe 
mens_predictions_year = mens_predictions_year.reindex(columns=[
   'Seed_1', 'Team1', 'TeamName1', 'Seed_2', 'Team2', 'TeamName2', 
   'Predicted_Prob_Team1_Wins'
]) 

# Womens - Clean up dataframe 
womens_predictions_year = womens_predictions_year.reindex(columns=[
   'Seed_1', 'Team1', 'TeamName1', 'Seed_2', 'Team2', 'TeamName2', 
   'Predicted_Prob_Team1_Wins'
])

In [None]:
def filter_seed_pair(predictions_df, seed_1, seed_2):
    # Ensure correct logical comparison for both directions (seed_1 vs seed_2 and seed_2 vs seed_1)
    filtered = predictions_df[
        ((predictions_df['Seed_1'] == seed_1) & (predictions_df['Seed_2'] == seed_2)) | 
        ((predictions_df['Seed_1'] == seed_2) & (predictions_df['Seed_2'] == seed_1))
    ]
    return filtered

# Example usage for Men's predictions
filtered_mens = filter_seed_pair(mens_predictions_year, 5, 12)

# Example usage for Women's predictions
filtered__womens = filter_seed_pair(womens_predictions_year, 5, 12)

filtered_mens

Unnamed: 0,Seed_1,Team1,TeamName1,Seed_2,Team2,TeamName2,Predicted_Prob_Team1_Wins
268,12,1251,Liberty,5,1332,Oregon,0.277657
286,12,1270,McNeese St,5,1332,Oregon,0.34309
303,5,1332,Oregon,12,1471,UC San Diego,0.624769
320,12,1161,Colorado St,5,1332,Oregon,0.256912
691,5,1155,Clemson,12,1251,Liberty,0.643539
708,12,1251,Liberty,5,1276,Michigan,0.30681
726,12,1251,Liberty,5,1272,Memphis,0.531621
1204,5,1155,Clemson,12,1270,McNeese St,0.588001
1221,5,1155,Clemson,12,1471,UC San Diego,0.60347
1238,5,1155,Clemson,12,1161,Colorado St,0.669448


In [None]:
predictions_year = pd.concat(mens_predictions_year, womens_predictions_year)
predictions_year[(predictions_year['Seed_1'] > predictions_year['Seed_2']) & (predictions_year['Predicted_Prob_Team1_Wins'] >= 0.5)]

# Load the team data
m_teams_df = pd.read_csv('../data/MTeams.csv')
w_teams_df = pd.read_csv('../data/WTeams.csv')
w_regular_season_df = pd.read_csv('../data/WRegularSeasonCompactResults.csv')

# Filter teams that are active in the year 2025
active_m_teams = m_teams_df[(m_teams_df['LastD1Season'] >= 2025) & (m_teams_df['FirstD1Season'] <= 2025)]
active_w_teams_ids = pd.concat([w_regular_season_df[w_regular_season_df['Season'] == 2025]['WTeamID'], 
                                w_regular_season_df[w_regular_season_df['Season'] == 2025]['LTeamID']]).unique()
active_w_teams = w_teams_df[w_teams_df['TeamID'].isin(active_w_teams_ids)]

print(f"Active men's teams: {len(active_m_teams)}")
print(f"Active women's teams: {len(active_w_teams)}")

# Generate matchups for the year 2025
year = 2025
m_matchups = []
w_matchups = []

# Ensure matchups are only between active teams
for team1, team2 in combinations(active_m_teams['TeamID'], 2):
    lower_id = min(team1, team2)
    higher_id = max(team1, team2)
    m_matchups.append(
        {
            'Year': year, 
            'Team1': lower_id,
            'Team2': higher_id,
            'Pred': 0.50 
        }
    )

for team1, team2 in combinations(active_w_teams['TeamID'], 2):
    lower_id = min(team1, team2)
    higher_id = max(team1, team2)
    w_matchups.append(
        {
            'Year': year, 
            'Team1': lower_id,
            'Team2': higher_id,
            'Pred': .50
        }
    )

m_list = pd.DataFrame(m_matchups)
w_list = pd.DataFrame(w_matchups)
# Combine the men's and women's submissions
combined_submission_df = pd.concat([m_list, w_list], ignore_index=True)


# Merge predictions with combined_submission_df on Team1 and Team2
combined_submission_df = combined_submission_df.merge(
    predictions_year[['Team1', 'Team2', 'Predicted_Prob_Team1_Wins']],
    on=['Team1', 'Team2'],
    how='left'
)

# Update 'Pred' with the predicted values where available
combined_submission_df['Pred'] = combined_submission_df['Predicted_Prob_Team1_Wins'].fillna(combined_submission_df['Pred'])

# Drop the extra column after merging
combined_submission_df.drop(columns=['Predicted_Prob_Team1_Wins'], inplace=True)


combined_submission_df[combined_submission_df['Pred'] != 0.5]



Active men's teams: 364
Active women's teams: 362


Unnamed: 0,Year,Team1,Team2,Pred
725,2025,1103,1104,0.086928
727,2025,1103,1106,0.702047
730,2025,1103,1110,0.682962
732,2025,1103,1112,0.100046
736,2025,1103,1116,0.191606
...,...,...,...,...
65838,2025,1459,1463,0.370722
65846,2025,1459,1471,0.291230
65895,2025,1462,1463,0.776248
65903,2025,1462,1471,0.566048


In [13]:
# Format the 'Year', 'Team1', and 'Team2' into a single column and add the 'Pred' column
combined_submission_df['Matchup'] = combined_submission_df['Year'].astype(str) + '_' + \
                                    combined_submission_df['Team1'].astype(str) + '_' + \
                                    combined_submission_df['Team2'].astype(str)

# Now, create the final DataFrame with 'Matchup' and 'Pred'
final_submission = combined_submission_df[['Matchup', 'Pred']]


submission_path = '../data/submission/submission.csv'
combined_submission_df.to_csv(submission_path, index=False)

print(f"Submission file successfully created at {submission_path}")

Submission file successfully created at ../data/submission/submission.csv
