In [13]:
import pandas as pd
from itertools import combinations
import numpy as np
import joblib
import json 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb 
from sklearn.svm import SVC 
from sklearn.neural_network import  MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import brier_score_loss 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import log_loss 
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV
import seaborn as sns 
import matplotlib.pyplot as plt 


In [None]:
pd.set_option("display.max_columns", None)
mens_season_25_data = pd.read_csv("../data/modeling/mens_25_final_ml.csv")
df_seeds = mens_season_25_data[['Team1', 'Team2', 'Seed_1', 'Seed_2']]
mens_season_25_data = pd.get_dummies(mens_season_25_data, columns=['Seed_1', 'Seed_2'], prefix=['T1_Seed','T2_Seed'], dtype=int)
mens_season_25_data

# Load the best features from JSON
with open("../modeling/best_features.json", "r") as f:
    best_features = json.load(f)

# Extract the selected features
select_features = ['reg_season_pred']
best_feats = best_features["selected_features"]
selected_features = select_features + best_feats

seed_columns = [col for col in mens_season_25_data.columns if 'seed' in col.lower()]
feature_columns = selected_features + seed_columns

x = mens_season_25_data[feature_columns]

# Load the trained model
loaded_model = xgb.XGBClassifier()
loaded_model.load_model("../modeling/best_xgb_model.model")

y_pred_probs_year = loaded_model.predict_proba(x)[:, 1]  # Prob for Team1 winning

# Create a DataFrame with results
predictions_year = pd.DataFrame({
    'Predicted_Prob_Team1_Wins': y_pred_probs_year,
    'Team1': mens_season_25_data['Team1'], 
    'TeamName1': mens_season_25_data['TeamName_1'],
    'Team2': mens_season_25_data['Team2'], 
    'TeamName2': mens_season_25_data['TeamName_2'],
    'Season': 2025
})

# Merge with seed data
predictions_year = predictions_year.merge(df_seeds, on=['Team1', 'Team2'], how='left')

# Clean up the DataFrame

predictions_year = predictions_year.reindex(columns=[
   'Seed_1', 'Team1', 'TeamName_1', 'Seed_2', 'Team2', 'TeamName_2', 
   'Predicted_Prob_Team1_Wins'
])
predictions_year 


Unnamed: 0,Seed_1,Team1,TeamName_1,Seed_2,Team2,TeamName_2,Predicted_Prob_Team1_Wins,Team1_Wins
0,2,1104,,1,1181,,0.334572,
1,1,1181,,3,1458,,0.829738,
2,4,1112,,1,1181,,0.133873,
3,1,1181,,5,1332,,0.864013,
4,6,1140,,1,1181,,0.104629,
...,...,...,...,...,...,...,...,...
2273,13,1213,,15,1303,,0.688045,
2274,13,1213,,16,1313,,0.683153,
2275,15,1303,,14,1423,,0.457414,
2276,16,1313,,14,1423,,0.477566,


In [None]:

# Load the team data
m_teams_df = pd.read_csv('../data/MTeams.csv')
w_teams_df = pd.read_csv('../data/WTeams.csv')
w_regular_season_df = pd.read_csv('../data/WRegularSeasonCompactResults.csv')

# Filter teams that are active in the year 2025
active_m_teams = m_teams_df[(m_teams_df['LastD1Season'] >= 2025) & (m_teams_df['FirstD1Season'] <= 2025)]
active_w_teams_ids = pd.concat([w_regular_season_df[w_regular_season_df['Season'] == 2025]['WTeamID'], 
                                w_regular_season_df[w_regular_season_df['Season'] == 2025]['LTeamID']]).unique()
active_w_teams = w_teams_df[w_teams_df['TeamID'].isin(active_w_teams_ids)]

print(f"Active men's teams: {len(active_m_teams)}")
print(f"Active women's teams: {len(active_w_teams)}")

# Generate matchups for the year 2025
year = 2025
m_matchups = []
w_matchups = []

# Ensure matchups are only between active teams
for team1, team2 in combinations(active_m_teams['TeamID'], 2):
    lower_id = min(team1, team2)
    higher_id = max(team1, team2)
    m_matchups.append(f"{year}_{lower_id}_{higher_id}")

for team1, team2 in combinations(active_w_teams['TeamID'], 2):
    lower_id = min(team1, team2)
    higher_id = max(team1, team2)
    w_matchups.append(f"{year}_{lower_id}_{higher_id}")

# Create DataFrames for the submissions
m_submission_df = pd.DataFrame({
    'ID': m_matchups,
    'Pred': [0.5] * len(m_matchups)  # Set all probabilities to 0.5
})

w_submission_df = pd.DataFrame({
    'ID': w_matchups,
    'Pred': [0.5] * len(w_matchups)  # Set all probabilities to 0.5
})

# Combine the men's and women's submissions
combined_submission_df = pd.concat([m_submission_df, w_submission_df], ignore_index=True)

# Save the combined submission DataFrame to a CSV file
submission_path = '../data/submission/submission.csv'
combined_submission_df.to_csv(submission_path, index=False)

print(f"Submission file successfully created at {submission_path}")
