In [4]:
# Complete the training part first, this picks up where it left off
# We'll load the saved models from the first fold and use them to make predictions

import os
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import brier_score_loss


# First load the models from the folds that were completed
def load_models(device, n_folds=5):
    models = []
    for fold_n in range(n_folds):
        try:
            model = NetModule()
            model.to(device)
            model.load_state_dict(torch.load(f"netmodule_{fold_n}.pt"))
            model.eval()
            models.append(model)
        except FileNotFoundError:
            print(f"Model for fold {fold_n} not found, skipping")
            continue
    return models


# Load sample submission and teams data
data_dir = f"../datasets/march-machine-learning-mania-2025"
if not os.path.isdir(data_dir):
    data_dir = f"../input/march-machine-learning-mania-2025"

sample_sub = pd.read_csv(f"{data_dir}/SampleSubmissionStage2.csv")
m_teams = pd.read_csv(f"{data_dir}/MTeams.csv")
w_teams = pd.read_csv(f"{data_dir}/WTeams.csv")

# Parse the ID column to get season and team IDs
sample_sub[["Season", "Team1", "Team2"]] = (
    sample_sub["ID"].str.split("_", expand=True).astype(int)
)


# Function to prepare features for a matchup
def prepare_features(season, team1, team2, season_df):
    # Get team stats for both teams
    team1_data = season_df[
        (season_df["Season"] == season) & (season_df["TeamID"] == team1)
    ]
    team2_data = season_df[
        (season_df["Season"] == season) & (season_df["TeamID"] == team2)
    ]

    if team1_data.empty or team2_data.empty:
        # Fall back to average stats for the season if team not found
        print(f"Missing data for Season: {season}, Team1: {team1}, Team2: {team2}")
        team1_data = season_df[season_df["Season"] == season].mean().to_frame().T
        team2_data = season_df[season_df["Season"] == season].mean().to_frame().T

    # Drop the columns we won't use for prediction
    team1_data = team1_data.drop(columns=["Season", "TeamID"])
    team2_data = team2_data.drop(columns=["Season", "TeamID"])

    # Rename columns to match training format
    team1_data = team1_data.rename(columns={c: f"{c}_1" for c in team1_data.columns})
    team2_data = team2_data.rename(columns={c: f"{c}_2" for c in team2_data.columns})

    # Combine features
    features = pd.concat([team1_data, team2_data], axis=1)
    return features


# Create features for all matchups in sample submission
X_submit_df = pd.DataFrame()

for idx, row in sample_sub.iterrows():
    if idx % 100 == 0:
        print(f"Processing matchup {idx}/{len(sample_sub)}")

    features = prepare_features(row["Season"], row["Team1"], row["Team2"], season)
    X_submit_df = pd.concat([X_submit_df, features], ignore_index=True)

# Ensure columns match training data
for col in X_df.columns:
    if col not in X_submit_df.columns:
        X_submit_df[col] = 0.0

X_submit_df = X_submit_df[X_df.columns]

# Scale features
X_submit = StandardScaler().fit_transform(X_submit_df.values)

# Load models and make predictions
device = "cuda" if torch.cuda.is_available() else "cpu"
models = load_models(device)

# Make predictions
all_predictions = []

for model in models:
    with torch.no_grad():
        inputs = torch.tensor(X_submit, dtype=torch.float32).to(device)
        predictions = model(inputs).cpu().numpy().flatten()
        all_predictions.append(predictions)

# Average predictions across folds
if all_predictions:
    predictions = np.mean(all_predictions, axis=0)
else:
    print("No models found, using random predictions")
    predictions = np.random.normal(0, 1, size=len(X_submit))

# Convert to win probabilities
win_probs = 1 / (
    1 + np.exp(-predictions * 0.25)
)  # Scaling factor from the original code

# Create submission file
sample_sub["Pred"] = win_probs
sample_sub[["ID", "Pred"]].to_csv("submission.csv", index=False)

# Create a binary predictions version as well (as mentioned in the original code)
np.random.seed(42)
binary_preds = (np.random.random(len(win_probs)) < win_probs).astype(float)
sample_sub["Pred"] = binary_preds
sample_sub[["ID", "Pred"]].to_csv("submission_binary.csv", index=False)

# Calculate Brier score for validation (if possible)
try:
    # We need to make predictions on the training data
    train_predictions = []
    for model in models:
        with torch.no_grad():
            inputs = torch.tensor(X, dtype=torch.float32).to(device)
            preds = model(inputs).cpu().numpy().flatten()
            train_predictions.append(preds)

    if train_predictions:
        train_preds = np.mean(train_predictions, axis=0)
        train_probs = 1 / (1 + np.exp(-train_preds * 0.25))

        # The true labels: 1 if Team1 won, 0 if Team2 won
        y_true = (train["Margin"] > 0).astype(int)

        brier_score = brier_score_loss(y_true, train_probs)
        print(f"\nBrier score on validation data: {brier_score:.4f}")
    else:
        print("\nBrier score calculation skipped - no models available")
except Exception as e:
    print(f"\nBrier score calculation failed: {e}")


# Create Top 25 Rankings for Men's and Women's teams with statistics
def create_top_25(team_df, season_df, season_year=2025, gender="M"):
    """Create a Top 25 ranking with statistics based on our model's features"""
    # Get the current season data
    current_season = season_df[season_df["Season"] == season_year]

    # Load team names
    teams = m_teams if gender == "M" else w_teams
    teams_in_season = pd.merge(current_season, teams, on="TeamID")

    # Calculate a score based on key metrics (offensive and defensive efficiency)
    teams_in_season["RankingScore"] = (
        teams_in_season["Score_poss_o"]  # Offensive points per possession
        - teams_in_season["Score_poss_d"]  # Defensive points per possession
        + 0.5 * teams_in_season["FGPct_diff"]  # Field goal percentage difference
        + 0.5 * teams_in_season["AstTO_diff"]  # Assist to turnover ratio difference
    )

    # Select most important statistics for the ranking
    top_columns = [
        "TeamID",
        "TeamName",
        "RankingScore",
        "Score_pg_o",
        "Score_pg_d",  # Points per game scored and allowed
        "Score_poss_o",
        "Score_poss_d",  # Points per possession scored and allowed
        "FGPct_o",
        "FGPct_d",  # Field goal percentage offense and defense
        "FGPct3_o",
        "FTPct_o",  # 3-point and free throw percentages
        "AstTO_o",
        "AstTO_d",  # Assist to turnover ratios
    ]

    # Create the top 25
    top_25 = teams_in_season.sort_values("RankingScore", ascending=False).head(25)[
        top_columns
    ]

    # Rename columns for readability
    top_25 = top_25.rename(
        columns={
            "Score_pg_o": "PPG_Off",
            "Score_pg_d": "PPG_Def",
            "Score_poss_o": "PPP_Off",
            "Score_poss_d": "PPP_Def",
            "FGPct_o": "FG%_Off",
            "FGPct_d": "FG%_Def",
            "FGPct3_o": "3PT%",
            "FTPct_o": "FT%",
            "AstTO_o": "Ast/TO_Off",
            "AstTO_d": "Ast/TO_Def",
        }
    )

    # Round values for better readability
    for col in top_25.columns:
        if col not in ["TeamID", "TeamName"]:
            top_25[col] = top_25[col].round(3)

    # Add rank column
    top_25.insert(0, "Rank", range(1, 26))

    return top_25


# Create Top 25 for Men
mens_top_25 = create_top_25(m_teams, season, 2025, "M")
mens_top_25.to_csv("mens_top_25.csv", index=False)
print("\nMen's Top 25:")
print(mens_top_25.head())

# Create Top 25 for Women
womens_top_25 = create_top_25(w_teams, season, 2025, "W")
womens_top_25.to_csv("womens_top_25.csv", index=False)
print("\nWomen's Top 25:")
print(womens_top_25.head())

print("\nAll outputs generated successfully!")

Processing matchup 0/131407


NameError: name 'season' is not defined