# March Machine Learning Mania 2025 [using Tensorflow]

* 컴피티션 목표  
  : NCAA Division 1 미국 대학농구 토너먼트, 혹은 다른 말로 **3월의 광란**의 결과를 예측하는 것!
  
* 전략  
  : 매 시즌 정규시즌 결과를 팀별로 정리한 후, 팀별 승률, 마진(점수차) 등을 사용한다.  
    추가적으로 평균득점, 리바운드, 어시스트 등의 1차 스탯과 시드, 랭킹 등 외부 평가자료를 학습데이터로 사용한다.  
    2024년까지의 정규시즌 성적을 특성 X, 토너먼트 성적을 라벨 y로 하여 머신러닝을 수행한다.  
    
    올해 처음 참가하는 것이므로 데이터를 형식에 맞춰 작성하는 것을 기본 목표로 한다.

## 1. Environment Settings

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import brier_score_loss

## 2. Gamescores, Seeds (RS, from 1985)

- MRegularSeasonCompactResults.csv  
  하나의 로우에 승리팀, 패배팀의 데이터가 섞여있으므로 구분해준다.    
  (패배팀의 경우, 마진에 -1을 곱해줘야 한다.)  
  W/L 컬럼에는 승패를 1,0으로 인코딩해서 넣어준다.  

- MNCAATourneySeeds.csv  
  토너먼트 시드 데이터는 별개의 파일에서 가져온다.

두 데이터를 정리한 뒤, 모든 팀에 대한 경우(카테시안 곱)를 구한다.  
먼저 나온 TeamID가 작은 경우만 남겨서 예측에 사용한다

In [3]:
compact_results_url = '/kaggle/input/march-machine-learning-mania-2025/MRegularSeasonCompactResults.csv'
seeds_url = '/kaggle/input/march-machine-learning-mania-2025/MNCAATourneySeeds.csv'

In [4]:
def regular_season(compact_results_url, seeds_url):
    # Regular Season Data from 1985
    rs_compact = pd.read_csv(compact_results_url)
    rs_compact['margin'] = rs_compact['WScore'] - rs_compact['LScore']

    W_columns = {'WTeamID': 'TeamID', 'WScore': 'Score'}
    L_columns = {'LTeamID': 'TeamID', 'LScore': 'Score'}

    W_results = rs_compact[['Season', 'margin'] + list(W_columns.keys())].rename(columns=W_columns)
    L_results = rs_compact[['Season', 'margin'] + list(L_columns.keys())].rename(columns=L_columns)

    W_results['W/L'], L_results['W/L'] = 1, 0
    L_results['margin'] = -L_results['margin']

    df_flatten = pd.concat([W_results, L_results], axis=0).reset_index(drop=True)
    summary_mean = df_flatten.groupby(['Season', 'TeamID'], as_index=False).mean()

    # Add seeds from 1985
    seeds = pd.read_csv(seeds_url)
    seeds['Seed'] = seeds['Seed'].apply(lambda x: int(x[1:3]))

    df_seeds = pd.merge(summary_mean, seeds, on=['Season', 'TeamID'], how='left')
    df_seeds['Seed'] = df_seeds['Seed'].fillna(16)  # Teams that didn't make the tournament

    # Self join to create a Cartesian product with TeamID ordering
    df_cartesian = df_seeds.merge(df_seeds, on='Season', suffixes=('_x', '_y'))
    df_cartesian = df_cartesian[df_cartesian['TeamID_x'] < df_cartesian['TeamID_y']]

    # Create variables
    df_cartesian['ID'] = df_cartesian['Season'].astype(str) + "_" + df_cartesian['TeamID_x'].astype(str) + "_" + df_cartesian['TeamID_y'].astype(str)
    df_cartesian['margin_diff'] = df_cartesian['margin_x'] - df_cartesian['margin_y']
    df_cartesian['Seed_diff'] = df_cartesian['Seed_x'] - df_cartesian['Seed_y']
    df_cartesian['Score_diff'] = df_cartesian['Score_x'] - df_cartesian['Score_y']
    df_cartesian['W/L_diff'] = df_cartesian['W/L_x'] - df_cartesian['W/L_y']

    return df_cartesian[['ID', 'Season', 'margin_diff', 'Seed_diff', 'Score_diff', 'W/L_diff']]

df_rs = regular_season(compact_results_url, seeds_url)
df_rs

Unnamed: 0,ID,Season,margin_diff,Seed_diff,Score_diff,W/L_diff
1,1985_1102_1103,1985,-2.748188,0.0,2.039855,-0.182971
2,1985_1102_1104,1985,-13.591667,9.0,-5.416667,-0.491667
3,1985_1102_1106,1985,-2.000000,0.0,-8.541667,-0.208333
4,1985_1102_1108,1985,-13.751667,0.0,-19.916667,-0.551667
5,1985_1102_1109,1985,23.333333,0.0,9.250000,0.166667
...,...,...,...,...,...,...
4397248,2025_1477_1479,2025,-4.551843,0.0,-1.430876,-0.267281
4397249,2025_1477_1480,2025,0.550538,0.0,-3.411828,-0.005376
4397612,2025_1478_1479,2025,-3.502381,0.0,6.147619,-0.195238
4397613,2025_1478_1480,2025,1.600000,0.0,4.166667,0.066667


## 3. Gamestats, Rankings (RS, from 2003)

In [None]:
def regular_season_detailed(detailed_results_url, rankings_url):
    # Regular Season Detailed Data from 2003
    rs_detailed = pd.read_csv(detailed_results_url)

    W_columns = {'WTeamID': 'TeamID', 'WScore': 'Score', 
                'WFGM': 'FGM', 'WFGA': 'FGA', 'WFGM3': 'FGM3', 'WFGA3': 'FGA3', 'WFTM': 'FTM', 'WFTA': 'FTA', 
                'WOR': 'OR', 'WDR': 'DR', 'WAst': 'Ast', 'WTO': 'TO', 'WStl': 'Stl', 'WBlk': 'Blk', 'WPF': 'PF'}
    L_columns = {'LTeamID': 'TeamID', 'LScore': 'Score',
                'LFGM': 'FGM', 'LFGA': 'FGA', 'LFGM3': 'FGM3', 'LFGA3': 'FGA3', 'LFTM': 'FTM', 'LFTA': 'FTA',
                'LOR': 'OR', 'LDR': 'DR', 'LAst': 'Ast', 'LTO': 'TO', 'LStl': 'Stl', 'LBlk': 'Blk', 'LPF': 'PF'}

    W_results = rs_detailed[['Season'] + list(W_columns.keys())].rename(columns=W_columns)
    L_results = rs_detailed[['Season'] + list(L_columns.keys())].rename(columns=L_columns)

    df_flatten = pd.concat([W_results, L_results], axis=0).reset_index(drop=True)
    summary_mean = df_flatten.groupby(['Season', 'TeamID'], as_index=False).mean()

    # Add rankings from 2003
    df_rankings = pd.read_csv(rankings_url)

    latest_ranking_days = df_rankings.groupby("Season")["RankingDayNum"].max().reset_index()
    df_latest_ranking = df_rankings.merge(latest_ranking_days, on=["Season", "RankingDayNum"], how="inner")
    df_latest_ranking = df_latest_ranking.groupby(["Season", "TeamID"])["OrdinalRank"].mean().reset_index()

    df_rankings = pd.merge(summary_mean, df_latest_ranking, on=['Season', 'TeamID'], how='left')
    df_rankings['OrdinalRank'] = df_rankings['OrdinalRank'].fillna(351)  # Teams that didn't make the tournament

    df_rankings

    # Create variables
    df_rankings['FG%'] = df_rankings['FGM'] / df_rankings['FGA']
    df_rankings['FG3%'] = df_rankings['FGM3'] / df_rankings['FGA3']
    df_rankings['FT%'] = df_rankings['FTM'] / df_rankings['FTA']

    df_rankings['TOR'] = df_rankings['OR'] + df_rankings['DR']
    df_rankings['AST/TO'] = df_rankings['Ast'] / df_rankings['TO']

    # Self join to create a Cartesian product with TeamID ordering
    df_cartesian = df_rankings.merge(df_rankings, on='Season', suffixes=('_x', '_y'))
    df_cartesian = df_cartesian[df_cartesian['TeamID_x'] < df_cartesian['TeamID_y']]

    df_cartesian['ID'] = df_cartesian['Season'].astype(str) + "_" + df_cartesian['TeamID_x'].astype(str) + "_" + df_cartesian['TeamID_y'].astype(str)
    df_cartesian['Rank_diff'] = df_cartesian['OrdinalRank_x'] - df_cartesian['OrdinalRank_y']
    
    return df_cartesian

df_rs_detailed = regular_season_detailed(detailed_results_url, rankings_url)
df_rs_detailed

In [None]:
df_rs_all = pd.merge(df_rs, df_rs_detailed, on=['ID', 'Season'], how='inner')
df_rs_all

In [None]:
def tournament(tournament_results_url):
    df_y = pd.read_csv(tournament_results_url)
    df_y["Team1"] = df_y[["WTeamID", "LTeamID"]].min(axis=1)
    df_y["Team2"] = df_y[["WTeamID", "LTeamID"]].max(axis=1)
    df_y["ID"] = df_y["Season"].astype(str) + "_" + df_y["Team1"].astype(str) + "_" + df_y["Team2"].astype(str)
    df_y["Margin"] = df_y.apply(lambda row: row["WScore"] - row["LScore"] if row["WTeamID"] == row["Team1"] else row["LScore"] - row["WScore"], axis=1)
    df_y['W/L'] = (df_y['Margin'] > 0).astype(int)
    
    return df_y[['ID', 'W/L', 'Margin']]
 
df_tourney = tournament(tournament_results_url)
df_tourney

df_train = pd.merge(df_tourney, df_rs_all, on='ID', how='inner')
df_train

In [None]:
# Men's prediction
compact_results_url = '/kaggle/input/march-machine-learning-mania-2025/MRegularSeasonCompactResults.csv'
detailed_results_url = '/kaggle/input/march-machine-learning-mania-2025/MRegularSeasonDetailedResults.csv'
tournament_results_url = '/kaggle/input/march-machine-learning-mania-2025/MNCAATourneyCompactResults.csv'
seeds_url = '/kaggle/input/march-machine-learning-mania-2025/MNCAATourneySeeds.csv'
rankings_url = '/kaggle/input/march-machine-learning-mania-2025/MMasseyOrdinals.csv'

df_rs = regular_season(compact_results_url, seeds_url)
df_rs_detailed = regular_season_detailed(detailed_results_url, rankings_url)
df_rs_all = pd.merge(df_rs, df_rs_detailed, on=['ID', 'Season'], how='inner').fillna(0)
df_tourney = tournament(tournament_results_url)
df_train = pd.merge(df_tourney, df_rs_all, on='ID', how='inner').fillna(0)

import tensorflow as tf
from tensorflow import keras

features = ['margin_diff', 'Seed_diff', 'Score_diff', 'W/L_diff', 'Rank_diff', 
            'Score_x', 'FGM_x', 'FGA_x', 'FGM3_x', 'FGA3_x', 'FTM_x', 'FTA_x', 
            'OR_x', 'DR_x', 'Ast_x', 'TO_x', 'Stl_x', 'Blk_x', 'PF_x', 
            'OrdinalRank_x', 'FG%_x', 'FG3%_x', 'FT%_x', 'TOR_x', 'AST/TO_x', 
            'Score_y', 'FGM_y', 'FGA_y', 'FGM3_y', 'FGA3_y', 'FTM_y', 'FTA_y', 
            'OR_y', 'DR_y', 'Ast_y', 'TO_y', 'Stl_y', 'Blk_y', 'PF_y', 
            'OrdinalRank_y', 'FG%_y', 'FG3%_y', 'FT%_y', 'TOR_y', 'AST/TO_y',]

X_train, y_train = df_train[features], df_train["W/L"]

model_tf = keras.Sequential([
    keras.layers.Dense(32, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(8, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.5),
    keras.layers.Flatten(),
    keras.layers.Dense(1, activation='sigmoid')
])

model_tf.compile(optimizer='adam',
                 loss='binary_crossentropy',
                 metrics=['accuracy'])
model_tf.fit(X_train, y_train, epochs=1000, batch_size=32)

In [None]:
df_submission = df_rs_all[df_rs_all.Season == 2025]
df_submission['Pred'] = model_tf.predict(df_submission[features])
df_submission = df_submission.reset_index()
df_submission_men = df_submission[['ID', 'Pred']]

df_submission_men

In [None]:
# Women's prediction
compact_results_url = '/kaggle/input/march-machine-learning-mania-2025/WRegularSeasonCompactResults.csv'
# detailed_results_url = '/kaggle/input/march-machine-learning-mania-2025/WRegularSeasonDetailedResults.csv'
tournament_results_url = '/kaggle/input/march-machine-learning-mania-2025/WNCAATourneyCompactResults.csv'
# seeds_url = '/kaggle/input/march-machine-learning-mania-2025/WNCAATourneySeeds.csv'
rankings_url = '/kaggle/input/march-machine-learning-mania-2025/WMasseyOrdinals.csv'

df_rs = regular_season(compact_results_url, seeds_url)
# df_rs_detailed = regular_season_detailed(detailed_results_url, rankings_url)
# df_rs_all = pd.merge(df_rs, df_rs_detailed, on=['ID', 'Season'], how='inner').fillna(0)
df_tourney = tournament(tournament_results_url)
df_train = pd.merge(df_tourney, df_rs, on='ID', how='inner').fillna(0)

import tensorflow as tf
from tensorflow import keras

features = ['margin_diff', 'Seed_diff', 'Score_diff', 'W/L_diff']

X_train, y_train = df_train[features], df_train["W/L"]

model_tf = keras.Sequential([
    keras.layers.Dense(32, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(8, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
    keras.layers.Dropout(0.5),
    keras.layers.Flatten(),
    keras.layers.Dense(1, activation='sigmoid')
])

model_tf.compile(optimizer='adam',
                 loss='binary_crossentropy',
                 metrics=['accuracy'])
model_tf.fit(X_train, y_train, epochs=1000, batch_size=32)

In [None]:
df_submission = df_rs[df_rs.Season == 2025]
df_submission['Pred'] = model_tf.predict(df_submission[features])
df_submission = df_submission.reset_index()
df_submission_women = df_submission[['ID', 'Pred']]

df_submission_women

In [None]:
# Create final submission file
submission_df = pd.concat([
    df_submission_men,
    df_submission_women
], axis=0).sort_values(by='ID')

submission_df.reset_index(drop=True, inplace=True)

# Save submission file
submission_df.to_csv("submission.csv", index=False)
print("Submission file created successfully.")