Requires:

-MRegularSeasonDetailedResults.csv

-MNCAATourneySeeds.csv

-MNCAATourneyCompactResults.csv

-MTeams.csv

In [None]:
import pandas as pd
from itertools import combinations

teams = pd.read_csv("MTeams.csv")
results = pd.read_csv("MRegularSeasonDetailedResults.csv")
tourney = pd.read_csv("MNCAATourneyCompactResults.csv")
seeds = pd.read_csv("MNCAATourneySeeds.csv")

season = 2025
results = results[results['Season'] == season]
tourney = tourney[tourney['Season'] == season]
seeds['Seed'] = seeds['Seed'].str[1:3].astype(int)
seeds = seeds[seeds['Season'] == season]

def calc_poss(row):
    return 0.5 * ((row['WFGA'] + 0.475 * row['WFTA'] - row['WOR'] + row['WTO']) +
                  (row['LFGA'] + 0.475 * row['LFTA'] - row['LOR'] + row['LTO']))

results['Possessions'] = results.apply(calc_poss, axis=1)

def stack_results(results):
    win_cols = {
        'TeamID': 'WTeamID', 'OppID': 'LTeamID', 'Score': 'WScore', 'OppScore': 'LScore',
        'FGM': 'WFGM', 'FGA': 'WFGA', 'FGM3': 'WFGM3', 'FGA3': 'WFGA3', 'FTM': 'WFTM', 'FTA': 'WFTA',
        'OR': 'WOR', 'DR': 'WDR', 'TO': 'WTO', 'Poss': 'Possessions'
    }

    lose_cols = {
        'TeamID': 'LTeamID', 'OppID': 'WTeamID', 'Score': 'LScore', 'OppScore': 'WScore',
        'FGM': 'LFGM', 'FGA': 'LFGA', 'FGM3': 'LFGM3', 'FGA3': 'LFGA3', 'FTM': 'LFTM', 'FTA': 'LFTA',
        'OR': 'LOR', 'DR': 'LDR', 'TO': 'LTO', 'Poss': 'Possessions'
    }

    wins = results.rename(columns=win_cols)[list(win_cols.keys())]
    losses = results.rename(columns=lose_cols)[list(lose_cols.keys())]
    return pd.concat([wins, losses], ignore_index=True)

stacked = stack_results(results)

team_stats = stacked.groupby('TeamID').agg({
    'Score': 'mean',
    'OppScore': 'mean',
    'FGM': 'sum', 'FGA': 'sum', 'FGM3': 'sum', 'FGA3': 'sum',
    'FTM': 'sum', 'FTA': 'sum',
    'OR': 'sum', 'DR': 'sum',
    'TO': 'sum',
    'Poss': 'sum'
}).reset_index()

team_stats['margin_of_victory'] = team_stats['Score'] - team_stats['OppScore']
team_stats['efg_pct'] = (team_stats['FGM'] + 0.5 * team_stats['FGM3']) / team_stats['FGA']
team_stats['turnover_rate'] = team_stats['TO'] / team_stats['Poss']
team_stats['tempo'] = team_stats['Poss'] / len(results['DayNum'].unique())
team_stats['def_reb_rate'] = team_stats['DR'] / (team_stats['DR'] + team_stats['OR'])  # vs. total reb
team_stats['off_reb_rate'] = team_stats['OR'] / (team_stats['OR'] + team_stats['DR'])  # proxy
team_stats['power_rating'] = team_stats['margin_of_victory'] + team_stats['efg_pct'] * 10

features = team_stats[['TeamID', 'efg_pct', 'turnover_rate', 'def_reb_rate',
                       'off_reb_rate', 'margin_of_victory', 'power_rating', 'tempo']]

features = features.merge(seeds[['TeamID', 'Seed']], how='left', on='TeamID')

matchups = []
for teamA, teamB in combinations(features['TeamID'], 2):
    rowA = features[features['TeamID'] == teamA].iloc[0]
    rowB = features[features['TeamID'] == teamB].iloc[0]

    m = {
        'Season': season,
        'teamA_id': teamA,
        'teamB_id': teamB,
        'diff_efg_pct': rowA['efg_pct'] - rowB['efg_pct'],
        'diff_turnover_rate': rowA['turnover_rate'] - rowB['turnover_rate'],
        'diff_def_reb_rate': rowA['def_reb_rate'] - rowB['def_reb_rate'],
        'diff_off_reb_rate': rowA['off_reb_rate'] - rowB['off_reb_rate'],
        'diff_margin_of_victory': rowA['margin_of_victory'] - rowB['margin_of_victory'],
        'diff_power_rating': rowA['power_rating'] - rowB['power_rating'],
        'diff_tempo': rowA['tempo'] - rowB['tempo'],
        'diff_seed': (rowA['Seed'] if pd.notna(rowA['Seed']) else 17) -
                     (rowB['Seed'] if pd.notna(rowB['Seed']) else 17)
    }

    game = tourney[((tourney['WTeamID'] == teamA) & (tourney['LTeamID'] == teamB)) |
                   ((tourney['WTeamID'] == teamB) & (tourney['LTeamID'] == teamA))]

    if not game.empty:
        m['label'] = 1 if game.iloc[0]['WTeamID'] == teamA else 0
    else:
        m['label'] = None

    matchups.append(m)

# === Save matchups ===
df_matchups = pd.DataFrame(matchups)
df_matchups.to_csv("full_matchups_2025.csv", index=False)
print("✅ Saved full_matchups_2025.csv with", len(df_matchups), "matchups.")
