In [None]:
%pip install pandas
%pip install matplotlib
%pip install numpy
%pip install seaborn
%pip install scikit-learn


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

In [None]:
folder = './march-machine-learning-mania-2024/'

Seeds = pd.read_csv(folder+'MNCAATourneySeeds.csv')
Conferences = pd.read_csv(folder+'MTeamConferences.csv')

RegularDetail = pd.read_csv(folder+'MRegularSeasonDetailedResults.csv')
TourneyCompact = pd.read_csv(folder+'MNCAATourneyCompactResults.csv')

RegularDetail.columns.values

In [None]:
WinTeams = pd.DataFrame()
LoseTeams = pd.DataFrame()

# Selecting Data: Winning TeamID and Score, Remove Losing Team Details

columns = ['Season', 'TeamID', 'Points', 'OppPoints', 
           'Loc', 'NumOT', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA',
           'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF', 'OppFGM', 'OppFGA',
           'OppFGM3', 'OppFGA3', 'OppFTM', 'OppFTA', 'OppOR', 'OppDR', 'OppAst', 'OppTO',
           'OppStl', 'OppBlk', 'OppPF']

WinTeams[columns] = RegularDetail[['Season', 'WTeamID', 'WScore', 'LScore',
       'WLoc', 'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA',
       'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA',
       'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO',
       'LStl', 'LBlk', 'LPF']]

WinTeams['Wins'] = 1
WinTeams['Losses'] = 0

LoseTeams[columns] = RegularDetail[['Season', 'LTeamID', 'LScore', 'WScore',
       'WLoc', 'NumOT', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA',
       'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 'WFGM', 'WFGA',
       'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO',
       'WStl', 'WBlk', 'WPF']]

def change_loc(loc):
    if loc == 'H':
        return 'A'
    elif loc == 'A':
        return 'H'
    else:
        return 'N'

LoseTeams['Loc'] = LoseTeams['Loc'].apply(change_loc)

LoseTeams['Wins'] = 0
LoseTeams['Losses'] = 1

WinLoseTeams = pd.concat([WinTeams, LoseTeams])
WinLoseTeams

In [None]:
combinedTeams = WinLoseTeams.groupby(['Season', 'TeamID']).sum()
combinedTeams['NumGames'] = combinedTeams['Wins'] + combinedTeams['Losses']
combinedTeams.columns.values

In [None]:
RegularSeasonInput = pd.DataFrame()

RegularSeasonInput['WinRatio'] = combinedTeams['Wins'] / combinedTeams['NumGames']
RegularSeasonInput['PointsPerGame'] = combinedTeams['Points'] / combinedTeams['NumGames']
RegularSeasonInput['PointsAllowedPerGame'] = combinedTeams['OppPoints'] / combinedTeams['NumGames']
RegularSeasonInput['PointsRatio'] = combinedTeams['Points'] / combinedTeams['OppPoints']
RegularSeasonInput['OTsPerGame'] = combinedTeams['NumOT'] / combinedTeams['NumGames']


RegularSeasonInput['FGPerGame'] = combinedTeams['FGM'] / combinedTeams['NumGames']
RegularSeasonInput['FGRatio'] = combinedTeams['FGM'] / combinedTeams['FGA']
RegularSeasonInput['FGAllowedPerGame'] = combinedTeams['OppFGM'] / combinedTeams['NumGames']

RegularSeasonInput['FG3PerGame'] = combinedTeams['FGM3'] / combinedTeams['NumGames']
RegularSeasonInput['FG3Ratio'] = combinedTeams['FGM3'] / combinedTeams['FGA3']
RegularSeasonInput['FG3AllowedPerGame'] = combinedTeams['OppFGM3'] / combinedTeams['NumGames']

RegularSeasonInput['FTPerGame'] = combinedTeams['FTM'] / combinedTeams['NumGames']
RegularSeasonInput['FTRatio'] = combinedTeams['FTM'] / combinedTeams['FTA']
RegularSeasonInput['FTAllowedPerGame'] = combinedTeams['OppFTM'] / combinedTeams['NumGames']

RegularSeasonInput['ORRatio'] = combinedTeams['OR'] / combinedTeams['OppOR']
RegularSeasonInput['DRRatio'] = combinedTeams['DR'] / combinedTeams['OppDR']

RegularSeasonInput['AstPerGame'] = combinedTeams['Ast'] / combinedTeams['NumGames']

RegularSeasonInput['TOPerGame'] = combinedTeams['TO'] / combinedTeams['NumGames']
RegularSeasonInput['StlPerGame'] = combinedTeams['Stl'] / combinedTeams['NumGames']
RegularSeasonInput['BlkPerGame'] = combinedTeams['Blk'] / combinedTeams['NumGames']
RegularSeasonInput['PFPerGame'] = combinedTeams['PF'] / combinedTeams['NumGames']


RegularSeasonInput

In [None]:
seed_dict = Seeds.set_index(['Season', 'TeamID'])

display(seed_dict)

winIDs = TourneyCompact['WTeamID']
loseIDS = TourneyCompact['LTeamID']
season = TourneyCompact['Season']

winners = pd.DataFrame()
winners[['Season', 'Team1', 'Team2']] = TourneyCompact[['Season', 'WTeamID', 'LTeamID']]
winners['Result'] = 1

losers = pd.DataFrame()
losers[['Season', 'Team1', 'Team2']] = TourneyCompact[['Season', 'LTeamID', 'WTeamID']]
losers['Result'] = 0

TourneyInput = pd.DataFrame()
TourneyInput = pd.concat([winners, losers])

TourneyInput = TourneyInput[TourneyInput['Season']>=2003].reset_index(drop=True)

TourneyInput 

In [None]:
team1seeds = []
team2seeds = []

for i in range(len(TourneyInput)):
    idx = (TourneyInput['Season'][i], TourneyInput['Team1'][i])
    seed = seed_dict.loc[idx].values[0]
    if len(seed) == 4:
        seed = int(seed[1:-1])
    else:
        seed = int(seed[1:])
    team1seeds.append(seed)

    idx = (TourneyInput['Season'][i], TourneyInput['Team2'][i])
    seed = seed_dict.loc[idx].values[0]
    if len(seed) == 4:
        seed = int(seed[1:-1])
    else:
        seed = int(seed[1:])
    team2seeds.append(seed)

TourneyInput['Team1Seed'] = team1seeds
TourneyInput['Team2Seed'] = team2seeds

TourneyInput

In [None]:
outscores = []

for x in range(len(TourneyInput)):
    idx = (TourneyInput['Season'][x], TourneyInput['Team1'][x])
    team1score = RegularSeasonInput.loc[idx]
    team1score['Seed'] = TourneyInput['Team1Seed'][x]

    idx = (TourneyInput['Season'][x], TourneyInput['Team2'][x])
    team2score = RegularSeasonInput.loc[idx]
    team2score['Seed'] = TourneyInput['Team2Seed'][x]

    outscore = team1score - team2score
    outscore['Result'] = TourneyInput['Result'][x]
    outscores.append(outscore)

outscores = pd.DataFrame(outscores)

outscores

In [None]:
corrs = round(outscores.corr(), 2)
display(np.abs(corrs['Result']))

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(corrs)
plt.show()

In [None]:
X = outscores[outscores.columns[:-1]].values
y = outscores['Result'].values

np.random.seed(1)
idx = np.random.permutation(len(X))
train_idx = idx[:int(-.2*len(X))]
test_idx = idx[int(-.2*len(X)):]

X_train = X[train_idx]
X_test = X[test_idx]
y_train = y[train_idx]
y_test = y[test_idx]

mins = X_train.min(axis=0)
maxs = X_train.max(axis=0)

X_train - (X_train - mins) / (maxs - mins)
X_test - (X_test - mins) / (maxs - mins)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(max_depth=9, random_state=1)
model = model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
# Example Prediction

team1ID = 1177
team2ID = 1474

team1_stats = [RegularSeasonInput[column][2024][team1ID] for column in RegularSeasonInput.columns]
team1_stats.append(1)

team2_stats = [RegularSeasonInput[column][2024][team2ID] for column in RegularSeasonInput.columns]
team2_stats.append(2)


out1 = pd.DataFrame(team1_stats)
out2 = pd.DataFrame(team2_stats)

out = out1 - out2

display(out.transpose())

print(model.predict(out.transpose()))

In [None]:
df = pd.read_csv('march-machine-learning-mania-2024/2024_tourney_seeds.csv')

def simulate_tournament_and_predict_winners(df):
    regions = df['Seed'].str[0].unique()

    for region in regions:
        print(f"--- Region: {region} ---")
        region_df = df[df['Seed'].str.startswith(region)].copy()
        region_df['SeedNo'] = region_df['Seed'].str[1:].astype(int)
        region_df.sort_values('SeedNo', inplace=True)

        round_number = 1
        while len(region_df) > 1:
            print(f"Round {round_number} winners (Region {region}):")
            next_round_teams = []

            print(len(region_df))

            max_teams = 0

            if round == 1:
                max_teams = 8
            elif round == 2:
                max_teams = 5

            for i in range(1, max_teams):
                team1_row = region_df.iloc[i - 1]
                if i <= len(region_df):
                    team2_row = region_df.iloc[16 - i]

                    # Load team stats and seed
                    team1ID = team1_row['TeamID']
                    team2ID = team2_row['TeamID']
                    team1_stats = [RegularSeasonInput[column][2024][team1ID] for column in RegularSeasonInput.columns]
                    team1_stats.append(team1_row['SeedNo'])
                    team2_stats = [RegularSeasonInput[column][2024][team2ID] for column in RegularSeasonInput.columns]
                    team2_stats.append(team2_row['SeedNo'])

                    # Prepare input for model prediction
                    out1 = pd.DataFrame(team1_stats)
                    out2 = pd.DataFrame(team2_stats)

                    out = out1 - out2
                    prediction = model.predict(out.transpose())[0]

                    # Determine winner based on model prediction
                    winner_row = team1_row if prediction > 0 else team2_row
                    next_round_teams.append(winner_row)
                    print(f"  Seed {winner_row['Seed']}: TeamID {winner_row['TeamID']}")
                else:
                    # If odd number of teams, last team automatically moves to the next round
                    print(f"  Seed {team1_row['Seed']}: TeamID {team1_row['TeamID']}")
                    next_round_teams.append(team1_row)

            region_df = pd.DataFrame(next_round_teams).reset_index(drop=True)
            round_number += 1
        
        print(f"Region {region} Champion: Seed {region_df.iloc[0]['Seed']}, TeamID {region_df.iloc[0]['TeamID']}\n")

simulate_tournament_and_predict_winners(df)