In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime, timedelta
from IPython.display import clear_output
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/march-machine-learning-mania-2023/Conferences.csv
/kaggle/input/march-machine-learning-mania-2023/WNCAATourneyDetailedResults.csv
/kaggle/input/march-machine-learning-mania-2023/WRegularSeasonCompactResults.csv
/kaggle/input/march-machine-learning-mania-2023/MNCAATourneySeedRoundSlots.csv
/kaggle/input/march-machine-learning-mania-2023/MRegularSeasonDetailedResults.csv
/kaggle/input/march-machine-learning-mania-2023/MNCAATourneyCompactResults.csv
/kaggle/input/march-machine-learning-mania-2023/MGameCities.csv
/kaggle/input/march-machine-learning-mania-2023/WGameCities.csv
/kaggle/input/march-machine-learning-mania-2023/MSeasons.csv
/kaggle/input/march-machine-learning-mania-2023/WNCAATourneySlots.csv
/kaggle/input/march-machine-learning-mania-2023/MSecondaryTourneyTeams.csv
/kaggle/input/march-machine-learning-mania-2023/Cities.csv
/kaggle/input/march-machine-learning-mania-2023/SampleSubmission2023.csv
/kaggle/input/march-machine-learning-mania-2023/MTeamSpellings.csv
/k

In [2]:
start_data_season = {
    'M': 2003,
    'W': 2009
}
columns = [
            'Season', 'TeamA', 'TeamB', 
            'ASeedNum', 'ALastTournPct', 'ARegSznPct',
            'APts', 'APtsOpp', 'AFG', 'AFG3', 'AFT', 'AAst', 'ATO', 'AOR', 'ADR', 'AStl', 'ABlk', 'APF',
            'BSeedNum', 'BLastTournPct', 'BRegSznPct',
            'BPts', 'BPtsOpp', 'BFG', 'BFG3', 'BFT', 'BAst', 'BTO', 'BOR', 'BDR', 'BStl', 'BBlk', 'BPF',
            'Winner'
        ]

In [3]:
def get_seed_number(seed):
    if 'a' in seed or 'b' in seed:
        return 17
    return int(seed[1:])

def generate_numeric_cols(df, team):
    df[f'{team}FG'] = (df[f'{team}FGM'] * 100) / df[f'{team}FGA']
    df[f'{team}FG3'] = (df[f'{team}FGM3'] * 100) / df[f'{team}FGA3']
    df[f'{team}FT'] = (df[f'{team}FTM'] * 100) / df[f'{team}FTA']

def get_files_by_competition(gender):
    # Getting teams df
    teams_file = f'/kaggle/input/march-machine-learning-mania-2023/{gender}Teams.csv'
    teams_df = pd.read_csv(teams_file)
    display(teams_df)
    
    # Getting regular season games df
    reg_szn_file = f'/kaggle/input/march-machine-learning-mania-2023/{gender}RegularSeasonDetailedResults.csv'
    reg_szn_df = pd.read_csv(reg_szn_file)
    generate_numeric_cols(reg_szn_df, 'W')
    generate_numeric_cols(reg_szn_df, 'L')
    display(reg_szn_df)
    
    # Getting tournament season games df
    tourn_file = f'/kaggle/input/march-machine-learning-mania-2023/{gender}NCAATourneyCompactResults.csv'
    tourn_df = pd.read_csv(tourn_file)
    display(tourn_df)
    
    # Getting seeding df
    seed_file = f'/kaggle/input/march-machine-learning-mania-2023/{gender}NCAATourneySeeds.csv'
    seed_df = pd.read_csv(seed_file)
    seed_df['SeedNum'] = seed_df.Seed.apply(lambda x: get_seed_number(x))
    display(seed_df)
    
    return teams_df, reg_szn_df, tourn_df, seed_df

In [4]:
def get_reg_szn_stats(season, team, reg_szn_df):
    w_games = reg_szn_df[reg_szn_df['WTeamID'] == team].rename(columns = {
        'WFG': 'FG', 'WFG3': 'FG3', 'WFT': 'FT', 'WAst': 'Ast', 'WTO': 'TO', 'WOR': 'OR', 'WDR': 'DR', 'WStl': 'Stl', 'WBlk': 'Blk', 'WPF': 'PF',
        'WScore': 'Score',
        
        'LFG': 'OppFG', 'LFG3': 'OppFG3', 'LFT': 'OppFT', 'LAst': 'OppAst', 'LTO': 'OppTO', 'LOR': 'OppOR', 'LDR': 'OppDR', 'LStl': 'OppStl', 'LBlk': 'OppBlk', 'LPF': 'OppPF',
        'LScore': 'OppScore',
        })
    w_games['Won'] = 1
    
    l_games = reg_szn_df[reg_szn_df['LTeamID'] == team].rename(columns = {
        'LFG': 'FG', 'LFG3': 'FG3', 'LFT': 'FT', 'LAst': 'Ast', 'LTO': 'TO', 'LOR': 'OR', 'LDR': 'DR', 'LStl': 'Stl', 'LBlk': 'Blk', 'LPF': 'PF',
        'LScore': 'Score',
        
        'WFG': 'OppFG', 'WFG3': 'OppFG3', 'WFT': 'OppFT', 'WAst': 'OppAst', 'WTO': 'OppTO', 'WOR': 'OppOR', 'WDR': 'OppDR', 'WStl': 'OppStl', 'WBlk': 'OppBlk', 'WPF': 'OppPF',
        'WScore': 'OppScore',
        })
    l_games['Won'] = 0
    
    games = pd.concat([w_games, l_games], axis=0, ignore_index=True)
    
    reg_szn_pct = (len(w_games) * 100) / len(games)
    
    return [   
               reg_szn_pct, 
               games['Score'].mean(), games['OppScore'].mean(), games['FG'].mean(), games['FG3'].mean(), games['FT'].mean(), games['Ast'].mean(), games['TO'].mean(), 
               games['OR'].mean(), games['DR'].mean(), games['Stl'].mean(), games['Blk'].mean(), games['PF'].mean()
           ]

In [5]:
def get_last_tourn_pct(season, team, tourn_df):
    wins = len(tourn_df[(tourn_df['Season'] == season-1) & (tourn_df['WTeamID'] == team)])
    total_games = len(tourn_df[(tourn_df['Season'] == season-1) & ((tourn_df['WTeamID'] == team) | (tourn_df['LTeamID'] == team))])
    
    if not total_games: return 0
    return (wins * 100) / total_games

In [6]:
def get_game_stats(season, team_a, team_b, reg_szn_df, seed_df, tourn_df):
    a_reg_szn_stats = get_reg_szn_stats(season, team_a, reg_szn_df)
    b_reg_szn_stats = get_reg_szn_stats(season, team_b, reg_szn_df)

    try:
        a_seed = seed_df[(seed_df['Season'] == season) & (seed_df['TeamID'] == team_a)].reset_index().loc[0, 'SeedNum']
    except:
        a_seed = 24
    try:
        b_seed = seed_df[(seed_df['Season'] == season) & (seed_df['TeamID'] == team_b)].reset_index().loc[0, 'SeedNum']
    except:
        b_seed = 24

    a_last_tourn_pct = get_last_tourn_pct(season, team_a, tourn_df)
    b_last_tourn_pct = get_last_tourn_pct(season, team_b, tourn_df)

    stats_a = [a_seed, a_last_tourn_pct] + (a_reg_szn_stats)
    stats_b = [b_seed, b_last_tourn_pct] + (b_reg_szn_stats)
    
    return [season, team_a, team_b] + stats_a + stats_b

def build_dataset(start_data_season, reg_szn_df, tourn_df, seed_df):
    data = []

    for season in range(start_data_season + 1, 2024):
        tourney_games = tourn_df[tourn_df['Season'] == season].reset_index(drop=True)
        for idx, g in tourney_games.iterrows():

            clear_output(wait=True)
            print(f"{season}: {idx}/{len(tourney_games)}")

            team_a = min([g['WTeamID'], g['LTeamID']])
            team_b = max([g['WTeamID'], g['LTeamID']])

            if team_a == g['WTeamID']:
                winner = 'A'
                team_a_score = g['WScore']
                team_b_score = g['LScore']
            else:
                winner = 'B'
                team_a_score = g['LScore']
                team_b_score = g['WScore']

            print(f"{team_a} x {team_b}")

            game_stats = get_game_stats(season, team_a, team_b, reg_szn_df, seed_df, tourn_df)

            data.append(game_stats + [winner])

    data_df = pd.DataFrame(data, columns=columns)
    display(data_df)
    
    return data_df

In [7]:
def train_model(x_train, y_train):
    classifier = RandomForestClassifier(random_state=0)
    classifier.fit(x_train, y_train)
    return classifier

def predict_seasons(gender, start_data_season, data_df, detailed_results):

    acc_sum = 0
    seasons_count = 0

    for season in range(start_data_season + 2, 2024):
        data_train = data_df[(data_df['Season'] < season)].reset_index(drop=True)
        data_test = data_df[(data_df['Season'] == season)].reset_index(drop=True)

        if not len(data_test):
            continue

        x_train = data_train.drop(['Winner'], axis=1)
        y_train = data_train.loc[:, 'Winner']
            
        classifier = train_model(x_train, y_train)

        x_test = data_test.drop(['Winner'], axis=1)
        y_test = data_test.loc[:, 'Winner']

        predictions = classifier.predict(x_test)
        
        acc = accuracy_score(y_test, predictions)
        acc_sum += acc
        seasons_count += 1
        
        if detailed_results:
            print(f'\nResults for {gender} season {season}:')
            print('Accuracy predictions:', acc)

            cm = confusion_matrix(y_test, predictions)
            print('Confusion matrix:')
            cm_disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classifier.classes_)
            cm_disp.plot()
            plt.show()

    print(f'\n\n{gender} Mean Accuracy: {acc_sum/seasons_count}')
    
    return classifier


In [8]:
def get_current_predictions(classifier, reg_szn_df, tourn_df, seed_df):
    season = 2023
    data = []
    teams = seed_df[seed_df['Season'] == season].TeamID.unique()
    teams.sort()
    
    for idx, team_a in enumerate(teams):
        for idx_b in range(idx + 1, len(teams)):
            team_b = teams[idx_b]
            
            clear_output(wait=True)
            print(f"{idx}/{len(teams)}")
            print(f"{team_a} x {team_b}")
            
            game_stats = get_game_stats(season, team_a, team_b, reg_szn_df, seed_df, tourn_df)
            data.append(game_stats)
            
    data_df = pd.DataFrame(data, columns=columns[:-1])
    probs = classifier.predict_proba(data_df)
    pred = classifier.predict(data_df)

    data_df['AProb'] = probs[:, 0]
    data_df['BProb'] = probs[:, 1]
    data_df['Pred'] = pred
    
    return data_df

In [9]:
def pipeline(gender, detailed_results=False):
    teams_df, reg_szn_df, tourn_df, seed_df = get_files_by_competition(gender)
    data_df = build_dataset(start_data_season[gender], reg_szn_df, tourn_df, seed_df)
    classifier = predict_seasons(gender, start_data_season[gender], data_df, detailed_results)
    current_szn_df = get_current_predictions(classifier, reg_szn_df, tourn_df, seed_df)
    display(current_szn_df)
    return current_szn_df

In [10]:
m_current_szn_df = pipeline('M')

66/68
1452 x 1462


Unnamed: 0,Season,TeamA,TeamB,ASeedNum,ALastTournPct,ARegSznPct,APts,APtsOpp,AFG,AFG3,...,BAst,BTO,BOR,BDR,BStl,BBlk,BPF,AProb,BProb,Pred
0,2023,1104,1112,1,0.0,60.242792,71.825493,67.534143,44.422728,33.656733,...,15.358098,12.747400,11.185736,25.421991,6.343239,3.707281,16.549777,0.34,0.66,B
1,2023,1104,1113,1,0.0,60.242792,71.825493,67.534143,44.422728,33.656733,...,13.918210,12.776235,10.004630,23.925926,6.274691,3.537037,17.854938,0.63,0.37,A
2,2023,1104,1116,1,0.0,60.242792,71.825493,67.534143,44.422728,33.656733,...,14.160902,13.010526,11.321805,23.792481,7.751880,4.845113,18.589474,0.68,0.32,A
3,2023,1104,1120,1,0.0,60.242792,71.825493,67.534143,44.422728,33.656733,...,13.383667,13.702619,11.543914,23.477658,7.554700,4.306626,18.710324,0.60,0.40,A
4,2023,1104,1124,1,0.0,60.242792,71.825493,67.534143,44.422728,33.656733,...,13.993610,13.298722,12.121406,23.394569,6.976038,4.079872,17.400958,0.46,0.54,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2273,2023,1436,1452,15,0.0,70.433437,70.496904,63.402477,46.029676,35.703908,...,14.575075,12.232733,12.731231,22.457958,7.527027,3.773273,17.944444,0.55,0.45,A
2274,2023,1436,1462,15,0.0,70.433437,70.496904,63.402477,46.029676,35.703908,...,15.060241,12.823795,11.030120,25.522590,6.112952,3.426205,17.191265,0.23,0.77,B
2275,2023,1438,1452,4,0.0,66.666667,68.063863,62.077882,44.835277,35.952306,...,14.575075,12.232733,12.731231,22.457958,7.527027,3.773273,17.944444,0.61,0.39,A
2276,2023,1438,1462,4,0.0,66.666667,68.063863,62.077882,44.835277,35.952306,...,15.060241,12.823795,11.030120,25.522590,6.112952,3.426205,17.191265,0.44,0.56,B


In [11]:
w_current_szn_df = pipeline('W')

66/68
3450 x 3452


Unnamed: 0,Season,TeamA,TeamB,ASeedNum,ALastTournPct,ARegSznPct,APts,APtsOpp,AFG,AFG3,...,BAst,BTO,BOR,BDR,BStl,BBlk,BPF,AProb,BProb,Pred
0,2023,3104,3112,10,0.0,51.190476,66.235714,64.509524,39.625995,30.661384,...,12.533654,15.298077,11.088942,24.290865,8.271635,3.293269,16.353365,0.51,0.49,A
1,2023,3104,3124,10,0.0,51.190476,66.235714,64.509524,39.625995,30.661384,...,19.627232,13.997768,14.080357,30.404018,7.915179,6.060268,15.031250,0.29,0.71,B
2,2023,3104,3151,10,0.0,51.190476,66.235714,64.509524,39.625995,30.661384,...,12.606132,14.181604,10.327830,25.212264,6.761792,3.766509,13.547170,0.56,0.44,A
3,2023,3104,3156,10,0.0,51.190476,66.235714,64.509524,39.625995,30.661384,...,12.767500,15.845000,11.517500,24.905000,8.672500,3.057500,16.550000,0.65,0.35,A
4,2023,3104,3160,10,0.0,51.190476,66.235714,64.509524,39.625995,30.661384,...,13.138015,15.883777,12.985472,25.067797,8.326877,3.716707,18.167070,0.45,0.55,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2273,2023,3437,3450,4,50.0,62.470309,61.923990,58.261283,40.221702,31.912471,...,13.314554,15.293427,12.260563,24.007042,8.732394,3.659624,17.492958,0.53,0.47,A
2274,2023,3437,3452,4,50.0,62.470309,61.923990,58.261283,40.221702,31.912471,...,13.735632,15.857471,13.059770,25.657471,9.094253,4.737931,17.967816,0.69,0.31,A
2275,2023,3439,3450,1,0.0,53.699284,64.708831,61.300716,40.201133,31.720223,...,13.314554,15.293427,12.260563,24.007042,8.732394,3.659624,17.492958,0.62,0.38,A
2276,2023,3439,3452,1,0.0,53.699284,64.708831,61.300716,40.201133,31.720223,...,13.735632,15.857471,13.059770,25.657471,9.094253,4.737931,17.967816,0.60,0.40,A


In [12]:
def set_game_id(row):
    return f"{row['Season']}_{row['TeamA']}_{row['TeamB']}"

current_szn_df = pd.concat([m_current_szn_df, w_current_szn_df], axis=0)
current_szn_df['ID'] = current_szn_df.apply(lambda row: set_game_id(row), axis=1)
display(current_szn_df.loc[:, ['ID', 'AProb']].rename({'AProb': 'Pred'}, axis=1))

Unnamed: 0,ID,Pred
0,2023_1104_1112,0.34
1,2023_1104_1113,0.63
2,2023_1104_1116,0.68
3,2023_1104_1120,0.60
4,2023_1104_1124,0.46
...,...,...
2273,2023_3437_3450,0.53
2274,2023_3437_3452,0.69
2275,2023_3439_3450,0.62
2276,2023_3439_3452,0.60
