# Kaggle March Madness Competition

In [1]:
from pathlib import Path
import os
import pandas as pd

In [2]:
# set path to data directories
raw_data = Path('../march-machine-learning-mania-2023/')
data_path = Path('../data/')

## Load Relevant Raw Data

In [3]:
# teams
m_teams = pd.read_csv(raw_data/'MTeams.csv')
w_teams = pd.read_csv(raw_data/'WTeams.csv')

# seasons
m_seasons = pd.read_csv(raw_data/'MSeasons.csv')
w_seasons = pd.read_csv(raw_data/'WSeasons.csv')

# NCAA tournament seeds
m_seeds = pd.read_csv(raw_data/'MNCAATourneySeeds.csv')
w_seeds = pd.read_csv(raw_data/'WNCAATourneySeeds.csv')

# compact regular season results
m_season_compact = pd.read_csv(raw_data/'MRegularSeasonCompactResults.csv')
w_season_compact = pd.read_csv(raw_data/'WRegularSeasonCompactResults.csv')

# compact tournament results
m_ncaa_compact = pd.read_csv(raw_data/'MNCAATourneyCompactResults.csv')
w_ncaa_compact = pd.read_csv(raw_data/'WNCAATourneyCompactResults.csv')

# sample submission file
samp_submission = pd.read_csv(raw_data/'SampleSubmission2023.csv')

## Preprocess Data

In [4]:
# concat mens and womens datasets
teams = pd.concat([m_teams, w_teams], axis=0)
seasons = pd.concat([m_seasons, w_seasons], axis=0)
seeds = pd.concat([m_seeds, w_seeds], axis=0)
season_results = pd.concat([m_season_compact, w_season_compact], axis=0)
ncaa_results = pd.concat([m_ncaa_compact, w_ncaa_compact])

In [5]:
# add winning team seeds to ncaa_results
ncaa_results = ncaa_results.merge(seeds, how='left', left_on=['WTeamID', 'Season'], right_on=['TeamID', 'Season'])
ncaa_results.rename(columns={'Seed': 'WTeamSeed'}, inplace=True)
ncaa_results.drop('TeamID', axis=1, inplace=True)

# add losing team seeds
ncaa_results = ncaa_results.merge(seeds, how='left', left_on=['LTeamID', 'Season'], right_on=['TeamID', 'Season'])
ncaa_results.rename(columns={'Seed':'LTeamSeed'}, inplace=True)
ncaa_results.drop('TeamID', axis=1, inplace=True)

In [6]:
# rename season columns
seasons.columns = ['Season', 'SeasonStart', 'RegionW', 'RegionX', 'RegionY', 'RegionZ']

# add to ncaa_results
ncaa_results = ncaa_results.merge(seasons, how='left', on='Season')

In [7]:
# identify team regions based on seed
ncaa_results['WTeamRegion'] = ncaa_results['WTeamSeed'].apply(lambda x: x[0])
ncaa_results['LTeamRegion'] = ncaa_results['LTeamSeed'].apply(lambda x: x[0])

In [8]:
# create mapping dict for regions
ncaa_results['mapping_dict'] = ncaa_results.apply(lambda x: {'W': x['RegionW'], 
                                                             'X': x['RegionX'],
                                                             'Y': x['RegionY'],
                                                             'Z': x['RegionZ']}, axis=1)

In [9]:
# replace region codes with the proper region name
ncaa_results['WTeamRegion'] = ncaa_results.apply(lambda x: x['mapping_dict'][x['WTeamRegion']], axis=1)
ncaa_results['LTeamRegion'] = ncaa_results.apply(lambda x: x['mapping_dict'][x['LTeamRegion']], axis=1)

In [10]:
# drop unnecessary columns
ncaa_results.drop(['RegionW', 'RegionX', 'RegionY', 'RegionZ', 'mapping_dict'], axis=1, inplace=True)

In [11]:
# winning team details 
ncaa_results = ncaa_results.merge(teams, how='left', left_on='WTeamID', right_on='TeamID')
ncaa_results.rename(columns={'TeamName': 'WTeamName',
                             'FirstD1Season': 'WTeamFirstD1Season',
                             'LastD1Season': 'WTeamLastD1Season'}, inplace=True)
ncaa_results.drop('TeamID', axis=1, inplace=True)

# losing team details 
ncaa_results = ncaa_results.merge(teams, how='left', left_on='LTeamID', right_on='TeamID')
ncaa_results.rename(columns={'TeamName':'LTeamName',
                             'FirstD1Season': 'LTeamFirstD1Season',
                             'LastD1Season': 'LTeamLastD1Season'}, inplace=True)
ncaa_results.drop('TeamID', axis=1, inplace=True)

In [12]:
# add columns 
season_results['WTeamScoreDiff'] = season_results['WScore'] - season_results['LScore']
season_results['LTeamScoreDiff'] = season_results['LScore'] - season_results['WScore']

In [13]:
# regular season games won
grouped = season_results.groupby(['Season', 'WTeamID']).count().reset_index()
grouped = grouped[['Season', 'WTeamID', 'WScore']]
grouped.columns = ['Season', 'TeamID', 'GamesWon']

# concat into ncaa_results on WTeam
ncaa_results = ncaa_results.merge(grouped, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
ncaa_results.drop('TeamID', axis=1, inplace=True)
ncaa_results.rename(columns={'GamesWon':'WTeamGamesWon'}, inplace=True)

# concat into ncaa_results on LTeam
ncaa_results = ncaa_results.merge(grouped, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
ncaa_results.drop('TeamID', axis=1, inplace=True)
ncaa_results.rename(columns={'GamesWon':'LTeamGamesWon'}, inplace=True)

In [14]:
# regular season games lost
grouped = season_results.groupby(['Season', 'LTeamID']).count().reset_index()
grouped = grouped[['Season', 'LTeamID', 'LScore']]
grouped.columns = ['Season', 'TeamID', 'GamesLost']

# concat into ncaa_results on WTeam
ncaa_results = ncaa_results.merge(grouped, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
ncaa_results.drop('TeamID', axis=1, inplace=True)
ncaa_results.rename(columns={'GamesLost':'WTeamGamesLost'}, inplace=True)

# concat into ncaa_results on LTeam
ncaa_results = ncaa_results.merge(grouped, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
ncaa_results.drop('TeamID', axis=1, inplace=True)
ncaa_results.rename(columns={'GamesLost':'LTeamGamesLost'}, inplace=True)

In [15]:
# average regular season point differential
grouped_win = season_results.groupby(['Season', 'WTeamID']).mean(numeric_only=True).reset_index()
grouped_lose = season_results.groupby(['Season', 'LTeamID']).mean(numeric_only=True).reset_index()

# Win score diff
grouped_win = grouped_win[['Season', 'WTeamID', 'WTeamScoreDiff']]
grouped_win.columns = ['Season', 'TeamID', 'MeanWinScoreDiff']

# Lose score diff
grouped_lose = grouped_lose[['Season', 'LTeamID', 'LTeamScoreDiff']]
grouped_lose.columns = ['Season', 'TeamID', 'MeanLoseScoreDiff']

In [16]:
# concat winning team meanwinscorediff
ncaa_results = ncaa_results.merge(grouped_win, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
ncaa_results.drop('TeamID', axis=1, inplace=True)
ncaa_results.rename(columns={'MeanWinScoreDiff':'WTeamMeanWinDiff'}, inplace=True)

# concat losing team meanwinscorediff
ncaa_results = ncaa_results.merge(grouped_win, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
ncaa_results.drop('TeamID', axis=1, inplace=True)
ncaa_results.rename(columns={'MeanWinScoreDiff':'LTeamMeanWinDiff'}, inplace=True)

In [17]:
# concat winning team mean lose score diff
ncaa_results = ncaa_results.merge(grouped_lose, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
ncaa_results.drop('TeamID', axis=1, inplace=True)
ncaa_results.rename(columns={'MeanLoseScoreDiff':'WTeamMeanLoseDiff'}, inplace=True)

# concat losing team mean lose score diff
ncaa_results = ncaa_results.merge(grouped_lose, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
ncaa_results.drop('TeamID', axis=1, inplace=True)
ncaa_results.rename(columns={'MeanLoseScoreDiff':'LTeamMeanLoseDiff'}, inplace=True)

In [18]:
ncaa_results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WTeamSeed,LTeamSeed,...,LTeamFirstD1Season,LTeamLastD1Season,WTeamGamesWon,LTeamGamesWon,WTeamGamesLost,LTeamGamesLost,WTeamMeanWinDiff,LTeamMeanWinDiff,WTeamMeanLoseDiff,LTeamMeanLoseDiff
0,1985,136,1116,63,1234,54,N,0,X09,X08,...,1985.0,2023.0,21,20,12.0,10.0,10.333333,18.45,-8.083333,-5.5
1,1985,136,1120,59,1345,58,N,0,Z11,Z06,...,1985.0,2023.0,18,17,11.0,8.0,11.833333,12.529412,-9.636364,-14.75
2,1985,136,1207,68,1250,43,N,0,W01,W16,...,1985.0,2023.0,25,11,2.0,18.0,17.04,6.0,-1.5,-10.833333
3,1985,136,1229,58,1425,55,N,0,Y09,Y08,...,1985.0,2023.0,20,19,7.0,9.0,11.35,9.421053,-9.428571,-8.111111
4,1985,136,1242,49,1325,38,N,0,Z03,Z14,...,1985.0,2023.0,23,20,7.0,7.0,10.043478,8.8,-8.857143,-7.571429


In [19]:
ncaa_results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WTeamSeed,LTeamSeed,...,LTeamFirstD1Season,LTeamLastD1Season,WTeamGamesWon,LTeamGamesWon,WTeamGamesLost,LTeamGamesLost,WTeamMeanWinDiff,LTeamMeanWinDiff,WTeamMeanLoseDiff,LTeamMeanLoseDiff
0,1985,136,1116,63,1234,54,N,0,X09,X08,...,1985.0,2023.0,21,20,12.0,10.0,10.333333,18.45,-8.083333,-5.5
1,1985,136,1120,59,1345,58,N,0,Z11,Z06,...,1985.0,2023.0,18,17,11.0,8.0,11.833333,12.529412,-9.636364,-14.75
2,1985,136,1207,68,1250,43,N,0,W01,W16,...,1985.0,2023.0,25,11,2.0,18.0,17.04,6.0,-1.5,-10.833333
3,1985,136,1229,58,1425,55,N,0,Y09,Y08,...,1985.0,2023.0,20,19,7.0,9.0,11.35,9.421053,-9.428571,-8.111111
4,1985,136,1242,49,1325,38,N,0,Z03,Z14,...,1985.0,2023.0,23,20,7.0,7.0,10.043478,8.8,-8.857143,-7.571429


In [20]:
list(ncaa_results.columns)

['Season',
 'DayNum',
 'WTeamID',
 'WScore',
 'LTeamID',
 'LScore',
 'WLoc',
 'NumOT',
 'WTeamSeed',
 'LTeamSeed',
 'SeasonStart',
 'WTeamRegion',
 'LTeamRegion',
 'WTeamName',
 'WTeamFirstD1Season',
 'WTeamLastD1Season',
 'LTeamName',
 'LTeamFirstD1Season',
 'LTeamLastD1Season',
 'WTeamGamesWon',
 'LTeamGamesWon',
 'WTeamGamesLost',
 'LTeamGamesLost',
 'WTeamMeanWinDiff',
 'LTeamMeanWinDiff',
 'WTeamMeanLoseDiff',
 'LTeamMeanLoseDiff']

In [21]:
# drop unnecessary columns
ncaa_results = ncaa_results.drop(['WLoc', 'NumOT', 'SeasonStart', 'WTeamRegion', 'LTeamRegion',
                                  'WTeamFirstD1Season', 'WTeamLastD1Season',
                                  'LTeamFirstD1Season', 'LTeamLastD1Season'], axis=1)

In [22]:
# create copy and drop additional columns
preprocessed = ncaa_results.copy() 
preprocessed = preprocessed.drop(['WScore', 'LScore', 'WTeamName', 'LTeamName', 'DayNum'], axis=1)

In [23]:
# pull out seed numbers
import re
preprocessed['WTeamSeed'] = preprocessed['WTeamSeed'].apply(lambda x: re.sub('[^0-9]', '', x))
preprocessed['LTeamSeed'] = preprocessed['LTeamSeed'].apply(lambda x: re.sub('[^0-9]', '', x))

In [24]:
# preview sample submission
samp_submission.head()

Unnamed: 0,ID,Pred
0,2023_1101_1102,0.5
1,2023_1101_1103,0.5
2,2023_1101_1104,0.5
3,2023_1101_1105,0.5
4,2023_1101_1106,0.5


In [25]:
# team listed first will always be the winning team
preprocessed['Pred'] = 1

In [26]:
# rename columns
preprocessed.columns = ['season', 'team_a', 'team_b', 
                        'seed_a', 'seed_b', 
                        'reg_season_wins_a', 'reg_season_wins_b',
                        'reg_season_losses_a', 'reg_season_losses_b',
                        'mean_win_diff_a', 'mean_win_diff_b',
                        'mean_lose_diff_a', 'mean_lose_diff_b',
                        'Pred']

## Prepare for Modeling

In [27]:
import numpy as np
from sklearn.impute import SimpleImputer

In [28]:
# check seed columns for weird values
preprocessed['seed_a'].value_counts()

01    1559
02    1082
03     853
04     703
05     508
06     426
07     388
11     296
08     281
09     249
10     237
12     184
13      81
16      72
14      37
15      25
Name: seed_a, dtype: int64

In [29]:
# convert seed columns to numeric
preprocessed['seed_a'] = preprocessed['seed_a'].astype('int')
preprocessed['seed_b'] = preprocessed['seed_b'].astype('int')

In [30]:
# check column types
preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6981 entries, 0 to 6980
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   season               6981 non-null   int64  
 1   team_a               6981 non-null   int64  
 2   team_b               6981 non-null   int64  
 3   seed_a               6981 non-null   int64  
 4   seed_b               6981 non-null   int64  
 5   reg_season_wins_a    6981 non-null   int64  
 6   reg_season_wins_b    6981 non-null   int64  
 7   reg_season_losses_a  6843 non-null   float64
 8   reg_season_losses_b  6962 non-null   float64
 9   mean_win_diff_a      6981 non-null   float64
 10  mean_win_diff_b      6981 non-null   float64
 11  mean_lose_diff_a     6843 non-null   float64
 12  mean_lose_diff_b     6962 non-null   float64
 13  Pred                 6981 non-null   int64  
dtypes: float64(6), int64(8)
memory usage: 818.1 KB


In [31]:
# for now fill missing values with the column mean
preprocessed['reg_season_losses_a'].fillna(value=preprocessed['reg_season_losses_a'].mean(), inplace=True)
preprocessed['reg_season_losses_b'].fillna(value=preprocessed['reg_season_losses_b'].mean(), inplace=True)
preprocessed['mean_lose_diff_a'].fillna(value=preprocessed['mean_lose_diff_a'].mean(), inplace=True)
preprocessed['mean_lose_diff_b'].fillna(value=preprocessed['mean_lose_diff_b'].mean(), inplace=True)

Our dataset shows significant class imbalance, so we will sample half of the dataframe and present losers first for that half.

In [32]:
# sample datasets to handle class imbalance
df_pre_a = preprocessed.sample(frac=0.5)
df_pre_b = preprocessed.drop(df_pre_a.index)

In [33]:
# relabel one of the dataframes
df_pre_b.columns = ['season', 'team_b', 'team_a',
                    'seed_b', 'seed_a',
                    'reg_season_wins_b', 'reg_season_wins_a',
                    'reg_season_losses_b', 'reg_season_losses_a',
                    'mean_win_diff_b', 'mean_win_diff_a',
                    'mean_lose_diff_b','mean_lose_diff_a',
                    'Pred']

In [34]:
# losers will now be presented first
df_pre_b['Pred'] = 0 

In [35]:
# concat the two back together
df_pre = pd.concat([df_pre_a, df_pre_b], axis=0).reset_index(drop=True)

## Modeling

#### Training and Testing Sets

In [36]:
SEED = 23

In [37]:
# before creating training and test sets check class imbalance
df_pre['Pred'].value_counts(normalize=True)

0    0.500072
1    0.499928
Name: Pred, dtype: float64

In [38]:
# save a clean copy for modeling
df_clean = df_pre.copy()

In [39]:
# drop team columns
df_clean.drop(['team_a', 'team_b'], axis=1, inplace=True)

In [40]:
# split into training and test sets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# identify variables 
y = df_clean['Pred']
X = df_clean.drop('Pred', axis=1)

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=SEED)

#### Scale Features

In [41]:
# use standard scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

#### Save Final Processed Datasets

In [42]:
X_train_final = X_train_scaled.copy()
X_test_final = X_test_scaled.copy()
y_train_final = y_train.copy()
y_test_final = y_test.copy()

#### Define Function to Print Results

In [43]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_curve, auc
from sklearn import tree

In [44]:
def print_model_scores(X_train, X_test, y_train, y_test, model, model_name):
    """
    Function to return accuracy, recall, precision, f1, roc_auc, and neg_log_loss from given 
    X_train, X_test, y_train, y_test, model, and string for the model name
    """
    
    # create predictions using our model
    y_train_preds = model.predict(X_train)
    y_test_preds = model.predict(X_test)
    
    # accuracy
    train_acc = accuracy_score(y_train, y_train_preds)
    test_acc = accuracy_score(y_test, y_test_preds)
    
    # precision
    train_prec = precision_score(y_train, y_train_preds)
    test_prec = precision_score(y_test, y_test_preds)
    
    # recall 
    train_recall = recall_score(y_train, y_train_preds)
    test_recall = recall_score(y_test, y_test_preds)
    
    # f1 score
    train_f1 = f1_score(y_train, y_train_preds)
    test_f1 = f1_score(y_test, y_test_preds)
    
    # AUC
    train_fpr, train_tpr, train_thresh = roc_curve(y_train, y_train_preds)
    train_roc_auc = auc(train_fpr, train_tpr)
    test_fpr, test_tpr, test_thresh = roc_curve(y_test, y_test_preds)
    test_roc_auc = auc(test_fpr, test_tpr)
    
    # print results
    print('Accuracy:')
    print(f'Training Set: {train_acc}')
    print(f'Testing Set: {test_acc}')
    print('---------------------------')
    print('Precision:')
    print(f'Training Set: {train_prec}')
    print(f'Testing Set: {test_prec}')
    print('---------------------------')
    print('Recall:')
    print(f'Training Set: {train_recall}')
    print(f'Testing Set: {test_recall}')
    print('---------------------------')
    print('F1 Score:')
    print(f'Training Set: {train_f1}')
    print(f'Testing Set: {test_f1}')
    print('---------------------------')
    print(f'ROC AUC:')
    print(f'Training Set: {train_roc_auc}')
    print(f'Test Set: {test_roc_auc}')
    
    # store results in dataframes
    test_results = pd.DataFrame([[f'Test-{model_name}', test_acc, test_prec, test_recall, test_f1, test_roc_auc]],
                                columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC'])
    
    train_results = pd.DataFrame([[f'Training-{model_name}', train_acc, train_prec, train_recall, train_f1, train_roc_auc]],
                                columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC'])
    
    # concat results
    results = pd.concat([test_results, train_results], axis=0)
    
    return results

### XGBoost

In [45]:
from xgboost import XGBClassifier

In [46]:
# create baseline classifier
base_xgb = XGBClassifier(random_state=SEED)

# fit to training set
base_xgb.fit(X_train_final, y_train_final)

# print and store results
base_xgb_results = print_model_scores(X_train_final, X_test_final,
                                      y_train_final, y_test_final,
                                      base_xgb, 'base_xgb')

Accuracy:
Training Set: 0.9829990448901623
Testing Set: 0.800114547537228
---------------------------
Precision:
Training Set: 0.9820815859702631
Testing Set: 0.8009205983889528
---------------------------
Recall:
Training Set: 0.983957219251337
Testing Set: 0.7981651376146789
---------------------------
F1 Score:
Training Set: 0.9830185079183362
Testing Set: 0.7995404939689833
---------------------------
ROC AUC:
Training Set: 0.9829988618228408
Test Set: 0.8001123170910923


Without any tuning our model is achieving ~80% accuracy. Given a our class split, a simple model that guessed one class would do substantially worse, with accuracy falling close to 50%. 

#### Tune Hyperparams

In [47]:
# set up gridsearch
grid_search_params = {
    'max_depth': [10, 11, 12],
    'eta': [0.1, 0.05],
    'gamma': [5, 10]
}

# instantiate xgboost
xgb_clf = XGBClassifier(random_state=SEED)
xgb_grid = GridSearchCV(estimator=xgb_clf,
                        param_grid=grid_search_params,
                        cv=3,
                        scoring='accuracy',
                        return_train_score=True,
                        verbose=1)

# fit to training data
xgb_grid.fit(X_train_final, y_train_final)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [48]:
# print grid search results 
mean_train_score = np.mean(xgb_grid.cv_results_['mean_train_score'])
mean_test_score = np.mean(xgb_grid.cv_results_['mean_test_score'])
print(f'Grid Search Train Accuracy: {mean_train_score}')
print(f'Grid Search Test Accuracy: {mean_test_score}')

Grid Search Train Accuracy: 0.8327523081821075
Grid Search Test Accuracy: 0.7442534224769181


In [49]:
# train model with the best params
xgb_tuned = xgb_grid.best_estimator_
xgb_tuned.fit(X_train_final, y_train_final)
xgb_tuned_results = print_model_scores(X_train_final, X_test_final,
                                       y_train_final, y_test_final,
                                       xgb_tuned, 'xgb_tuned')

Accuracy:
Training Set: 0.9021967526265521
Testing Set: 0.7703321878579611
---------------------------
Precision:
Training Set: 0.9028309104820199
Testing Set: 0.775438596491228
---------------------------
Recall:
Training Set: 0.9014514896867838
Testing Set: 0.7603211009174312
---------------------------
F1 Score:
Training Set: 0.9021406727828747
Testing Set: 0.7678054429646787
---------------------------
ROC AUC:
Training Set: 0.9021968950153445
Test Set: 0.7703207335250771


Seeing an improvement to overfitting -- we will use this model to create our submissions.

## Submission

In [50]:
X_train_final.head()

Unnamed: 0,season,seed_a,seed_b,reg_season_wins_a,reg_season_wins_b,reg_season_losses_a,reg_season_losses_b,mean_win_diff_a,mean_win_diff_b,mean_lose_diff_a,mean_lose_diff_b
0,1.511093,0.292898,-1.228893,-1.366665,-0.862942,0.542795,-1.04369,-0.289011,-0.28888,-0.277141,-0.654484
1,-0.496129,-1.006895,-0.797362,-0.112195,-0.1034,-0.093986,-0.728421,1.025242,-0.415776,0.449893,0.430968
2,0.284458,-0.790263,-0.150067,0.138699,0.14978,0.224404,0.217387,-0.89559,0.128062,-0.265138,0.309569
3,1.176556,-1.223528,-1.228893,1.393169,2.175224,-1.367547,0.238399,1.025242,4.075026,-1.429421,-0.132971
4,-0.719153,0.726163,0.065698,0.138699,-0.356581,-0.093986,1.163194,-0.32271,-1.267908,0.573351,-0.083193


In [51]:
# for each ID, create a prediction
first_submission = samp_submission.copy()
first_submission.drop('Pred', axis=1, inplace=True)
first_submission.head()

Unnamed: 0,ID
0,2023_1101_1102
1,2023_1101_1103
2,2023_1101_1104
3,2023_1101_1105
4,2023_1101_1106


In [52]:
X_train_final.head()

Unnamed: 0,season,seed_a,seed_b,reg_season_wins_a,reg_season_wins_b,reg_season_losses_a,reg_season_losses_b,mean_win_diff_a,mean_win_diff_b,mean_lose_diff_a,mean_lose_diff_b
0,1.511093,0.292898,-1.228893,-1.366665,-0.862942,0.542795,-1.04369,-0.289011,-0.28888,-0.277141,-0.654484
1,-0.496129,-1.006895,-0.797362,-0.112195,-0.1034,-0.093986,-0.728421,1.025242,-0.415776,0.449893,0.430968
2,0.284458,-0.790263,-0.150067,0.138699,0.14978,0.224404,0.217387,-0.89559,0.128062,-0.265138,0.309569
3,1.176556,-1.223528,-1.228893,1.393169,2.175224,-1.367547,0.238399,1.025242,4.075026,-1.429421,-0.132971
4,-0.719153,0.726163,0.065698,0.138699,-0.356581,-0.093986,1.163194,-0.32271,-1.267908,0.573351,-0.083193


In [53]:
# add relevant columns
first_submission['season'] = first_submission['ID'].apply(lambda x: x.split('_')[0]).astype('int64')
first_submission['team_a'] = first_submission['ID'].apply(lambda x: x.split('_')[1]).astype('int64')
first_submission['team_b'] = first_submission['ID'].apply(lambda x: x.split('_')[2]).astype('int64')

In [54]:
# add seed_a information 
seeds['Seed'] = seeds['Seed'].apply(lambda x: re.sub('[^0-9]', '', x)).astype('int64')

In [88]:
# add seed_a to first submission
first_sub = first_submission.merge(seeds, how='left', left_on=['season', 'team_a'], right_on=['Season', 'TeamID'])
first_sub.rename(columns={'Seed':'seed_a'}, inplace=True)
first_sub.drop(['Season', 'TeamID'], axis=1, inplace=True)

In [89]:
# add seed_b to first_submission
first_sub = first_sub.merge(seeds, how='left', left_on=['season', 'team_b'], right_on=['Season', 'TeamID'])
first_sub.rename(columns={'Seed':'seed_b'}, inplace=True)
first_sub.drop(['Season', 'TeamID'], axis=1, inplace=True)

In [90]:
# add reg season wins
wins_group = season_results.groupby(['Season', 'WTeamID']).count().reset_index()
lose_group = season_results.groupby(['Season', 'LTeamID']).count().reset_index()

In [91]:
wins_group = wins_group[['Season', 'WTeamID', 'WScore']]
lose_group = lose_group[['Season', 'LTeamID', 'LScore']]

In [92]:
first_sub = first_sub.merge(wins_group, how='left', left_on=['season', 'team_a'], right_on=['Season', 'WTeamID'])
first_sub.rename(columns={'WScore':'reg_season_wins_a'}, inplace=True)
first_sub.drop(['Season', 'WTeamID'], axis=1, inplace=True)

first_sub = first_sub.merge(wins_group, how='left', left_on=['season', 'team_b'], right_on=['Season', 'WTeamID'])
first_sub.rename(columns={'WScore':'reg_season_wins_b'}, inplace=True)
first_sub.drop(['Season', 'WTeamID'], axis=1, inplace=True)

In [93]:
first_sub = first_sub.merge(lose_group, how='left', left_on=['season', 'team_a'], right_on=['Season', 'LTeamID'])
first_sub.rename(columns={'LScore':'reg_season_losses_a'}, inplace=True)
first_sub.drop(['Season', 'LTeamID'], axis=1, inplace=True)

first_sub = first_sub.merge(lose_group, how='left', left_on=['season', 'team_b'], right_on=['Season', 'LTeamID'])
first_sub.rename(columns={'LScore':'reg_season_losses_b'}, inplace=True)
first_sub.drop(['Season', 'LTeamID'], axis=1, inplace=True)

In [94]:
# add win and lose diffs 
wins_group = season_results.groupby(['Season', 'WTeamID']).mean(numeric_only=True).reset_index()
lose_group = season_results.groupby(['Season', 'LTeamID']).mean(numeric_only=True).reset_index()

In [95]:
wins_group = wins_group[['Season', 'WTeamID', 'WTeamScoreDiff']]
lose_group = lose_group[['Season', 'LTeamID', 'LTeamScoreDiff']]

In [96]:
first_sub = first_sub.merge(wins_group, how='left', left_on=['season', 'team_a'], right_on=['Season', 'WTeamID'])
first_sub.rename(columns={'WTeamScoreDiff':'mean_win_diff_a'}, inplace=True)
first_sub.drop(['Season', 'WTeamID'], axis=1, inplace=True)

first_sub = first_sub.merge(wins_group, how='left', left_on=['season', 'team_b'], right_on=['Season', 'WTeamID'])
first_sub.rename(columns={'WTeamScoreDiff':'mean_win_diff_b'}, inplace=True)
first_sub.drop(['Season', 'WTeamID'], axis=1, inplace=True)

In [97]:
first_sub = first_sub.merge(lose_group, how='left', left_on=['season', 'team_a'], right_on=['Season', 'LTeamID'])
first_sub.rename(columns={'LTeamScoreDiff':'mean_lose_diff_a'}, inplace=True)
first_sub.drop(['Season', 'LTeamID'], axis=1, inplace=True)

first_sub = first_sub.merge(lose_group, how='left', left_on=['season', 'team_b'], right_on=['Season', 'LTeamID'])
first_sub.rename(columns={'LTeamScoreDiff':'mean_lose_diff_b'}, inplace=True)
first_sub.drop(['Season', 'LTeamID'], axis=1, inplace=True)

In [98]:
first_sub.head()

Unnamed: 0,ID,season,team_a,team_b,seed_a,seed_b,reg_season_wins_a,reg_season_wins_b,reg_season_losses_a,reg_season_losses_b,mean_win_diff_a,mean_win_diff_b,mean_lose_diff_a,mean_lose_diff_b
0,2023_1101_1102,2023,1101,1102,,,9.0,14.0,17.0,18.0,11.333333,12.642857,-11.647059,-10.055556
1,2023_1101_1103,2023,1101,1103,,,9.0,20.0,17.0,11.0,11.333333,15.35,-11.647059,-11.454545
2,2023_1101_1104,2023,1101,1104,,1.0,9.0,29.0,17.0,5.0,11.333333,18.241379,-11.647059,-12.8
3,2023_1101_1105,2023,1101,1105,,,9.0,12.0,17.0,18.0,11.333333,9.416667,-11.647059,-11.388889
4,2023_1101_1106,2023,1101,1106,,,9.0,7.0,17.0,23.0,11.333333,7.285714,-11.647059,-15.304348


In [100]:
# fill missing seed values with 1000
first_sub['seed_a'].fillna(value=1000, inplace=True)
first_sub['seed_b'].fillna(value=1000, inplace=True)

In [101]:
# fill everything else with 0s
first_sub.fillna(value=0, inplace=True)
first_sub.isna().sum()

ID                     0
season                 0
team_a                 0
team_b                 0
seed_a                 0
seed_b                 0
reg_season_wins_a      0
reg_season_wins_b      0
reg_season_losses_a    0
reg_season_losses_b    0
mean_win_diff_a        0
mean_win_diff_b        0
mean_lose_diff_a       0
mean_lose_diff_b       0
dtype: int64

In [102]:
first_sub.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130683 entries, 0 to 130682
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   130683 non-null  object 
 1   season               130683 non-null  int64  
 2   team_a               130683 non-null  int64  
 3   team_b               130683 non-null  int64  
 4   seed_a               130683 non-null  float64
 5   seed_b               130683 non-null  float64
 6   reg_season_wins_a    130683 non-null  float64
 7   reg_season_wins_b    130683 non-null  float64
 8   reg_season_losses_a  130683 non-null  float64
 9   reg_season_losses_b  130683 non-null  float64
 10  mean_win_diff_a      130683 non-null  float64
 11  mean_win_diff_b      130683 non-null  float64
 12  mean_lose_diff_a     130683 non-null  float64
 13  mean_lose_diff_b     130683 non-null  float64
dtypes: float64(10), int64(3), object(1)
memory usage: 15.0+ MB
