In [1]:
import pandas as pd 
import numpy as np 
import os 
from pathlib import Path 

In [16]:
### Combine the stats and the ratings for each team depending on the name of the team 
stats_summary = pd.read_csv('../data/preprocessing/mens_summary_season_data.csv')
advanced_stats = pd.read_csv("../data/preprocessing/mens_advanced_stats.csv")
team_spellings = pd.read_csv('../data/MTeamSpellings.csv', encoding='ISO-8859-1')

# Merge spellings with team summary stats 
all_spellings = stats_summary.merge(team_spellings, how='left', left_on=['TeamID'], right_on=['TeamID'])

# Set spellings to all be lower case 
all_spellings['TeamNameSpelling'] = all_spellings['TeamNameSpelling'].str.lower() 
advanced_stats['School'] = advanced_stats['School'].str.lower() 

# Combine season stats and season ratings 
combined = all_spellings.merge(advanced_stats, how='left', left_on=['Season','TeamNameSpelling'], right_on=['Season','School'])

combined.head(5) 

Unnamed: 0,Season,TeamName,TeamID,Win_Percentage,Points_Per_Game,FG_Percentage,Threes_Per_Game,Three_Point_Percentage,Free_Throws_Per_Game,Free_Throw_Percentage,...,Opp_Turnovers_Per_Game,ConfAbbrev,Win_pct_last_10_games,TeamNameSpelling,School,SRS,SOS,Pace,FTr,3PAr
0,2003,Air Force,1102.0,0.428571,57.25,48.114901,20.821429,37.564322,17.107143,65.135699,...,12.964286,mwc,0.2,air force,air force,1.71,1.46,,0.43,0.523
1,2003,Air Force,1102.0,0.428571,57.25,48.114901,20.821429,37.564322,17.107143,65.135699,...,12.964286,mwc,0.2,air-force,,,,,,
2,2003,Akron,1103.0,0.481481,78.777778,48.607427,16.074074,33.870968,25.851852,73.638968,...,15.333333,mac,0.5,akron,akron,-0.8,-1.43,,0.462,0.286
3,2003,Alabama,1104.0,0.607143,69.285714,42.036227,19.857143,32.014388,20.928571,70.989761,...,13.857143,sec,0.4,alabama,alabama,13.36,9.4,,0.364,0.344
4,2003,Alabama A&M,1105.0,0.269231,71.769231,39.575531,20.769231,36.481481,21.846154,70.598592,...,18.807692,swac,0.3,alabama a&m,alabama a&m,-16.71,-11.83,,0.349,0.335


In [17]:
### Look for teams that need to be renamed to be joined together

# Step 1: Check if all 'Team' values are null for each 'TeamID'
team_null_check = combined.groupby(['Season','TeamID'])['School'].apply(lambda x: x.isnull().all()).reset_index()

# Step 2: Filter for TeamIDs where all records have 'null' in the 'team' column
team_null_check = team_null_check[team_null_check['School'] == True]

# View teams that do not have a matching spelling in the ratings dataset
team_null_check.head(50) 

Unnamed: 0,Season,TeamID,School
1765,2008,1223.0,True
2195,2009,1315.0,True
2249,2009,1370.0,True
2256,2009,1377.0,True
3222,2012,1303.0,True


In [19]:
### Find the Lower quantile for offensive rating, defensive rating, and srs rating and use that for the three teams that don't have ratings

# Find the rows where the 'team' field is not null for each 'TeamID'
non_null_teams = combined[combined['School'].notnull()]
final_result = non_null_teams.groupby(['Season', 'TeamID']).first().reset_index()

# Find the lower quartile
lower_quartile_srs = final_result['SRS'].quantile(0.25)
lower_quartile_sos = final_result['SOS'].quantile(0.25)
lower_quartile_Pace = final_result['Pace'].quantile(0.25)
lower_quartile_FTr = final_result['FTr'].quantile(0.25)
lower_quartile_3PAr = final_result['3PAr'].quantile(0.25)

# Display the results
print(f'Lower Quartile (25th percentile) for SRS Rating: {lower_quartile_srs}')
print(f'Lower Quartile (25th percentile) for SOS Rating: {lower_quartile_sos}')
print(f'Lower Quartile (25th percentile) for Pace: {lower_quartile_Pace}')
print(f'Lower Quartile (25th percentile) for FTr: {lower_quartile_FTr}')
print(f'Lower Quartile (25th percentile) for 3PAr: {lower_quartile_3PAr}')

# Use lower quartile values of ratings for teams missing ratings 
combined.loc[(combined['TeamID'] == 1223) & (combined['Season'] == 2008), ['School', 'SRS', 'SOS', 'Pace', 'FTr', '3PAr']] = ['houston chr', lower_quartile_srs, lower_quartile_sos, lower_quartile_Pace, lower_quartile_FTr, lower_quartile_3PAr]
combined.loc[(combined['TeamID'] == 1315) & (combined['Season'] == 2009), ['School', 'SRS', 'SOS', 'Pace', 'FTr', '3PAr']] = ['north dakota', lower_quartile_srs, lower_quartile_sos, lower_quartile_Pace, lower_quartile_FTr, lower_quartile_3PAr]
combined.loc[(combined['TeamID'] == 1370) & (combined['Season'] == 2009), ['School', 'SRS', 'SOS', 'Pace', 'FTr', '3PAr']] = ['seattle', lower_quartile_srs, lower_quartile_sos, lower_quartile_Pace, lower_quartile_FTr, lower_quartile_3PAr]
combined.loc[(combined['TeamID'] == 1377) & (combined['Season'] == 2009), ['School', 'SRS', 'SOS', 'Pace', 'FTr', '3PAr']] = ['south dakota', lower_quartile_srs, lower_quartile_sos, lower_quartile_Pace, lower_quartile_FTr, lower_quartile_3PAr]
combined.loc[(combined['TeamID'] == 1303) & (combined['Season'] == 2012), ['School', 'SRS', 'SOS', 'Pace', 'FTr', '3PAr']] = ['ne omaha', lower_quartile_srs, lower_quartile_sos, lower_quartile_Pace, lower_quartile_FTr, lower_quartile_3PAr]


### Retrieve final joined dataset 

# Group by 'Season' and 'TeamID', and use first non-null value for each column
mens_season_data = (
    combined
    .groupby(['Season', 'TeamID'])
    .agg(lambda x: x.dropna().iloc[0] if x.notna().any() else np.nan)  # Take the first non-null value
    .reset_index()
)

# Drop unnecessary columns 
mens_season_data.drop(columns=['School', 'TeamNameSpelling'], inplace=True)

mens_season_data

Lower Quartile (25th percentile) for SRS Rating: -7.99
Lower Quartile (25th percentile) for SOS Rating: -4.53
Lower Quartile (25th percentile) for Pace: 66.1
Lower Quartile (25th percentile) for FTr: 0.315
Lower Quartile (25th percentile) for 3PAr: 0.311


Unnamed: 0,Season,TeamID,TeamName,Win_Percentage,Points_Per_Game,FG_Percentage,Threes_Per_Game,Three_Point_Percentage,Free_Throws_Per_Game,Free_Throw_Percentage,...,Opp_Three_Point_Percentage,Opp_Free_Throws_Per_Game,Opp_Turnovers_Per_Game,ConfAbbrev,Win_pct_last_10_games,SRS,SOS,Pace,FTr,3PAr
0,2003,1102.0,Air Force,0.428571,57.250000,48.114901,20.821429,37.564322,17.107143,65.135699,...,38.218391,19.250000,12.964286,mwc,0.2,1.71,1.46,,0.430,0.523
1,2003,1103.0,Akron,0.481481,78.777778,48.607427,16.074074,33.870968,25.851852,73.638968,...,36.290323,22.148148,15.333333,mac,0.5,-0.80,-1.43,,0.462,0.286
2,2003,1104.0,Alabama,0.607143,69.285714,42.036227,19.857143,32.014388,20.928571,70.989761,...,33.208955,17.142857,13.857143,sec,0.4,13.36,9.40,,0.364,0.344
3,2003,1105.0,Alabama A&M,0.269231,71.769231,39.575531,20.769231,36.481481,21.846154,70.598592,...,35.745614,24.500000,18.807692,swac,0.3,-16.71,-11.83,,0.349,0.335
4,2003,1106.0,Alabama St,0.464286,63.607143,42.377261,17.642857,34.615385,16.464286,64.642082,...,31.455399,21.964286,15.071429,swac,0.4,-10.33,-10.00,,0.299,0.319
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7976,2025,1476.0,Stonehill,0.480000,68.480000,43.860947,24.480000,36.274510,16.800000,72.380952,...,33.261803,19.320000,10.280000,nec,0.5,-12.86,-9.29,66.3,0.319,0.445
7977,2025,1477.0,East Texas A&M,0.111111,63.925926,41.465054,26.703704,31.345354,15.407407,63.942308,...,37.476809,20.370370,13.185185,southland,0.1,-12.38,-1.86,67.8,0.280,0.483
7978,2025,1478.0,Le Moyne,0.280000,72.000000,44.094488,23.200000,31.896552,21.640000,70.794824,...,36.304700,22.840000,10.880000,nec,0.4,-17.52,-8.06,69.0,0.367,0.414
7979,2025,1479.0,Mercyhurst,0.384615,64.884615,41.601144,19.461538,34.584980,16.692308,80.184332,...,36.007463,20.076923,12.692308,nec,0.6,-15.34,-9.38,65.4,0.308,0.358


In [23]:
# Select all rows where the 'SOS' column is null
sos_null_rows = mens_season_data[mens_season_data['FTr'].isnull()]

sos_null_rows

Unnamed: 0,Season,TeamID,TeamName,Win_Percentage,Points_Per_Game,FG_Percentage,Threes_Per_Game,Three_Point_Percentage,Free_Throws_Per_Game,Free_Throw_Percentage,...,Opp_Three_Point_Percentage,Opp_Free_Throws_Per_Game,Opp_Turnovers_Per_Game,ConfAbbrev,Win_pct_last_10_games,SRS,SOS,Pace,FTr,3PAr
23,2003,1128.0,Birmingham So,0.653846,67.846154,44.281298,21.653846,35.523979,22.230769,72.318339,...,35.969388,19.153846,15.076923,ind,0.7,,,,,


In [36]:
## Add team metrics to regular season games to create dataset for supervised ML model
pd.set_option("display.max_columns",None)

mens_reg_season_data = pd.read_csv('../data/MRegularSeasonCompactResults.csv')
mens_reg_season_data = mens_reg_season_data[mens_reg_season_data['Season'] >= 2003]


# Duplicate data to there is a record for each losing and winning team 
mens_reg_season_data['Team1'] = mens_reg_season_data['WTeamID']
mens_reg_season_data['Team2'] = mens_reg_season_data['LTeamID']
mens_reg_season_data['Team1_Wins'] = 1 

flipped = mens_reg_season_data.copy() 
flipped['Team1'], flipped['Team2'] = flipped['Team2'], flipped['Team1'] 
flipped['Team1_Wins'] = 0 

games = pd.concat([mens_reg_season_data, flipped])
games.drop(columns=['WTeamID','WScore','LTeamID','LScore','WLoc','NumOT'], inplace=True)

# Merge games dataset with teams stats 
full_games = games.merge(mens_season_data, left_on=['Season','Team1'], right_on=['Season', 'TeamID'], how='left')
full_games = full_games.rename(columns={col: col + "_1" for col in mens_season_data.columns if col not in ["Season", "TeamID"]})

full_games = full_games.merge(mens_season_data, left_on=['Season','Team2'], right_on=['Season', 'TeamID'], how='left')
full_games = full_games.rename(columns={col: col + "_2" for col in mens_season_data.columns if col not in ["Season", "TeamID"]})
full_games.drop(columns=['TeamID_x', 'TeamName_1', 'TeamID_y', 'TeamName_2'], inplace=True)

full_games

Unnamed: 0,Season,DayNum,Team1,Team2,Team1_Wins,Win_Percentage_1,Points_Per_Game_1,FG_Percentage_1,Threes_Per_Game_1,Three_Point_Percentage_1,Free_Throws_Per_Game_1,Free_Throw_Percentage_1,Offensive_Rebound_Rate_1,Defensive_Rebound_Rate_1,Turnovers_Per_Game_1,Opp_FG_Percentage_1,Opp_Three_Point_Percentage_1,Opp_Free_Throws_Per_Game_1,Opp_Turnovers_Per_Game_1,ConfAbbrev_1,Win_pct_last_10_games_1,SRS_1,SOS_1,Pace_1,FTr_1,3PAr_1,Win_Percentage_2,Points_Per_Game_2,FG_Percentage_2,Threes_Per_Game_2,Three_Point_Percentage_2,Free_Throws_Per_Game_2,Free_Throw_Percentage_2,Offensive_Rebound_Rate_2,Defensive_Rebound_Rate_2,Turnovers_Per_Game_2,Opp_FG_Percentage_2,Opp_Three_Point_Percentage_2,Opp_Free_Throws_Per_Game_2,Opp_Turnovers_Per_Game_2,ConfAbbrev_2,Win_pct_last_10_games_2,SRS_2,SOS_2,Pace_2,FTr_2,3PAr_2
0,2003,10,1104,1328,1,0.607143,69.285714,42.036227,19.857143,32.014388,20.928571,70.989761,37.475345,68.717949,13.285714,41.891892,33.208955,17.142857,13.857143,sec,0.4,13.36,9.40,,0.364,0.344,0.800000,71.166667,44.693396,18.966667,39.367311,18.600000,70.788530,35.135135,70.593779,11.800000,40.474703,32.678133,18.633333,13.700000,big_twelve,0.8,18.86,8.53,,0.329,0.334
1,2003,10,1272,1393,1,0.793103,74.517241,43.793103,20.068966,34.879725,22.896552,65.361446,37.362637,67.776778,13.793103,40.226460,32.203390,20.724138,15.068966,cusa,0.9,13.23,5.27,,0.379,0.337,0.827586,80.103448,47.006652,15.862069,33.043478,23.620690,69.343066,39.114043,63.004847,13.620690,39.007471,30.687023,18.896552,14.448276,big_east,0.8,19.02,9.02,,0.376,0.252
2,2003,11,1266,1437,1,0.821429,78.392857,48.380952,15.250000,37.939110,23.607143,77.004539,41.375423,65.500486,13.571429,40.596470,32.752613,19.142857,12.500000,cusa,0.8,16.28,7.28,,0.406,0.272,0.500000,72.200000,42.042889,19.100000,34.904014,22.266667,71.257485,39.095745,67.521368,16.033333,42.892157,37.521515,23.400000,16.966667,big_east,0.3,9.11,7.57,,0.380,0.322
3,2003,11,1296,1457,1,0.548387,69.612903,45.896657,16.419355,38.310413,22.387097,65.273775,39.278752,66.920877,17.000000,43.995172,33.816425,22.677419,14.451613,mac,0.5,0.92,1.11,,0.422,0.309,0.642857,69.428571,43.210660,20.107143,35.168739,21.571429,63.576159,33.565737,66.238894,14.642857,41.316979,36.234818,21.321429,15.642857,big_south,0.7,-7.74,-10.58,,0.398,0.354
4,2003,11,1400,1208,1,0.785714,78.857143,44.851259,16.785714,34.893617,23.785714,71.471471,42.376052,66.910420,13.428571,41.133896,34.615385,21.035714,14.178571,big_twelve,0.8,18.91,9.28,,0.398,0.268,0.703704,79.185185,46.413502,17.629630,38.025210,21.629630,71.404110,35.055724,65.547878,11.555556,43.472981,33.826248,19.074074,13.444444,sec,0.7,17.30,11.30,,0.352,0.287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235491,2025,106,1102,1461,0,0.115385,62.346154,42.517268,23.615385,33.550489,18.615385,63.429752,20.676203,72.400000,11.807692,47.776994,37.080868,19.076923,9.730769,mwc,0.0,-7.74,3.98,64.7,0.354,0.485,0.423077,66.115385,43.772242,21.653846,35.346359,17.038462,65.462754,26.769627,75.572519,12.192308,44.593640,32.952381,20.884615,9.923077,mwc,0.3,-0.28,4.60,65.7,0.300,0.399
235492,2025,106,1139,1462,0,0.461538,74.192308,46.142857,21.576923,37.254902,22.346154,73.666093,24.666667,73.830735,10.961538,42.788462,30.704698,14.038462,7.384615,big_east,0.5,10.34,10.31,67.3,0.373,0.409,0.629630,77.296296,45.868575,21.259259,38.327526,21.370370,79.202773,21.923077,75.377468,10.851852,44.002525,32.807018,16.814815,11.666667,big_east,0.7,15.57,8.41,69.4,0.372,0.374
235493,2025,106,1480,1466,0,0.148148,67.851852,42.839352,18.148148,29.795918,16.740741,68.584071,24.363234,72.831633,10.740741,48.573281,37.130178,20.555556,11.037037,a_sun,0.2,-15.07,-4.00,69.3,0.286,0.301,0.692308,78.230769,45.304878,22.653846,33.106961,19.115385,71.026157,32.762557,72.488584,8.038462,43.657437,31.736527,16.230769,10.923077,a_sun,0.9,1.64,-5.36,67.6,0.315,0.351
235494,2025,106,1122,1468,0,0.384615,68.461538,40.663630,24.538462,33.072100,17.500000,70.109890,22.972973,70.308483,9.576923,46.580907,36.310680,20.153846,11.153846,a_sun,0.5,-9.13,-3.43,67.2,0.288,0.409,0.076923,71.115385,45.804677,24.076923,35.463259,14.769231,76.822917,18.276762,71.448087,10.730769,49.388587,39.341917,20.538462,9.807692,a_sun,0.1,-14.73,-4.35,67.6,0.255,0.445


In [41]:
## Add team metrics to regular season games to create dataset for supervised ML model
pd.set_option("display.max_columns",None)

mens_tourney_data = pd.read_csv('../data/MNCAATourneyCompactResults.csv')
mens_tourney_data = mens_tourney_data[mens_tourney_data['Season'] >= 2003]

mens_tourney_seeds = pd.read_csv("../data/MNCAATourneySeeds.csv")
mens_tourney_seeds['Seed'] = mens_tourney_seeds['Seed'].str[1:].str.rstrip('ab')

# Duplicate data to there is a record for each losing and winning team 
mens_tourney_data['Team1'] = mens_tourney_data['WTeamID']
mens_tourney_data['Team2'] = mens_tourney_data['LTeamID']
mens_tourney_data['Team1_Wins'] = 1 

flipped = mens_tourney_data.copy() 
flipped['Team1'], flipped['Team2'] = flipped['Team2'], flipped['Team1'] 
flipped['Team1_Wins'] = 0 

games = pd.concat([mens_tourney_data, flipped])
games.drop(columns=['WTeamID','WScore','LTeamID','LScore','WLoc','NumOT'], inplace=True)

# Merge games dataset with teams stats 
tourney_games = games.merge(mens_season_data, left_on=['Season','Team1'], right_on=['Season', 'TeamID'], how='left')
tourney_games = tourney_games.rename(columns={col: col + "_1" for col in mens_season_data.columns if col not in ["Season", "TeamID"]})

tourney_games = tourney_games.merge(mens_season_data, left_on=['Season','Team2'], right_on=['Season', 'TeamID'], how='left')
tourney_games = tourney_games.rename(columns={col: col + "_2" for col in mens_season_data.columns if col not in ["Season", "TeamID"]})
tourney_games.drop(columns=['TeamID_x', 'TeamName_1', 'TeamID_y', 'TeamName_2'], inplace=True)

tourney_games = tourney_games.merge(mens_tourney_seeds, how='left', left_on=['Season','Team1'], right_on=['Season', 'TeamID'])
tourney_games = tourney_games.rename(columns={'Seed':'Seed_1'})
tourney_games = tourney_games.merge(mens_tourney_seeds, how='left', left_on=['Season','Team2'], right_on=['Season', 'TeamID'])
tourney_games = tourney_games.rename(columns={'Seed':'Seed_2'})
tourney_games = tourney_games.drop(columns=['TeamID_x', 'TeamID_y'])
tourney_games


Unnamed: 0,Season,DayNum,Team1,Team2,Team1_Wins,Win_Percentage_1,Points_Per_Game_1,FG_Percentage_1,Threes_Per_Game_1,Three_Point_Percentage_1,Free_Throws_Per_Game_1,Free_Throw_Percentage_1,Offensive_Rebound_Rate_1,Defensive_Rebound_Rate_1,Turnovers_Per_Game_1,Opp_FG_Percentage_1,Opp_Three_Point_Percentage_1,Opp_Free_Throws_Per_Game_1,Opp_Turnovers_Per_Game_1,ConfAbbrev_1,Win_pct_last_10_games_1,SRS_1,SOS_1,Pace_1,FTr_1,3PAr_1,Win_Percentage_2,Points_Per_Game_2,FG_Percentage_2,Threes_Per_Game_2,Three_Point_Percentage_2,Free_Throws_Per_Game_2,Free_Throw_Percentage_2,Offensive_Rebound_Rate_2,Defensive_Rebound_Rate_2,Turnovers_Per_Game_2,Opp_FG_Percentage_2,Opp_Three_Point_Percentage_2,Opp_Free_Throws_Per_Game_2,Opp_Turnovers_Per_Game_2,ConfAbbrev_2,Win_pct_last_10_games_2,SRS_2,SOS_2,Pace_2,FTr_2,3PAr_2,Seed_1,Seed_2
0,2003,134,1421,1411,1,0.448276,71.206897,42.926533,18.000000,36.015326,20.931034,76.276771,34.970530,62.803738,16.206897,45.588235,36.710963,22.551724,12.827586,big_south,0.5,-13.19,-5.60,,0.360,0.317,0.600000,72.800000,44.752714,18.500000,32.072072,28.066667,61.995249,36.540241,67.513612,15.233333,42.494481,32.517986,18.733333,14.333333,swac,0.5,-9.56,-11.21,,0.509,0.334,16,16
1,2003,136,1112,1436,1,0.892857,85.214286,46.141304,20.071429,35.053381,25.000000,70.142857,39.461467,67.835232,14.785714,40.750966,31.663685,17.714286,16.857143,pac_ten,0.9,23.42,8.95,,0.368,0.306,0.655172,67.793103,44.444444,15.482759,34.075724,19.551724,65.784832,37.227723,72.851562,14.068966,41.327489,32.958199,15.758621,13.000000,aec,0.8,-1.32,-4.85,,0.342,0.278,01,16
2,2003,136,1113,1272,1,0.620690,75.965517,47.818182,12.586207,31.780822,26.206897,66.973684,39.979859,67.939698,14.000000,44.547708,34.273319,20.517241,15.517241,pac_ten,0.6,15.02,9.27,,0.453,0.220,0.793103,74.517241,43.793103,20.068966,34.879725,22.896552,65.361446,37.362637,67.776778,13.793103,40.226460,32.203390,20.724138,15.068966,cusa,0.9,13.23,5.27,,0.379,0.337,10,07
3,2003,136,1141,1166,1,0.793103,79.344828,50.523560,17.931034,38.076923,25.172414,76.575342,35.906433,65.533981,18.241379,45.476190,35.732010,21.965517,16.068966,mac,0.9,6.72,1.79,,0.460,0.335,0.878788,79.242424,49.947257,20.484848,38.905325,20.030303,69.288956,33.740602,67.699115,13.363636,43.154436,34.110169,16.696970,17.060606,mvc,0.8,15.32,1.03,,0.347,0.354,11,06
4,2003,136,1143,1301,1,0.724138,74.482759,46.564885,17.034483,37.651822,19.517241,68.551237,32.829809,68.574200,14.172414,43.167155,33.118971,17.448276,14.931034,pac_ten,0.6,11.78,7.58,,0.330,0.291,0.600000,72.400000,45.625000,22.500000,35.407407,20.466667,77.035831,31.229947,67.656090,14.200000,44.102886,33.725490,21.200000,14.633333,acc,0.5,11.76,7.57,,0.383,0.420,08,09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759,2024,146,1181,1301,0,0.750000,79.843750,48.179420,22.125000,37.711864,20.000000,72.187500,29.538462,76.019417,9.375000,43.244710,32.568807,15.906250,10.906250,acc,0.7,20.67,8.36,67.1,0.343,0.375,0.611111,76.361111,44.920273,20.388889,34.604905,19.805556,73.352034,27.121464,73.570191,9.222222,44.365193,34.905660,18.666667,11.555556,acc,0.6,12.81,8.93,68.5,0.320,0.333,04,11
2760,2024,146,1397,1345,0,0.750000,79.468750,44.433198,25.500000,34.191176,21.187500,74.926254,31.881372,73.063063,9.968750,38.930481,31.395349,20.906250,12.500000,sec,0.7,21.81,10.42,70.3,0.337,0.412,0.878788,83.393939,48.832382,20.424242,40.801187,25.000000,72.121212,36.954315,76.721883,10.969697,41.888620,31.380208,14.393939,9.515152,big_ten,0.8,24.93,11.60,68.3,0.413,0.347,02,01
2761,2024,152,1104,1163,0,0.656250,90.750000,47.731660,30.281250,36.532508,22.812500,78.356164,33.523267,72.147350,11.812500,44.067797,31.886024,24.843750,11.281250,sec,0.5,20.69,11.80,73.9,0.353,0.465,0.911765,81.470588,49.598796,23.970588,36.687117,19.529412,74.246988,35.360825,76.840215,9.117647,39.802802,31.924883,18.411765,10.088235,big_east,0.9,26.70,8.70,66.0,0.319,0.402,04,01
2762,2024,152,1301,1345,0,0.611111,76.361111,44.920273,20.388889,34.604905,19.805556,73.352034,27.121464,73.570191,9.222222,44.365193,34.905660,18.666667,11.555556,acc,0.6,12.81,8.93,68.5,0.320,0.333,0.878788,83.393939,48.832382,20.424242,40.801187,25.000000,72.121212,36.954315,76.721883,10.969697,41.888620,31.380208,14.393939,9.515152,big_ten,0.8,24.93,11.60,68.3,0.413,0.347,11,01


In [42]:
output_dir = Path("..") / "data" / "modeling"
output_dir.mkdir(parents=True, exist_ok =True)
season_output_path = output_dir / "reg_season_ml2.csv"
tourney_output_path = output_dir / "tourney_ml2.csv"
full_games.to_csv(season_output_path, index=False)
tourney_games.to_csv(tourney_output_path, index=False)