In [31]:
import pandas as pd 
import numpy as np 
import os 
from pathlib import Path 

In [32]:
### Combine the stats and the ratings for each team depending on the name of the team 
stats_summary = pd.read_csv('../data/preprocessing/mens_summary_season_data.csv')
ratings_summary = pd.read_csv('../data/preprocessing/mens_season_ratings.csv')
advanced_stats = pd.read_csv("../data/preprocessing/mens_advanced_stats.csv")
team_spellings = pd.read_csv('../data/MTeamSpellings.csv', encoding='ISO-8859-1')

ratings_summary.drop(columns=['netRating'], inplace=True)

# Take data from year 2014 and up because that was what was available in the ratings data
filtered_stats = stats_summary[stats_summary['Season'] >= 2014]

# Merge spellings with team summary stats 
all_spellings = filtered_stats.merge(team_spellings, how='left', left_on=['TeamID'], right_on=['TeamID'])

# Set spellings to all be lower case 
all_spellings['TeamNameSpelling'] = all_spellings['TeamNameSpelling'].str.lower() 
ratings_summary['team'] = ratings_summary['team'].str.lower()
advanced_stats['School'] = advanced_stats['School'].str.lower() 

# Combine season stats and season ratings 
combined = all_spellings.merge(advanced_stats, how='left', left_on=['Season','TeamNameSpelling'], right_on=['Season','School'])

# Update Spellings for certain teams based on my findings 

# TeamIDs that need to be renamed - Fix joining issues 
# 1107, 1111, 1216, 1271, 1274, 1363, 1366, 1383, 1410, 1419, 1472, 1474
combined.loc[combined['TeamID'] == 1107, 'TeamNameSpelling'] = 'ualbany'
combined.loc[combined['TeamID'] == 1111, 'TeamNameSpelling'] = 'app state'
combined.loc[combined['TeamID'] == 1271, 'TeamNameSpelling'] = 'maryland eastern shore'
combined.loc[combined['TeamID'] == 1274, 'TeamNameSpelling'] = 'miami'
combined.loc[combined['TeamID'] == 1410, 'TeamNameSpelling'] = 'ut rio grande valley'
combined.loc[combined['TeamID'] == 1419, 'TeamNameSpelling'] = 'ul monroe'
combined.loc[combined['TeamID'] == 1472, 'TeamNameSpelling'] = 'st. thomas-minnesota'
combined.loc[combined['TeamID'] == 1474, 'TeamNameSpelling'] = 'queens university'
ratings_summary.loc[ratings_summary['teamID'] == 260, 'team'] = 'san jose st'

combined = combined.merge(ratings_summary,  how='left', left_on=['Season', 'TeamNameSpelling'], right_on=['season', 'team'])

combined.head(5) 

Unnamed: 0,Season,TeamName,TeamID,Win_Percentage,Points_Per_Game,FG_Percentage,Threes_Per_Game,Three_Point_Percentage,Free_Throws_Per_Game,Free_Throw_Percentage,Offensive_Rebound_Rate,Defensive_Rebound_Rate,Turnovers_Per_Game,Opp_FG_Percentage,Opp_Three_Point_Percentage,Opp_Free_Throws_Per_Game,Opp_Turnovers_Per_Game,ConfAbbrev,Win_pct_last_10_games,TeamNameSpelling,School,SRS,SOS,Pace,FTr,3PAr,season,teamID,team,offensiveRating,defensiveRating
0,2014,Abilene Chr,1101.0,0.095238,63.142857,40.550807,17.857143,37.333333,21.190476,74.606742,24.925816,66.20155,15.0,50.802139,37.058824,25.809524,12.142857,southland,0.1,abilene chr,,,,,,,,,,,
1,2014,Abilene Chr,1101.0,0.095238,63.142857,40.550807,17.857143,37.333333,21.190476,74.606742,24.925816,66.20155,15.0,50.802139,37.058824,25.809524,12.142857,southland,0.1,abilene christian,abilene christian,-19.6,-4.12,67.7,0.4,0.36,2014.0,1.0,abilene christian,110.5,113.6
2,2014,Abilene Chr,1101.0,0.095238,63.142857,40.550807,17.857143,37.333333,21.190476,74.606742,24.925816,66.20155,15.0,50.802139,37.058824,25.809524,12.142857,southland,0.1,abilene-christian,,,,,,,,,,,
3,2014,Air Force,1102.0,0.357143,64.571429,42.591316,21.714286,32.894737,19.25,69.016698,26.378378,69.792803,13.464286,44.129555,33.724832,23.25,11.678571,mwc,0.3,air force,air force,-4.08,1.71,65.4,0.377,0.418,2014.0,2.0,air force,110.0,111.7
4,2014,Air Force,1102.0,0.357143,64.571429,42.591316,21.714286,32.894737,19.25,69.016698,26.378378,69.792803,13.464286,44.129555,33.724832,23.25,11.678571,mwc,0.3,air-force,,,,,,,,,,,


In [33]:
### Look for teams that need to be renamed to be joined together

# Step 1: Check if all 'Team' values are null for each 'TeamID'
team_null_check = combined.groupby(['Season','TeamID'])['team'].apply(lambda x: x.isnull().all()).reset_index()
team_null_check2 = combined.groupby(['Season','TeamID'])['School'].apply(lambda x: x.isnull().all()).reset_index()

# Step 2: Filter for TeamIDs where all records have 'null' in the 'team' column
team_null_check = team_null_check[team_null_check['team'] == True]
team_null_check2 = team_null_check2[team_null_check2['School'] == True]


# View teams that do not have a matching spelling in the ratings dataset
team_null_check.head(50) 

# TeamIDs that need to be renamed 
# 1107, 1111, 1216, 1271, 1274, 1363, 1366, 1383, 1410, 1419, 1472, 1474

#Team IDs with no associated ranking, set to the lower quartile
# 1216, 1366, 1383

Unnamed: 0,Season,TeamID,team
108,2014,1216.0,True
255,2014,1366.0,True
272,2014,1383.0,True
459,2015,1216.0,True
606,2015,1366.0,True
623,2015,1383.0,True
810,2016,1216.0,True
957,2016,1366.0,True
974,2016,1383.0,True
1161,2017,1216.0,True


In [34]:
### Find the Lower quantile for offensive rating, defensive rating, and srs rating and use that for the three teams that don't have ratings

# Find the rows where the 'team' field is not null for each 'TeamID'
non_null_teams = combined[combined['team'].notnull()]
final_result = non_null_teams.groupby(['Season', 'TeamID']).first().reset_index()

# Find the lower quartile
lower_quartile_offensive = final_result['offensiveRating'].quantile(0.25)
lower_quartile_defensive = final_result['defensiveRating'].quantile(0.25)
lower_quartile_srs = final_result['SRS'].quantile(0.25)
lower_quartile_sos = final_result['SOS'].quantile(0.25)
lower_quartile_Pace = final_result['Pace'].quantile(0.25)
lower_quartile_FTr = final_result['FTr'].quantile(0.25)
lower_quartile_3PAr = final_result['3PAr'].quantile(0.25)

# Display the results
print(f'Lower Quartile (25th percentile) for Offensive Rating: {lower_quartile_offensive}')
print(f'Lower Quartile (25th percentile) for Defensive Rating: {lower_quartile_defensive}')

# Use lower quartile values of ratings for 3 teams missing ratings 
# 1216, 1366, 1383
# Assign lower quartile values to the columns for TeamID 1107
combined.loc[combined['TeamID'] == 1216, ['team', 'offensiveRating', 'defensiveRating']] = ['hartford', lower_quartile_offensive, lower_quartile_defensive]
combined.loc[combined['TeamID'] == 1366, ['team', 'offensiveRating', 'defensiveRating']] = ['savannah st', lower_quartile_offensive, lower_quartile_defensive]
combined.loc[combined['TeamID'] == 1383, ['team', 'offensiveRating', 'defensiveRating']] = ['st francis ny', lower_quartile_offensive, lower_quartile_defensive]

### Retrieve final joined dataset 

# Group by 'Season' and 'TeamID', and use first non-null value for each column
mens_season_data = (
    combined
    .groupby(['Season', 'TeamID'])
    .agg(lambda x: x.dropna().iloc[0] if x.notna().any() else np.nan)  # Take the first non-null value
    .reset_index()
)

# Drop unnecessary columns 
mens_season_data.drop(columns=['School', 'TeamNameSpelling', 'season', 'teamID', 'team'], inplace=True)

mens_season_data[mens_season_data['TeamID'] == 1186]

Lower Quartile (25th percentile) for Offensive Rating: 102.5
Lower Quartile (25th percentile) for Defensive Rating: 102.8


Unnamed: 0,Season,TeamID,TeamName,Win_Percentage,Points_Per_Game,FG_Percentage,Threes_Per_Game,Three_Point_Percentage,Free_Throws_Per_Game,Free_Throw_Percentage,Offensive_Rebound_Rate,Defensive_Rebound_Rate,Turnovers_Per_Game,Opp_FG_Percentage,Opp_Three_Point_Percentage,Opp_Free_Throws_Per_Game,Opp_Turnovers_Per_Game,ConfAbbrev,Win_pct_last_10_games,SRS,SOS,Pace,FTr,3PAr,offensiveRating,defensiveRating
79,2014,1186.0,E Washington,0.448276,75.62069,44.858689,21.758621,37.242472,23.172414,69.345238,29.941292,69.830827,12.275862,44.964871,35.028249,24.655172,10.344828,big_sky,0.6,-5.32,-4.15,69.5,0.391,0.378,111.4,111.9
430,2015,1186.0,E Washington,0.741935,79.354839,46.81793,25.16129,39.615385,20.741935,71.384137,29.378531,69.426152,11.0,45.286195,38.362761,20.225806,12.258065,big_sky,0.7,0.79,-3.84,68.3,0.352,0.421,110.1,106.6
781,2016,1186.0,E Washington,0.5,79.666667,48.109366,26.533333,38.065327,21.066667,68.512658,26.785714,70.964467,11.533333,49.216483,37.636761,22.133333,11.366667,big_sky,0.5,-4.47,-6.03,70.4,0.361,0.465,111.3,113.6
1132,2017,1186.0,E Washington,0.645161,78.741935,47.397564,21.322581,36.611195,20.516129,76.572327,27.68635,72.231986,12.225806,43.941842,37.053571,22.387097,10.290323,big_sky,0.7,-2.77,-4.43,68.0,0.342,0.375,111.3,113.6
1483,2018,1186.0,E Washington,0.5625,74.3125,46.619021,23.3125,38.605898,16.0625,76.653696,21.853659,75.0,11.71875,44.657534,35.620915,19.96875,11.53125,big_sky,0.8,-0.03,-1.54,69.2,0.284,0.411,130.6,128.4
1834,2019,1186.0,E Washington,0.454545,71.757576,42.791878,25.878788,34.777518,16.090909,72.504708,24.620573,76.408451,12.030303,45.121951,37.449664,19.909091,11.727273,big_sky,0.7,-6.77,-4.17,69.5,0.268,0.434,103.1,105.6
2187,2020,1186.0,E Washington,0.724138,77.793103,44.683196,26.551724,34.415584,18.517241,68.715084,27.297794,72.965388,13.103448,43.831723,34.317343,20.413793,13.724138,big_sky,0.8,2.01,-1.81,74.3,0.287,0.416,109.0,104.2
2535,2021,1186.0,E Washington,0.681818,77.954545,46.949807,23.318182,35.867446,17.909091,79.949239,22.08589,75.761589,11.727273,41.928251,31.64557,16.818182,11.681818,big_sky,0.9,4.03,-3.15,71.8,0.302,0.403,110.3,102.6
2887,2022,1186.0,E Washington,0.516129,75.258065,45.128779,24.483871,35.968379,19.096774,75.675676,23.676012,76.315789,12.548387,43.329776,33.806452,18.709677,10.677419,big_sky,0.7,-5.2,-5.14,71.6,0.333,0.428,103.8,108.4
3245,2023,1186.0,E Washington,0.677419,75.354839,48.678802,24.129032,35.695187,17.612903,75.274725,26.820809,76.162216,13.225806,42.794521,32.537688,17.419355,10.258065,big_sky,0.7,1.47,-1.62,68.9,0.31,0.44,107.5,106.4


In [35]:
# Select all rows where the 'SOS' column is null
sos_null_rows = mens_season_data[mens_season_data['offensiveRating'].isnull()]

sos_null_rows

Unnamed: 0,Season,TeamID,TeamName,Win_Percentage,Points_Per_Game,FG_Percentage,Threes_Per_Game,Three_Point_Percentage,Free_Throws_Per_Game,Free_Throw_Percentage,Offensive_Rebound_Rate,Defensive_Rebound_Rate,Turnovers_Per_Game,Opp_FG_Percentage,Opp_Three_Point_Percentage,Opp_Free_Throws_Per_Game,Opp_Turnovers_Per_Game,ConfAbbrev,Win_pct_last_10_games,SRS,SOS,Pace,FTr,3PAr,offensiveRating,defensiveRating


In [36]:
## Add team metrics to regular season games to create dataset for supervised ML model
pd.set_option("display.max_columns",None)

mens_reg_season_data = pd.read_csv('../data/MRegularSeasonCompactResults.csv')
mens_reg_season_data = mens_reg_season_data[mens_reg_season_data['Season'] >= 2014]
mens_tourney_data = pd.read_csv('../data/MNCAATourneyCompactResults.csv')


# Duplicate data to there is a record for each losing and winning team 
mens_reg_season_data['Team1'] = mens_reg_season_data['WTeamID']
mens_reg_season_data['Team2'] = mens_reg_season_data['LTeamID']
mens_reg_season_data['Team1_Wins'] = 1 

flipped = mens_reg_season_data.copy() 
flipped['Team1'], flipped['Team2'] = flipped['Team2'], flipped['Team1'] 
flipped['Team1_Wins'] = 0 

games = pd.concat([mens_reg_season_data, flipped])
games.drop(columns=['WTeamID','WScore','LTeamID','LScore','WLoc','NumOT'], inplace=True)

# Merge games dataset with teams stats 
full_games = games.merge(mens_season_data, left_on=['Season','Team1'], right_on=['Season', 'TeamID'], how='left')
full_games = full_games.rename(columns={col: col + "_1" for col in mens_season_data.columns if col not in ["Season", "TeamID"]})

full_games = full_games.merge(mens_season_data, left_on=['Season','Team2'], right_on=['Season', 'TeamID'], how='left')
full_games = full_games.rename(columns={col: col + "_2" for col in mens_season_data.columns if col not in ["Season", "TeamID"]})
full_games.drop(columns=['TeamID_x', 'TeamName_1', 'TeamID_y', 'TeamName_2'], inplace=True)

full_games

Unnamed: 0,Season,DayNum,Team1,Team2,Team1_Wins,Win_Percentage_1,Points_Per_Game_1,FG_Percentage_1,Threes_Per_Game_1,Three_Point_Percentage_1,Free_Throws_Per_Game_1,Free_Throw_Percentage_1,Offensive_Rebound_Rate_1,Defensive_Rebound_Rate_1,Turnovers_Per_Game_1,Opp_FG_Percentage_1,Opp_Three_Point_Percentage_1,Opp_Free_Throws_Per_Game_1,Opp_Turnovers_Per_Game_1,ConfAbbrev_1,Win_pct_last_10_games_1,SRS_1,SOS_1,Pace_1,FTr_1,3PAr_1,offensiveRating_1,defensiveRating_1,Win_Percentage_2,Points_Per_Game_2,FG_Percentage_2,Threes_Per_Game_2,Three_Point_Percentage_2,Free_Throws_Per_Game_2,Free_Throw_Percentage_2,Offensive_Rebound_Rate_2,Defensive_Rebound_Rate_2,Turnovers_Per_Game_2,Opp_FG_Percentage_2,Opp_Three_Point_Percentage_2,Opp_Free_Throws_Per_Game_2,Opp_Turnovers_Per_Game_2,ConfAbbrev_2,Win_pct_last_10_games_2,SRS_2,SOS_2,Pace_2,FTr_2,3PAr_2,offensiveRating_2,defensiveRating_2
0,2014,4,1102,1119,1,0.357143,64.571429,42.591316,21.714286,32.894737,19.250000,69.016698,26.378378,69.792803,13.464286,44.129555,33.724832,23.250000,11.678571,mwc,0.3,-4.08,1.71,65.4,0.377,0.418,110.0,111.7,0.466667,72.533333,44.629523,22.300000,33.781764,18.900000,69.841270,31.977294,66.465257,13.000000,46.447446,38.996139,20.033333,13.700000,patriot,0.4,-6.16,-4.66,68.2,0.322,0.386,111.4,111.9
1,2014,4,1103,1157,1,0.636364,67.909091,43.190661,20.393939,34.769688,22.212121,61.800819,35.273675,67.692308,13.212121,42.833517,32.525253,19.606061,13.030303,mac,0.5,1.16,-0.48,65.5,0.404,0.372,111.2,110.7,0.600000,70.533333,42.923434,17.900000,32.029795,22.333333,69.253731,34.762774,70.062556,14.100000,40.375587,32.088520,22.200000,12.833333,big_south,0.8,-5.21,-6.31,68.6,0.385,0.319,110.0,109.9
2,2014,4,1107,1373,1,0.562500,66.031250,43.914373,13.468750,36.426914,21.718750,74.820144,32.387476,70.905764,12.625000,41.863140,35.247209,15.843750,11.968750,aec,0.7,-3.31,-5.22,63.1,0.420,0.261,110.7,110.2,0.468750,69.906250,42.102397,13.406250,32.400932,24.000000,71.875000,38.946459,64.650767,14.156250,42.063492,33.636364,28.843750,12.718750,maac,0.5,-0.86,-0.15,67.2,0.417,0.238,128.5,164.1
3,2014,4,1112,1142,1,0.882353,73.058824,46.848739,14.852941,35.643564,23.352941,65.491184,37.249782,74.006623,10.441176,38.097856,31.446541,18.441176,12.411765,pac_twelve,0.7,23.36,9.04,64.1,0.410,0.264,111.9,106.5,0.366667,62.000000,40.371517,18.900000,33.509700,17.766667,68.667917,30.842912,66.953714,8.966667,43.837442,33.978495,21.466667,10.833333,big_west,0.5,-1.34,1.69,60.8,0.326,0.354,110.2,110.9
4,2014,4,1113,1420,1,0.656250,75.031250,45.044543,21.031250,38.632987,23.562500,69.363395,24.193548,69.879518,11.468750,41.513400,32.442748,22.031250,12.031250,pac_twelve,0.5,13.32,7.41,69.0,0.425,0.371,110.5,109.1,0.275862,64.965517,40.000000,19.103448,30.685921,22.344828,66.358025,29.853480,69.075452,14.931034,44.959128,36.711281,29.137931,14.965517,aec,0.3,-14.21,-6.55,68.4,0.419,0.342,110.4,111.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125179,2025,106,1102,1461,0,0.115385,62.346154,42.517268,23.615385,33.550489,18.615385,63.429752,20.676203,72.400000,11.807692,47.776994,37.080868,19.076923,9.730769,mwc,0.0,-7.74,3.98,64.7,0.354,0.485,98.4,112.3,0.423077,66.115385,43.772242,21.653846,35.346359,17.038462,65.462754,26.769627,75.572519,12.192308,44.593640,32.952381,20.884615,9.923077,mwc,0.3,-0.28,4.60,65.7,0.300,0.399,103.5,106.6
125180,2025,106,1139,1462,0,0.461538,74.192308,46.142857,21.576923,37.254902,22.346154,73.666093,24.666667,73.830735,10.961538,42.788462,30.704698,14.038462,7.384615,big_east,0.5,10.34,10.31,67.3,0.373,0.409,116.5,105.4,0.629630,77.296296,45.868575,21.259259,38.327526,21.370370,79.202773,21.923077,75.377468,10.851852,44.002525,32.807018,16.814815,11.666667,big_east,0.7,15.57,8.41,69.4,0.372,0.374,117.4,99.2
125181,2025,106,1480,1466,0,0.148148,67.851852,42.839352,18.148148,29.795918,16.740741,68.584071,24.363234,72.831633,10.740741,48.573281,37.130178,20.555556,11.037037,a_sun,0.2,-15.07,-4.00,69.3,0.286,0.301,98.1,117.1,0.692308,78.230769,45.304878,22.653846,33.106961,19.115385,71.026157,32.762557,72.488584,8.038462,43.657437,31.736527,16.230769,10.923077,a_sun,0.9,1.64,-5.36,67.6,0.315,0.351,112.3,107.4
125182,2025,106,1122,1468,0,0.384615,68.461538,40.663630,24.538462,33.072100,17.500000,70.109890,22.972973,70.308483,9.576923,46.580907,36.310680,20.153846,11.153846,a_sun,0.5,-9.13,-3.43,67.2,0.288,0.409,101.6,112.3,0.076923,71.115385,45.804677,24.076923,35.463259,14.769231,76.822917,18.276762,71.448087,10.730769,49.388587,39.341917,20.538462,9.807692,a_sun,0.1,-14.73,-4.35,67.6,0.255,0.445,102.9,122.2


In [37]:
## Add team metrics to regular season games to create dataset for supervised ML model
pd.set_option("display.max_columns",None)

mens_tourney_data = pd.read_csv('../data/MNCAATourneyCompactResults.csv')
mens_tourney_data = mens_tourney_data[mens_tourney_data['Season'] >= 2014]

mens_tourney_seeds = pd.read_csv("../data/MNCAATourneySeeds.csv")
mens_tourney_seeds['Seed'] = mens_tourney_seeds['Seed'].str[1:].str.rstrip('ab')

# Duplicate data to there is a record for each losing and winning team 
mens_tourney_data['Team1'] = mens_tourney_data['WTeamID']
mens_tourney_data['Team2'] = mens_tourney_data['LTeamID']
mens_tourney_data['Team1_Wins'] = 1 

flipped = mens_tourney_data.copy() 
flipped['Team1'], flipped['Team2'] = flipped['Team2'], flipped['Team1'] 
flipped['Team1_Wins'] = 0 

games = pd.concat([mens_tourney_data, flipped])
games.drop(columns=['WTeamID','WScore','LTeamID','LScore','WLoc','NumOT'], inplace=True)

# Merge games dataset with teams stats 
tourney_games = games.merge(mens_season_data, left_on=['Season','Team1'], right_on=['Season', 'TeamID'], how='left')
tourney_games = tourney_games.rename(columns={col: col + "_1" for col in mens_season_data.columns if col not in ["Season", "TeamID"]})

tourney_games = tourney_games.merge(mens_season_data, left_on=['Season','Team2'], right_on=['Season', 'TeamID'], how='left')
tourney_games = tourney_games.rename(columns={col: col + "_2" for col in mens_season_data.columns if col not in ["Season", "TeamID"]})
tourney_games.drop(columns=['TeamID_x', 'TeamName_1', 'TeamID_y', 'TeamName_2'], inplace=True)

tourney_games = tourney_games.merge(mens_tourney_seeds, how='left', left_on=['Season','Team1'], right_on=['Season', 'TeamID'])
tourney_games = tourney_games.rename(columns={'Seed':'Seed_1'})
tourney_games = tourney_games.merge(mens_tourney_seeds, how='left', left_on=['Season','Team2'], right_on=['Season', 'TeamID'])
tourney_games = tourney_games.rename(columns={'Seed':'Seed_2'})
tourney_games = tourney_games.drop(columns=['TeamID_x', 'TeamID_y'])
tourney_games


Unnamed: 0,Season,DayNum,Team1,Team2,Team1_Wins,Win_Percentage_1,Points_Per_Game_1,FG_Percentage_1,Threes_Per_Game_1,Three_Point_Percentage_1,Free_Throws_Per_Game_1,Free_Throw_Percentage_1,Offensive_Rebound_Rate_1,Defensive_Rebound_Rate_1,Turnovers_Per_Game_1,Opp_FG_Percentage_1,Opp_Three_Point_Percentage_1,Opp_Free_Throws_Per_Game_1,Opp_Turnovers_Per_Game_1,ConfAbbrev_1,Win_pct_last_10_games_1,SRS_1,SOS_1,Pace_1,FTr_1,3PAr_1,offensiveRating_1,defensiveRating_1,Win_Percentage_2,Points_Per_Game_2,FG_Percentage_2,Threes_Per_Game_2,Three_Point_Percentage_2,Free_Throws_Per_Game_2,Free_Throw_Percentage_2,Offensive_Rebound_Rate_2,Defensive_Rebound_Rate_2,Turnovers_Per_Game_2,Opp_FG_Percentage_2,Opp_Three_Point_Percentage_2,Opp_Free_Throws_Per_Game_2,Opp_Turnovers_Per_Game_2,ConfAbbrev_2,Win_pct_last_10_games_2,SRS_2,SOS_2,Pace_2,FTr_2,3PAr_2,offensiveRating_2,defensiveRating_2,Seed_1,Seed_2
0,2014,134,1107,1291,1,0.562500,66.031250,43.914373,13.468750,36.426914,21.718750,74.820144,32.387476,70.905764,12.625000,41.863140,35.247209,15.843750,11.968750,aec,0.7,-3.31,-5.22,63.1,0.420,0.261,110.7,110.2,0.500000,76.250000,44.068706,24.906250,35.633626,21.687500,74.063401,28.345070,66.855524,12.000000,49.312896,33.626374,20.312500,13.281250,nec,0.6,-6.32,-4.44,70.0,0.367,0.434,112.5,112.6,16,16
1,2014,134,1301,1462,1,0.617647,70.794118,45.865434,14.764706,30.278884,20.852941,66.149506,34.790060,63.735343,10.411765,42.438765,31.951641,23.794118,11.529412,acc,0.5,9.02,7.44,64.9,0.370,0.255,111.1,110.9,0.636364,72.212121,47.130919,14.727273,35.390947,22.848485,68.832891,34.105653,71.216098,12.363636,42.714127,35.394127,21.696970,11.272727,big_east,0.5,11.84,8.14,65.7,0.416,0.270,111.6,110.9,12,12
2,2014,135,1142,1411,1,0.366667,62.000000,40.371517,18.900000,33.509700,17.766667,68.667917,30.842912,66.953714,8.966667,43.837442,33.978495,21.466667,10.833333,big_west,0.5,-1.34,1.69,60.8,0.326,0.354,110.2,110.9,0.562500,75.062500,45.821326,17.593750,35.879218,26.812500,71.095571,33.456905,68.924640,13.218750,44.632768,35.528596,17.906250,10.843750,swac,0.9,-8.41,-10.07,68.2,0.491,0.325,111.9,112.3,16,16
3,2014,135,1397,1234,1,0.625000,70.500000,44.249292,16.875000,32.962963,22.781250,70.781893,39.692586,72.423146,10.875000,41.238318,34.047109,17.687500,10.843750,sec,0.6,17.34,7.84,62.8,0.416,0.308,112.2,109.1,0.625000,82.000000,46.469367,16.562500,35.283019,27.593750,73.272933,38.107639,71.063479,11.375000,41.401274,31.954351,20.500000,13.468750,big_ten,0.3,18.58,7.43,69.8,0.453,0.275,113.3,110.1,11,11
4,2014,136,1163,1386,1,0.764706,71.852941,44.835046,18.382353,38.720000,21.000000,76.050420,31.238616,66.563467,11.676471,38.733126,32.894737,20.088235,12.911765,aac,0.7,17.23,8.63,65.5,0.383,0.343,111.4,107.7,0.727273,71.272727,46.858790,19.363636,38.028169,22.818182,64.143426,29.488372,72.393661,12.333333,41.842105,33.742331,17.636364,10.151515,a_ten,0.8,9.05,5.08,66.0,0.436,0.369,111.4,110.2,07,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,2024,146,1181,1301,0,0.750000,79.843750,48.179420,22.125000,37.711864,20.000000,72.187500,29.538462,76.019417,9.375000,43.244710,32.568807,15.906250,10.906250,acc,0.7,20.67,8.36,67.1,0.343,0.375,121.8,95.5,0.611111,76.361111,44.920273,20.388889,34.604905,19.805556,73.352034,27.121464,73.570191,9.222222,44.365193,34.905660,18.666667,11.555556,acc,0.6,12.81,8.93,68.5,0.320,0.333,114.5,100.1,04,11
1334,2024,146,1397,1345,0,0.750000,79.468750,44.433198,25.500000,34.191176,21.187500,74.926254,31.881372,73.063063,9.968750,38.930481,31.395349,20.906250,12.500000,sec,0.7,21.81,10.42,70.3,0.337,0.412,117.3,90.6,0.878788,83.393939,48.832382,20.424242,40.801187,25.000000,72.121212,36.954315,76.721883,10.969697,41.888620,31.380208,14.393939,9.515152,big_ten,0.8,24.93,11.60,68.3,0.413,0.347,125.3,94.6,02,01
1335,2024,152,1104,1163,0,0.656250,90.750000,47.731660,30.281250,36.532508,22.812500,78.356164,33.523267,72.147350,11.812500,44.067797,31.886024,24.843750,11.281250,sec,0.5,20.69,11.80,73.9,0.353,0.465,125.6,101.4,0.911765,81.470588,49.598796,23.970588,36.687117,19.529412,74.246988,35.360825,76.840215,9.117647,39.802802,31.924883,18.411765,10.088235,big_east,0.9,26.70,8.70,66.0,0.319,0.402,126.7,92.3,04,01
1336,2024,152,1301,1345,0,0.611111,76.361111,44.920273,20.388889,34.604905,19.805556,73.352034,27.121464,73.570191,9.222222,44.365193,34.905660,18.666667,11.555556,acc,0.6,12.81,8.93,68.5,0.320,0.333,114.5,100.1,0.878788,83.393939,48.832382,20.424242,40.801187,25.000000,72.121212,36.954315,76.721883,10.969697,41.888620,31.380208,14.393939,9.515152,big_ten,0.8,24.93,11.60,68.3,0.413,0.347,125.3,94.6,11,01


In [38]:
output_dir = Path("..") / "data" / "modeling"
output_dir.mkdir(parents=True, exist_ok =True)
season_output_path = output_dir / "reg_season_ml.csv"
tourney_output_path = output_dir / "tourney_ml.csv"
full_games.to_csv(season_output_path, index=False)
tourney_games.to_csv(tourney_output_path, index=False)
