In [38]:
import pandas as pd 
import numpy as np 
import os 
from pathlib import Path 

In [39]:
### Combine the stats and the ratings for each team depending on the name of the team 
stats_summary = pd.read_csv('../../data/preprocessing/mens_summary_season_data.csv')
ratings_summary = pd.read_csv('../../data/preprocessing/mens_season_ratings.csv')
advanced_stats = pd.read_csv("../../data/preprocessing/mens_advanced_stats.csv")
team_spellings = pd.read_csv('../../data/MTeamSpellings.csv', encoding='ISO-8859-1')

ratings_summary.drop(columns=['netRating'], inplace=True)

# Take data from year 2014 and up because that was what was available in the ratings data
filtered_stats = stats_summary[stats_summary['Season'] >= 2014]

# Merge spellings with team summary stats 
all_spellings = filtered_stats.merge(team_spellings, how='left', left_on=['TeamID'], right_on=['TeamID'])

# Set spellings to all be lower case 
all_spellings['TeamNameSpelling'] = all_spellings['TeamNameSpelling'].str.lower() 
ratings_summary['team'] = ratings_summary['team'].str.lower()
advanced_stats['School'] = advanced_stats['School'].str.lower() 

# Combine season stats and season ratings 
combined = all_spellings.merge(advanced_stats, how='left', left_on=['Season','TeamNameSpelling'], right_on=['Season','School'])

# Update Spellings for certain teams based on my findings 

# TeamIDs that need to be renamed - Fix joining issues 
# 1107, 1111, 1216, 1271, 1274, 1363, 1366, 1383, 1410, 1419, 1472, 1474
combined.loc[combined['TeamID'] == 1107, 'TeamNameSpelling'] = 'ualbany'
combined.loc[combined['TeamID'] == 1111, 'TeamNameSpelling'] = 'app state'
combined.loc[combined['TeamID'] == 1271, 'TeamNameSpelling'] = 'maryland eastern shore'
combined.loc[combined['TeamID'] == 1274, 'TeamNameSpelling'] = 'miami'
combined.loc[combined['TeamID'] == 1410, 'TeamNameSpelling'] = 'ut rio grande valley'
combined.loc[combined['TeamID'] == 1419, 'TeamNameSpelling'] = 'ul monroe'
combined.loc[combined['TeamID'] == 1472, 'TeamNameSpelling'] = 'st. thomas-minnesota'
combined.loc[combined['TeamID'] == 1474, 'TeamNameSpelling'] = 'queens university'
ratings_summary.loc[ratings_summary['teamID'] == 260, 'team'] = 'san jose st'

combined = combined.merge(ratings_summary,  how='left', left_on=['Season', 'TeamNameSpelling'], right_on=['season', 'team'])

combined.head(5) 

Unnamed: 0,Season,TeamName,TeamID,Win_Percentage,Points_Per_Game,FG_Percentage,Threes_Per_Game,Three_Point_Percentage,Free_Throws_Per_Game,Free_Throw_Percentage,Offensive_Rebound_Rate,Defensive_Rebound_Rate,Turnovers_Per_Game,Opp_FG_Percentage,Opp_Three_Point_Percentage,Opp_Free_Throws_Per_Game,Opp_Turnovers_Per_Game,Opp_Threes_Per_Game,Turnover_Margin,ConfAbbrev,Win_pct_last_10_games,TeamNameSpelling,School,SRS,SOS,Pace,FTr,3PAr,season,teamID,team,offensiveRating,defensiveRating,avg_height
0,2014,Abilene Chr,1101.0,0.095238,63.142857,40.550807,17.857143,37.333333,21.190476,74.606742,24.925816,66.20155,15.0,50.802139,37.058824,25.809524,12.142857,16.190476,-2.857143,southland,0.1,abilene chr,,,,,,,,,,,,
1,2014,Abilene Chr,1101.0,0.095238,63.142857,40.550807,17.857143,37.333333,21.190476,74.606742,24.925816,66.20155,15.0,50.802139,37.058824,25.809524,12.142857,16.190476,-2.857143,southland,0.1,abilene christian,abilene christian,-19.6,-4.12,67.7,0.4,0.36,2014.0,1.0,abilene christian,110.5,113.6,75.79
2,2014,Abilene Chr,1101.0,0.095238,63.142857,40.550807,17.857143,37.333333,21.190476,74.606742,24.925816,66.20155,15.0,50.802139,37.058824,25.809524,12.142857,16.190476,-2.857143,southland,0.1,abilene-christian,,,,,,,,,,,,
3,2014,Air Force,1102.0,0.357143,64.571429,42.591316,21.714286,32.894737,19.25,69.016698,26.378378,69.792803,13.464286,44.129555,33.724832,23.25,11.678571,21.285714,-1.785714,mwc,0.3,air force,air force,-4.08,1.71,65.4,0.377,0.418,2014.0,2.0,air force,110.0,111.7,78.28
4,2014,Air Force,1102.0,0.357143,64.571429,42.591316,21.714286,32.894737,19.25,69.016698,26.378378,69.792803,13.464286,44.129555,33.724832,23.25,11.678571,21.285714,-1.785714,mwc,0.3,air-force,,,,,,,,,,,,


In [40]:
### Find the Lower quantile for offensive rating, defensive rating, and srs rating and use that for the three teams that don't have ratings

# Find the rows where the 'team' field is not null for each 'TeamID'
non_null_teams = combined[combined['team'].notnull()]
final_result = non_null_teams.groupby(['Season', 'TeamID']).first().reset_index()

# Find the lower quartile
lower_quartile_offensive = final_result['offensiveRating'].quantile(0.25)
lower_quartile_defensive = final_result['defensiveRating'].quantile(0.25)
lower_quartile_srs = final_result['SRS'].quantile(0.25)
lower_quartile_sos = final_result['SOS'].quantile(0.25)
lower_quartile_Pace = final_result['Pace'].quantile(0.25)
lower_quartile_FTr = final_result['FTr'].quantile(0.25)
lower_quartile_3PAr = final_result['3PAr'].quantile(0.25)
lower_quartile_height = final_result['avg_height'].quantile(0.25)

# Display the results
print(f'Lower Quartile (25th percentile) for Offensive Rating: {lower_quartile_offensive}')
print(f'Lower Quartile (25th percentile) for Defensive Rating: {lower_quartile_defensive}')

# Use lower quartile values of ratings for 3 teams missing ratings 
# 1216, 1366, 1383
# Assign lower quartile values to the columns for TeamID 1107
combined.loc[combined['TeamID'] == 1216, ['team', 'offensiveRating', 'defensiveRating', 'avg_height']] = ['hartford', lower_quartile_offensive, lower_quartile_defensive, lower_quartile_height]
combined.loc[combined['TeamID'] == 1366, ['team', 'offensiveRating', 'defensiveRating', 'avg_height']] = ['savannah st', lower_quartile_offensive, lower_quartile_defensive, lower_quartile_height]
combined.loc[combined['TeamID'] == 1383, ['team', 'offensiveRating', 'defensiveRating', 'avg_height']] = ['st francis ny', lower_quartile_offensive, lower_quartile_defensive, lower_quartile_height]

### Retrieve final joined dataset 

# Group by 'Season' and 'TeamID', and use first non-null value for each column
mens_season_data = (
    combined
    .groupby(['Season', 'TeamID'])
    .agg(lambda x: x.dropna().iloc[0] if x.notna().any() else np.nan)  # Take the first non-null value
    .reset_index()
)

# Drop unnecessary columns 
mens_season_data.drop(columns=['School', 'TeamNameSpelling', 'season', 'teamID', 'team'], inplace=True)

Lower Quartile (25th percentile) for Offensive Rating: 102.5
Lower Quartile (25th percentile) for Defensive Rating: 102.8


In [41]:
# Output 2025 season for final predictions
pd.set_option("display.max_columns", None)
mens_25_data = mens_season_data[mens_season_data['Season'] == 2025]

output_dir = Path("../..") / "data" / "modeling"
output_dir.mkdir(parents=True, exist_ok =True)
season_output_path = output_dir / "mens_25_season_ml.csv"
mens_25_data.to_csv(season_output_path, index=False)
mens_25_data

Unnamed: 0,Season,TeamID,TeamName,Win_Percentage,Points_Per_Game,FG_Percentage,Threes_Per_Game,Three_Point_Percentage,Free_Throws_Per_Game,Free_Throw_Percentage,Offensive_Rebound_Rate,Defensive_Rebound_Rate,Turnovers_Per_Game,Opp_FG_Percentage,Opp_Three_Point_Percentage,Opp_Free_Throws_Per_Game,Opp_Turnovers_Per_Game,Opp_Threes_Per_Game,Turnover_Margin,ConfAbbrev,Win_pct_last_10_games,SRS,SOS,Pace,FTr,3PAr,offensiveRating,defensiveRating,avg_height
3891,2025,1101.0,Abilene Chr,0.448276,67.551724,43.041079,14.206897,28.883495,20.896552,71.947195,27.704194,69.099757,14.206897,46.102151,32.644628,26.344828,16.034483,16.689655,1.827586,wac,0.7,-5.46,-2.01,71.1,0.361,0.262,98.4,104.7,77.29
3892,2025,1102.0,Air Force,0.125000,61.937500,42.079208,24.468750,33.077905,17.875000,63.461538,19.895288,72.510823,12.187500,48.065984,36.263736,19.468750,9.687500,19.906250,-2.500000,mwc,0.1,-7.74,3.98,64.7,0.354,0.485,98.4,112.3,77.76
3893,2025,1103.0,Akron,0.812500,83.968750,47.124756,29.093750,36.627282,17.187500,74.909091,31.552918,73.772011,11.750000,43.482688,31.860776,20.687500,12.000000,23.343750,0.250000,mac,0.9,3.55,-4.51,72.4,0.259,0.461,115.1,107.9,75.62
3894,2025,1104.0,Alabama,0.757576,91.121212,48.259643,29.787879,34.994914,25.848485,71.629543,32.700994,73.296789,12.121212,42.456609,30.798479,23.060606,10.060606,23.909091,-2.060606,sec,0.5,25.97,16.27,76.2,0.401,0.462,126.2,96.8,78.36
3895,2025,1105.0,Alabama A&M,0.241379,69.310345,38.696655,24.862069,30.790569,23.068966,66.666667,31.848984,67.361111,15.103448,46.979866,39.024390,26.379310,14.137931,21.206897,-0.965517,swac,0.3,-20.34,-9.76,73.0,0.375,0.410,91.8,117.6,77.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4250,2025,1476.0,Stonehill,0.433333,67.333333,43.174799,24.000000,35.416667,16.966667,72.102161,24.348811,71.948608,11.300000,45.035461,34.929078,18.466667,9.966667,18.800000,-1.333333,nec,0.4,-12.86,-9.29,66.3,0.319,0.445,101.5,114.2,77.25
4251,2025,1477.0,East Texas A&M,0.161290,64.354839,41.598600,26.709677,31.400966,15.483871,64.375000,25.256674,65.444444,14.612903,47.365407,37.417219,20.548387,13.032258,19.483871,-1.580645,southland,0.3,-12.38,-1.86,67.8,0.280,0.483,96.1,111.5,76.00
4252,2025,1478.0,Le Moyne,0.233333,71.933333,44.765343,22.900000,32.751092,20.866667,71.086262,24.400871,67.708333,12.933333,47.012842,36.977058,22.000000,10.933333,24.700000,-2.000000,nec,0.2,-17.52,-8.06,69.0,0.367,0.414,102.0,120.9,76.75
4253,2025,1479.0,Mercyhurst,0.428571,65.785714,42.136695,19.607143,35.701275,16.642857,80.686695,20.813397,68.640646,9.750000,47.915243,35.677530,20.000000,12.535714,20.821429,2.785714,nec,0.7,-15.34,-9.38,65.4,0.308,0.358,100.4,116.9,75.79


In [42]:
## Add team metrics to regular season games to create dataset for supervised ML model
pd.set_option("display.max_columns",None)

mens_reg_season_data = pd.read_csv('../../data/MRegularSeasonCompactResults.csv')
mens_reg_season_data = mens_reg_season_data[mens_reg_season_data['Season'] >= 2014]


# Duplicate data to there is a record for each losing and winning team 
mens_reg_season_data['Team1'] = mens_reg_season_data['WTeamID']
mens_reg_season_data['Team2'] = mens_reg_season_data['LTeamID']
mens_reg_season_data['Team1_Wins'] = 1 

flipped = mens_reg_season_data.copy() 
flipped['Team1'], flipped['Team2'] = flipped['Team2'], flipped['Team1'] 
flipped['Team1_Wins'] = 0 

games = pd.concat([mens_reg_season_data, flipped])
games.drop(columns=['WTeamID','WScore','LTeamID','LScore','WLoc','NumOT'], inplace=True)

# Merge games dataset with teams stats 
full_games = games.merge(mens_season_data, left_on=['Season','Team1'], right_on=['Season', 'TeamID'], how='left')
full_games = full_games.rename(columns={col: col + "_1" for col in mens_season_data.columns if col not in ["Season", "TeamID"]})

full_games = full_games.merge(mens_season_data, left_on=['Season','Team2'], right_on=['Season', 'TeamID'], how='left')
full_games = full_games.rename(columns={col: col + "_2" for col in mens_season_data.columns if col not in ["Season", "TeamID"]})
full_games.drop(columns=['TeamID_x', 'TeamName_1', 'TeamID_y', 'TeamName_2'], inplace=True)

full_games

Unnamed: 0,Season,DayNum,Team1,Team2,Team1_Wins,Win_Percentage_1,Points_Per_Game_1,FG_Percentage_1,Threes_Per_Game_1,Three_Point_Percentage_1,Free_Throws_Per_Game_1,Free_Throw_Percentage_1,Offensive_Rebound_Rate_1,Defensive_Rebound_Rate_1,Turnovers_Per_Game_1,Opp_FG_Percentage_1,Opp_Three_Point_Percentage_1,Opp_Free_Throws_Per_Game_1,Opp_Turnovers_Per_Game_1,Opp_Threes_Per_Game_1,Turnover_Margin_1,ConfAbbrev_1,Win_pct_last_10_games_1,SRS_1,SOS_1,Pace_1,FTr_1,3PAr_1,offensiveRating_1,defensiveRating_1,avg_height_1,Win_Percentage_2,Points_Per_Game_2,FG_Percentage_2,Threes_Per_Game_2,Three_Point_Percentage_2,Free_Throws_Per_Game_2,Free_Throw_Percentage_2,Offensive_Rebound_Rate_2,Defensive_Rebound_Rate_2,Turnovers_Per_Game_2,Opp_FG_Percentage_2,Opp_Three_Point_Percentage_2,Opp_Free_Throws_Per_Game_2,Opp_Turnovers_Per_Game_2,Opp_Threes_Per_Game_2,Turnover_Margin_2,ConfAbbrev_2,Win_pct_last_10_games_2,SRS_2,SOS_2,Pace_2,FTr_2,3PAr_2,offensiveRating_2,defensiveRating_2,avg_height_2
0,2014,4,1102,1119,1,0.357143,64.571429,42.591316,21.714286,32.894737,19.250000,69.016698,26.378378,69.792803,13.464286,44.129555,33.724832,23.250000,11.678571,21.285714,-1.785714,mwc,0.3,-4.08,1.71,65.4,0.377,0.418,110.0,111.7,78.28,0.466667,72.533333,44.629523,22.300000,33.781764,18.900000,69.841270,31.977294,66.465257,13.000000,46.447446,38.996139,20.033333,13.700000,17.266667,0.700000,patriot,0.4,-6.16,-4.66,68.2,0.322,0.386,111.4,111.9,76.60
1,2014,4,1103,1157,1,0.636364,67.909091,43.190661,20.393939,34.769688,22.212121,61.800819,35.273675,67.692308,13.212121,42.833517,32.525253,19.606061,13.030303,15.000000,-0.181818,mac,0.5,1.16,-0.48,65.5,0.404,0.372,111.2,110.7,77.31,0.600000,70.533333,42.923434,17.900000,32.029795,22.333333,69.253731,34.762774,70.062556,14.100000,40.375587,32.088520,22.200000,12.833333,24.100000,-1.266667,big_south,0.8,-5.21,-6.31,68.6,0.385,0.319,110.0,109.9,76.50
2,2014,4,1107,1373,1,0.562500,66.031250,43.914373,13.468750,36.426914,21.718750,74.820144,32.387476,70.905764,12.625000,41.863140,35.247209,15.843750,11.968750,19.593750,-0.656250,aec,0.7,-3.31,-5.22,63.1,0.420,0.261,110.7,110.2,77.29,0.468750,69.906250,42.102397,13.406250,32.400932,24.000000,71.875000,38.946459,64.650767,14.156250,42.063492,33.636364,28.843750,12.718750,17.187500,-1.437500,maac,0.5,-0.86,-0.15,67.2,0.417,0.238,128.5,164.1,76.75
3,2014,4,1112,1142,1,0.882353,73.058824,46.848739,14.852941,35.643564,23.352941,65.491184,37.249782,74.006623,10.441176,38.097856,31.446541,18.441176,12.411765,14.029412,1.970588,pac_twelve,0.7,23.36,9.04,64.1,0.410,0.264,111.9,106.5,77.62,0.366667,62.000000,40.371517,18.900000,33.509700,17.766667,68.667917,30.842912,66.953714,8.966667,43.837442,33.978495,21.466667,10.833333,15.500000,1.866667,big_west,0.5,-1.34,1.69,60.8,0.326,0.354,110.2,110.9,75.76
4,2014,4,1113,1420,1,0.656250,75.031250,45.044543,21.031250,38.632987,23.562500,69.363395,24.193548,69.879518,11.468750,41.513400,32.442748,22.031250,12.031250,16.375000,0.562500,pac_twelve,0.5,13.32,7.41,69.0,0.425,0.371,110.5,109.1,76.17,0.275862,64.965517,40.000000,19.103448,30.685921,22.344828,66.358025,29.853480,69.075452,14.931034,44.959128,36.711281,29.137931,14.965517,18.034483,0.034483,aec,0.3,-14.21,-6.55,68.4,0.419,0.342,110.4,111.3,75.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127447,2025,132,1397,1196,0,0.794118,74.735294,45.482866,24.382353,34.016888,20.058824,74.340176,33.637285,73.809524,9.588235,38.171488,27.803204,16.764706,10.705882,25.705882,1.117647,sec,0.7,25.04,12.70,65.4,0.346,0.428,120.6,89.9,77.20,0.882353,85.411765,47.266881,27.911765,35.511064,20.852941,71.791255,37.059859,73.671498,10.264706,39.825581,29.610390,20.058824,11.441176,22.647059,1.176471,sec,0.9,27.82,11.43,70.9,0.322,0.436,125.9,92.0,78.71
127448,2025,132,1412,1272,0,0.636364,81.727273,45.678439,19.181818,34.123223,20.636364,75.624082,37.162750,71.468927,9.484848,45.548451,35.683453,18.000000,10.333333,21.060606,0.848485,aac,0.7,4.54,-1.12,70.1,0.311,0.294,116.9,111.2,76.35,0.852941,80.147059,47.350173,19.676471,37.967115,23.411765,70.226131,34.076137,70.086957,13.382353,41.921397,32.219570,20.352941,12.852941,24.647059,-0.529412,aac,0.9,12.51,5.75,71.3,0.396,0.336,113.8,99.7,75.80
127449,2025,132,1458,1276,0,0.742857,79.714286,45.181598,28.400000,34.909457,19.914286,82.783357,24.836601,75.125628,9.314286,41.431262,33.012048,17.028571,9.685714,23.714286,0.371429,big_ten,0.6,21.15,11.41,68.7,0.340,0.478,121.9,98.2,77.71,0.735294,78.264706,47.397676,24.735294,33.412604,20.352941,72.832370,31.224900,73.067332,13.735294,40.215356,31.325301,17.882353,10.705882,24.411765,-3.029412,big_ten,0.6,19.97,12.91,70.7,0.354,0.421,115.7,95.5,77.81
127450,2025,132,1206,1433,0,0.757576,70.030303,45.198390,18.878788,35.313002,22.000000,71.487603,29.968454,72.364925,12.545455,38.123324,30.306604,17.818182,11.666667,25.696970,-0.878788,a_ten,0.7,7.65,0.80,65.2,0.416,0.364,107.7,96.7,76.88,0.818182,76.333333,44.404701,28.090909,33.549083,19.121212,74.484945,36.397749,72.641509,10.818182,39.031180,30.599369,19.666667,12.878788,19.212121,2.060606,a_ten,0.9,15.26,1.17,66.8,0.322,0.475,116.7,95.5,78.38


In [43]:
## Add team metrics to regular season games to create dataset for supervised ML model
pd.set_option("display.max_columns",None)

mens_tourney_data = pd.read_csv('../../data/MNCAATourneyCompactResults.csv')
mens_tourney_data = mens_tourney_data[mens_tourney_data['Season'] >= 2014]

mens_tourney_seeds = pd.read_csv("../../data/MNCAATourneySeeds.csv")
mens_tourney_seeds['Seed'] = mens_tourney_seeds['Seed'].str[1:].str.rstrip('ab')

# Duplicate data to there is a record for each losing and winning team 
mens_tourney_data['Team1'] = mens_tourney_data['WTeamID']
mens_tourney_data['Team2'] = mens_tourney_data['LTeamID']
mens_tourney_data['Team1_Wins'] = 1 

flipped = mens_tourney_data.copy() 
flipped['Team1'], flipped['Team2'] = flipped['Team2'], flipped['Team1'] 
flipped['Team1_Wins'] = 0 

games = pd.concat([mens_tourney_data, flipped])
games.drop(columns=['WTeamID','WScore','LTeamID','LScore','WLoc','NumOT'], inplace=True)

# Merge games dataset with teams stats 
tourney_games = games.merge(mens_season_data, left_on=['Season','Team1'], right_on=['Season', 'TeamID'], how='left')
tourney_games = tourney_games.rename(columns={col: col + "_1" for col in mens_season_data.columns if col not in ["Season", "TeamID"]})

tourney_games = tourney_games.merge(mens_season_data, left_on=['Season','Team2'], right_on=['Season', 'TeamID'], how='left')
tourney_games = tourney_games.rename(columns={col: col + "_2" for col in mens_season_data.columns if col not in ["Season", "TeamID"]})
tourney_games.drop(columns=['TeamID_x', 'TeamName_1', 'TeamID_y', 'TeamName_2'], inplace=True)

tourney_games = tourney_games.merge(mens_tourney_seeds, how='left', left_on=['Season','Team1'], right_on=['Season', 'TeamID'])
tourney_games = tourney_games.rename(columns={'Seed':'Seed_1'})
tourney_games = tourney_games.merge(mens_tourney_seeds, how='left', left_on=['Season','Team2'], right_on=['Season', 'TeamID'])
tourney_games = tourney_games.rename(columns={'Seed':'Seed_2'})
tourney_games = tourney_games.drop(columns=['TeamID_x', 'TeamID_y'])
tourney_games


Unnamed: 0,Season,DayNum,Team1,Team2,Team1_Wins,Win_Percentage_1,Points_Per_Game_1,FG_Percentage_1,Threes_Per_Game_1,Three_Point_Percentage_1,Free_Throws_Per_Game_1,Free_Throw_Percentage_1,Offensive_Rebound_Rate_1,Defensive_Rebound_Rate_1,Turnovers_Per_Game_1,Opp_FG_Percentage_1,Opp_Three_Point_Percentage_1,Opp_Free_Throws_Per_Game_1,Opp_Turnovers_Per_Game_1,Opp_Threes_Per_Game_1,Turnover_Margin_1,ConfAbbrev_1,Win_pct_last_10_games_1,SRS_1,SOS_1,Pace_1,FTr_1,3PAr_1,offensiveRating_1,defensiveRating_1,avg_height_1,Win_Percentage_2,Points_Per_Game_2,FG_Percentage_2,Threes_Per_Game_2,Three_Point_Percentage_2,Free_Throws_Per_Game_2,Free_Throw_Percentage_2,Offensive_Rebound_Rate_2,Defensive_Rebound_Rate_2,Turnovers_Per_Game_2,Opp_FG_Percentage_2,Opp_Three_Point_Percentage_2,Opp_Free_Throws_Per_Game_2,Opp_Turnovers_Per_Game_2,Opp_Threes_Per_Game_2,Turnover_Margin_2,ConfAbbrev_2,Win_pct_last_10_games_2,SRS_2,SOS_2,Pace_2,FTr_2,3PAr_2,offensiveRating_2,defensiveRating_2,avg_height_2,Seed_1,Seed_2
0,2014,134,1107,1291,1,0.562500,66.031250,43.914373,13.468750,36.426914,21.718750,74.820144,32.387476,70.905764,12.625000,41.863140,35.247209,15.843750,11.968750,19.593750,-0.656250,aec,0.7,-3.31,-5.22,63.1,0.420,0.261,110.7,110.2,77.29,0.500000,76.250000,44.068706,24.906250,35.633626,21.687500,74.063401,28.345070,66.855524,12.000000,49.312896,33.626374,20.312500,13.281250,14.218750,1.281250,nec,0.6,-6.32,-4.44,70.0,0.367,0.434,112.5,112.6,76.00,16,16
1,2014,134,1301,1462,1,0.617647,70.794118,45.865434,14.764706,30.278884,20.852941,66.149506,34.790060,63.735343,10.411765,42.438765,31.951641,23.794118,11.529412,17.029412,1.117647,acc,0.5,9.02,7.44,64.9,0.370,0.255,111.1,110.9,77.60,0.636364,72.212121,47.130919,14.727273,35.390947,22.848485,68.832891,34.105653,71.216098,12.363636,42.714127,35.394127,21.696970,11.272727,19.606061,-1.090909,big_east,0.5,11.84,8.14,65.7,0.416,0.270,111.6,110.9,77.12,12,12
2,2014,135,1142,1411,1,0.366667,62.000000,40.371517,18.900000,33.509700,17.766667,68.667917,30.842912,66.953714,8.966667,43.837442,33.978495,21.466667,10.833333,15.500000,1.866667,big_west,0.5,-1.34,1.69,60.8,0.326,0.354,110.2,110.9,75.76,0.562500,75.062500,45.821326,17.593750,35.879218,26.812500,71.095571,33.456905,68.924640,13.218750,44.632768,35.528596,17.906250,10.843750,18.031250,-2.375000,swac,0.9,-8.41,-10.07,68.2,0.491,0.325,111.9,112.3,76.50,16,16
3,2014,135,1397,1234,1,0.625000,70.500000,44.249292,16.875000,32.962963,22.781250,70.781893,39.692586,72.423146,10.875000,41.238318,34.047109,17.687500,10.843750,14.593750,-0.031250,sec,0.6,17.34,7.84,62.8,0.416,0.308,112.2,109.1,77.60,0.625000,82.000000,46.469367,16.562500,35.283019,27.593750,73.272933,38.107639,71.063479,11.375000,41.401274,31.954351,20.500000,13.468750,21.906250,2.093750,big_ten,0.3,18.58,7.43,69.8,0.453,0.275,113.3,110.1,78.73,11,11
4,2014,136,1163,1386,1,0.764706,71.852941,44.835046,18.382353,38.720000,21.000000,76.050420,31.238616,66.563467,11.676471,38.733126,32.894737,20.088235,12.911765,17.882353,1.235294,aac,0.7,17.23,8.63,65.5,0.383,0.343,111.4,107.7,77.35,0.727273,71.272727,46.858790,19.363636,38.028169,22.818182,64.143426,29.488372,72.393661,12.333333,41.842105,33.742331,17.636364,10.151515,19.757576,-2.181818,a_ten,0.8,9.05,5.08,66.0,0.436,0.369,111.4,110.2,77.62,07,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,2024,146,1181,1301,0,0.750000,79.843750,48.179420,22.125000,37.711864,20.000000,72.187500,29.538462,76.019417,9.375000,43.244710,32.568807,15.906250,10.906250,20.437500,1.531250,acc,0.7,20.67,8.36,67.1,0.343,0.375,121.8,95.5,78.80,0.611111,76.361111,44.920273,20.388889,34.604905,19.805556,73.352034,27.121464,73.570191,9.222222,44.365193,34.905660,18.666667,11.555556,20.611111,2.333333,acc,0.6,12.81,8.93,68.5,0.320,0.333,114.5,100.1,76.06,04,11
1334,2024,146,1397,1345,0,0.750000,79.468750,44.433198,25.500000,34.191176,21.187500,74.926254,31.881372,73.063063,9.968750,38.930481,31.395349,20.906250,12.500000,24.187500,2.531250,sec,0.7,21.81,10.42,70.3,0.337,0.412,117.3,90.6,77.25,0.878788,83.393939,48.832382,20.424242,40.801187,25.000000,72.121212,36.954315,76.721883,10.969697,41.888620,31.380208,14.393939,9.515152,23.272727,-1.454545,big_ten,0.8,24.93,11.60,68.3,0.413,0.347,125.3,94.6,78.35,02,01
1335,2024,152,1104,1163,0,0.656250,90.750000,47.731660,30.281250,36.532508,22.812500,78.356164,33.523267,72.147350,11.812500,44.067797,31.886024,24.843750,11.281250,23.031250,-0.531250,sec,0.5,20.69,11.80,73.9,0.353,0.465,125.6,101.4,78.80,0.911765,81.470588,49.598796,23.970588,36.687117,19.529412,74.246988,35.360825,76.840215,9.117647,39.802802,31.924883,18.411765,10.088235,18.794118,0.970588,big_east,0.9,26.70,8.70,66.0,0.319,0.402,126.7,92.3,78.36,04,01
1336,2024,152,1301,1345,0,0.611111,76.361111,44.920273,20.388889,34.604905,19.805556,73.352034,27.121464,73.570191,9.222222,44.365193,34.905660,18.666667,11.555556,20.611111,2.333333,acc,0.6,12.81,8.93,68.5,0.320,0.333,114.5,100.1,76.06,0.878788,83.393939,48.832382,20.424242,40.801187,25.000000,72.121212,36.954315,76.721883,10.969697,41.888620,31.380208,14.393939,9.515152,23.272727,-1.454545,big_ten,0.8,24.93,11.60,68.3,0.413,0.347,125.3,94.6,78.35,11,01


In [44]:
output_dir = Path("../..") / "data" / "modeling"
output_dir.mkdir(parents=True, exist_ok =True)
season_output_path = output_dir / "mens_reg_season_ml.csv"
tourney_output_path = output_dir / "mens_tourney_ml.csv"
full_games.to_csv(season_output_path, index=False)
tourney_games.to_csv(tourney_output_path, index=False)