In [9]:
import pandas as pd 
import numpy as np 

In [69]:
### Combine the stats and the ratings for each team depending on the name of the team 
stats_summary = pd.read_csv('../data/preprocessing/mens_summary_season_data.csv')
ratings_summary = pd.read_csv('../data/preprocessing/mens_season_ratings.csv')
ratings_summary.drop(columns=['netRating'], inplace=True)
team_spellings = pd.read_csv('../data/MTeamSpellings.csv', encoding='ISO-8859-1')

# Update Spellings for certain teams based on my findings 

# TeamIDs that need to be renamed 
# 1107, 1111, 1216, 1271, 1274, 1363, 1366, 1383, 1410, 1419, 1472, 1474
team_spellings.loc[team_spellings['TeamID'] == 1107, 'TeamNameSpelling'] = 'ualbany'
team_spellings.loc[team_spellings['TeamID'] == 1111, 'TeamNameSpelling'] = 'app state'
team_spellings.loc[team_spellings['TeamID'] == 1271, 'TeamNameSpelling'] = 'maryland eastern shore'
team_spellings.loc[team_spellings['TeamID'] == 1274, 'TeamNameSpelling'] = 'miami'
team_spellings.loc[team_spellings['TeamID'] == 1410, 'TeamNameSpelling'] = 'ut rio grande valley'
team_spellings.loc[team_spellings['TeamID'] == 1419, 'TeamNameSpelling'] = 'ul monroe'
team_spellings.loc[team_spellings['TeamID'] == 1472, 'TeamNameSpelling'] = 'st. thomas-minnesota'
team_spellings.loc[team_spellings['TeamID'] == 1474, 'TeamNameSpelling'] = 'queens university'
ratings_summary.loc[ratings_summary['teamID'] == 260, 'team'] = 'san jose st'

#TeamIDs that need new records 


# Take data from year 2014 and up because that was what was available in the ratings data
filtered_stats = stats_summary[stats_summary['Season'] >= 2014]

# Merge spellings with team summary stats 
all_spellings = filtered_stats.merge(team_spellings, how='left', left_on=['TeamID'], right_on=['TeamID'])

# Set spellings to all be lower case 
all_spellings['TeamNameSpelling'] = all_spellings['TeamNameSpelling'].str.lower() 
ratings_summary['team'] = ratings_summary['team'].str.lower()

# Combine season stats and season ratings 
combined = all_spellings.merge(ratings_summary,  how='left', left_on=['Season', 'TeamNameSpelling'], right_on=['season', 'team'])



In [70]:
### Look for teams that need to be renamed to be joined together

# Step 1: Check if all 'Team' values are null for each 'TeamID'
team_null_check = combined.groupby('TeamID')['team'].apply(lambda x: x.isnull().all()).reset_index()

# Step 2: Filter for TeamIDs where all records have 'null' in the 'team' column
team_null_check = team_null_check[team_null_check['team'] == True]

# View teams that do not have a matching spelling in the ratings dataset
team_null_check.head(50) 

# TeamIDs that need to be renamed 
# 1107, 1111, 1216, 1271, 1274, 1363, 1366, 1383, 1410, 1419, 1472, 1474

#Team IDs with no associated ranking, set to the lower quartile
# 1216, 1366, 1383

Unnamed: 0,TeamID,team
108,1216.0,True
255,1366.0,True
272,1383.0,True


In [78]:
### Find the Lower quantile for offensive rating, defensive rating, and srs rating and use that for the three teams that don't have ratings

# Find the rows where the 'team' field is not null for each 'TeamID'
non_null_teams = combined[combined['team'].notnull()]
final_result = non_null_teams.groupby(['Season', 'TeamID']).first().reset_index()

# Find the lower quartile
lower_quartile_offensive = final_result['offensiveRating'].quantile(0.25)
lower_quartile_defensive = final_result['defensiveRating'].quantile(0.25)
lower_quartile_srs = final_result['srs_rating'].quantile(0.25)

# Display the results
print(f"Lower Quartile (25th percentile) for Offensive Rating: {lower_quartile_offensive}")
print(f"Lower Quartile (25th percentile) for Defensive Rating: {lower_quartile_defensive}")
print(f"Lower Quartile (25th percentile) for SRS Rating: {lower_quartile_srs}")

# Use lower quartile values of ratings for 3 teams missing ratings 
# 1216, 1366, 1383
# Assign lower quartile values to the columns for TeamID 1107
combined.loc[combined['TeamID'] == 1216, ['team', 'offensiveRating', 'defensiveRating', 'srs_rating']] = ['hartford', lower_quartile_offensive, lower_quartile_defensive, lower_quartile_srs]
combined.loc[combined['TeamID'] == 1366, ['team', 'offensiveRating', 'defensiveRating', 'srs_rating']] = ['savannah st', lower_quartile_offensive, lower_quartile_defensive, lower_quartile_srs]
combined.loc[combined['TeamID'] == 1383, ['team', 'offensiveRating', 'defensiveRating', 'srs_rating']] = ['st francis ny', lower_quartile_offensive, lower_quartile_defensive, lower_quartile_srs]

combined.head(5) 




Lower Quartile (25th percentile) for Offensive Rating: 102.5
Lower Quartile (25th percentile) for Defensive Rating: 102.9
Lower Quartile (25th percentile) for SRS Rating: -4.625


Unnamed: 0,Season,TeamName,TeamID,Win_Percentage,Points_Per_Game,FG_Percentage,Threes_Per_Game,Three_Point_Percentage,Free_Throws_Per_Game,Free_Throw_Percentage,...,Conf_swac,Conf_wac,Conf_wcc,TeamNameSpelling,season,teamID,team,offensiveRating,defensiveRating,srs_rating
0,2014,Abilene Chr,1101.0,0.095238,63.142857,40.550807,17.857143,37.333333,21.190476,74.606742,...,0,0,0,abilene chr,,,,,,
1,2014,Abilene Chr,1101.0,0.095238,63.142857,40.550807,17.857143,37.333333,21.190476,74.606742,...,0,0,0,abilene christian,2014.0,1.0,abilene christian,110.5,113.6,-15.4
2,2014,Abilene Chr,1101.0,0.095238,63.142857,40.550807,17.857143,37.333333,21.190476,74.606742,...,0,0,0,abilene-christian,,,,,,
3,2014,Air Force,1102.0,0.357143,64.571429,42.591316,21.714286,32.894737,19.25,69.016698,...,0,0,0,air force,2014.0,2.0,air force,110.0,111.7,-7.0
4,2014,Air Force,1102.0,0.357143,64.571429,42.591316,21.714286,32.894737,19.25,69.016698,...,0,0,0,air-force,,,,,,


In [None]:
### Retrieve final joined dataset 

# Find the rows where the 'team' field is not null for each 'TeamID'
mens_season_data = combined[combined['team'].notnull()]
mens_season_data = mens_season_data.groupby(['Season','TeamID']).first().reset_index()

# Drop unnecessary columns 
mens_season_data.drop(columns=['TeamNameSpelling', 'season', 'teamID', 'team'], inplace=True)

mens_season_data



Unnamed: 0,Season,TeamID,TeamName,Win_Percentage,Points_Per_Game,FG_Percentage,Threes_Per_Game,Three_Point_Percentage,Free_Throws_Per_Game,Free_Throw_Percentage,...,Conf_swac,Conf_wac,Conf_wcc,TeamNameSpelling,season,teamID,team,offensiveRating,defensiveRating,srs_rating
0,2014,1101.0,Abilene Chr,0.095238,63.142857,40.550807,17.857143,37.333333,21.190476,74.606742,...,0,0,0,abilene christian,2014.0,1.0,abilene christian,110.5,113.6,-15.4
1,2014,1102.0,Air Force,0.357143,64.571429,42.591316,21.714286,32.894737,19.250000,69.016698,...,0,0,0,air force,2014.0,2.0,air force,110.0,111.7,-7.0
2,2014,1103.0,Akron,0.636364,67.909091,43.190661,20.393939,34.769688,22.212121,61.800819,...,0,0,0,akron,2014.0,3.0,akron,111.2,110.7,1.8
3,2014,1104.0,Alabama,0.387097,66.709677,43.835616,17.258065,33.271028,22.774194,68.271955,...,0,0,0,alabama,2014.0,5.0,alabama,109.7,109.8,2.8
4,2014,1105.0,Alabama A&M,0.428571,63.821429,40.551446,18.892857,32.325142,22.071429,66.343042,...,1,0,0,alabama a&m,2014.0,4.0,alabama a&m,109.8,110.7,-4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4249,2025,1476.0,Stonehill,0.480000,68.480000,43.860947,24.480000,36.274510,16.800000,72.380952,...,0,0,0,stonehill,2025.0,284.0,stonehill,102.1,114.1,-5.8
4250,2025,1477.0,East Texas A&M,0.111111,63.925926,41.465054,26.703704,31.345354,15.407407,63.942308,...,0,0,0,east texas a&m,2025.0,76.0,east texas a&m,96.0,111.8,-10.7
4251,2025,1478.0,Le Moyne,0.280000,72.000000,44.094488,23.200000,31.896552,21.640000,70.794824,...,0,0,0,le moyne,2025.0,140.0,le moyne,101.6,120.3,-10.3
4252,2025,1479.0,Mercyhurst,0.384615,64.884615,41.601144,19.461538,34.584980,16.692308,80.184332,...,0,0,0,mercyhurst,2025.0,165.0,mercyhurst,100.3,116.6,-8.4
