In [21]:
import pandas as pd
import numpy as np 
import os 
from pathlib import Path

In [22]:
# Team Info 
mens_teams = pd.read_csv("../../data/MTeams.csv")

# Season Info 
mens_seasons = pd.read_csv("../../data/MSeasons.csv")

# Seed Info 
mens_tourney_seeds = pd.read_csv("../../data/MNCAATourneySeeds.csv")

# Regular Season Data - Each record: gamed
# Regular Seasons Games - All games before day num 132 
mens_reg_season_data = pd.read_csv("../../data/MRegularSeasonCompactResults.csv")

# Tourney Data 
mens_tourney_data = pd.read_csv("../../data/MNCAATourneyCompactResults.csv")

# Regular Season Box Scores 
mens_reg_season_box_scores = pd.read_csv("../../data/MRegularSeasonDetailedResults.csv")

# Tournament Box Scores 
mens_tourney_box_scores = pd.read_csv("../../data/MNCAATourneyDetailedResults.csv")

# Conferences 
conferences = pd.read_csv("../../data/Conferences.csv")
mens_conferences = pd.read_csv("../../data/MTeamConferences.csv")

# Alternate Spellings 
mens_spellings = pd.read_csv("../../data/MTeamSpellings.csv", encoding='ISO-8859-1')

#Tourney Slots 
mens_tourney_slots = pd.read_csv("../../data/MNCAATourneySlots.csv")
mens_tourney_seed_rounds = pd.read_csv("../../data/MNCAATourneySeedRoundSlots.csv")

# Conference Tourney 
mens_conf_tourney = pd.read_csv("../../data/MConferenceTourneyGames.csv")

# Geography  
cities = pd.read_csv("../../data/Cities.csv")
mens_cities = pd.read_csv("../../data/MGameCities.csv")

# Public Rankings 
public_rankings = pd.read_csv("../../data/MMasseyOrdinals.csv")

# Coaches 
mens_coaches = pd.read_csv("../../data/MTeamCoaches.csv")

# Sample Submission
sample = pd.read_csv("../../data/SampleSubmissionStage1.csv")


In [23]:
### Calculate summary statistics about each mens basketball team for each season 

winning_teams = mens_reg_season_box_scores.groupby(['Season', 'WTeamID']).agg({
    'WScore': 'sum', 
    'LScore': 'sum',     
    'WFGM': 'sum', 
    'WFGA': 'sum',
    'WFGM3': 'sum',
    'WFGA3': 'sum',
    'WFTM': 'sum',
    'WFTA': 'sum',
    'WOR': 'sum', 
    'WDR': 'sum', 
    'WAst': 'sum', 
    'WTO': 'sum', 
    'LFGM': 'sum', 
    'LFGA': 'sum', 
    'LFGM3': 'sum',
    'LFGA3': 'sum',
    'LFTM': 'sum', 
    'LFTA': 'sum', 
    'LOR': 'sum', 
    'LDR': 'sum', 
    'LTO': 'sum'
}).reset_index() 

winning_teams['Wins'] = mens_reg_season_box_scores.groupby(['Season', 'WTeamID']).size().values

losing_teams = mens_reg_season_box_scores.groupby(['Season', 'LTeamID']).agg({  
    'LScore': 'sum',  
    'WScore': 'sum', 
    'LFGM': 'sum', 
    'LFGA': 'sum', 
    'LFGM3': 'sum',
    'LFGA3': 'sum',
    'LFTM': 'sum', 
    'LFTA': 'sum', 
    'LOR': 'sum', 
    'LDR': 'sum', 
    'LTO': 'sum',
    'LAst': 'sum',  
    'WFGM': 'sum', 
    'WFGA': 'sum',
    'WFGM3': 'sum',
    'WFGA3': 'sum',
    'WFTM': 'sum',
    'WFTA': 'sum',
    'WOR': 'sum', 
    'WDR': 'sum', 
    'WTO': 'sum'
}).reset_index()

losing_teams['Losses'] = mens_reg_season_box_scores.groupby(['Season', 'LTeamID']).size().values

merged_df = pd.merge(winning_teams, losing_teams, left_on=['Season', 'WTeamID'], right_on = ['Season', 'LTeamID'], how='outer', suffixes=('_1', '_2'))

# Combine the Stats for Winning and Losing games 
merged_df['Points_Scored'] = merged_df['WScore_1'] + merged_df['LScore_2']
merged_df['FG_Made'] = merged_df['WFGM_1'] + merged_df['LFGM_2']
merged_df['FG_Atts'] = merged_df['WFGA_1'] + merged_df['LFGA_2']
merged_df['Threes_Made'] = merged_df['WFGM3_1'] + merged_df['LFGM3_2']
merged_df['Threes_Attempted'] = merged_df['WFGA3_1'] + merged_df['LFGA3_2']
merged_df['FreeThrows_Made'] = merged_df['WFTM_1'] + merged_df['LFTM_2']
merged_df['FreeThrows_Attempted'] = merged_df['WFTA_1'] + merged_df['LFTA_2']
merged_df['Offensive_Rebounds'] = merged_df['WOR_1'] + merged_df['LOR_2']
merged_df['Defensive_Rebounds'] = merged_df['WDR_1'] + merged_df['LDR_2']
merged_df['Assists'] = merged_df['WAst'] + merged_df['LAst']
merged_df['Turnovers'] = merged_df['WTO_1'] + merged_df['LTO_2']
merged_df['Points_Allowed'] = merged_df['LScore_1'] + merged_df['WScore_2']
merged_df['Opposing_FG_Made'] = merged_df['LFGM_1'] + merged_df['WFGM_2']
merged_df['Opposing_FG_Atts'] = merged_df['LFGA_1'] + merged_df['WFGA_2']
merged_df['Opposing_Threes_Made'] = merged_df['LFGM3_1'] + merged_df['WFGM3_2']
merged_df['Opposing_Threes_Attempted'] = merged_df['LFGA3_1'] + merged_df['WFGA3_2']
merged_df['Opposing_FreeThrows_Made'] = merged_df['LFTM_1'] + merged_df['WFTM_2']
merged_df['Opposing_FreeThrows_Attempted'] = merged_df['LFTA_1'] + merged_df['WFTA_2']
merged_df['Opposing_Offensive_Rebounds'] = merged_df['LOR_1'] + merged_df['WOR_2']
merged_df['Opposing_Defensive_Rebounds'] = merged_df['LDR_1'] + merged_df['WDR_2']
merged_df['Opposing_Turnovers'] = merged_df['LTO_1'] + merged_df['WTO_2']

merged_df['TeamID'] = merged_df['WTeamID'].combine_first(merged_df['LTeamID'])

output = [
    'Season', 'Losses', 'Wins', 'TeamID', 'Points_Scored', 'FG_Made', 'FG_Atts', 
    'Threes_Made', 'Threes_Attempted', 'FreeThrows_Made', 
    'FreeThrows_Attempted', 'Offensive_Rebounds', 'Defensive_Rebounds', 
    'Assists', 'Turnovers', 'Points_Allowed', 'Opposing_FG_Made', 
    'Opposing_FG_Atts', 'Opposing_Threes_Made', 'Opposing_Threes_Attempted', 
    'Opposing_FreeThrows_Made', 'Opposing_FreeThrows_Attempted', 
    'Opposing_Offensive_Rebounds', 'Opposing_Defensive_Rebounds', 'Opposing_Turnovers'
]

team_summary_stats = merged_df.copy() 
team_summary_stats = team_summary_stats[output]

team_summary_stats['Total_Games'] = team_summary_stats['Wins'] + team_summary_stats['Losses']
team_summary_stats['Points_Per_Game'] = team_summary_stats['Points_Scored'] / team_summary_stats['Total_Games']
team_summary_stats['Win_Percentage'] = team_summary_stats['Wins'] / team_summary_stats['Total_Games']
team_summary_stats['FG_Percentage'] = (team_summary_stats['FG_Made'] / team_summary_stats['FG_Atts']) * 100
team_summary_stats['Threes_Per_Game'] = team_summary_stats['Threes_Attempted'] / team_summary_stats['Total_Games']
team_summary_stats['Turnovers_Per_Game'] = team_summary_stats['Turnovers'] / team_summary_stats['Total_Games']
team_summary_stats['Three_Point_Percentage'] = (team_summary_stats['Threes_Made'] / team_summary_stats['Threes_Attempted']) * 100
team_summary_stats['Free_Throws_Per_Game'] = team_summary_stats['FreeThrows_Attempted'] / team_summary_stats['Total_Games']
team_summary_stats['Free_Throw_Percentage'] = (team_summary_stats['FreeThrows_Made'] / team_summary_stats['FreeThrows_Attempted']) * 100
team_summary_stats['Offensive_Rebound_Rate'] = (team_summary_stats['Offensive_Rebounds'] / (team_summary_stats['Offensive_Rebounds'] + team_summary_stats['Opposing_Defensive_Rebounds'])) * 100
team_summary_stats['Defensive_Rebound_Rate'] = (team_summary_stats['Defensive_Rebounds'] / (team_summary_stats['Defensive_Rebounds'] + team_summary_stats['Opposing_Offensive_Rebounds'])) * 100
team_summary_stats['Opp_FG_Percentage'] = (team_summary_stats['Opposing_FG_Made'] / team_summary_stats['Opposing_FG_Atts']) * 100
team_summary_stats['Opp_Three_Point_Percentage'] = (team_summary_stats['Opposing_Threes_Made'] / team_summary_stats['Opposing_Threes_Attempted']) * 100
team_summary_stats['Opp_Free_Throws_Per_Game'] = team_summary_stats['Opposing_FreeThrows_Attempted'] / team_summary_stats['Total_Games']
team_summary_stats['Opp_Turnovers_Per_Game'] = team_summary_stats['Opposing_Turnovers'] / team_summary_stats['Total_Games']

team_summary_stats.fillna(0, inplace=True)

# Add team names to records so external data can be joined in 
mens_teams_subset = mens_teams[['TeamID', 'TeamName']]
team_summary_stats = team_summary_stats.merge(mens_teams_subset, on=['TeamID'], how='left')

percentage_columns = [
    'Season', 'TeamName', 'TeamID', 'Win_Percentage', 'Points_Per_Game', 'FG_Percentage', 'Threes_Per_Game', 'Three_Point_Percentage', 
    'Free_Throws_Per_Game', 'Free_Throw_Percentage', 'Offensive_Rebound_Rate', 'Defensive_Rebound_Rate', 
    'Turnovers_Per_Game', 'Opp_FG_Percentage', 'Opp_Three_Point_Percentage', 'Opp_Free_Throws_Per_Game', 'Opp_Turnovers_Per_Game'
]

team_summary_stats = team_summary_stats[percentage_columns]

# Add Conferences and each record and one-hot encode them 
team_summary_stats = team_summary_stats.merge(mens_conferences, on=['Season', 'TeamID'], how='left')
#team_summary_stats = pd.get_dummies(team_summary_stats, columns=['ConfAbbrev'], prefix='Conf', dtype=int)
team_summary_stats.head(5) 

Unnamed: 0,Season,TeamName,TeamID,Win_Percentage,Points_Per_Game,FG_Percentage,Threes_Per_Game,Three_Point_Percentage,Free_Throws_Per_Game,Free_Throw_Percentage,Offensive_Rebound_Rate,Defensive_Rebound_Rate,Turnovers_Per_Game,Opp_FG_Percentage,Opp_Three_Point_Percentage,Opp_Free_Throws_Per_Game,Opp_Turnovers_Per_Game,ConfAbbrev
0,2003,Air Force,1102.0,0.428571,57.25,48.114901,20.821429,37.564322,17.107143,65.135699,17.180617,63.648649,11.428571,45.454545,38.218391,19.25,12.964286,mwc
1,2003,Akron,1103.0,0.481481,78.777778,48.607427,16.074074,33.870968,25.851852,73.638968,30.733411,62.340672,12.62963,48.732943,36.290323,22.148148,15.333333,mac
2,2003,Alabama,1104.0,0.607143,69.285714,42.036227,19.857143,32.014388,20.928571,70.989761,37.475345,68.717949,13.285714,41.891892,33.208955,17.142857,13.857143,sec
3,2003,Alabama A&M,1105.0,0.269231,71.769231,39.575531,20.769231,36.481481,21.846154,70.598592,33.847637,63.665254,18.653846,45.792564,35.745614,24.5,18.807692,swac
4,2003,Alabama St,1106.0,0.464286,63.607143,42.377261,17.642857,34.615385,16.464286,64.642082,35.463918,67.817259,17.035714,40.668896,31.455399,21.964286,15.071429,swac


In [24]:
### Calculate the Win Percentage for teams in the final 10 games of the season 


# Combine the winning and losing teams into a single DataFrame
win_data = mens_reg_season_data[['Season', 'DayNum', 'WTeamID']].copy()
win_data['IsWin'] = 1  # 1 for win

loss_data = mens_reg_season_data[['Season', 'DayNum', 'LTeamID']].copy()
loss_data['IsWin'] = 0  # 0 for loss

# Rename the columns to match for concatenation
win_data = win_data.rename(columns={'WTeamID': 'TeamID'})
loss_data = loss_data.rename(columns={'LTeamID': 'TeamID'})

# Combine the win and loss data
all_games = pd.concat([win_data, loss_data], ignore_index=True)

# Rank the games for each team (by DayNum, descending)
all_games['GameRank'] = all_games.groupby(['TeamID', 'Season'])['DayNum'].rank(method='first', ascending=False)

# Filter the last 10 games for each team
last_10_games = all_games[all_games['GameRank'] <= 10]

# Calculate wins and win percentage for each team in each season
win_percentage = last_10_games.groupby(['TeamID', 'Season']).agg(
    Wins=('IsWin', 'sum'),
    TotalGames=('IsWin', 'size')
).reset_index()

# Calculate win percentage
win_percentage['WinPercentage'] = (win_percentage['Wins'] / win_percentage['TotalGames'])
win_percentage = win_percentage.drop(columns=['Wins', 'TotalGames'])
win_percentage = win_percentage.rename(columns={'WinPercentage':'Win_pct_last_10_games'})

team_full_stats = team_summary_stats.merge(win_percentage, how='left', on=['Season','TeamID'])
team_full_stats.head(5) 

Unnamed: 0,Season,TeamName,TeamID,Win_Percentage,Points_Per_Game,FG_Percentage,Threes_Per_Game,Three_Point_Percentage,Free_Throws_Per_Game,Free_Throw_Percentage,Offensive_Rebound_Rate,Defensive_Rebound_Rate,Turnovers_Per_Game,Opp_FG_Percentage,Opp_Three_Point_Percentage,Opp_Free_Throws_Per_Game,Opp_Turnovers_Per_Game,ConfAbbrev,Win_pct_last_10_games
0,2003,Air Force,1102.0,0.428571,57.25,48.114901,20.821429,37.564322,17.107143,65.135699,17.180617,63.648649,11.428571,45.454545,38.218391,19.25,12.964286,mwc,0.2
1,2003,Akron,1103.0,0.481481,78.777778,48.607427,16.074074,33.870968,25.851852,73.638968,30.733411,62.340672,12.62963,48.732943,36.290323,22.148148,15.333333,mac,0.5
2,2003,Alabama,1104.0,0.607143,69.285714,42.036227,19.857143,32.014388,20.928571,70.989761,37.475345,68.717949,13.285714,41.891892,33.208955,17.142857,13.857143,sec,0.4
3,2003,Alabama A&M,1105.0,0.269231,71.769231,39.575531,20.769231,36.481481,21.846154,70.598592,33.847637,63.665254,18.653846,45.792564,35.745614,24.5,18.807692,swac,0.3
4,2003,Alabama St,1106.0,0.464286,63.607143,42.377261,17.642857,34.615385,16.464286,64.642082,35.463918,67.817259,17.035714,40.668896,31.455399,21.964286,15.071429,swac,0.4


In [25]:
output_dir = Path("../..") / "data" / "preprocessing"
output_dir.mkdir(parents=True, exist_ok =True)
output_path = output_dir / "mens_summary_season_data.csv"
team_full_stats.to_csv(output_path, index=False)