In [1]:
import pandas as pd
import numpy as np

root = "/home/austin/Github/kaggle-ncaa-2018/"

reg_season_detailed = pd.read_csv(root + "original_data/RegularSeasonDetailedResults.csv")
tourney_detailed = pd.read_csv(root + "original_data/NCAATourneyDetailedResults.csv")
reg_season_composite = pd.read_csv(root + "original_data/RegularSeasonCompositeStats.csv")
seeds = pd.read_csv(root + "original_data/NCAATourneySeeds.csv")
elo = pd.read_csv(root + "derived_data/season_elos.csv")

seasons = range(2003, 2018)

In [2]:
# Make & start filling initial dataframe
all_data = pd.DataFrame(columns=["Season", "TeamID"])
index = 0
for i in range(len(seasons)):
    season = seasons[i]
    team_ids_for_season = pd.unique(tourney_detailed.loc[tourney_detailed["Season"] == season, ["WTeamID", "LTeamID"]].values.ravel('K'))
    for j in range(len(team_ids_for_season)):
        all_data.loc[index, "Season"] = season
        all_data.loc[index, "TeamID"] = team_ids_for_season[j]
        index += 1

In [3]:
# Add regular season stats

# Relevant stats
stats_columns = ["FGM", "FGA", "FGM3", "FGA3", "FTM", "FTA", "OR", "DR", "Ast", "TO", "Stl", "Blk", "PF"]
w_team_stats = ["WFGM", "WFGA", "WFGM3", "WFGA3", "WFTM", "WFTA", "WOR", "WDR", "WAst", "WTO", "WStl", "WBlk", "WPF"]
l_team_stats = ["LFGM", "LFGA", "LFGM3", "LFGA3", "LFTM", "LFTA", "LOR", "LDR", "LAst", "LTO", "LStl", "LBlk", "LPF"]

# Add empty columns for new stats being added
all_data[stats_columns] = pd.DataFrame(index=range(all_data.shape[0]), columns=stats_columns)

# Loop through seasons
for season in seasons:
    games_for_season = reg_season_detailed.loc[reg_season_detailed["Season"] == season]
    team_ids_for_season = all_data.loc[all_data["Season"] == season, "TeamID"].values.ravel()

    # Loop through team IDs in the season
    for team_id in team_ids_for_season:
        games = games_for_season.loc[(games_for_season["WTeamID"] == team_id) | (games_for_season["LTeamID"] == team_id)].reset_index()
        stats = pd.DataFrame(index=range(games.shape[0]), columns=stats_columns)
        for i, row in games.iterrows():
            if row["WTeamID"] == team_id:
                stats.loc[i, stats_columns] = row[w_team_stats].as_matrix()
            else:
                stats.loc[i, stats_columns] = row[l_team_stats].as_matrix()
        
        all_data.loc[(all_data["Season"] == season) & (all_data["TeamID"] == team_id), stats_columns] = stats.mean(axis=0).ravel()

In [4]:
# Add composite stats, seeds, and elo

comp_columns = ["PIE", "FG_PCT", "TURNOVER_RATE", "OFF_REB_PCT", "FT_RATE", "4FACTOR", "OFF_EFF", "DEF_EFF",
                "ASSIST_RATIO", "DEF_REB_PCT", "FT_PCT", "WINPCT"]

# Add empty columns for new stats being added
all_data[comp_columns + ["Seed", "Elo"]] = pd.DataFrame(index=range(all_data.shape[0]), columns=comp_columns + ["Seed", "Elo"])

# Loop through all season/team pairs
for i, row in all_data.iterrows():
    # Get season and team ID for the row
    season = row["Season"]
    team_id = row["TeamID"]
    
    # Get composite stats
    season_team_data = reg_season_composite.loc[(reg_season_composite["Season"] == season) & (reg_season_composite["TeamID"] == team_id)]
    
    # Add composite stats
    all_data.loc[i, comp_columns] = season_team_data[comp_columns].as_matrix().ravel()
    
    # Add seed
    seed = seeds.loc[(seeds["Season"] == season) & (seeds["TeamID"] == team_id)]["Seed"].reset_index(drop=True)[0]
    all_data.loc[i, "Seed"] = int(seed[1:3])
    
    # Add elo
    all_data.loc[i, "Elo"] = elo.loc[(elo["season"] == season) & (elo["team_id"] == team_id)]["season_elo"].reset_index(drop=True)[0]

In [5]:
all_data.to_csv(root + "derived_data/Master.csv")