In [8]:
import pandas as pd

master_df= pd.read_csv('./master_data/all_seasons_v2.csv')
master_df.shape

(481311, 59)

In [9]:
renamed_columns = {
    'Team_ID': 'team_id',
    'Game_ID': 'game_id',
    'PLAYER_ID': 'player_id',
    'GAME_DATE': 'game_date',
    'MATCHUP': 'matchup',
    'W': 'team_wins', # per game
    'L': 'team_losses', # per game
    'W_PCT': 'team_win_percentage',
    'MIN_x': 'team_minutes_played',
    'FGM': 'team_field_goals_made',
    'FGA': 'team_field_goals_attempted',
    'FG_PCT': 'team_field_goal_percentage',
    'FG3M': 'team_three_points_made',
    'FG3A': 'team_three_points_attempted',
    'FG3_PCT': 'team_three_point_percentage',
    'FTM': 'team_free_throws_made',
    'FTA': 'team_free_throws_attempted',
    'FT_PCT': 'team_free_throw_percentage',
    'OREB': 'team_offensive_rebounds',
    'DREB': 'team_defensive_rebounds',
    'REB': 'team_total_rebounds',
    'AST': 'team_assists',
    'STL': 'team_steals',
    'BLK': 'team_blocks',
    'TOV': 'team_turnovers',
    'PF': 'team_personal_fouls',
    'PTS': 'team_points',
    'TEAM_ABBREVIATION': 'team_abbreviation',
    'TEAM_CITY': 'team_city',
    'PLAYER_NAME': 'player_name',
    'NICKNAME': 'nickname',
    'START_POSITION': 'start_position',
    'COMMENT': 'comment',
    'MIN_y': 'minutes_played_player_per_game',
    'E_OFF_RATING': 'estimated_offensive_rating',
    'OFF_RATING': 'offensive_rating',
    'E_DEF_RATING': 'estimated_defensive_rating',
    'DEF_RATING': 'defensive_rating',
    'E_NET_RATING': 'estimated_net_rating',
    'NET_RATING': 'net_rating',
    'AST_PCT': 'assist_percentage',
    'AST_TOV': 'assist_to_turnover_ratio',
    'AST_RATIO': 'assist_ratio',
    'OREB_PCT': 'offensive_rebound_percentage',
    'DREB_PCT': 'defensive_rebound_percentage',
    'REB_PCT': 'rebound_percentage',
    'TM_TOV_PCT': 'team_turnover_percentage',
    'EFG_PCT': 'effective_field_goal_percentage',
    'TS_PCT': 'true_shooting_percentage',
    'USG_PCT': 'usage_percentage',
    'E_USG_PCT': 'estimated_usage_percentage',
    'E_PACE': 'estimated_pace',
    'PACE': 'pace',
    'PACE_PER40': 'pace_per_40_minutes',
    'POSS': 'possessions',
    'PIE': 'player_impact_estimate',
    'WL': 'win_loss'
}

master_df.rename(columns=renamed_columns, inplace=True)
master_df.head()

Unnamed: 0,team_id,game_id,game_date,matchup,win_loss,team_wins,team_losses,team_win_percentage,team_minutes_played,team_field_goals_made,...,team_turnover_percentage,effective_field_goal_percentage,true_shooting_percentage,usage_percentage,estimated_usage_percentage,estimated_pace,pace,pace_per_40_minutes,possessions,player_impact_estimate
0,1610612747,22001072,"MAY 16, 2021",LAL @ NOP,W,42,30,0.583,240,45,...,6.5,0.545,0.546,0.357,0.36,109.99,106.6,88.83,60,0.182
1,1610612747,22001072,"MAY 16, 2021",LAL @ NOP,W,42,30,0.583,240,45,...,6.3,0.417,0.478,0.195,0.198,106.36,102.23,85.2,65,0.087
2,1610612747,22001072,"MAY 16, 2021",LAL @ NOP,W,42,30,0.583,240,45,...,13.3,0.545,0.528,0.263,0.27,111.96,107.26,89.38,47,0.145
3,1610612747,22001072,"MAY 16, 2021",LAL @ NOP,W,42,30,0.583,240,45,...,0.0,0.625,0.625,0.121,0.119,106.92,101.3,84.41,58,0.089
4,1610612747,22001072,"MAY 16, 2021",LAL @ NOP,W,42,30,0.583,240,45,...,0.0,0.545,0.545,0.169,0.175,100.63,99.02,82.52,59,0.089


In [10]:
# Addinng seasons as a factor

import numpy as np
 
# Step 1: Determine the Season Year
# Create a new column 'season_year' to identify the season based on the 'GAME_DATE'

master_df['game_date'] = pd.to_datetime(master_df['game_date'], format='%b %d, %Y')

master_df['season_year'] = np.where(master_df['game_date'].dt.month >= 10, 
                                      master_df['game_date'].dt.year, 
                                      master_df['game_date'].dt.year - 1)
 
# Step 2: Map each season year to a unique season identifier
# First, get the sorted unique season years
unique_season_years = master_df['season_year'].unique()
unique_season_years.sort()
 
# Create a mapping from season years to "Season1", "Season2", etc.
season_mapping = {year: f'Season-{i+1}' for i, year in enumerate(unique_season_years)}
 
# Map the 'season_year' to the new 'Seasons' column using the mapping
master_df['seasons'] = master_df['season_year'].map(season_mapping)
 
# Example output
master_df[['game_date', 'season_year', 'seasons']].tail(5)  

Unnamed: 0,game_date,season_year,seasons
481306,2022-10-19,2022,Season-8
481307,2022-10-19,2022,Season-8
481308,2022-10-19,2022,Season-8
481309,2022-10-19,2022,Season-8
481310,2022-10-19,2022,Season-8


Check if it was a home/away game

In [11]:
master_df['home_away'] = master_df['matchup'].apply(lambda x: 1 if "vs." in x else 0)
master_df[[ 'season_year', 'home_away']].head()


Unnamed: 0,season_year,home_away
0,2020,0
1,2020,0
2,2020,0
3,2020,0
4,2020,0


In [12]:
team_attribute_col_drop_list = [col for col in master_df.columns if col.startswith('team_')]
master_df.drop(columns=team_attribute_col_drop_list, inplace=True)

In [13]:
master_df.head()

Unnamed: 0,game_id,game_date,matchup,win_loss,GAME_ID,TEAM_ID,player_id,player_name,nickname,start_position,...,usage_percentage,estimated_usage_percentage,estimated_pace,pace,pace_per_40_minutes,possessions,player_impact_estimate,season_year,seasons,home_away
0,22001072,2021-05-16,LAL @ NOP,W,22001072,1610612747,2544,LeBron James,LeBron,F,...,0.357,0.36,109.99,106.6,88.83,60,0.182,2020,Season-6,0
1,22001072,2021-05-16,LAL @ NOP,W,22001072,1610612747,203076,Anthony Davis,Anthony,F,...,0.195,0.198,106.36,102.23,85.2,65,0.087,2020,Season-6,0
2,22001072,2021-05-16,LAL @ NOP,W,22001072,1610612747,203083,Andre Drummond,Andre,C,...,0.263,0.27,111.96,107.26,89.38,47,0.145,2020,Season-6,0
3,22001072,2021-05-16,LAL @ NOP,W,22001072,1610612747,203484,Kentavious Caldwell-Pope,Kentavious,G,...,0.121,0.119,106.92,101.3,84.41,58,0.089,2020,Season-6,0
4,22001072,2021-05-16,LAL @ NOP,W,22001072,1610612747,203471,Dennis Schroder,Dennis,G,...,0.169,0.175,100.63,99.02,82.52,59,0.089,2020,Season-6,0
