# Title: NBA Games - Data Preparation Part 3
## Author: Andy Orie

**Objectives:** 
Perform the following Data Cleaning, Feature Engineering and Transformation tasks on the gm_details file, which contains player stats for specific games played per season:
1. Fixing column names to improve readability
2. Fixing structural issues such as how the minutes column displays data.
3. Identifying and fixing anomalies in the data types.
4. Identify and deal with missing values.
5. Transform data into a format to get aggregate/ summarized data to be used to calculate advance stats.
6. Transform the dataset to include new features with advanced stats using a python script I created named “nba_adv_stat”. 

In [None]:
# Starting off with a fun module to get the current date.

import datetime

current_date = datetime.date.today()

print('The Current Date is:', current_date)

In [None]:
# Global Settings and Imported Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import os


%matplotlib inline
%config InlineBackend.figure_format='retina'

from matplotlib import rcParams
# Specify the figure size in inches, for both X, and Y axes
rcParams['figure.figsize'] = 12,5

from matplotlib import style
from IPython.display import display

In [None]:
# Let's look at our folder and files to ensure we have what we need.

import chkdir

In [None]:
# Read in csv file with the player details based on game ID.

gm_details_df = pd.read_csv('games_details.csv', usecols= ['GAME_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'PLAYER_ID',
       'PLAYER_NAME', 'START_POSITION', 'MIN', 'FGM',
       'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
       'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS',
       'PLUS_MINUS'])

gm_details_df.head(5)

In [None]:
# Let's rename a few columns to improve readability.

new_gm_df = gm_details_df.rename(columns={'TEAM_ABBREVIATION':'TEAM_ABB', 'START_POSITION':'POS', 'PLAYER_NAME':'PLAYER'})
new_gm_df.head(5)

In [None]:
# Let's investigate the dataset shape and feature data types.

print('The data set shape is:',new_gm_df.shape)
print(100*'-')
new_gm_df.dtypes

In [None]:
# Let's Insert a new column called "SECS" with the total number of seconds based on the time conversion.

seconds_list = []

for t in new_gm_df['MIN']:
    t_str = str(t)
    parts = t_str.split(':')
    str_lgth = len(parts)
    if "nan" not in parts:
        if parts[0] != 0:
            if str_lgth == 2:
                t_secs = abs(int(int(parts[0])*60)) + abs(int(parts[1]))
                seconds_list.append(t_secs)
                
            else:
                t_secs = abs(int(int(parts[0])*60))
                seconds_list.append(t_secs)
                
        else:
            t_secs = abs(parts[1])
            seconds_list.append(t_secs)
    elif "nan" in parts:
        t_secs = 0
        seconds_list.append(t_secs)
        
new_gm_df.insert(7, 'SECS', seconds_list)

display(new_gm_df.head(5))

In [None]:
# Let's insert a new column with the calculated Minutes by dividing the data under 'SECS'.

calc_mins = lambda x: x/60

mins_list = list(map(calc_mins, new_gm_df['SECS']))

new_gm_df.insert(8, 'MINS', mins_list)

display(new_gm_df.head(5))

In [None]:
# Let's drop the 'MIN' and 'SECS' columns and round out the minutes to 2 decimal places.

new_gm_df1 = new_gm_df.drop(['MIN', 'SECS'], axis=1)

new_gm_df1 = new_gm_df1.round(decimals=2)

In [None]:
# Are there any Nulls in the dataset? If yes then that would indicate that there might be 
# players who did not contribute to the boxscores in the game.

new_gm_df1.isna().sum()

In [None]:
# Look for anomolies in the features based on their measures of spread. 

new_gm_df1.describe()

In [None]:
# Read in another dataset with the boxscores from the respective game ID. 
# See jupyter notebook titled "NBA Games - Preprocessing and Data Cleaning Part 2" on how this file was prepared.

gm_box_sc = pd.read_csv('game_boxscores.csv', index_col=False)
display(gm_box_sc.head(5))

In [None]:
# Look at the dataset shape and data types.

print(gm_box_sc.shape)
print(100*'-')
gm_box_sc.dtypes

In [None]:
# Let's convert the 'GAME_DATE' column from a string to Datetime object by creating a new column.

gm_box_sc['NEW_GAME_DATE'] = pd.to_datetime(gm_box_sc['GAME_DATE'], infer_datetime_format= True) 
gm_box_sc.head(5)

In [None]:
# Let's merge the new_gm_df1 dataset with the gm_box_sc dataset to get the Year, Month and Day columns.

gm_det_df = pd.merge(new_gm_df1, gm_box_sc[['GAME_ID', 'GM_YR', 'GM_MTH', 'GM_DAY', 'NEW_GAME_DATE']], on='GAME_ID', how= 'right')
display(gm_det_df.head(5))

In [None]:
# What is the new shape of our dataset?

print('The original dataframe shape was: ', new_gm_df1.shape)
print()
print('The new dataframe shape is: ', gm_det_df.shape)

In [None]:
# Let's replace all the NaN values with zero.

gm_det_df.fillna(0, axis=1, inplace=True)

In [None]:
# Check if there are any NaN left.

gm_det_df.isna().sum()

In [None]:
# Let's get the total team minutes by using a groupby object. This will return a series object.

total_team_minutes_ser = gm_det_df.groupby(['GAME_ID', 'TEAM_ID'])['MINS'].sum()
display(total_team_minutes_ser)

In [None]:
# Let's convert the series object to a dataframe. This will be used to calculate the Advanced Stats for a player later.

total_team_minutes_df = total_team_minutes_ser.unstack().fillna(0)
display(total_team_minutes_df)

In [None]:
# Let's get the total team FGM. This will be used to calculate the Advanced Stats for a player later.

total_team_fgm_ser = gm_det_df.groupby(['GAME_ID', 'TEAM_ID'])['FGM'].sum()
display(total_team_fgm_ser)

In [None]:
# Let's convert the series object to a dataframe. This will be used to calculate the Advanced Stats for a player later.

total_team_fgm_df = total_team_fgm_ser.unstack().fillna(0)
display(total_team_fgm_df)

In [None]:
# Import a python script I created for calculating several Advanced Stats.

import nba_adv_stat as nas

In [None]:
# Let's calculate a New Advance Stat for the Assist % using the total team mins played.

num_rows = gm_det_df.shape[0]

tot_tm_asst_list = [ ]

for i in range(0, num_rows):
    gm_id = gm_det_df['GAME_ID'][i]
    tm_id = gm_det_df['TEAM_ID'][i]
    tot_tm_mins = total_team_minutes_df.loc[gm_id, tm_id] # Get the total team Mins.
    tm_fg = total_team_fgm_df.loc[gm_id, tm_id] # Get the total team FGM.
    asst = gm_det_df['AST'][i]
    mins = gm_det_df['MINS'][i]
    fg_ply = gm_det_df['FGM'][i]
    
    asst_pert = nas.asst_perct(asst, mins, tot_tm_mins, tm_fg, fg_ply)
    
    tot_tm_asst_list.append(asst_pert)

In [None]:
# Let's calculate a New Advance Stat for the Turnover %.

num_rows = gm_det_df.shape[0]

turnover_list = [ ]

for i in range(0, num_rows):
    gm_id = gm_det_df['GAME_ID'][i]
    tm_id = gm_det_df['TEAM_ID'][i]
    turnover_ply = gm_det_df['TO'][i]
    fga = gm_det_df['FGA'][i]
    fta = gm_det_df['FTA'][i]
    
    tov_pert = nas.turnover_perct(turnover_ply, fga, fta)
    
    turnover_list.append(tov_pert)

In [None]:
# Let's take a look at the columns in the dataframe again to ensure we use the correct column name.

gm_det_df.columns

In [None]:
# Let's get the total team OREB.

total_team_oreb_ser = gm_det_df.groupby(['GAME_ID', 'TEAM_ID'])['OREB'].sum()
display(total_team_oreb_ser)

In [None]:
# Let's convert the series object to a dataframe. This will be used to calculate the Advanced Stats for a player later.

total_team_oreb_df = total_team_oreb_ser.unstack().fillna(0)
total_team_oreb_df

In [None]:
# Let's calculate a New Advance Stat for the ORB % using the total team mins played.

num_rows = gm_det_df.shape[0]

offensive_reb_list = [ ]

for i in range(0, num_rows):
    gm_id = gm_det_df['GAME_ID'][i]
    tm_id = gm_det_df['TEAM_ID'][i]
    
    # Getting the function parameters:
    tot_tm_mins = total_team_minutes_df.loc[gm_id, tm_id] # Get the total team Mins.
    oreb_ply = gm_det_df['OREB'][i]
    mins_ply = gm_det_df['MINS'][i]
    
    oreb_tm = total_team_oreb_df.loc[gm_id, tm_id] # This gives the 1st team total oreb's.
    
    tm_id_ser = total_team_oreb_ser.loc[gm_id] # See the previoue code block to see how the series output looks.
    
    if tm_id_ser.index[0] != tm_id: # This will check to ensure the 2nd/ opponent team total oreb's are choosen.
        oreb_opp = tm_id_ser.iloc[0] # will give the opponent's oreb's.
    else:
        oreb_opp = tm_id_ser.iloc[1]

    oreb_pert = nas.off_rebs_pct(oreb_ply,tot_tm_mins, mins_ply, oreb_tm, oreb_opp)
    
    offensive_reb_list.append(oreb_pert)

In [None]:
# Let's calculate a New Advance Stat for the eFG%.

num_rows = gm_det_df.shape[0]

eff_fg_list = []

for i in range(0, num_rows):
    gm_id = gm_det_df['GAME_ID'][i]
    tm_id = gm_det_df['TEAM_ID'][i]
    fg_ply = gm_det_df['FGM'][i]
    fg_3m_ply = gm_det_df['FG3M'][i]
    fga = gm_det_df['FGA'][i]
    
    eff_fg = nas.eff_fg_pct(fg_ply, fg_3m_ply, fga)
    eff_fg_list.append(eff_fg)

In [None]:
# Let's calculate a New Advance Stat for the TS%.

num_rows = gm_det_df.shape[0]

true_sht_list = []

for i in range(0, num_rows):
    gm_id = gm_det_df['GAME_ID'][i]
    tm_id = gm_det_df['TEAM_ID'][i]
    pts = gm_det_df['PTS'][i]
    fta = gm_det_df['FTA'][i]
    fga = gm_det_df['FGA'][i]
    
    ts = nas.true_sht_pct(pts, fga, fta)
    true_sht_list.append(ts)

In [None]:
# Let's creates a series object. This will be used to calculate the Advanced Stats for a player later.

total_team_ft_ser = gm_det_df.groupby(['GAME_ID', 'TEAM_ID'])['FTM'].sum()
display(total_team_ft_ser)

In [None]:
## Let's convert the series object to a dataframe. This will be used to calculate the Advanced Stats for a player later.

total_team_ft_df = total_team_ft_ser.unstack().fillna(0)
total_team_ft_df

In [None]:
# Let's creates a series object. This will be used to calculate the Advanced Stats for a player later.

total_team_to_ser = gm_det_df.groupby(['GAME_ID', 'TEAM_ID'])['TO'].sum()
display(total_team_to_ser)

In [None]:
# Let's convert the series object to a dataframe. This will be used to calculate the Advanced Stats for a player later.

total_team_to_df = total_team_to_ser.unstack().fillna(0)
total_team_to_df

In [None]:
# Let's calculate a New Advance Stat for the USG%.

num_rows = gm_det_df.shape[0]

usage_rate_list = [ ]

for i in range(0, num_rows):
    gm_id = gm_det_df['GAME_ID'][i]
    tm_id = gm_det_df['TEAM_ID'][i]
    turnover_ply = gm_det_df['TO'][i]
    fga = gm_det_df['FGA'][i]
    fta = gm_det_df['FTA'][i]
    tot_tm_mins = total_team_minutes_df.loc[gm_id, tm_id]
    mins = gm_det_df['MINS'][i]
    tm_fg = total_team_fgm_df.loc[gm_id, tm_id]
    tm_ft = total_team_ft_df.loc[gm_id, tm_id]
    tm_to = total_team_to_df.loc[gm_id, tm_id]
    
    
    usg = nas.usage_rate(fga, fta, turnover_ply, tot_tm_mins, mins, tm_fg, tm_ft, tm_to)
    
    usage_rate_list.append(usg)

In [None]:
# Let's calculate a New Advance Stat for the PPP%.

num_rows = gm_det_df.shape[0]

ppp_list = [ ]

for i in range(0, num_rows):
    gm_id = gm_det_df['GAME_ID'][i]
    tm_id = gm_det_df['TEAM_ID'][i]
    turnover_ply = gm_det_df['TO'][i]
    fga = gm_det_df['FGA'][i]
    fta = gm_det_df['FTA'][i]
    pts = gm_det_df['PTS'][i]
    
    ppp = nas.pts_per_poss(pts, fga, fta, turnover_ply)
    
    ppp_list.append(ppp)

In [None]:
# Let's update the gm_det_df with all the new advanced stats lists.

gm_det_df.insert(1, 'ASST_%', tot_tm_asst_list)
gm_det_df.insert(2, 'TO_%', turnover_list)
gm_det_df.insert(3, 'OREB_%', offensive_reb_list)
gm_det_df.insert(4, 'eFG%', eff_fg_list)
gm_det_df.insert(5, 'TS%', true_sht_list)
gm_det_df.insert(6, 'USG%', usage_rate_list)
gm_det_df.insert(7, 'PPP%', ppp_list)

In [None]:
# Let's take a look at the data frame.

gm_det_df.head(5)

In [None]:
# Let's check the new dataframe's shape.

gm_det_df.shape

In [None]:
# Let's convert the ID columns from integer to string objects. 

gm_det_df['GAME_ID'] = gm_det_df['GAME_ID'].astype(str)
gm_det_df['TEAM_ID'] = gm_det_df['TEAM_ID'].astype(str)
gm_det_df['PLAYER_ID'] = gm_det_df['PLAYER_ID'].astype(str)

In [None]:
# Verify all data types are accurate.

gm_det_df.info()

In [None]:
# Check for anomolies in the measures of spread.

gm_det_df.describe()

In [None]:
# Save the dataframe to a csv file for later use.

gm_det_df.to_csv('nba_game_data.csv')

End of Code