# Package & data

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


In [12]:
data_nba = pd.read_csv("../data/processed/nba_data.csv")
data_nba.head()

Unnamed: 0,PLAYER_ID,PLAYER_NAME,PLAYER_POS,PLAYER_YEAR_START,PLAYER_GAME_AGE,PLAYER_HEIGHT,PLAYER_WEIGHT,PLAYER_BORN_YEAR,PLAYER_EXP,GAME_ID,...,SHOT_ACTION_TYPE,SHOT_ACTION_CATEGORY,SHOT_TYPE,SHOT_ZONE_BASIC,SHOT_ZONE_AREA,SHOT_ZONE_RANGE,SHOT_DISTANCE,SHOT_X_LOCATION,SHOT_Y_LOCATION,SHOT_MADE_FLAG
0,100,Tim Legler,SG,1990,32,193,90,1966,8,29700427,...,Jump Shot,Jump Shot,2PT Field Goal,Mid-Range,Right Side(R),8-16 ft.,15,117,109,1
1,100,Tim Legler,SG,1990,32,193,90,1966,8,29700427,...,Jump Shot,Jump Shot,2PT Field Goal,Mid-Range,Right Side(R),8-16 ft.,14,143,25,0
2,100,Tim Legler,SG,1990,32,193,90,1966,8,29700427,...,Jump Shot,Jump Shot,2PT Field Goal,Mid-Range,Left Side(L),8-16 ft.,10,-87,55,0
3,100,Tim Legler,SG,1990,32,193,90,1966,8,29700427,...,Jump Shot,Jump Shot,2PT Field Goal,In The Paint (Non-RA),Center(C),Less Than 8 ft.,5,-1,53,0
4,100,Tim Legler,SG,1990,32,193,90,1966,8,29700427,...,Jump Shot,Jump Shot,2PT Field Goal,Mid-Range,Right Side(R),8-16 ft.,14,89,113,0


# Preprocessing

In [13]:
data = data_nba

# columns dic
columns_to_drop = [
    'PLAYER_ID',
    'PLAYER_NAME',
    'GAME_ID', 
    'GAME_EVENT_ID', 
    'GAME_YEAR', 
    'GAME_DATE',
    'GAME_SEASON_TYPE', 
    'GAME_TEAM_ID', 
    'GAME_TEAM_NAME', 
    'GAME_HOME_TEAM',
    'GAME_AWAY_TEAM',
    'SHOT_ACTION_TYPE',
    'SHOT_X_LOCATION',
    'SHOT_Y_LOCATION',

]

columns_to_scale = [
    'PLAYER_YEAR_START',
    'PLAYER_GAME_AGE',
    'PLAYER_HEIGHT', 
    'PLAYER_WEIGHT', 
    'PLAYER_BORN_YEAR',
    'PLAYER_EXP',
    'GAME_PERIOD',
    'GAME_PERIODE_SECOND_REMAINGING',
    'SHOT_DISTANCE',

]

categorical_columns = [
    'PLAYER_POS',
    'SHOT_ACTION_CATEGORY',
    'SHOT_TYPE',
    'SHOT_ZONE_BASIC',
    'SHOT_ZONE_AREA',
    'SHOT_ZONE_RANGE',

]

# dummies columns 
data_encoded = pd.get_dummies(data[categorical_columns], columns=categorical_columns)
data_encoded.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)

data = pd.concat([data, data_encoded], axis=1)
data = data.drop(categorical_columns, axis=1)

# Drop columns
data = data.drop(columns_to_drop, axis=1)

# normelize columns 
scaler = StandardScaler()
data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])

# Save data
data.to_csv("../data/processed/data_preprocessed.csv", index=False)

data.head()

Unnamed: 0,PLAYER_YEAR_START,PLAYER_GAME_AGE,PLAYER_HEIGHT,PLAYER_WEIGHT,PLAYER_BORN_YEAR,PLAYER_EXP,GAME_PERIOD,GAME_PERIODE_SECOND_REMAINGING,SHOT_DISTANCE,SHOT_MADE_FLAG,...,SHOT_ZONE_AREA_Center(C),SHOT_ZONE_AREA_Left_Side_Center(LC),SHOT_ZONE_AREA_Left_Side(L),SHOT_ZONE_AREA_Right_Side_Center(RC),SHOT_ZONE_AREA_Right_Side(R),SHOT_ZONE_RANGE_16-24_ft.,SHOT_ZONE_RANGE_24+_ft.,SHOT_ZONE_RANGE_8-16_ft.,SHOT_ZONE_RANGE_Back_Court_Shot,SHOT_ZONE_RANGE_Less_Than_8_ft.
0,-1.788159,0.235575,-0.766211,-0.682981,-1.140319,0.609749,1.336712,1.595207,0.3084,1,...,False,False,False,False,True,False,False,True,False,False
1,-1.788159,0.235575,-0.766211,-0.682981,-1.140319,0.609749,1.336712,1.088236,0.205792,0,...,False,False,False,False,True,False,False,True,False,False
2,-1.788159,0.235575,-0.766211,-0.682981,-1.140319,0.609749,1.336712,-0.772253,-0.204643,0,...,False,False,True,False,False,False,False,True,False,False
3,-1.788159,0.235575,-0.766211,-0.682981,-1.140319,0.609749,1.336712,-1.164438,-0.717686,0,...,True,False,False,False,False,False,False,False,False,True
4,-1.788159,0.235575,-0.766211,-0.682981,-1.140319,0.609749,1.336712,-1.451403,0.205792,0,...,False,False,False,False,True,False,False,True,False,False


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3986275 entries, 0 to 3986274
Data columns (total 53 columns):
 #   Column                                 Dtype  
---  ------                                 -----  
 0   PLAYER_YEAR_START                      float64
 1   PLAYER_GAME_AGE                        float64
 2   PLAYER_HEIGHT                          float64
 3   PLAYER_WEIGHT                          float64
 4   PLAYER_BORN_YEAR                       float64
 5   PLAYER_EXP                             float64
 6   GAME_PERIOD                            float64
 7   GAME_PERIODE_SECOND_REMAINGING         float64
 8   SHOT_DISTANCE                          float64
 9   SHOT_MADE_FLAG                         int64  
 10  PLAYER_POS_C                           bool   
 11  PLAYER_POS_C-PF                        bool   
 12  PLAYER_POS_C-SF                        bool   
 13  PLAYER_POS_PF                          bool   
 14  PLAYER_POS_PF-C                        bool   
 15