In [97]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/data.csv').dropna()

defensive_stats = []
with open('../data/defensive_ratings_adjusted.pkl' ,'rb') as pkl_file:
    defensive_stats = pickle.load(pkl_file)
    
print(df.shape)
print(df.columns)

(25697, 25)
Index([u'action_type', u'combined_shot_type', u'game_event_id', u'game_id',
       u'lat', u'loc_x', u'loc_y', u'lon', u'minutes_remaining', u'period',
       u'playoffs', u'season', u'seconds_remaining', u'shot_distance',
       u'shot_made_flag', u'shot_type', u'shot_zone_area', u'shot_zone_basic',
       u'shot_zone_range', u'team_id', u'team_name', u'game_date', u'matchup',
       u'opponent', u'shot_id'],
      dtype='object')


In [98]:
df.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5
5,Jump Shot,Jump Shot,244,20000012,34.0553,-145,-11,-118.4148,9,3,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,6


## Extract the season number

In [99]:
def get_season_number(dt):
        start_year = 1996
        # Kobe's rookie season was 1996-1997
        # Seasons start in October, end by July of the next year
        if dt.month < 8:
            return dt.year - start_year
        else:
            return dt.year - start_year + 1

def extract_season_info(df):
    df['game_date'] = pd.to_datetime(df['game_date'])
    df = df.sort_values(['game_date'])
    df['year'] = df['game_date'].dt.year
    df['month'] = df['game_date'].dt.month
    df['day'] = df['game_date'].dt.day
    # Next, let's extract the season number 
    df['season'] = df['game_date'].apply(get_season_number)
    return df

# Note that a lot of things after this will require the season number so I'm doing it here
df = extract_season_info(df)

## Get an Adjusted Defensive Rating column (DRtg/A)
I created a webscraper to get all of the defensive ratings of NBA teams over the years that Kobe played. Note that this isn't in just the data from Kaggle so we probably shouldn't use it for Kaggle.

In [100]:
def get_defensive_rating(x):
    try:
        return defensive_stats[x[0]-1][x[1]]
    except:
        return defensive_stats[x[0]-1]['NOH'] # This is really hacky but New Orleans was having an identity crisis... 
    
def extract_defensive_ratings(df):
    df['Opponent DRtg/A'] = df[['season', 'opponent']].apply(get_defensive_rating, axis=1)
    return df

## Extract Shot Information
Here we'll get the Euclidean distance of the shot as well as the side of the court he shot from

In [113]:
def extract_shot_distance(df):
    df['shot_distance'] = df[['loc_x', 'loc_y']].apply(lambda x: np.linalg.norm(x[1]-x[0]), axis=1)
    return df

def extract_shot_side(df):
    df['left_side'] = np.where(df['loc_x'] >= 80, 1, 0)
    df['right_side'] = np.where(df['loc_x'] <= -80, 1, 0)
    df['middle'] = np.where(np.logical_and(df['loc_x'] > -80, df['loc_x'] < 80), 1, 0)
    return df

def extract_shot_moving(df):
    df['moving'] = np.where(np.logical_or('Driving' in df['action_type'], 'Running' in df['action_type']), 1, 0)
    return df

# Use this one to extract all shot information
def extract_shot_info(df):
    df = extract_shot_distance(df)
    df = extract_shot_side(df)
    df = extract_shot_moving(df)
    df = pd.get_dummies(df, columns=['combined_shot_type'])
    return df

In [92]:
pd.get_dummies(df, columns=['combined_shot_type']).columns.values

array(['action_type', 'game_event_id', 'game_id', 'lat', 'loc_x', 'loc_y',
       'lon', 'minutes_remaining', 'period', 'playoffs', 'season',
       'seconds_remaining', 'shot_distance', 'shot_made_flag', 'shot_type',
       'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'team_id',
       'team_name', 'game_date', 'matchup', 'opponent', 'shot_id', 'year',
       'month', 'day', 'left_side', 'right_side', 'middle',
       'combined_shot_type_Bank Shot', 'combined_shot_type_Dunk',
       'combined_shot_type_Hook Shot', 'combined_shot_type_Jump Shot',
       'combined_shot_type_Layup', 'combined_shot_type_Tip Shot'], dtype=object)

## Combine all of our extractions

In [114]:
keep_columns = ['season', 'Opponent DRtg/A', 'shot_distance', 'left_side', 'right_side', 'moving',
                'combined_shot_type_Bank Shot', 'combined_shot_type_Dunk', 'combined_shot_type_Hook Shot', 
                'combined_shot_type_Jump Shot', 'combined_shot_type_Layup', 'combined_shot_type_Tip Shot']
def process_data(df):
    # df = extract_season_info(df) - DON'T INCLUDE THIS, IT SHOULD ALREADY BE DONE
    df = extract_defensive_ratings(df)
    df = extract_shot_info(df)
    return df[keep_columns]
    
df_processed = process_data(df)

In [115]:
df_processed.head()

Unnamed: 0,season,Opponent DRtg/A,shot_distance,left_side,right_side,moving,combined_shot_type_Bank Shot,combined_shot_type_Dunk,combined_shot_type_Hook Shot,combined_shot_type_Jump Shot,combined_shot_type_Layup,combined_shot_type_Tip Shot
22901,1,108.06,256.0,0,1,0,0.0,0.0,0.0,1.0,0.0,0.0
22902,1,101.68,228.0,0,1,0,0.0,0.0,0.0,1.0,0.0,0.0
22903,1,109.65,323.0,0,1,0,0.0,0.0,0.0,1.0,0.0,0.0
22904,1,109.65,0.0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
22905,1,109.65,148.0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
