In [20]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/data.csv')

defensive_stats = []
with open('../data/defensive_ratings_adjusted.pkl' ,'rb') as pkl_file:
    defensive_stats = pickle.load(pkl_file)

print(df.shape)
print(df.columns)

(30697, 25)
Index([u'action_type', u'combined_shot_type', u'game_event_id', u'game_id',
       u'lat', u'loc_x', u'loc_y', u'lon', u'minutes_remaining', u'period',
       u'playoffs', u'season', u'seconds_remaining', u'shot_distance',
       u'shot_made_flag', u'shot_type', u'shot_zone_area', u'shot_zone_basic',
       u'shot_zone_range', u'team_id', u'team_name', u'game_date', u'matchup',
       u'opponent', u'shot_id'],
      dtype='object')


## Drop some unimportant columns

In [21]:
drop = ['game_event_id', 'game_id', 'lat', 'lon', 'team_name', 'team_id']
df = df.drop(drop, axis=1)

In [22]:
df.head()

Unnamed: 0,action_type,combined_shot_type,loc_x,loc_y,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,167,72,10,1,0,2000-01,27,18,,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,-157,0,10,1,0,2000-01,22,15,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,-101,135,7,1,0,2000-01,45,16,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,138,175,6,1,0,2000-01,52,22,0.0,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,0,0,6,2,0,2000-01,19,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,2000-10-31,LAL @ POR,POR,5


## Convert categorical columns into dummies

In [23]:
def extract_dummies(df, *categories):
    for category in categories:
        dummies = pd.get_dummies(df[category])
        dummies = dummies.add_prefix("{}#".format(category))
        df.drop(category, axis=1, inplace=True)
        df = df.join(dummies)
    return df

## Extract the season number

In [24]:
def get_season_number(dt):
        start_year = 1996
        # Kobe's rookie season was 1996-1997
        # Seasons start in October, end by July of the next year
        if dt.month < 8:
            return dt.year - start_year
        else:
            return dt.year - start_year + 1

def extract_season_info(df):
    df['game_date'] = pd.to_datetime(df['game_date'])
    df = df.sort_values(['game_date'])
    df['year'] = df['game_date'].dt.year
    df['month'] = df['game_date'].dt.month
    df['day'] = df['game_date'].dt.day
    # Next, let's extract the season number 
    df['season'] = df['game_date'].apply(get_season_number)
    df = extract_dummies(df, 'year', 'month', 'day')
    return df

# Note that a lot of things after this will require the season number so I'm doing it here
df = extract_season_info(df)

In [25]:
df.head()

Unnamed: 0,action_type,combined_shot_type,loc_x,loc_y,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,...,day#22,day#23,day#24,day#25,day#26,day#27,day#28,day#29,day#30,day#31
22901,Jump Shot,Jump Shot,-140,116,0,1,0,1,42,18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22902,Jump Shot,Jump Shot,-131,97,10,2,0,1,8,16,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22903,Jump Shot,Jump Shot,-142,181,8,2,0,1,37,23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22904,Jump Shot,Jump Shot,0,0,6,2,0,1,34,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22905,Jump Shot,Jump Shot,-10,138,5,2,0,1,27,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Extract Opponent Team's Defensive Rating

In [26]:
def get_defensive_rating(x):
    try:
        return defensive_stats[x[0]-1][x[1]]
    except:
        return defensive_stats[x[0]-1]['NOH'] # This is really hacky but New Orleans was having an identity crisis... 
    
def extract_defensive_ratings(df):
    df['Opponent DRtg/A'] = df[['season', 'opponent']].apply(get_defensive_rating, axis=1)
    return df

## Extract Shot Information
Here we'll get the Euclidean distance of the shot as well as the side of the court he shot from

In [27]:
def extract_shot_binned(df):
    df['loc_x'] = pd.cut(df['loc_x'], 25)
    df['loc_y'] = pd.cut(df['loc_y'], 25)
    df = extract_dummies(df, 'loc_x', 'loc_y')
    return df

In [28]:
def get_shot_moving(action_type):
    if 'Driving' in action_type:
        return 1
    if 'Running' in action_type:
        return 1
    return 0

def extract_shot_moving(df):
    df['moving'] = df['action_type'].apply(get_shot_moving)
    return df

In [29]:
# Use this one to extract all shot information
def extract_shot_info(df):
    drop = ['shot_type', 'shot_zone_basic', 'shot_zone_area']
    # df = extract_shot_binned(df)
    df = extract_shot_moving(df)
    df.drop(drop, axis=1, inplace=True)
    df = extract_dummies(df, 'combined_shot_type', 'shot_zone_range')
    return df

## Extract Home/Away and if this is the second game of a back to back

In [30]:
# Whether or not this was a home game
def home_game(matchup):
    if '@' in matchup:
        return 0
    else:
        return 1

def extract_home_games(df):
    df['home'] = df['matchup'].apply(home_game)
    df.drop('matchup', axis=1, inplace=True)
    return df

In [31]:
def get_segababa(game_date, all_game_dates):
    this_game_index = all_game_dates.index(game_date)
    if this_game_index == 0: # First game of the season
        return 0
    previous_game_date = all_game_dates[this_game_index - 1]
    delta = game_date - previous_game_date
    if delta.days == 1:
        return 1
    else:
        return 0

def extract_segababa(df):
    all_game_dates = sorted(list(set(df['game_date'])))
    df['segababa'] = df['game_date'].apply(get_segababa, args=(all_game_dates,))
    df.drop('game_date', axis=1, inplace=True)
    return df

## Extract if this was a shot in the last 5 seconds of the period

In [32]:
# This function is used for determine how much time is left in the game (in minutes)
def extract_seconds_remaining(df):
    df['seconds_left_in_period'] = 60 * df['minutes_remaining'] + df['seconds_remaining']
    df['last_5_seconds'] = df['seconds_left_in_period'] < 5
    df.drop('seconds_left_in_period', axis=1, inplace=True)
    df.drop('minutes_remaining', axis=1, inplace=True)
    df.drop('seconds_remaining', axis=1, inplace=True)
    return df

## Combine all of our extractions

In [33]:
# Keep the shot_id for the test stuff
def process_data(df):
    # df = extract_season_info(df) - DON'T INCLUDE THIS, IT SHOULD ALREADY BE DONE
    df = extract_defensive_ratings(df)
    df = extract_shot_info(df)
    df = extract_home_games(df)
    df = extract_segababa(df)
    df = extract_seconds_remaining(df)
    df = extract_dummies(df, 'opponent', 'action_type')
    return df
    
df_processed = process_data(df)

In [34]:
submit_mask = df['shot_made_flag'].isnull()
df_submit = df_processed[submit_mask]

df_processed = df_processed.dropna().drop('shot_id', axis=1)

## Choose the best 20 features

In [35]:
X = df_processed.drop('shot_made_flag', axis=1)
y = df_processed['shot_made_flag']

In [36]:
X.head()

Unnamed: 0,loc_x,loc_y,period,playoffs,season,shot_distance,year#1996,year#1997,year#1998,year#1999,...,action_type#Slam Dunk Shot,action_type#Step Back Jump shot,action_type#Tip Layup Shot,action_type#Tip Shot,action_type#Turnaround Bank shot,action_type#Turnaround Fadeaway Bank Jump Shot,action_type#Turnaround Fadeaway shot,action_type#Turnaround Finger Roll Shot,action_type#Turnaround Hook Shot,action_type#Turnaround Jump Shot
22901,-140,116,1,0,1,18,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22902,-131,97,2,0,1,16,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22903,-142,181,2,0,1,23,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22904,0,0,2,0,1,0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22905,-10,138,2,0,1,13,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
from sklearn.feature_selection import VarianceThreshold

threshold = 0.90
vt = VarianceThreshold().fit(X)

# Find feature names
feat_var_threshold = df_processed.columns[vt.variances_ > threshold * (1-threshold)]
feat_var_threshold

Index([u'loc_x', u'loc_y', u'period', u'playoffs', u'season', u'shot_distance',
       u'year#2016', u'month#1', u'month#2', u'month#3', u'month#10',
       u'month#11', u'day#31', u'Opponent DRtg/A',
       u'combined_shot_type#Hook Shot', u'combined_shot_type#Jump Shot',
       u'combined_shot_type#Tip Shot', u'shot_zone_range#16-24 ft.',
       u'shot_zone_range#24+ ft.', u'shot_zone_range#Back Court Shot',
       u'shot_zone_range#Less Than 8 ft.', u'home',
       u'action_type#Jump Hook Shot'],
      dtype='object')

## Best features according to the Random Forest Classifier

In [39]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X,y)

feature_imp = pd.DataFrame(rfc.feature_importances_, index=X.columns, columns=["importance"])
feat_imp_20 = feature_imp.sort_values("importance", ascending=False).head(20).index
feat_imp_20

Index([u'loc_y', u'loc_x', u'shot_distance', u'period', u'Opponent DRtg/A',
       u'action_type#Jump Shot', u'season', u'combined_shot_type#Dunk',
       u'home', u'moving', u'action_type#Layup Shot', u'segababa', u'month#1',
       u'month#12', u'month#3', u'action_type#Driving Layup Shot',
       u'shot_zone_range#16-24 ft.', u'month#4', u'month#2',
       u'shot_zone_range#8-16 ft.'],
      dtype='object')

## Using Univariate Classifier

In [40]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2


X_minmax = MinMaxScaler(feature_range=(0,1)).fit_transform(X)
X_scored = SelectKBest(score_func=chi2, k='all').fit(X_minmax, y)
feature_scoring = pd.DataFrame({
        'feature': X.columns,
        'score': X_scored.scores_
    })

feat_scored_20 = feature_scoring.sort_values('score', ascending=False).head(20)['feature'].values
feat_scored_20

array(['moving', 'combined_shot_type#Dunk', 'action_type#Jump Shot',
       'action_type#Driving Layup Shot', 'shot_zone_range#Less Than 8 ft.',
       'action_type#Slam Dunk Shot', 'action_type#Driving Dunk Shot',
       'action_type#Running Jump Shot', 'shot_zone_range#24+ ft.',
       'combined_shot_type#Layup', 'combined_shot_type#Jump Shot',
       'last_5_seconds', 'action_type#Jump Bank Shot',
       'action_type#Pullup Jump shot', 'action_type#Dunk Shot',
       'action_type#Alley Oop Dunk Shot', 'shot_distance',
       'action_type#Turnaround Jump Shot',
       'action_type#Fadeaway Jump Shot', 'combined_shot_type#Bank Shot'], dtype=object)

## Using RFE

In [41]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

rfe = RFE(LogisticRegression(), 20)
rfe.fit(X, y)

feature_rfe_scoring = pd.DataFrame({
        'feature': X.columns,
        'score': rfe.ranking_
    })

feat_rfe_20 = feature_rfe_scoring[feature_rfe_scoring['score'] == 1]['feature'].values
feat_rfe_20

array(['combined_shot_type#Bank Shot', 'combined_shot_type#Dunk',
       'combined_shot_type#Tip Shot', 'shot_zone_range#Back Court Shot',
       'last_5_seconds', 'action_type#Driving Dunk Shot',
       'action_type#Driving Finger Roll Layup Shot',
       'action_type#Driving Finger Roll Shot',
       'action_type#Driving Jump shot', 'action_type#Dunk Shot',
       'action_type#Fadeaway Bank shot',
       'action_type#Finger Roll Layup Shot', 'action_type#Hook Shot',
       'action_type#Jump Shot', 'action_type#Layup Shot',
       'action_type#Pullup Bank shot',
       'action_type#Putback Slam Dunk Shot',
       'action_type#Running Finger Roll Shot',
       'action_type#Running Hook Shot', 'action_type#Slam Dunk Shot'], dtype=object)

## Final feature selection

In [102]:
features = np.hstack([
        feat_var_threshold, 
        feat_imp_20,
        feat_scored_20,
        feat_rfe_20
    ])

features = np.unique(features)
print('Final features set:\n')
for f in features:
    print("\t-{}".format(f))

Final features set:

	-Opponent DRtg/A
	-action_type#Alley Oop Dunk Shot
	-action_type#Driving Dunk Shot
	-action_type#Driving Finger Roll Layup Shot
	-action_type#Driving Finger Roll Shot
	-action_type#Driving Jump shot
	-action_type#Driving Layup Shot
	-action_type#Dunk Shot
	-action_type#Fadeaway Bank shot
	-action_type#Hook Shot
	-action_type#Jump Bank Shot
	-action_type#Jump Hook Shot
	-action_type#Jump Shot
	-action_type#Layup Shot
	-action_type#Pullup Jump shot
	-action_type#Putback Slam Dunk Shot
	-action_type#Running Finger Roll Shot
	-action_type#Running Hook Shot
	-action_type#Running Jump Shot
	-action_type#Slam Dunk Shot
	-combined_shot_type#Dunk
	-combined_shot_type#Hook Shot
	-combined_shot_type#Jump Shot
	-combined_shot_type#Layup
	-combined_shot_type#Tip Shot
	-day#31
	-home
	-last_5_seconds
	-loc_x#(-10.96, 8.96]
	-loc_x#(-30.88, -10.96]
	-loc_x#(128.48, 148.4]
	-loc_y#(-10.6, 22.8]
	-loc_y#(-44.835, -10.6]
	-loc_y#(123, 156.4]
	-loc_y#(223.2, 256.6]
	-loc_y#(290, 323

In [103]:
train_labels = np.hstack([features, 'shot_made_flag'])
test_labels = np.hstack([features, 'shot_id'])

df_processed = df_processed[train_labels]
df_submit = df_submit[test_labels]

In [105]:
df_processed.to_csv('../data/processed_data.csv')
df_submit.to_csv('../data/submit_data.csv')