In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

Importing data and removing entries with no `shot_made_flag`

In [3]:
data = pd.read_csv('~/data/kobe/data.csv')

In [4]:
data = data[data.shot_made_flag.notnull()].reset_index()

Identifying key features and filtering data set for these features

In [5]:
features = ['combined_shot_type', 'period', 'playoffs', 'seconds_remaining', 'minutes_remaining', 'season',
          'shot_distance', 'shot_made_flag', 'shot_zone_area', 'game_date', 'matchup']

In [6]:
dataset = data[features]

Some feature engineering, extracting total seconds remaining, month of game and whether home or away

In [7]:
dataset['total_seconds_remaining'] = 60*dataset['minutes_remaining'] + dataset['seconds_remaining']
dataset.drop('minutes_remaining', axis=1, inplace=True)
dataset.drop('seconds_remaining', axis=1, inplace=True)

In [8]:
dataset['month'] = dataset['game_date'].map(lambda x: x.split('-')[1])

In [9]:
dataset.drop('game_date', axis=1, inplace=True)

In [10]:
dataset['home'] = dataset['matchup'].str.contains('vs').astype('int')
dataset.drop('matchup', axis=1, inplace=True)

Making dummy variables from the categorical features

In [12]:
dataset["combined_shot_type"] = dataset["combined_shot_type"].astype('category')
dataset["period"] = dataset["period"].astype('category')
dataset["playoffs"] = dataset["playoffs"].astype('category')
dataset["season"] = dataset["season"].astype('category')
dataset["shot_made_flag"] = dataset["shot_made_flag"].astype('category')
dataset["month"] = dataset["month"].astype('category')
dataset["home"] = dataset["home"].astype('category')

In [13]:
multi_categorical_features = ['combined_shot_type', 'period', 'season', 'month', 'shot_zone_area']

In [14]:
for c in multi_categorical_features:
    dummies = pd.get_dummies(dataset[c], drop_first=True)
    dummies = dummies.add_prefix("{}#".format(c))
    dataset.drop(c, axis=1, inplace=True)
    dataset = dataset.join(dummies)

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X = dataset.loc[:, dataset.columns != 'shot_made_flag']
Y = dataset['shot_made_flag']

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)

In [37]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

Decision tree classifier:

In [100]:
max_leaf_nodes = 10
dt = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes,
                            criterion='entropy',
                            random_state=42)
dt.fit(X_train, Y_train)
node_count = dt.tree_.node_count
predicted = dt.predict(X_test)
print(f'Decision tree with {node_count} nodes')
print(f"Accuracy: {accuracy_score(predicted, Y_test):.3f}")

Decision tree with 19 nodes
Accuracy: 0.614


Random forest classifier: 

In [102]:
max_leaf_nodes = 10
clf = RandomForestClassifier(max_leaf_nodes=10, n_estimators=100, criterion='entropy', random_state=42)
clf.fit(X_train, Y_train)
predicted = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(predicted, Y_test):.3f}")

Accuracy: 0.616


We can rank the features in order of importance:

In [103]:
features = list(dataset.drop('shot_made_flag', axis=1))
feature_imp = pd.Series(clf.feature_importances_,index=features).sort_values(ascending=False)
feature_imp

combined_shot_type#Dunk                 0.289627
shot_distance                           0.201578
combined_shot_type#Jump Shot            0.167059
shot_zone_area#Center(C)                0.120087
combined_shot_type#Layup                0.088109
total_seconds_remaining                 0.052467
shot_zone_area#Left Side Center(LC)     0.024179
shot_zone_area#Right Side Center(RC)    0.007388
period#4                                0.006008
season#2015-16                          0.005249
combined_shot_type#Tip Shot             0.003963
season#1997-98                          0.002754
home                                    0.002278
shot_zone_area#Left Side(L)             0.002070
month#12                                0.002054
season#2000-01                          0.001881
month#04                                0.001509
season#2006-07                          0.001499
season#2005-06                          0.001377
season#2008-09                          0.001341
season#2014-15      