# Kobe Shot Selection Neural Network

In [1]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical

In [3]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

## Preprocess the data

Before we implement any models, we first load and preprocess the data. 

So lets load the data from file and have a look.

In [4]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,...,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5


Next we'll check out the features we have at our disposal

In [5]:
list(data)

['action_type',
 'combined_shot_type',
 'game_event_id',
 'game_id',
 'lat',
 'loc_x',
 'loc_y',
 'lon',
 'minutes_remaining',
 'period',
 'playoffs',
 'season',
 'seconds_remaining',
 'shot_distance',
 'shot_made_flag',
 'shot_type',
 'shot_zone_area',
 'shot_zone_basic',
 'shot_zone_range',
 'team_id',
 'team_name',
 'game_date',
 'matchup',
 'opponent',
 'shot_id']

`shot_made_flag` is what we'd like to predict, however there have been about 5000 of these entries left out of the data set, so first we'll go through and clean out all the rows/instances which don't have a shot label

In [6]:
data = data[pd.notnull(data.shot_made_flag)]
data.shot_made_flag[:15]

1     0.0
2     1.0
3     0.0
4     1.0
5     0.0
6     1.0
8     1.0
9     0.0
10    0.0
11    1.0
12    1.0
13    0.0
14    0.0
15    0.0
17    1.0
Name: shot_made_flag, dtype: float64

Thats a bit better. Let's clean up the indices and do some more trimming of the feature space. 

In [9]:
data.index = range(len(data))

### Pruning Features

It seems a bit redundant to have both the `action_type` and the `combined_shot_type` features (however, this might not be the case - this could be changed later on), in particular, the `action_type` seems like a more fine grained version of `combined_shot_type`, so we'll get rid of the `combined_shot_type` feature. 

In [10]:
data = data.drop('combined_shot_type', axis=1)

The `game_id` seems irrelevant - a case can be made that this feature gives information regarding the fact that some shots are from the same game and some from others, but for now we'll drop that one. The `game_event_id` also seems irrelevant, however it does provide some indication as to how far into a given game we are. I think we'll try rescaling this feature between 0 (start of the game) and 1 (end of the game) - we'll do this later on.

In [11]:
data = data.drop('game_id', axis=1)

The `team_id` and `team_name` features are constant and not useful - we'll remove those. The `shot_id` seems useless being just an enumeration of all of Kobe's shots, however, like `game_event_id` this is an indication as to how far into kobe's career the shot was taken - we would expect players shots to improve, plateau and decline through out the course of their careers - so we'll leave that feature in.

In [12]:
data = data.drop(['team_id', 'team_name'], axis=1)

Lastly, we'll drop the date, only because I need to find a nice way to incorporate it ...

In [13]:
data = data.drop('game_date', axis=1)
data.head()

Unnamed: 0,action_type,game_event_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,matchup,opponent,shot_id
0,Jump Shot,12,34.0443,-157,0,-118.4268,10,1,0,2000-01,22,15,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,LAL @ POR,POR,2
1,Jump Shot,35,33.9093,-101,135,-118.3708,7,1,0,2000-01,45,16,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,LAL @ POR,POR,3
2,Jump Shot,43,33.8693,138,175,-118.1318,6,1,0,2000-01,52,22,0.0,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,LAL @ POR,POR,4
3,Driving Dunk Shot,155,34.0443,0,0,-118.2698,6,2,0,2000-01,19,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,LAL @ POR,POR,5
4,Jump Shot,244,34.0553,-145,-11,-118.4148,9,3,0,2000-01,32,14,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,LAL @ POR,POR,6


### Dummy variables

we have some categorical variables like `action_type`, `period`, etc. which we convert to dummy indicator variables using `pd.get_dummies()`. This converts our data set from being one with 18 input features to one with 219 input features. 

In [14]:
dummy_fields = ['action_type', 'period', 'season', 'shot_type', 'shot_zone_area', \
                'shot_zone_basic', 'shot_zone_range', 'matchup', 'opponent']
for each in dummy_fields:
    dummies = pd.get_dummies(data[each], prefix=each, drop_first=False)
    data = pd.concat([data, dummies], axis=1)

data = data.drop(dummy_fields, axis=1)
data.head()

Unnamed: 0,game_event_id,lat,loc_x,loc_y,lon,minutes_remaining,playoffs,seconds_remaining,shot_distance,shot_made_flag,...,opponent_PHI,opponent_PHX,opponent_POR,opponent_SAC,opponent_SAS,opponent_SEA,opponent_TOR,opponent_UTA,opponent_VAN,opponent_WAS
0,12,34.0443,-157,0,-118.4268,10,0,22,15,0.0,...,0,0,1,0,0,0,0,0,0,0
1,35,33.9093,-101,135,-118.3708,7,0,45,16,1.0,...,0,0,1,0,0,0,0,0,0,0
2,43,33.8693,138,175,-118.1318,6,0,52,22,0.0,...,0,0,1,0,0,0,0,0,0,0
3,155,34.0443,0,0,-118.2698,6,0,19,0,1.0,...,0,0,1,0,0,0,0,0,0,0
4,244,34.0553,-145,-11,-118.4148,9,0,32,14,0.0,...,0,0,1,0,0,0,0,0,0,0


### Scaling Continuous Variables

To help the network train we scale `lat`, `loc_x`, `loc_y` and `lon` to have mean 0 and standard deviation 1. 

In [15]:
cont_variables = ['lat', 'loc_x', 'loc_y', 'lon']
scaled_cont_features = {}
for each in cont_variables:
    mean, std = data[each].mean(), data[each].std()
    scaled_cont_features[each] = [mean, std]
    data.loc[:, each] = (data[each] - mean)/std

data.head()

Unnamed: 0,game_event_id,lat,loc_x,loc_y,lon,minutes_remaining,playoffs,seconds_remaining,shot_distance,shot_made_flag,...,opponent_PHI,opponent_PHX,opponent_POR,opponent_SAC,opponent_SAS,opponent_SEA,opponent_TOR,opponent_UTA,opponent_VAN,opponent_WAS
0,12,1.035226,-1.491267,-1.035226,-1.491267,10,0,22,15,0.0,...,0,0,1,0,0,0,0,0,0,0
1,35,-0.496218,-0.982514,0.496218,-0.982514,7,0,45,16,1.0,...,0,0,1,0,0,0,0,0,0,0
2,43,-0.949979,1.188769,0.949979,1.188769,6,0,52,22,0.0,...,0,0,1,0,0,0,0,0,0,0
3,155,1.035226,-0.064942,-1.035226,-0.064942,6,0,19,0,1.0,...,0,0,1,0,0,0,0,0,0,0
4,244,1.16001,-1.382248,-1.16001,-1.382248,9,0,32,14,0.0,...,0,0,1,0,0,0,0,0,0,0


We also scale `game_event_id`, `minutes_remaining`, `seconds_remaining` and `shot_distance` to lie within the interval $[0,1]$

In [16]:
interv_variables = ['game_event_id', 'minutes_remaining', 'seconds_remaining', 'shot_distance', 'shot_id']
scaled_interv_features = {}
for each in interv_variables:
    minn, maxx = data[each].min(), data[each].max()
    scaled_interv_features[each] = [minn, maxx]
    data.loc[:, each] = (data[each] - minn)/(maxx - minn)

data.head()

Unnamed: 0,game_event_id,lat,loc_x,loc_y,lon,minutes_remaining,playoffs,seconds_remaining,shot_distance,shot_made_flag,...,opponent_PHI,opponent_PHX,opponent_POR,opponent_SAC,opponent_SAS,opponent_SEA,opponent_TOR,opponent_UTA,opponent_VAN,opponent_WAS
0,0.015361,1.035226,-1.491267,-1.035226,-1.491267,0.909091,0,0.372881,0.189873,0.0,...,0,0,1,0,0,0,0,0,0,0
1,0.050691,-0.496218,-0.982514,0.496218,-0.982514,0.636364,0,0.762712,0.202532,1.0,...,0,0,1,0,0,0,0,0,0,0
2,0.06298,-0.949979,1.188769,0.949979,1.188769,0.545455,0,0.881356,0.278481,0.0,...,0,0,1,0,0,0,0,0,0,0
3,0.235023,1.035226,-0.064942,-1.035226,-0.064942,0.545455,0,0.322034,0.0,1.0,...,0,0,1,0,0,0,0,0,0,0
4,0.371736,1.16001,-1.382248,-1.16001,-1.382248,0.818182,0,0.542373,0.177215,0.0,...,0,0,1,0,0,0,0,0,0,0


## Building Models

Here we construct models to predict whether or not kobe will make a given shot. In particular, we'll try a few things: 

  1) Training models on randomly selected shots throughout Kobe's career, and then for a given shot we
  haven't seen try to predict if that shot will go in. This is clearly a naive way to train a model and make
  "predictions". This is analogous to trying to "predict" if the jays will win a game knowing that they won the 
  world series that year - knowing this likely means they had a good team, and in turn are more likely to win 
  throughout that season.
  
  2) Training/validating the models on shots from the early part of Kobe's career, and then for a given shot taken 
  at any point after (chronologically) we'll try to predict if that shot will go in.
  
  3) We'll also try training the models on a "historical" data set of fixed size (say a data set consisting of 
  10000 sequential shots) and then make predictions for the next game.

## Prediction Paradigm #1:

### Train, Validation and Test Sets

We now split the data up into training and testing sets. Creation of the validation set will be handled by TFLearn. We use the `to_categorical` function on the labels, yielding two output units, so that we can use the softmax function to classify the data.

In [14]:
X = data.drop('shot_made_flag', axis=1).as_matrix()
Y = (data['shot_made_flag'] == 1.0).astype(np.int_)
records = len(Y)

shuffle = np.arange(records)
np.random.shuffle(shuffle)
test_fraction = 0.9

train_split, test_split = shuffle[:int(records*test_fraction)], shuffle[int(records*test_fraction):]
trainX, trainY = X[train_split, :], to_categorical(Y.values[train_split], 2)
testX, testY = X[test_split, :], to_categorical(Y.values[test_split], 2)

### SKLearn

In [15]:
# initialize model
clfNB = GaussianNB() 

# training 
# we dont use trainY because GaussianNB only wants out output variable
clfNB.fit(trainX, Y.values[train_split])

# make predictions
predNB = clfNB.predict(testX)

# output model accuracy on the test set 
print("Naive Bayes accuracy: ", accuracy_score(Y.values[test_split], predNB))

Naive Bayes accuracy:  0.615564202335


### TFLearn

#### Network Building

In [45]:
# Network building
def build_model_1(h1, lr):
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    net = tflearn.input_data([None, 218])                       # input
    net = tflearn.fully_connected(net, h1, activation='ReLU')   # hidden1
    net = tflearn.fully_connected(net, 2, activation='softmax') # output
    net = tflearn.regression(net, optimizer='sgd', learning_rate=lr, loss='categorical_crossentropy')
    
    model = tflearn.DNN(net)
    return model

def build_model_2(h1, h2, lr):
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    net = tflearn.input_data([None, 218])                       # input
    net = tflearn.fully_connected(net, h1, activation='ReLU')   # hidden1
    net = tflearn.fully_connected(net, h2, activation='ReLU')   # hidden2
    net = tflearn.fully_connected(net, 2, activation='softmax') # output
    net = tflearn.regression(net, optimizer='sgd', learning_rate=lr, loss='categorical_crossentropy')
    
    model = tflearn.DNN(net)
    return model

def build_model_3(h1, h2, h3, lr):
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    net = tflearn.input_data([None, 218])                       # input
    net = tflearn.fully_connected(net, h1, activation='ReLU')   # hidden1
    net = tflearn.fully_connected(net, h2, activation='ReLU')   # hidden2
    net = tflearn.fully_connected(net, h3, activation='ReLU')   # hidden3
    net = tflearn.fully_connected(net, 2, activation='softmax') # output
    net = tflearn.regression(net, optimizer='sgd', learning_rate=lr, loss='categorical_crossentropy')
    
    model = tflearn.DNN(net)
    return model

#### Initialization and Training

In [17]:
# initialize model1 (1 hidden layer)
model1 = build_model_1(50, 0.1)

In [18]:
# training
model1.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=25)

Training Step: 4074  | total loss: [1m[32m0.60906[0m[0m | time: 0.414s
| SGD | epoch: 025 | loss: 0.60906 - acc: 0.6693 -- iter: 20736/20814
Training Step: 4075  | total loss: [1m[32m0.59811[0m[0m | time: 1.424s
| SGD | epoch: 025 | loss: 0.59811 - acc: 0.6805 | val_loss: 0.60861 - val_acc: 0.6853 -- iter: 20814/20814
--


In [19]:
# initialize the model2 (2 hidden layer)
model2 = build_model_2(70, 20, 0.1)

In [20]:
# training
model2.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=25)

Training Step: 4074  | total loss: [1m[32m0.59073[0m[0m | time: 0.431s
| SGD | epoch: 025 | loss: 0.59073 - acc: 0.6924 -- iter: 20736/20814
Training Step: 4075  | total loss: [1m[32m0.59417[0m[0m | time: 1.439s
| SGD | epoch: 025 | loss: 0.59417 - acc: 0.6896 | val_loss: 0.61452 - val_acc: 0.6680 -- iter: 20814/20814
--


In [26]:
# initialize the model3 (3 hidden layer)
model3 = build_model_3(120, 60, 20, 0.1)

In [27]:
# training
model3.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=25)

Training Step: 4074  | total loss: [1m[32m0.60668[0m[0m | time: 0.493s
| SGD | epoch: 025 | loss: 0.60668 - acc: 0.6837 -- iter: 20736/20814
Training Step: 4075  | total loss: [1m[32m0.60574[0m[0m | time: 1.503s
| SGD | epoch: 025 | loss: 0.60574 - acc: 0.6809 | val_loss: 0.60327 - val_acc: 0.6814 -- iter: 20814/20814
--


#### Testing

One hidden layer network

In [28]:
predictions = (np.array(model1.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)

Test accuracy:  0.681712062257


Two hidden layer network

In [29]:
predictions = (np.array(model2.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)

Test accuracy:  0.671206225681


Three hidden layer network

In [30]:
predictions = (np.array(model3.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)

Test accuracy:  0.678988326848


## Prediction paradigm #2:

In [30]:
test_fraction = 0.9
valid_fraction = 0.75

testX, testY = X[int(records*test_fraction):, :], to_categorical(Y.values[int(records*test_fraction):], 2)
validX, validY = X[int(records*valid_fraction):int(records*test_fraction), :], \
                 to_categorical(Y.values[int(records*valid_fraction):int(records*test_fraction)], 2)
trainX, trainY = X[:int(records*valid_fraction), :], to_categorical(Y.values[:int(records*valid_fraction)], 2)

### SKLearn

In [43]:
# initialize model
clfNB = GaussianNB() 

# training 
# we dont use trainY because GaussianNB only wants out output variable
clfNB.fit(np.append(trainX, validX, axis=0), Y.values[:int(records*test_fraction)])

# make predictions
predNB = clfNB.predict(testX)

# output model accuracy on the test set 
print("Naive Bayes accuracy: ", accuracy_score(Y.values[int(records*test_fraction):], predNB))

Naive Bayes accuracy:  0.613229571984


#### Initialization and Training

In [92]:
# initialize model1 (1 hidden layer)
model1 = build_model_1(13, 0.2) 
# 50, 0.2, 30-35 epochs ~68% validation accuracy
# 13, 0.2, 30-35 epochs ~68% validation accuracy

In [93]:
# training
model1.fit(trainX, trainY, validation_set=(validX, validY), show_metric=True, batch_size=128, n_epoch=35)

Training Step: 5284  | total loss: [1m[32m0.58206[0m[0m | time: 0.363s
| SGD | epoch: 035 | loss: 0.58206 - acc: 0.6958 -- iter: 19200/19272
Training Step: 5285  | total loss: [1m[32m0.59025[0m[0m | time: 1.374s
| SGD | epoch: 035 | loss: 0.59025 - acc: 0.6872 | val_loss: 0.61172 - val_acc: 0.6843 -- iter: 19272/19272
--


In [127]:
# initialize the model2 (2 hidden layer)
model2 = build_model_2(10, 10, 0.2)
# 60, 15, 0.2, 25-30 epochs ~68% validation accuracy
# 10, 10, 0.2, 30-40 epochs ~68% validation accuracy

In [134]:
# training
model2.fit(trainX, trainY, validation_set=(validX, validY), show_metric=True, batch_size=128, n_epoch=35)

Training Step: 7549  | total loss: [1m[32m0.57116[0m[0m | time: 0.392s
| SGD | epoch: 050 | loss: 0.57116 - acc: 0.6994 -- iter: 19200/19272
Training Step: 7550  | total loss: [1m[32m0.57137[0m[0m | time: 1.400s
| SGD | epoch: 050 | loss: 0.57137 - acc: 0.6998 | val_loss: 0.62073 - val_acc: 0.6789 -- iter: 19272/19272
--


In [160]:
# initialize the model3 (3 hidden layer)
model3 = build_model_3(100, 40, 5, 0.3)
# 100, 40, 5, 0.3, 25-30 epochs ~68% validation accuracy

In [161]:
# training
model3.fit(trainX, trainY, validation_set=(validX, validY), show_metric=True, batch_size=128, n_epoch=30)

Training Step: 4529  | total loss: [1m[32m0.60995[0m[0m | time: 0.435s
| SGD | epoch: 030 | loss: 0.60995 - acc: 0.6601 -- iter: 19200/19272
Training Step: 4530  | total loss: [1m[32m0.60735[0m[0m | time: 1.441s
| SGD | epoch: 030 | loss: 0.60735 - acc: 0.6605 | val_loss: 0.62023 - val_acc: 0.6799 -- iter: 19272/19272
--


#### Testing

One hidden layer network

In [165]:
predictions = (np.array(model1.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)

Test accuracy:  0.670817120623


Two hidden layer network

In [163]:
predictions = (np.array(model2.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)

Test accuracy:  0.673929961089


Three hidden layer network

In [164]:
predictions = (np.array(model3.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)

Test accuracy:  0.674708171206
