In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/data.csv').dropna()

print(df.shape)
print(df.columns)

(25697, 25)
Index([u'action_type', u'combined_shot_type', u'game_event_id', u'game_id',
       u'lat', u'loc_x', u'loc_y', u'lon', u'minutes_remaining', u'period',
       u'playoffs', u'season', u'seconds_remaining', u'shot_distance',
       u'shot_made_flag', u'shot_type', u'shot_zone_area', u'shot_zone_basic',
       u'shot_zone_range', u'team_id', u'team_name', u'game_date', u'matchup',
       u'opponent', u'shot_id'],
      dtype='object')


In [2]:
df.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5
5,Jump Shot,Jump Shot,244,20000012,34.0553,-145,-11,-118.4148,9,3,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,6


## Information about the data
As we can see, there are 25,697 labelled shots and 25 columns (one column is what we're trying to predict, which is whether or not he made the shot). Here are brief descriptions of what each column means:

- action_type:

Values: Jump Shot, Driving Dunk, Layup Shot, Running Jump Shot, etc.

This basically means the type of shot taken


- combined_shot_type

Values: Jump Shot, Dunk, Layup

Same thing as action_type but more generalized. action_type will say "Running Jump Shot" while combined_shot_type will say "Jump Shot," so action_type is more descriptive.

- game_event_id, game_id, shot_id

Identifiers

- loc_x, loc_y

(x,y) location of the position where the shot was taken. (0,0) refers to the location of the basket

- lat, lon

GPS coordinates of the stadium where the game was played

- minutes_remaining

Minutes remaining in the quarter

- period

The quarter

- playoffs

Flag for whether or not this was a playoff game

- season

The year of the game

- seconds_remaining

Seconds remaining in the minute

- shot_distance

Euclidean distance from the basket in feet

- shot_type

2 PT or 3 PT shot (this can also be determined by shot distance)

- shot_zone_area

Side of the court

- shot_zone_basic

Descriptive locations of where the shots are taken (Restricted, In the paint, Mid-Range, 

- shot_zone_range

Range of where the shot was taken

- team_id, team_name

ID, name of the team (Kobe's team)

- game_date

Date of the game

- matchup

The away vs home team

- opponent

The name of the opponent team

- shot_made_flag

Flag for if the shot is made or not

## Pre Processing - Get Dummies

In [3]:
drop_list = ['combined_shot_type', 'game_event_id', 'game_id', 'lat', 'lon', 'shot_type', 'shot_zone_area',
            'shot_zone_basic', 'shot_zone_range', 'team_id', 'team_name', 'shot_id', 'opponent']

def drop_columns(df):
    return df.drop(drop_list, axis=1)

df = drop_columns(df)
df = pd.get_dummies(df)
print(df.shape)

y = df['shot_made_flag']
X = df.drop('shot_made_flag', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

(25697, 1715)


In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

X_train_simple = X_train[['loc_x', 'loc_y']]
X_test_simple = X_test[['loc_x', 'loc_y']]

# Try using XGBoost
cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}

clf_simple = GridSearchCV(
    xgb.XGBClassifier(**ind_params),
    cv_params,
    scoring = 'accuracy', cv=3, n_jobs=-1
)


clf_simple.fit(X_train_simple, y_train)

predictions_simple = clf_simple.predict(X_test_simple)

print "Done" # Takes awhile to run...

Done


In [5]:
# print clf_simple.cv_results_
print clf_simple.cv_results_

{'std_train_score': array([ 0.0029669 ,  0.00320376,  0.00298467,  0.00231833,  0.00247814,
        0.00173389,  0.00404398,  0.0024756 ,  0.00177908]), 'rank_test_score': array([2, 1, 3, 8, 5, 4, 9, 7, 6]), 'mean_score_time': array([ 0.08400003,  0.0893333 ,  0.09499995,  0.14633338,  0.22066681,
        0.18266662,  0.21033343,  0.21033335,  0.19999997]), 'param_max_depth': masked_array(data = [3 3 3 5 5 5 7 7 7],
             mask = [False False False False False False False False False],
       fill_value = ?)
, 'std_test_score': array([ 0.00175272,  0.0024609 ,  0.00144506,  0.00218803,  0.00424183,
        0.00389584,  0.00341969,  0.00387335,  0.00296613]), 'split1_train_score': array([ 0.63988847,  0.63962708,  0.63710029,  0.64877581,  0.64668467,
        0.6456391 ,  0.65086695,  0.64877581,  0.6481659 ]), 'split0_test_score': array([ 0.59156647,  0.59383168,  0.58999826,  0.57762676,  0.57832375,
        0.57797526,  0.57623279,  0.57640704,  0.57658129]), 'mean_test_score':

In [7]:
print clf_simple.score(X_test_simple, y_test)

0.586959085014


In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}

clf = GridSearchCV(
    xgb.XGBClassifier(**ind_params),
    cv_params,
    scoring = 'accuracy', cv=3, n_jobs=-1
)


clf.fit(X_train, y_train)

print "Done" # Takes awhile to run...

In [None]:
print clf.cv_results_

In [None]:
print clf.score(X_test, y_test)

In [5]:
misses_simple = np.where(y_test != predictions_simple)
print len(misses_simple[0])
print len(y_test)

3403
8481
