In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/data.csv').dropna()

print(df.shape)
print(df.columns)

(25697, 25)
Index([u'action_type', u'combined_shot_type', u'game_event_id', u'game_id',
       u'lat', u'loc_x', u'loc_y', u'lon', u'minutes_remaining', u'period',
       u'playoffs', u'season', u'seconds_remaining', u'shot_distance',
       u'shot_made_flag', u'shot_type', u'shot_zone_area', u'shot_zone_basic',
       u'shot_zone_range', u'team_id', u'team_name', u'game_date', u'matchup',
       u'opponent', u'shot_id'],
      dtype='object')


In [2]:
df.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5
5,Jump Shot,Jump Shot,244,20000012,34.0553,-145,-11,-118.4148,9,3,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,6


## Information about the data
As we can see, there are 25,697 labelled shots and 25 columns (one column is what we're trying to predict, which is whether or not he made the shot). Here are brief descriptions of what each column means:

- action_type:

Values: Jump Shot, Driving Dunk, Layup Shot, Running Jump Shot, etc.

This basically means the type of shot taken


- combined_shot_type

Values: Jump Shot, Dunk, Layup

Same thing as action_type but more generalized. action_type will say "Running Jump Shot" while combined_shot_type will say "Jump Shot," so action_type is more descriptive.

- game_event_id, game_id, shot_id

Identifiers

- loc_x, loc_y

(x,y) location of the position where the shot was taken. (0,0) refers to the location of the basket

- lat, lon

GPS coordinates of the stadium where the game was played

- minutes_remaining

Minutes remaining in the quarter

- period

The quarter

- playoffs

Flag for whether or not this was a playoff game

- season

The year of the game

- seconds_remaining

Seconds remaining in the minute

- shot_distance

Euclidean distance from the basket in feet

- shot_type

2 PT or 3 PT shot (this can also be determined by shot distance)

- shot_zone_area

Side of the court

- shot_zone_basic

Descriptive locations of where the shots are taken (Restricted, In the paint, Mid-Range, 

- shot_zone_range

Range of where the shot was taken

- team_id, team_name

ID, name of the team (Kobe's team)

- game_date

Date of the game

- matchup

The away vs home team

- opponent

The name of the opponent team

- shot_made_flag

Flag for if the shot is made or not

## Pre Processing

In [3]:
from sklearn import preprocessing

# TODO - Compare performance between using combined_shot_type and action_type
used_columns = ['action_type', 'loc_x', 'loc_y', 'minutes_remaining', 'playoffs', 'season', 'seconds_remaining',
                'home', 'shot_made_flag']

# Whether or not this was a home game
def home_game(matchup):
    if '@' in matchup:
        return 0
    else:
        return 1

# Next 2 functions are used to get the season # where 1 is his first year
def extract_season(dt):
        start_year = 1996
        # Kobe's rookie season was 1996-1997
        # Seasons start in October, end by July of the next year
        if dt.month < 8:
            return dt.year - start_year
        else:
            return dt.year - start_year + 1

def process_season_number(df):
    df['game_date'] = pd.to_datetime(df['game_date'])
    df = df.sort_values(['game_date'])
    df['year'] = df['game_date'].dt.year
    df['month'] = df['game_date'].dt.month
    df['day'] = df['game_date'].dt.day
    # Next, let's extract the season number 
    df['season'] = df['game_date'].apply(extract_season)
    return df.drop(['game_date', 'year', 'month', 'day'], axis=1)

# This function is used for determine how much time is left in the game (in minutes)
def process_minutes_left(df):
    df['minutes_remaining'] = (4 - df['period'])*12 + df['minutes_remaining']
    return df.drop(['period'], axis=1)

# All pre-processing
def process_data(df):
    # Convert the action types
    le_action = preprocessing.LabelEncoder()
    le_opp = preprocessing.LabelEncoder()
    df['action_type'] = le_action.fit_transform(df['action_type'].values)
    df['opponent'] = le_opp.fit_transform(df['opponent'].values)
    df['home'] = df['matchup'].apply(home_game)
    df = process_season_number(df)
    df = process_minutes_left(df)
    return df[used_columns]
    
df_p = process_data(df)

y = df_p['shot_made_flag']
X = df_p.drop('shot_made_flag', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

df_p.head(100)

Unnamed: 0,action_type,loc_x,loc_y,minutes_remaining,playoffs,season,seconds_remaining,home,shot_made_flag
22901,25,-140,116,36,0,1,42,1,0.0
22902,25,-131,97,34,0,1,8,0,0.0
22903,25,-142,181,32,0,1,37,0,1.0
22904,25,0,0,30,0,1,34,0,0.0
22905,25,-10,138,29,0,1,27,0,1.0
22907,25,-64,223,26,0,1,16,0,1.0
22909,25,-79,177,13,0,1,53,0,0.0
22910,25,-103,207,13,0,1,14,0,1.0
22911,11,0,0,12,0,1,2,0,0.0
22912,25,-155,175,9,0,1,9,0,0.0


In [4]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

X_train_simple = X_train[['loc_x', 'loc_y']]
X_test_simple = X_test[['loc_x', 'loc_y']]

# Try using XGBoost
cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}

clf_simple = GridSearchCV(
    xgb.XGBClassifier(**ind_params),
    cv_params,
    scoring = 'accuracy', cv=5, n_jobs=-1
)


clf_simple.fit(X_train_simple, y_train)

predictions_simple = clf_simple.predict(X_test_simple)

In [7]:
print clf_simple.cv_results_

{'rank_test_score': array([1, 3, 2, 6, 5, 4, 8, 9, 7]), 'std_score_time': array([ 0.00803988,  0.00282839,  0.00135647,  0.01509434,  0.01166189,
        0.0054185 ,  0.00503592,  0.01018633,  0.00711625]), 'split2_test_score': array([ 0.58495498,  0.58727854,  0.5864072 ,  0.57914609,  0.5811792 ,
        0.5811792 ,  0.57449898,  0.57595121,  0.57827476]), 'mean_fit_time': array([ 4.40360007,  4.59819999,  4.35220003,  6.79360003,  6.37080002,
        6.17339993,  7.31440005,  7.23959999,  6.48359995]), 'split3_train_score': array([ 0.62898424,  0.6297103 ,  0.62891164,  0.63544616,  0.6346475 ,
        0.63435708,  0.63784215,  0.63631743,  0.63559137]), 'std_test_score': array([ 0.00376496,  0.00328812,  0.00392151,  0.00260872,  0.00337703,
        0.00296384,  0.00400761,  0.00289484,  0.00334077]), 'params': ({'max_depth': 3, 'min_child_weight': 1}, {'max_depth': 3, 'min_child_weight': 3}, {'max_depth': 3, 'min_child_weight': 5}, {'max_depth': 5, 'min_child_weight': 1}, {'max_de

In [5]:


misses_simple = np.where(y_test != predictions_simple)
print len(misses_simple[0])
print len(y_test)

3403
8481
