In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler

from sklearn.metrics import accuracy_score, log_loss
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from dtreeviz.trees import *

In [2]:
data = pd.read_csv('~/desktop/finalproject/data.csv')
data = data[data.shot_made_flag.notnull()].reset_index()

In [3]:
data['opponent'] = data.apply(lambda row: 'NOH' if row['season'] == '2004-05' and row['opponent'] == 'NOP' else row['opponent'], axis=1)

In [4]:
defense = pd.read_pickle('./defense_scrape/defense.pkl')
data = data.merge(defense, left_on=['opponent', 'season'], right_on=['Team_Abb', 'Season'], how='left')

In [5]:
def evaluateModel(model):
    print(f'Test Accuracy: {model.score(X_test, y_test):.4f}')
    print(f'Test Log-Loss: {log_loss(y_test, model.predict_proba(X_test)):.4f}')
    print(f'Training Accuracy: {model.score(X_train, y_train):.4f}')
    
def featureImportance(model):
    feature_importance = list(zip(list(X),model.feature_importances_))
    feature_importance.sort(key=lambda x: x[1], reverse=True)
    for f in feature_importance:
        print(f'{f[0]}: {f[1]}')

In [6]:
features = ['period', 'playoffs', 'seconds_remaining', 'minutes_remaining', 'season',
            'shot_distance', 'shot_made_flag', 'shot_zone_area', 'game_date', 'matchup', 
            'action_type', 'Def_Rtg']

dataset = data[features]
dataset['total_seconds_remaining'] = 60*dataset['minutes_remaining'] + dataset['seconds_remaining']
dataset.drop('minutes_remaining', axis=1, inplace=True)
dataset.drop('seconds_remaining', axis=1, inplace=True)
dataset['month'] = dataset['game_date'].map(lambda x: x.split('-')[1])
dataset.drop('game_date', axis=1, inplace=True)
dataset['home'] = dataset['matchup'].str.contains('vs').astype('int')
dataset.drop('matchup', axis=1, inplace=True)
dataset["action_type"] = dataset["action_type"].astype('category')
dataset["period"] = dataset["period"].astype('category')
dataset["playoffs"] = dataset["playoffs"].astype('category')
dataset["season"] = dataset["season"].astype('category')
dataset["shot_made_flag"] = dataset["shot_made_flag"].astype('category')
dataset["month"] = dataset["month"].astype('category')
dataset["home"] = dataset["home"].astype('category')
multi_categorical_features = ['action_type', 'period', 'season', 'month', 'shot_zone_area']
numerical_features = ['total_seconds_remaining', 'shot_distance']
for c in multi_categorical_features:
    dummies = pd.get_dummies(dataset[c], drop_first=True)
    dummies = dummies.add_prefix("{}#".format(c))
    dataset.drop(c, axis=1, inplace=True)
    dataset = dataset.join(dummies)

X = dataset.loc[:, dataset.columns != 'shot_made_flag']
y = dataset['shot_made_flag']
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
from sklearn.ensemble import GradientBoostingClassifier

rf = GradientBoostingClassifier(n_estimators=120, max_depth=4, min_samples_split=4)
grid_params = {'min_samples_split':np.arange(2,10,1)}
gs = GridSearchCV(estimator=rf,  
                  param_grid=grid_params,
                  scoring='neg_log_loss',
                  cv=5)
gs.fit(X_train, y_train)
evaluateModel(gs)
print(f"{gs.best_estimator_}")

Test Accuracy: -0.6007
Test Log-Loss: 0.6007
Training Accuracy: -0.5824
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=4,
              min_weight_fraction_leaf=0.0, n_estimators=120,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)


In [None]:
gs.best_estimator_timator_

In [None]:
features = ['period', 'playoffs', 'seconds_remaining', 'minutes_remaining', 'season',
            'shot_distance', 'shot_made_flag', 'shot_zone_area', 'game_date', 'matchup', 
            'action_type', 'Def_Rtg']

dataset = data[features]
dataset['total_seconds_remaining'] = 60*dataset['minutes_remaining'] + dataset['seconds_remaining']
dataset.drop('minutes_remaining', axis=1, inplace=True)
dataset.drop('seconds_remaining', axis=1, inplace=True)
dataset['month'] = dataset['game_date'].map(lambda x: x.split('-')[1])
dataset.drop('game_date', axis=1, inplace=True)
dataset['home'] = dataset['matchup'].str.contains('vs').astype('int')
dataset.drop('matchup', axis=1, inplace=True)
dataset["action_type"] = dataset["action_type"].astype('category')
dataset["period"] = dataset["period"].astype('category')
dataset["playoffs"] = dataset["playoffs"].astype('category')
dataset["season"] = dataset["season"].astype('category')
dataset["shot_made_flag"] = dataset["shot_made_flag"].astype('category')
dataset["month"] = dataset["month"].astype('category')
dataset["home"] = dataset["home"].astype('category')
multi_categorical_features = ['action_type', 'period', 'season', 'month', 'shot_zone_area']
numerical_features = ['total_seconds_remaining', 'shot_distance']
for c in multi_categorical_features:
    dummies = pd.get_dummies(dataset[c], drop_first=True)
    dummies = dummies.add_prefix("{}#".format(c))
    dataset.drop(c, axis=1, inplace=True)
    dataset = dataset.join(dummies)

X = dataset.loc[:, dataset.columns != 'shot_made_flag']
y = dataset['shot_made_flag']
X_train, X_test, y_train, y_test = train_test_split(X, y)