In [1]:
reset -fs

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np

# Data Preprocessing

In [4]:
data = pd.read_csv('../data.csv')

In [5]:
data = data[data.shot_made_flag.notnull()].reset_index()

In [6]:
features = ['combined_shot_type', 'period', 'playoffs', 'seconds_remaining', 'minutes_remaining', 'season',
          'shot_distance', 'shot_made_flag', 'shot_zone_area', 'game_date', 'matchup']

In [7]:
dataset = data[features]

In [8]:
dataset['total_seconds_remaining'] = 60*dataset['minutes_remaining'] + dataset['seconds_remaining']
dataset.drop('minutes_remaining', axis=1, inplace=True)
dataset.drop('seconds_remaining', axis=1, inplace=True)

In [9]:
dataset['month'] = dataset['game_date'].map(lambda x: x.split('-')[1])

In [10]:
dataset.drop('game_date', axis=1, inplace=True)

In [11]:
dataset['home'] = dataset['matchup'].str.contains('vs').astype('int')
dataset.drop('matchup', axis=1, inplace=True)

In [12]:
dataset.head()

Unnamed: 0,combined_shot_type,period,playoffs,season,shot_distance,shot_made_flag,shot_zone_area,total_seconds_remaining,month,home
0,Jump Shot,1,0,2000-01,15,0.0,Left Side(L),622,10,0
1,Jump Shot,1,0,2000-01,16,1.0,Left Side Center(LC),465,10,0
2,Jump Shot,1,0,2000-01,22,0.0,Right Side Center(RC),412,10,0
3,Dunk,2,0,2000-01,0,1.0,Center(C),379,10,0
4,Jump Shot,3,0,2000-01,14,0.0,Left Side(L),572,10,0


In [13]:
dataset["combined_shot_type"] = dataset["combined_shot_type"].astype('category')
dataset["period"] = dataset["period"].astype('category')
dataset["playoffs"] = dataset["playoffs"].astype('category')
dataset["season"] = dataset["season"].astype('category')
dataset["shot_made_flag"] = dataset["shot_made_flag"].astype('category')
dataset["month"] = dataset["home"].astype('category')
dataset["home"] = dataset["home"].astype('category')

In [14]:
multi_categorical_features = ['combined_shot_type', 'period', 'season', 'month', 'shot_zone_area']

In [15]:
for c in multi_categorical_features:
    dummies = pd.get_dummies(dataset[c], drop_first=True)
    dummies = dummies.add_prefix("{}#".format(c))
    dataset.drop(c, axis=1, inplace=True)
    dataset = dataset.join(dummies)

# Sample from dataset since it's too big

In [16]:
sample = dataset.sample(10000)

In [17]:
X = sample.loc[:, dataset.columns != 'shot_made_flag']
Y = sample['shot_made_flag']

In [18]:
X.columns

Index(['playoffs', 'shot_distance', 'total_seconds_remaining', 'home',
       'combined_shot_type#Dunk', 'combined_shot_type#Hook Shot',
       'combined_shot_type#Jump Shot', 'combined_shot_type#Layup',
       'combined_shot_type#Tip Shot', 'period#2', 'period#3', 'period#4',
       'period#5', 'period#6', 'period#7', 'season#1997-98', 'season#1998-99',
       'season#1999-00', 'season#2000-01', 'season#2001-02', 'season#2002-03',
       'season#2003-04', 'season#2004-05', 'season#2005-06', 'season#2006-07',
       'season#2007-08', 'season#2008-09', 'season#2009-10', 'season#2010-11',
       'season#2011-12', 'season#2012-13', 'season#2013-14', 'season#2014-15',
       'season#2015-16', 'month#1', 'shot_zone_area#Center(C)',
       'shot_zone_area#Left Side Center(LC)', 'shot_zone_area#Left Side(L)',
       'shot_zone_area#Right Side Center(RC)', 'shot_zone_area#Right Side(R)'],
      dtype='object')

# Start of linear SVM modeling

In [19]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20)

In [22]:
X_train.shape

(8000, 40)

In [23]:
svclassifier = SVC(kernel='linear')  
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [24]:
y_pred = svclassifier.predict(X_test)

In [25]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  

[[935 173]
 [588 304]]
              precision    recall  f1-score   support

         0.0       0.61      0.84      0.71      1108
         1.0       0.64      0.34      0.44       892

   micro avg       0.62      0.62      0.62      2000
   macro avg       0.63      0.59      0.58      2000
weighted avg       0.62      0.62      0.59      2000



In [26]:
svclassifier.score(X_train, y_train)

0.6145

In [27]:
accuracy_score(y_pred, y_test)

0.6195