In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('all/data.csv')

In [4]:
data = data[data.shot_made_flag.notnull()].reset_index()

In [5]:
features = ['combined_shot_type', 'period', 'playoffs', 'seconds_remaining', 'minutes_remaining', 'season',
          'shot_distance', 'shot_made_flag', 'shot_zone_area', 'game_date', 'matchup']

In [6]:
dataset = data[features]

In [7]:
dataset.head()

Unnamed: 0,combined_shot_type,period,playoffs,seconds_remaining,minutes_remaining,season,shot_distance,shot_made_flag,shot_zone_area,game_date,matchup
0,Jump Shot,1,0,22,10,2000-01,15,0.0,Left Side(L),2000-10-31,LAL @ POR
1,Jump Shot,1,0,45,7,2000-01,16,1.0,Left Side Center(LC),2000-10-31,LAL @ POR
2,Jump Shot,1,0,52,6,2000-01,22,0.0,Right Side Center(RC),2000-10-31,LAL @ POR
3,Dunk,2,0,19,6,2000-01,0,1.0,Center(C),2000-10-31,LAL @ POR
4,Jump Shot,3,0,32,9,2000-01,14,0.0,Left Side(L),2000-10-31,LAL @ POR


In [8]:
dataset.matchup[100]

'LAL vs. HOU'

In [9]:
dataset['total_seconds_remaining'] = 60*dataset['minutes_remaining'] + dataset['seconds_remaining']
dataset.drop('minutes_remaining', axis=1, inplace=True)
dataset.drop('seconds_remaining', axis=1, inplace=True)

In [10]:
dataset['month'] = dataset['game_date'].map(lambda x: x.split('-')[1])

In [11]:
dataset.drop('game_date', axis=1, inplace=True)

In [12]:
dataset['home'] = dataset['matchup'].str.contains('vs').astype('int')
dataset.drop('matchup', axis=1, inplace=True)

In [13]:
dataset.head()

Unnamed: 0,combined_shot_type,period,playoffs,season,shot_distance,shot_made_flag,shot_zone_area,total_seconds_remaining,month,home
0,Jump Shot,1,0,2000-01,15,0.0,Left Side(L),622,10,0
1,Jump Shot,1,0,2000-01,16,1.0,Left Side Center(LC),465,10,0
2,Jump Shot,1,0,2000-01,22,0.0,Right Side Center(RC),412,10,0
3,Dunk,2,0,2000-01,0,1.0,Center(C),379,10,0
4,Jump Shot,3,0,2000-01,14,0.0,Left Side(L),572,10,0


In [14]:
dataset["combined_shot_type"] = dataset["combined_shot_type"].astype('category')
dataset["period"] = dataset["period"].astype('category')
dataset["playoffs"] = dataset["playoffs"].astype('category')
dataset["season"] = dataset["season"].astype('category')
dataset["shot_made_flag"] = dataset["shot_made_flag"].astype('category')
dataset["month"] = dataset["home"].astype('category')
dataset["home"] = dataset["home"].astype('category')

In [15]:
multi_categorical_features = ['combined_shot_type', 'period', 'season', 'month', 'shot_zone_area']

In [16]:
for c in multi_categorical_features:
    dummies = pd.get_dummies(dataset[c], drop_first=True)
    dummies = dummies.add_prefix("{}#".format(c))
    dataset.drop(c, axis=1, inplace=True)
    dataset = dataset.join(dummies)

In [17]:
dataset.head()

Unnamed: 0,playoffs,shot_distance,shot_made_flag,total_seconds_remaining,home,combined_shot_type#Dunk,combined_shot_type#Hook Shot,combined_shot_type#Jump Shot,combined_shot_type#Layup,combined_shot_type#Tip Shot,...,season#2012-13,season#2013-14,season#2014-15,season#2015-16,month#1,shot_zone_area#Center(C),shot_zone_area#Left Side Center(LC),shot_zone_area#Left Side(L),shot_zone_area#Right Side Center(RC),shot_zone_area#Right Side(R)
0,0,15,0.0,622,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,16,1.0,465,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,22,0.0,412,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,1.0,379,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,14,0.0,572,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [18]:
X = dataset.loc[:, dataset.columns != 'shot_made_flag']
Y = dataset['shot_made_flag']

In [19]:
X.columns

Index(['playoffs', 'shot_distance', 'total_seconds_remaining', 'home',
       'combined_shot_type#Dunk', 'combined_shot_type#Hook Shot',
       'combined_shot_type#Jump Shot', 'combined_shot_type#Layup',
       'combined_shot_type#Tip Shot', 'period#2', 'period#3', 'period#4',
       'period#5', 'period#6', 'period#7', 'season#1997-98', 'season#1998-99',
       'season#1999-00', 'season#2000-01', 'season#2001-02', 'season#2002-03',
       'season#2003-04', 'season#2004-05', 'season#2005-06', 'season#2006-07',
       'season#2007-08', 'season#2008-09', 'season#2009-10', 'season#2010-11',
       'season#2011-12', 'season#2012-13', 'season#2013-14', 'season#2014-15',
       'season#2015-16', 'month#1', 'shot_zone_area#Center(C)',
       'shot_zone_area#Left Side Center(LC)', 'shot_zone_area#Left Side(L)',
       'shot_zone_area#Right Side Center(RC)', 'shot_zone_area#Right Side(R)'],
      dtype='object')

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, Y)

In [23]:
clf = LogisticRegression().fit(X_train, y_train)

In [24]:
clf.score(X_test, y_test)

0.6107392996108949

In [25]:
from sklearn.metrics import log_loss

In [33]:
log_loss(y_test, clf.predict_proba(X_test))

0.6534494775292955