### clean data

In [1]:
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split

In [3]:
ALLdata = pd.read_csv('../Kobe-Project-master/data.csv')
missing_value = ALLdata['shot_made_flag'].isnull()

In [4]:
data = ALLdata.copy() # create a copy of data frame
ALL_Y = data['shot_made_flag'].copy()

In [5]:
print(data.shape)
data.drop('team_name', axis=1, inplace=True) # Always LA Lakers
print(data.shape)
data.drop('team_id', axis=1, inplace=True) # Always one number
print(data.shape)
data.drop('shot_made_flag', axis=1, inplace=True)
print(data.shape)

(30697, 25)
(30697, 24)
(30697, 23)
(30697, 22)


In [7]:
data.head(0)  

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,seconds_remaining,shot_distance,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,game_date,matchup,opponent,shot_id


In [8]:
# Remaining time
data['seconds_from_period_end'] = 60 * data['minutes_remaining'] + data['seconds_remaining']

data.drop('minutes_remaining', axis=1, inplace=True)
data.drop('seconds_remaining', axis=1, inplace=True)

In [9]:
data.head(0)  #21

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,period,playoffs,...,shot_distance,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,game_date,matchup,opponent,shot_id,seconds_from_period_end


In [10]:
## Matchup - (away/home)
data['away/home'] = data['matchup'].str.contains('vs').astype('int')
data.drop('matchup', axis=1, inplace=True)

In [11]:
# Game date
data['game_date'] = pd.to_datetime(data['game_date'])
data['game_year'] = data['game_date'].dt.year
data['game_month'] = data['game_date'].dt.month
data['game_day'] = data['game_date'].dt.day
data.drop('game_date', axis=1, inplace=True)

In [12]:
print(data.shape)
data.head(0)

(30697, 23)


Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,period,playoffs,...,shot_zone_area,shot_zone_basic,shot_zone_range,opponent,shot_id,seconds_from_period_end,away/home,game_year,game_month,game_day


In [13]:
# Partition interval
data['loc_x'] = pd.cut(data['loc_x'], 50)
data['loc_y'] = pd.cut(data['loc_y'], 50)
data['lat'] = pd.cut(data['lat'],10)
data['lon'] = pd.cut(data['lon'],10)


In [14]:
data_remov0 = data.copy()
data_remov1 = data.drop('game_day',axis = 1, inplace = False)
data_remov2 = data_remov1.drop('game_id',axis = 1, inplace = False)
data_remov3 = data_remov1.drop('shot_zone_basic',axis = 1, inplace = False)
print(data_remov0.shape)
print(data_remov1.shape)
print(data_remov2.shape)

(30697, 23)
(30697, 22)
(30697, 21)


In [15]:
#One Hot Coding
# Get different feature data set
categorial_cols = list(data_remov0 .columns.values)
for col in categorial_cols:
    dummies = pd.get_dummies(data_remov0 [col])
    dummies = dummies.add_prefix("{}_".format(col))
    data_remov0.drop(col, axis=1, inplace=True)
    data_remov0  = data_remov0 .join(dummies)
data_submit = data_remov0[missing_value]
print(data_submit.shape)
print(data_remov0.shape)

(5000, 33993)
(30697, 33993)


In [16]:
# data removel the game day
categorial_cols = list(data_remov1.columns.values)
for col in categorial_cols:
    dummies = pd.get_dummies(data_remov1[col])
    dummies = dummies.add_prefix("{}_".format(col))
    data_remov1.drop(col, axis=1, inplace=True)
    data_remov1 = data_remov1.join(dummies)
data_submit = data_remov1[missing_value]
print(data_submit.shape)
print(data_remov1.shape)

(5000, 33962)
(30697, 33962)


In [17]:
# data removel the game_day,game_id
categorial_cols = list(data_remov2.columns.values)
for col in categorial_cols:
    dummies = pd.get_dummies(data_remov2[col])
    dummies = dummies.add_prefix("{}_".format(col))
    data_remov2.drop(col, axis=1, inplace=True)
    data_remov2 = data_remov2.join(dummies)
data_submit = data_remov2[missing_value]
print(data_submit.shape)
print(data_remov2.shape)

(5000, 32403)
(30697, 32403)


In [18]:
# data removel the game_day,game_id and shot_zone_basic
categorial_cols = list(data_remov3.columns.values)
for col in categorial_cols:
    dummies = pd.get_dummies(data_remov3[col])
    dummies = dummies.add_prefix("{}_".format(col))
    data_remov3.drop(col, axis=1, inplace=True)
    data_remov3 = data_remov3.join(dummies)
data_submit = data_remov3[missing_value]
print(data_submit.shape)
print(data_remov3.shape)

(5000, 33955)
(30697, 33955)


### divide dataset

In [19]:
data_remov1.shape

(30697, 33962)

In [20]:
# Separate dataset for training
X0 = data_remov0[~missing_value]
Y = ALL_Y[~missing_value]
X_train0, X_test0, y_train0, y_test0 = train_test_split(X0, Y, test_size=0.75, random_state=123)
X1 = data_remov1[~missing_value]
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, Y, test_size=0.75, random_state=123)
X2 = data_remov2[~missing_value]
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, Y, test_size=0.75, random_state=123)
X3 = data_remov3[~missing_value]
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, Y, test_size=0.75, random_state=123)

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# train model & predict

completed data

In [23]:
#LogisticRegression model
clf_LR = LogisticRegression()
L_mode0 = clf_LR.fit(X0, Y)
y_pred0 = L_mode0.predict(X_test0)
print("Accuracy of Logistic Regression model 0: %f" % (accuracy_score(y_pred0, y_test0)))



Accuracy of Logistic Regression model 0: 0.848441


data with removing game_day col

In [24]:
L_mode1 = clf_LR.fit(X1, Y)
y_pred1 = L_mode1.predict(X_test1)
print("Accuracy of Logistic Regression 1: %f" % (accuracy_score(y_pred1, y_test1)))



Accuracy of Logistic Regression 1: 0.847818


data with removing game_day and game_id cols

In [25]:
L_mode2 = clf_LR.fit(X2, Y)
y_pred2 = L_mode2.predict(X_test2)
print("Accuracy of Logistic Regression 2: %f" % (accuracy_score(y_pred2, y_test2)))



Accuracy of Logistic Regression 2: 0.830333


data with removing game_day, game_id and shot_zone_basic cols

In [26]:
L_mode3 = clf_LR.fit(X3, Y)
y_pred3 = L_mode3.predict(X_test3)
print("Accuracy of Logistic Regression 3: %f" % (accuracy_score(y_pred3, y_test3)))



Accuracy of Logistic Regression 3: 0.848026


### Cross validation : accuracy & F1 score

In [27]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
cv = ShuffleSplit(n_splits=3, test_size=.3, random_state=123)

 completed data

In [29]:
LR_acc = cross_val_score(clf_LR, X0, Y, cv=cv, scoring='accuracy')
B_f1 = cross_val_score(clf_LR, X0, Y, cv=cv, scoring='f1')
print("Accuracy of model and its mean are :",LR_acc,LR_acc.mean())
print("f1 score of model and its mean are :" ,B_f1,B_f1.mean())



Accuracy of model and its mean are : [0.64941634 0.63437095 0.6464332 ] 0.643406830955469
f1 score of model and its mean are : [0.56661857 0.54772983 0.56356068] 0.5593030235257862


data with removing game_day col

In [30]:
LR_acc = cross_val_score(clf_LR, X1, Y, cv=cv, scoring='accuracy')
B_f1 = cross_val_score(clf_LR, X1, Y, cv=cv, scoring='f1')
print("Accuracy of model and its mean are :",LR_acc,LR_acc.mean())
print("f1 score of model and its mean are :" ,B_f1,B_f1.mean())



Accuracy of model and its mean are : [0.65084306 0.63398184 0.6461738 ] 0.6436662343277129
f1 score of model and its mean are : [0.56831302 0.5470305  0.56296059] 0.5594347027714468


data with removing game_day and game_id cols

In [31]:
LR_acc = cross_val_score(clf_LR, X2, Y, cv=cv, scoring='accuracy')
B_f1 = cross_val_score(clf_LR, X2, Y, cv=cv, scoring='f1')
print("Accuracy of model and its mean are :",LR_acc,LR_acc.mean())
print("f1 score of model and its mean are :" ,B_f1,B_f1.mean())



Accuracy of model and its mean are : [0.66355383 0.64954604 0.66108949] 0.658063121487246
f1 score of model and its mean are : [0.5739159  0.55427252 0.5678849 ] 0.5653577730188412


data with removing game_day, game_id and shot_zone_basic cols

In [33]:
LR_acc = cross_val_score(clf_LR, X3, Y, cv=cv, scoring='accuracy')
B_f1 = cross_val_score(clf_LR, X3, Y, cv=cv, scoring='f1')
print("Accuracy of model and its mean are :",LR_acc,LR_acc.mean())
print("f1 score of model and its mean are :" ,B_f1,B_f1.mean())



Accuracy of model and its mean are : [0.65071336 0.63579767 0.64708171] 0.6445309122351923
f1 score of model and its mean are : [0.56849864 0.54913295 0.56373256] 0.5604547165827394
