### clean data

In [1]:
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split

In [2]:
ALLdata = pd.read_csv('../Kobe-Project-master/data.csv')
missing_value = ALLdata['shot_made_flag'].isnull()

In [3]:
data = ALLdata.copy() # create a copy of data frame
ALL_Y = data['shot_made_flag'].copy()

In [4]:
data.drop('team_name', axis=1, inplace=True) # Always LA Lakers
data.drop('team_id', axis=1, inplace=True) # Always one number
data.drop('shot_made_flag', axis=1, inplace=True)

In [5]:
data.head(0)  

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,seconds_remaining,shot_distance,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,game_date,matchup,opponent,shot_id


In [6]:
# Remaining time
data['seconds_from_period_end'] = 60 * data['minutes_remaining'] + data['seconds_remaining']

data.drop('minutes_remaining', axis=1, inplace=True)
data.drop('seconds_remaining', axis=1, inplace=True)

In [7]:
data.head(0)  #21

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,period,playoffs,...,shot_distance,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,game_date,matchup,opponent,shot_id,seconds_from_period_end


In [8]:
## Matchup - (away/home)
data['away/home'] = data['matchup'].str.contains('vs').astype('int')
data.drop('matchup', axis=1, inplace=True)

In [9]:
# Game date
data['game_date'] = pd.to_datetime(data['game_date'])
data['game_year'] = data['game_date'].dt.year
data['game_month'] = data['game_date'].dt.month
data['game_day'] = data['game_date'].dt.day
data.drop('game_date', axis=1, inplace=True)

In [10]:
data.head(0)

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,period,playoffs,...,shot_zone_area,shot_zone_basic,shot_zone_range,opponent,shot_id,seconds_from_period_end,away/home,game_year,game_month,game_day


In [11]:
# Partition interval
data['loc_x'] = pd.cut(data['loc_x'], 50)
data['loc_y'] = pd.cut(data['loc_y'], 50)
data['lat'] = pd.cut(data['lat'],10)
data['lon'] = pd.cut(data['lon'],10)

In [12]:
#One Hot Coding
categorial_cols = list(data.columns.values)
for col in categorial_cols:
    dummies = pd.get_dummies(data[col])
    dummies = dummies.add_prefix("{}_".format(col))
    data.drop(col, axis=1, inplace=True)
    data = data.join(dummies)

In [13]:
data_submit = data[missing_value]

In [14]:
data_submit.shape

(5000, 33993)

### divide dataset

In [15]:
# Separate dataset for training
X = data[~missing_value]
Y = ALL_Y[~missing_value]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.75, random_state=123)

### train model & predict

In [16]:
X.shape

(25697, 33993)

In [18]:
#Bernoulli model
clf = BernoulliNB()
B_model = clf.fit(X, Y)
y_pred1 = B_model.predict(X_test)
print("Accuracy of Bernoulli model: %f" % ( (y_test == y_pred1).sum()/X_test.shape[0]))

Accuracy of Bernoulli model: 0.685000


In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [20]:
# Multinomial model
clf2 = MultinomialNB()
M_model = clf2.fit(X, Y)
y_pred2=M_model.predict(X_test)
M_score = accuracy_score(y_pred2, y_test)
print("Accuracy of Multinomial model: %f"%M_score)

Accuracy of Multinomial model: 0.699891


In [21]:
#Gaussian model
clf3 = GaussianNB()
G_model = clf3.fit(X, Y)
y_pred3=G_model.predict(X_test)
G_score = accuracy_score(y_pred3, y_test)
print("Accuracy of Gaussian model: %f"%G_score)

Accuracy of Gaussian model: 1.000000


In [24]:
#LogisticRegression model
clf_LR = LogisticRegression()
L_mode0 = clf_LR.fit(X, Y)
y_pred0 = L_mode0.predict(X_test)
print("Accuracy of Bernoulli model: %f" % (accuracy_score(y_pred0, y_test)))



Accuracy of Bernoulli model: 0.848493


### Cross validation : accuracy & F1 score

In [25]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

 Bernoulli model

In [26]:
cv = ShuffleSplit(n_splits=3, test_size=.3, random_state=123)
B_acc = cross_val_score(clf, X, Y, cv=cv, scoring='accuracy')
print("Cross validation : Accuracy of Bernoulli model is :" ,B_acc)

Cross validation : Accuracy of Bernoulli model is : [0.63800259 0.62217899 0.62931258]


In [27]:
B_f1 = cross_val_score(clf, X, Y, cv=cv, scoring='f1')
print("Cross validation : f1 score of Bernoulli model is :" ,B_f1)

Cross validation : f1 score of Bernoulli model is : [0.52428839 0.50484447 0.51460598]


Multinomial model

In [28]:
M_acc = cross_val_score(clf2, X, Y, cv=cv,scoring='accuracy')
print("Cross validation : Accuracy of Multinomial model is :" ,M_acc)

Cross validation : Accuracy of Multinomial model is : [0.63463035 0.62075227 0.62749676]


In [29]:
M_f1 = cross_val_score(clf2, X, Y, cv=cv,scoring='f1')
print("Cross validation :  f1 score of Multinomial model is :" ,M_f1)

Cross validation :  f1 score of Multinomial model is : [0.52310818 0.50807537 0.51519244]


Gaussian model

In [30]:
G_acc = cross_val_score(clf3, X, Y, cv=cv,scoring='accuracy')
print("Cross validation : Accuracy of Gaussian model is :",G_acc)

Cross validation : Accuracy of Gaussian model is : [0.45071336 0.4536965  0.45486381]


In [31]:
G_f1 = cross_val_score(clf3, X, Y, cv=cv,scoring='f1')
print("Cross validation :  f1 score of Gaussian model is :",G_f1)

Cross validation :  f1 score of Gaussian model is : [0.61384152 0.61729965 0.61891377]


Logist Regression model

In [32]:
LR_acc = cross_val_score(clf_LR, X, Y, cv=cv, scoring='accuracy')
print("Cross validation : Accuracy of Bernoulli model is :" ,LR_acc)



Cross validation : Accuracy of Bernoulli model is : [0.64941634 0.63437095 0.6464332 ]


In [33]:
B_f1 = cross_val_score(clf_LR, X, Y, cv=cv, scoring='f1')
print("Cross validation : f1 score of Bernoulli model is :" ,B_f1)



Cross validation : f1 score of Bernoulli model is : [0.56661857 0.54772983 0.56356068]
