### clean data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# load data set
ALLdata = pd.read_csv('../Kobe-Project-master/data.csv')

In [3]:
# deal with missing values and set as the test data set
missing_value = ALLdata['shot_made_flag'].isnull()

In [4]:
# create a copy of data set which will not pollute the original data set
df = ALLdata.copy() 
ALL_Y = df['shot_made_flag'].copy()

In [5]:
# choose approriate methods to re-represent corresponding columns
df['secs_from_period_end']   = 60*df['minutes_remaining']+df['seconds_remaining']
df['secs_from_period_start'] = 60*(11-df['minutes_remaining'])+(60-df['seconds_remaining'])
df['secs_from_start']   = (df['period'] <= 4).astype(int)*(df['period']-1)*12*60 + (df['period'] > 4).astype(int)*((df['period']-4)*5*60 + 3*12*60) + df['secs_from_period_start']
df['dist'] = np.sqrt(df['loc_x']**2 + df['loc_y']**2)
df['away/home'] = df['matchup'].str.contains('vs').astype('int')
df['game_date'] = pd.to_datetime(df['game_date'])
df['game_year'] = df['game_date'].dt.year
loc_x_zero = (df['loc_x'] == 0)
df['angle'] = np.array([0]*len(df))
df['angle'][~loc_x_zero] = np.arctan(df['loc_y'][~loc_x_zero] / df['loc_x'][~loc_x_zero])
df['angle'][loc_x_zero] = np.pi / 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [6]:
# drop some columns
to_drop = ['period', 'minutes_remaining', 'seconds_remaining', 'game_id', 'team_name', 'matchup', 'lon', 'lat',\
           'game_event_id', 'game_date', 'secs_from_period_start', 'season', 'team_id']

df = df.drop(to_drop, axis = 1) 

In [7]:
# keep the same way of representing columns with one-hot as the bayes algorithm
categorical_cols = ['action_type', 'combined_shot_type', 'shot_type', 'shot_zone_area',
                        'shot_zone_basic', 'shot_zone_range', 'opponent']

for col in categorical_cols:
    dummies = pd.get_dummies(df[col])
    dummies = dummies.add_prefix("{}_".format(col))
    df.drop(col, axis=1, inplace=True)
    df = df.join(dummies)

In [8]:
Y = df['shot_made_flag'].copy() # set the label column
X = df.drop(['shot_made_flag'], axis = 1) # the processed data set 

In [9]:
# drop some columns and use them to train different models to get different accuracies for analysis
data_remov0 = X.copy()
data_remov1 = data_remov0.drop('shot_distance',axis = 1, inplace = False)
data_remov2 = data_remov0.drop('playoffs',axis = 1, inplace = False)
data_remov3 = data_remov0.drop('shot_id',axis = 1, inplace = False)

### divide dataset

In [10]:
# Separate dataset for training
X0 = data_remov0[~missing_value]
Y = ALL_Y[~missing_value]
X_train0, X_test0, y_train0, y_test0 = train_test_split(X0, Y, test_size=0.33, random_state=123)
X1 = data_remov1[~missing_value]
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, Y, test_size=0.33, random_state=123)
X2 = data_remov2[~missing_value]
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, Y, test_size=0.33, random_state=123)
X3 = data_remov3[~missing_value]
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, Y, test_size=0.33, random_state=123)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# train model & predict

completed data

In [12]:
target_names = ['shot_made_flag_0','shot_made_flag_1']

In [13]:
# LogisticRegression model
clf_LR = LogisticRegression()
L_mode0 = clf_LR.fit(X_train0, y_train0)
y_pred0 = L_mode0.predict(X_test0)
print("Accuracy of Logistic Regression model 0: %f\n" % (accuracy_score(y_pred0, y_test0)))
print(classification_report(y_pred0,y_test0,target_names=target_names))



Accuracy of Logistic Regression model 0: 0.683528

                  precision    recall  f1-score   support

shot_made_flag_0       0.84      0.67      0.75      5941
shot_made_flag_1       0.48      0.71      0.57      2540

        accuracy                           0.68      8481
       macro avg       0.66      0.69      0.66      8481
    weighted avg       0.73      0.68      0.70      8481



data with removing removing shot_distance column

In [14]:
L_mode1 = clf_LR.fit(X_train1, y_train1)
y_pred1 = L_mode1.predict(X_test1)
print("Accuracy of Logistic Regression 1: %f\n" % (accuracy_score(y_pred1, y_test1)))
print(classification_report(y_pred1,y_test1,target_names=target_names))



Accuracy of Logistic Regression 1: 0.684117

                  precision    recall  f1-score   support

shot_made_flag_0       0.85      0.67      0.75      5986
shot_made_flag_1       0.48      0.71      0.57      2495

        accuracy                           0.68      8481
       macro avg       0.66      0.69      0.66      8481
    weighted avg       0.74      0.68      0.70      8481



data with removing playoffs column

In [15]:
L_mode2 = clf_LR.fit(X_train2, y_train2)
y_pred2 = L_mode2.predict(X_test2)
print("Accuracy of Logistic Regression 2: %f\n" % (accuracy_score(y_pred2, y_test2)))
print(classification_report(y_pred2,y_test2,target_names=target_names))



Accuracy of Logistic Regression 2: 0.683646

                  precision    recall  f1-score   support

shot_made_flag_0       0.85      0.67      0.75      5982
shot_made_flag_1       0.48      0.71      0.57      2499

        accuracy                           0.68      8481
       macro avg       0.66      0.69      0.66      8481
    weighted avg       0.74      0.68      0.70      8481



data with removing shot_id column

In [17]:
L_mode3 = clf_LR.fit(X_train3, y_train3)
y_pred3 = L_mode3.predict(X_test3)
print("Accuracy of Logistic Regression 3: %f\n" % (accuracy_score(y_pred3, y_test3)))
print(classification_report(y_pred3,y_test3,target_names=target_names))

Accuracy of Logistic Regression 3: 0.684589

                  precision    recall  f1-score   support

shot_made_flag_0       0.85      0.67      0.75      5974
shot_made_flag_1       0.48      0.71      0.57      2507

        accuracy                           0.68      8481
       macro avg       0.66      0.69      0.66      8481
    weighted avg       0.74      0.68      0.70      8481





### Cross validation : accuracy & F1 score

In [18]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
cv = ShuffleSplit(n_splits=3, test_size=0.33, random_state=123)

 completed data

In [19]:
LR_acc = cross_val_score(clf_LR, X0, Y, cv=cv, scoring='accuracy')
LR_f1 = cross_val_score(clf_LR, X0, Y, cv=cv, scoring='f1')
print("Accuracy of model and its mean are :",LR_acc,LR_acc.mean())
print("f1 score of model and its mean are :" ,LR_f1,LR_f1.mean())



Accuracy of model and its mean are : [0.68352789 0.65711591 0.6758637 ] 0.672169162441536
f1 score of model and its mean are : [0.57274753 0.58097983 0.56191235] 0.5718799034399554




data with removing shot_distance column

In [20]:
LR_acc = cross_val_score(clf_LR, X1, Y, cv=cv, scoring='accuracy')
LR_f1 = cross_val_score(clf_LR, X1, Y, cv=cv, scoring='f1')
print("Accuracy of model and its mean are :",LR_acc,LR_acc.mean())
print("f1 score of model and its mean are :" ,LR_f1,LR_f1.mean())



Accuracy of model and its mean are : [0.68411744 0.67385921 0.67657116] 0.6781826042526431
f1 score of model and its mean are : [0.57046657 0.56067344 0.55807959] 0.5630732004950542


data with removing playoffs column

In [21]:
LR_acc = cross_val_score(clf_LR, X2, Y, cv=cv, scoring='accuracy')
LR_f1 = cross_val_score(clf_LR, X2, Y, cv=cv, scoring='f1')
print("Accuracy of model and its mean are :",LR_acc,LR_acc.mean())
print("f1 score of model and its mean are :" ,LR_f1,LR_f1.mean())



Accuracy of model and its mean are : [0.6836458  0.67397713 0.6758637 ] 0.6778288723814015
f1 score of model and its mean are : [0.57010095 0.56090202 0.56330421] 0.5647690572949878




data with removing shot_id column

In [22]:
LR_acc = cross_val_score(clf_LR, X3, Y, cv=cv, scoring='accuracy')
LR_f1 = cross_val_score(clf_LR, X3, Y, cv=cv, scoring='f1')
print("Accuracy of model and its mean are :",LR_acc,LR_acc.mean())
print("f1 score of model and its mean are :",LR_f1,LR_f1.mean())



Accuracy of model and its mean are : [0.68458908 0.67468459 0.67657116] 0.6786149432063828
f1 score of model and its mean are : [0.57193151 0.55635954 0.56034621] 0.5628790872349354
