In [1]:
%matplotlib inline

In [2]:
import scipy as sp
def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

In [3]:
from scipy import stats as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set_context("poster")

In [31]:
# A whole world of potential analysis...
kobe = pd.read_csv('data/data.csv')
kobe['known_data'] = kobe['shot_made_flag'].apply(lambda x: 0 if pd.isnull(x) else 1)

In [32]:
print len(kobe)

30697


In [None]:
kobe.head()

In [33]:
kobe_coltypes = pd.DataFrame(kobe.dtypes.reset_index())
kobe_coltypes.columns = ['col','type']
kobe_coltypes

Unnamed: 0,col,type
0,action_type,object
1,combined_shot_type,object
2,game_event_id,int64
3,game_id,int64
4,lat,float64
5,loc_x,int64
6,loc_y,int64
7,lon,float64
8,minutes_remaining,int64
9,period,int64


Attempt 1 - Lets just take the average for each known shot up to that shot
--

In [40]:
kobe['rolling_perc'] = pd.rolling_mean(kobe['shot_made_flag'],window=len(kobe['shot_made_flag']),min_periods=1).fillna(0.5)

	Series.rolling(min_periods=1,window=30697,center=False).mean()
  if __name__ == '__main__':


In [58]:
kobe_output = pd.DataFrame(kobe['rolling_perc'][kobe['known_data']==0])
kobe_output['id'] = kobe_output.index
kobe_output['id'] = kobe_output['id']+1
kobe_output.columns = ['shot_made_flag','shot_id']

In [59]:
kobe_output[['shot_id','shot_made_flag']].to_csv('data/first_attempt.csv',index=False)

Submission logloss: 0.68855
    
Better than 50% mark but not by much!

Attempt 2 - Lets take previous attempt for small sample (Anything with less than 200) and then an incremental naive bayes for the rest
----

In [4]:
#kobe_id = pd.read_csv('data/kobe_x_id.csv')
kobe_x = pd.read_csv('data/kobe_x.csv')
kobe_y = pd.read_csv('data/kobe_y.csv')

In [5]:
##First lets do some feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

In [6]:
print len(kobe_x), len(kobe_x[pd.notnull(kobe_y['shot_made_flag'])])
print len(kobe_y), len(kobe_y['shot_made_flag'][pd.notnull(kobe_y['shot_made_flag'])])

30697 25697
30697 25697


In [7]:
fs = SelectKBest(f_classif,k='all')
fs.fit(kobe_x[pd.notnull(kobe_y['shot_made_flag'])],kobe_y['shot_made_flag'][pd.notnull(kobe_y['shot_made_flag'])])



SelectKBest(k='all', score_func=<function f_classif at 0x11125f500>)

In [8]:
col_p_val = pd.DataFrame()
col_p_val['cols'] = kobe_x.columns
col_p_val['p_score'] = fs.pvalues_
col_p_val = col_p_val.sort('p_score').reset_index()
print len(col_p_val), len(col_p_val[col_p_val['p_score']<0.01])
cols_to_keep = col_p_val[col_p_val['p_score']<0.01]['cols']
print cols_to_keep

214 94
0                                                streak
1                                 action_type_Jump Shot
2                          combined_shot_type_Jump Shot
3                               combined_shot_type_Dunk
4     combined_shot_type_shot_zone_range_Dunk_Less T...
5           combined_shot_type_zone_area_Dunk_Center(C)
6                                         shot_distance
7                       shot_zone_basic_Restricted Area
8     shot_type_shot_zone_range_2PT Field Goal_Less ...
9                       shot_zone_range_Less Than 8 ft.
10         shot_type_zone_area_2PT Field Goal_Center(C)
11                       action_type_Driving Layup Shot
12                                                  lat
13                             shot_zone_area_Center(C)
14                           action_type_Slam Dunk Shot
15                             shot_type_2PT Field Goal
16                             shot_type_3PT Field Goal
17                              shot_zone



In [9]:
from sklearn.decomposition import PCA

comps = PCA(n_components=20)
comps.fit(kobe_x[cols_to_keep])
print [round(x,2) for x in comps.explained_variance_ratio_]

[0.7, 0.21, 0.05, 0.03, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [12]:
comps = PCA(n_components=2)
comps.fit(kobe_x[cols_to_keep])
kobe_x_transformed = comps.transform(kobe_x[cols_to_keep])

print kobe_x_transformed[0:5]

kobe_transformed_df = pd.DataFrame(kobe_x_transformed,columns=['pca1','pca2'])

[[ -1.52574535   3.94351757]
 [ -6.40529113   0.74181531]
 [ 16.52154056   2.75742455]
 [ 23.27629428   8.98283965]
 [ -8.83407018 -14.37523097]]


In [13]:
kobe_transformed_df.to_csv('data/kobe_x_transformed.csv',index=False)

In [8]:
kobe_x.values.tolist()

[[33.9723,
  -118.1028,
  10.0,
  1.0,
  0.0,
  27.0,
  18.0,
  2000.0,
  10.0,
  44.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  

In [None]:
from sklearn.naive_bayes import BernoulliNB

i=0
score = []
bnb = BernoulliNB()
new_scores = []
for x, y in zip(kobe_x.iterrows(),kobe_y.iterrows()):
    if i==0:
        #tx = x
        #ty = y
        #print x[1]
        #print y[1]['shot_made_flag']
        #print pd.notnull(y[1]['shot_made_flag'])
        #break
        score.append([i,0.5])
        i+=1
    elif pd.notnull(y[1]['shot_made_flag']):
        bnb.partial_fit([x[1].tolist()],[y[1]['shot_made_flag']],classes=[0,1])
        i+=1
    else:
        score.append([i,bnb.predict_proba(x[1])[0][1]])
        i+=1



In [None]:
bnb = BernoulliNB()

In [28]:
print tx[1].tolist()

[33.972299999999997, -118.1028, 10.0, 1.0, 0.0, 27.0, 18.0, 2000.0, 10.0, 44.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]


In [29]:
bnb = BernoulliNB()
bnb.partial_fit([tx[1].tolist()],[0],classes=[0,1])

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [None]:
score = []

for i in test_x.index:
    #print i, train_y[i]
    if i == 0:
        sc = 0.5
        score.append([i,0.5])
    elif i > 0 and i < 200:
        temp_x = train_x[train_x.index< i]
        temp_y = train_y['shot_made_flag'][0:len(temp_x)]
        sc = sum(temp_y)/len(temp_y)
        score.append([i,sum(temp_y)/len(temp_y)])
    else:
        temp_x = train_x[train_x.index< i]
        temp_y = train_y['shot_made_flag'][0:len(temp_x)]
        lrm = LogisticRegression()
        lrm.fit(temp_x,temp_y)
        sc = lrm.predict_proba(test_x[test_x.index==i])[0][1]
    print [i,sc]
    score.append([i,sc])

print score

[0, 0.5]
[7, 0.5]
[16, 0.42857142857142855]
[19, 0.4375]
[32, 0.39285714285714285]
[33, 0.39285714285714285]
[34, 0.39285714285714285]
[35, 0.39285714285714285]
[36, 0.39285714285714285]
[37, 0.39285714285714285]
[44, 0.44117647058823528]
[49, 0.47368421052631576]
[54, 0.42857142857142855]
[59, 0.45652173913043476]
[65, 0.45098039215686275]
[66, 0.45098039215686275]
[70, 0.46296296296296297]
[79, 0.45161290322580644]
[84, 0.43939393939393939]
[85, 0.43939393939393939]
[94, 0.47297297297297297]
[103, 0.48780487804878048]
[112, 0.45555555555555555]
[122, 0.45454545454545453]
[125, 0.46534653465346537]
[132, 0.46728971962616822]
[140, 0.45614035087719296]
[143, 0.44827586206896552]
[149, 0.45454545454545453]
[151, 0.45901639344262296]
[152, 0.45901639344262296]
[155, 0.45161290322580644]
[158, 0.44444444444444442]
[164, 0.4580152671755725]
[171, 0.45255474452554745]
[181, 0.43835616438356162]
[191, 0.43870967741935485]
[192, 0.43870967741935485]
[195, 0.43949044585987262]
[201, 0.17272435

In [None]:
with open('data/attempt_2_submission.csv','w') as f:
    f.write('shot_id,shot_made_flag'+'\n')
    for line in score:
        f.write(','.join([str(line[0]+1),str(line[1])])+'\n')