In [2]:
%matplotlib inline

In [3]:
import scipy as sp
def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

In [4]:
from scipy import stats as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set_context("poster")

In [4]:
# A whole world of potential analysis...
kobe = pd.read_csv('data/data.csv')
kobe['known_data'] = kobe['shot_made_flag'].apply(lambda x: 0 if pd.isnull(x) else 1)

In [5]:
print len(kobe)

30697


In [None]:
kobe.head()

In [33]:
kobe_coltypes = pd.DataFrame(kobe.dtypes.reset_index())
kobe_coltypes.columns = ['col','type']
kobe_coltypes

Unnamed: 0,col,type
0,action_type,object
1,combined_shot_type,object
2,game_event_id,int64
3,game_id,int64
4,lat,float64
5,loc_x,int64
6,loc_y,int64
7,lon,float64
8,minutes_remaining,int64
9,period,int64


Attempt 1 - Lets just take the average for each known shot up to that shot
--

In [40]:
kobe['rolling_perc'] = pd.rolling_mean(kobe['shot_made_flag'],window=len(kobe['shot_made_flag']),min_periods=1).fillna(0.5)

	Series.rolling(min_periods=1,window=30697,center=False).mean()
  if __name__ == '__main__':


In [58]:
kobe_output = pd.DataFrame(kobe['rolling_perc'][kobe['known_data']==0])
kobe_output['id'] = kobe_output.index
kobe_output['id'] = kobe_output['id']+1
kobe_output.columns = ['shot_made_flag','shot_id']

In [59]:
kobe_output[['shot_id','shot_made_flag']].to_csv('data/first_attempt.csv',index=False)

Submission logloss: 0.68855
    
Better than 50% mark but not by much!

Attempt 2 - Lets take previous attempt for small sample (Anything with less than 200) and then an incremental naive bayes for the rest
----

In [47]:
#kobe_id = pd.read_csv('data/kobe_x_id.csv')
kobe_x = pd.read_csv('data/kobe_x.csv')
kobe_y = pd.read_csv('data/kobe_y.csv')

In [48]:
##First lets do some feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

In [49]:
print len(kobe_x), len(kobe_x[pd.notnull(kobe_y['shot_made_flag'])])
print len(kobe_y), len(kobe_y['shot_made_flag'][pd.notnull(kobe_y['shot_made_flag'])])

30697 25697
30697 25697


In [50]:
fs = SelectKBest(f_classif,k='all')
fs.fit(kobe_x[pd.notnull(kobe_y['shot_made_flag'])],kobe_y['shot_made_flag'][pd.notnull(kobe_y['shot_made_flag'])])



SelectKBest(k='all', score_func=<function f_classif at 0x11125f500>)

In [51]:
col_p_val = pd.DataFrame()
col_p_val['cols'] = kobe_x.columns
col_p_val['p_score'] = fs.pvalues_
col_p_val = col_p_val.sort('p_score').reset_index()
print len(col_p_val), len(col_p_val[col_p_val['p_score']<0.01])
cols_to_keep = col_p_val[col_p_val['p_score']<0.01]['cols']
print cols_to_keep

213 93
0                                 action_type_Jump Shot
1                                                streak
2                          combined_shot_type_Jump Shot
3                               combined_shot_type_Dunk
4           combined_shot_type_zone_area_Dunk_Center(C)
5     combined_shot_type_shot_zone_range_Dunk_Less T...
6                                         shot_distance
7                       shot_zone_basic_Restricted Area
8     shot_type_shot_zone_range_2PT Field Goal_Less ...
9                       shot_zone_range_Less Than 8 ft.
10         shot_type_zone_area_2PT Field Goal_Center(C)
11                       action_type_Driving Layup Shot
12                                                  lat
13                             shot_zone_area_Center(C)
14                           action_type_Slam Dunk Shot
15                             shot_type_2PT Field Goal
16                             shot_type_3PT Field Goal
17                              shot_zone



In [53]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
comps = PCA(n_components=20)
comps.fit(kobe_x[cols_to_keep])
print [round(x,2) for x in comps.explained_variance_ratio_]

[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [45]:
comps = PCA(n_components=2)
comps.fit(normalize(kobe_x[cols_to_keep]))
kobe_x_transformed = comps.transform(normalize(kobe_x[cols_to_keep]))

print kobe_x_transformed[0:5]

kobe_transformed_df = pd.DataFrame(kobe_x_transformed,columns=['pca1','pca2'])

[[ 0.14604095 -0.00247487]
 [ 0.14377955 -0.00475009]
 [ 0.07116479  0.00763481]
 [ 0.04593177  0.01132817]
 [ 0.0300067  -0.00416219]]


In [46]:
kobe_transformed_df.to_csv('data/kobe_x_transformed.csv',index=False)

Attempt 3 - non online learning models
--------------------------------------

In this we need to save the data out as we go line by line, and then at each unknown fit the saved data 

In [4]:
kobe_x = pd.read_csv('data/kobe_x.csv')
kobe_y = pd.read_csv('data/kobe_y.csv')

In [5]:
print kobe_x.head()
print kobe_y.head()

       lat       lon  period  playoffs  shot_distance  game_year  game_month  \
0  33.9723 -118.1028       1         0             18       2000          10   
1  34.0443 -118.4268       1         0             15       2000          10   
2  33.9093 -118.3708       1         0             16       2000          10   
3  33.8693 -118.1318       1         0             22       2000          10   
4  34.0443 -118.2698       2         0              0       2000          10   

   game_week  game_dayofweek  action_type_Alley Oop Dunk Shot       ...        \
0         44               1                              0.0       ...         
1         44               1                              0.0       ...         
2         44               1                              0.0       ...         
3         44               1                              0.0       ...         
4         44               1                              0.0       ...         

   combined_shot_type_shot_zone_

In [6]:
##Normalize x data
from sklearn import preprocessing
X_scaled = preprocessing.scale(kobe_x)

print len(X_scaled), len(kobe_x), len(kobe_y)

30697 30697 30697


In [7]:
kobe_x_train = pd.DataFrame(X_scaled)[pd.notnull(kobe_y['shot_made_flag'])]
kobe_x_test = pd.DataFrame(X_scaled)[pd.isnull(kobe_y['shot_made_flag'])]
kobe_y_train = kobe_y[pd.notnull(kobe_y['shot_made_flag'])]['shot_made_flag']
kobe_y_test = kobe_y[pd.isnull(kobe_y['shot_made_flag'])]['shot_made_flag']

In [8]:
## So we want to build a classifier that takes X_train and y_train and outputs y_test from X_test
print len(kobe_x_train), len(kobe_x_test), len(kobe_y_train), len(kobe_y_test)

25697 5000 25697 5000


## to stop leakage this need be a classifier that will do:
* a split to get its own train and test
* recursively go through the following to find the best classifier
    * An initial variable selection with cutoff p
    * a pca with dimensions n
    * a model with hyper parameters h = {}
    * test on test data using logloss definition
* output the best model with p, n , h defined and fitted across all the known data
* 

In [9]:
from sklearn.feature_selection import SelectFromModel
from sklearn import linear_model, svm
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
import itertools

def predict_probabilities(X_train,X_test,y_train,threshold,component,m):
    ## Selector phase
    #selector = SelectFromModel(linear_model.LogisticRegression(),threshold=threshold)
    #selector.fit(X_train,y_train)
    #new_X_train = selector.transform(X_train)
    
    ##PCA phase
    pca = PCA(n_components=component)
    
    pca.fit(new_X_train)
    pca_variance =  sum(pca.explained_variance_ratio_)
    pca_X_train = pca.transform(new_X_train)
    
    #convert the X_test
    pca_X_test = pca.transform(selector.transform(X_test))
    
    ##Model phase
    model = m[1]
    model.fit(pca_X_train,y_train)
    return model.predict_proba(pca_X_test), pca_variance


def model_tests(X_train,X_test,y_train,y_test,thresholds,components,models):
    output = []
    for threshold, component, m in itertools.product(thresholds,components,models):
        print threshold, component, m
        y_pred, pca_variance = predict_probabilities(X_train,X_test,y_train,threshold,component,m)
        ##test phase
        output.append([m, threshold, component, pca_variance, logloss(y_test,[x[1] for x in y_pred])])

    return pd.DataFrame(output,columns = ['model','selection_threshold','N_pca','pca_var','logloss'])


def predict_best_hyper_params(X_train,X_test,y_train,y_test,thresholds,components,models):
    output = test_model(X_train,X_test,y_train,y_test,thresholds,components,models)
    hyper_params = output.sort('logloss',ascending=True).reset_index().head(1).values.tolist()[0]
    return hyper_params

def global_prediction(kobe_x_train,kobe_x_test,kobe_y_train,thresholds,components,models):
    X_train, X_test, y_train, y_test = train_test_split(kobe_x_train, kobe_y_train, test_size=0.33, random_state=42)
    output = model_tests(X_train,X_test,y_train,y_test,thresholds,components,models)
    print output[output['logloss']>0.01].sort('logloss',ascending=True)
    hyper_params = output[output['logloss']>0.01].sort('logloss',ascending=True).reset_index().head(1).values.tolist()[0]
    print hyper_params
    probs = predict_probabilities(kobe_x_train,kobe_x_test,kobe_y_train,hyper_params[2],hyper_params[3],hyper_params[1])[0]
    return [x[1] for x in probs]

In [11]:
def x_range(start,end,by):
    while start < end:
        yield start
        start += by
    
thresholds = [x for x in x_range(0.01,0.05,0.005)]
components = [x for x in x_range(1,10,1)]

models = [
    ['logmodel',linear_model.LogisticRegression()]#,
    #['svc',svm.SVC(probability=True)]
    ]

i=0
dx = []
dy = []
predictions = []
for x, y in zip(kobe_x.iterrows(),kobe_y['shot_made_flag'].fillna(-1000)):
    if x[0]==0:
        #print x
        predictions.append([x[0],0.5])
        print predictions[len(predictions)-1]
    else:
        if y != -1000:
            dx.append(x[1])
            dy.append(y)
        else:
            if x[0]<=100:
                predictions.append([x[0],np.mean(dy)])
                print predictions[len(predictions)-1]
            else:
                predictions.append(
                    [x[0],global_prediction(dx,kobe_x_test,dy,thresholds,components,models)]
                )
                print predictions[len(predictions)-1]
    i+=1
    if i>30:
        break

[0, 0.5]
[7, 0.5]
[16, 0.42857142857142855]
[19, 0.4375]


### Final attempts and comparisons

Lets try a number of the online learning methods in scikit learn. 

Some dont calculate probabilities - I'm expecting them nt to be as good but lets try them anyway for completeness

In [5]:
kobe_x = pd.read_csv('data/kobe_x.csv')
kobe_y = pd.read_csv('data/kobe_y.csv')

In [6]:
kobe_x.head(1)

Unnamed: 0,lat,lon,period,playoffs,shot_distance,game_year,game_month,game_week,game_dayofweek,action_type_Alley Oop Dunk Shot,...,combined_shot_type_shot_zone_range_Layup_16-24 ft.,combined_shot_type_shot_zone_range_Layup_8-16 ft.,combined_shot_type_shot_zone_range_Layup_Less Than 8 ft.,combined_shot_type_shot_zone_range_Tip Shot_Less Than 8 ft.,home_away,last_shot_made,streak,game_shot_perc,global_shot_perc,time remaining
0,33.9723,-118.1028,1,0,18,2000,10,44,1,0.0,...,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,627


In [7]:
kobe_y.head(1)

Unnamed: 0,shot_made_flag
0,


In [33]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier

prob_models = [
    MultinomialNB(),
    Perceptron(),
    SGDClassifier(loss='log'),
    SGDClassifier(loss='modified_huber')
]

class_models = [
    BernoulliNB(),
    Perceptron(),
    SGDClassifier(loss='log'),
    SGDClassifier(loss='modified_huber'),
    SGDClassifier(loss='hinge'),
    SGDClassifier(loss='squared_hinge'),
    SGDClassifier(loss='perceptron')
]

burner = 150

In [37]:
kobe_x['lon'] = kobe_x['lon'].apply(lambda x: x if x >=0 else -x)

In [38]:
kobe_x.iloc[0]

lat                                                               33.9723
lon                                                              118.1028
period                                                             1.0000
playoffs                                                           0.0000
shot_distance                                                     18.0000
game_year                                                       2000.0000
game_month                                                        10.0000
game_week                                                         44.0000
game_dayofweek                                                     1.0000
action_type_Alley Oop Dunk Shot                                    0.0000
action_type_Alley Oop Layup shot                                   0.0000
action_type_Cutting Finger Roll Layup Shot                         0.0000
action_type_Cutting Layup Shot                                     0.0000
action_type_Driving Bank shot         

In [41]:
test_model = prob_models[0]

score = []
for x, y  in zip(kobe_x.iterrows(),kobe_y.iterrows()):
    #unpack
    i, row = x
    j, flag = y
    
    #print flag['shot_made_flag']
    #print row.values
    
    #hold onto x's for burner to calculate mean
    y_temp = [0.5]
    
    if pd.isnull(flag['shot_made_flag']):
        if i < burner:
            score.append([i+1,np.mean(y_temp)])
        else:
            print score
            score.append([i+1,test_model.predict_proba([row.tolist()])])
    else:
        print score
        print y_temp
        test_model.partial_fit([row.tolist()],[flag['shot_made_flag']],classes=[0,1])
    
    if i == 250:
        break

[[1, 0.5]]
[0.5]


AttributeError: 'MultinomialNB' object has no attribute 'feature_log_prob_'

In [139]:
kobe_y['shot_made_flag'].values.tolist()[0] is None

False

In [111]:
for x, y in zip(kobe_x.iterrows(),kobe_y['shot_made_flag']):
    print x[0]
    print x[1].values
    print y
    if x[0]==10:
        break

0
[  3.39723000e+01  -1.18102800e+02   1.00000000e+00   0.00000000e+00
   1.80000000e+01   2.00000000e+03   1.00000000e+01   4.40000000e+01
   1.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+

In [78]:
global_prediction(kobe_x_train,kobe_x_test,kobe_y_train,thresholds,components,models)

  if __name__ == '__main__':


[24, ['logmodel', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)], 0.2, 5, 0.9796208968666864, 0.0014429266667051015]


In [79]:
print [x[1] for x in predictions]

[0.00027168670300570279, 0.99999988479111401, 0.00027168670300570279, 0.00058282285795677343, 0.0003448905165769643, 0.0026126205929374014, 0.00027168670300570279, 0.0019051981884677569, 0.99999999999963007, 0.0015013128205368473, 0.0015013128205368473, 0.0015013128205368473, 0.99786844265619012, 0.00047322709779604237, 1.0, 0.00047322709779604237, 0.0019051981884677569, 0.0026126205929374014, 0.99786844265619012, 0.99786844265619012, 0.00027168670300570279, 0.00058282285795677343, 0.996292345069279, 0.0015013128205368473, 0.00027168670300570279, 0.99999999999963007, 0.00027168670300570279, 0.0015013128205368473, 0.9989105772764999, 0.00027168670300570279, 0.99786844265619012, 0.99826876261920738, 0.00027168670300570279, 0.0015013128205368473, 0.996292345069279, 0.0019051981884677569, 0.996292345069279, 0.0003448905165769643, 0.99786844265619012, 0.00027168670300570279, 0.0015013128205368473, 0.0019051981884677569, 0.996292345069279, 0.99826876261920738, 0.99786844265619012, 0.00190519