# Import Libraries

In [54]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from os import path
import random
import math
from pandas import DataFrame, Series
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# Load Data

In [2]:
DATA_DIR = r'C:\Users\AButa\Downloads\code-basketball-files-main\data'

# load
df = pd.read_csv(path.join(DATA_DIR, 'shots.csv'))

df['dist_sq'] = df['dist']**2
df['made'] = df['made'].astype(int)

df[['made', 'dist', 'dist_sq']].head()

Unnamed: 0,made,dist,dist_sq
0,1,2,4
1,0,26,676
2,1,25,625
3,0,26,676
4,0,18,324


# Linear Regression

In [3]:
# OLS (Ordinary Least Squares) function = 
# other term for basic Linear Regression
model = smf.ols(formula='made ~ dist + dist_sq', data=df)

results = model.fit()

In [4]:
results.summary()

0,1,2,3
Dep. Variable:,made,R-squared:,0.05
Model:,OLS,Adj. R-squared:,0.05
Method:,Least Squares,F-statistic:,441.0
Date:,"Mon, 08 Jul 2024",Prob (F-statistic):,2.15e-187
Time:,19:26:09,Log-Likelihood:,-11764.0
No. Observations:,16876,AIC:,23530.0
Df Residuals:,16873,BIC:,23560.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6197,0.007,91.828,0.000,0.607,0.633
dist,-0.0177,0.001,-17.794,0.000,-0.020,-0.016
dist_sq,0.0003,3.23e-05,8.233,0.000,0.000,0.000

0,1,2,3
Omnibus:,68294.39,Durbin-Watson:,2.012
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2269.755
Skew:,0.163,Prob(JB):,0.0
Kurtosis:,1.233,Cond. No.,797.0


In [5]:
def prob_of_make(yds):
    b0, b1, b2 = results.params
    return (b0 + b1*yds + b2*(yds**2))

In [6]:
prob_of_make(1)

0.6022954444724223

In [7]:
prob_of_make(25)

0.3431953624855085

In [8]:
prob_of_make(30)

0.3277962585781291

In [9]:
df['made_hats'] = results.predict(df)
df[['made', 'made_hats']].head()

Unnamed: 0,made,made_hats
0,1,0.58538
1,0,0.339051
2,1,0.343195
3,0,0.339051
4,0,0.387104


## Statistical Significance

In [10]:
coin = ['H', 'T']

# make an empty DataFrame
df2 = pd.DataFrame(index=range(100))

# now fill it with a "guess" and a "flip"
df2['guess'] = [random.choice(coin) for _ in range(100)]
df2['result'] = [random.choice(coin) for _ in range(100)]

# did we get it right or not?
df2['right'] = (df2['guess'] == df2['result']).astype(int)

df2.head()

Unnamed: 0,guess,result,right
0,H,T,0
1,H,H,1
2,T,H,0
3,T,H,0
4,H,T,0


In [11]:
# Regression

# C indicates guess is a categorical variable
model = smf.ols(formula='right ~ C(guess)', data=df2)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,right,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.01
Method:,Least Squares,F-statistic:,0.02841
Date:,"Mon, 08 Jul 2024",Prob (F-statistic):,0.867
Time:,19:26:09,Log-Likelihood:,-72.545
No. Observations:,100,AIC:,149.1
Df Residuals:,98,BIC:,154.3
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4828,0.066,7.282,0.000,0.351,0.614
C(guess)[T.T],0.0172,0.102,0.169,0.867,-0.186,0.220

0,1,2,3
Omnibus:,803.925,Durbin-Watson:,2.319
Prob(Omnibus):,0.0,Jarque-Bera (JB):,16.647
Skew:,0.04,Prob(JB):,0.000243
Kurtosis:,1.003,Cond. No.,2.47


### Regressions hold things constant

#### dunks

In [12]:
# dunking shot probability

model = smf.ols(formula='made ~ dunk', data=df)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,made,R-squared:,0.049
Model:,OLS,Adj. R-squared:,0.048
Method:,Least Squares,F-statistic:,860.2
Date:,"Mon, 08 Jul 2024",Prob (F-statistic):,1.83e-184
Time:,19:26:09,Log-Likelihood:,-11774.0
No. Observations:,16876,AIC:,23550.0
Df Residuals:,16874,BIC:,23570.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4333,0.004,112.541,0.000,0.426,0.441
dunk[T.True],0.4808,0.016,29.329,0.000,0.449,0.513

0,1,2,3
Omnibus:,63998.527,Durbin-Watson:,2.038
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2557.914
Skew:,0.238,Prob(JB):,0.0
Kurtosis:,1.153,Cond. No.,4.39


In [13]:
df.groupby('dunk')['dist'].mean()

# on average, dunk attemps are insite 1 foot from the basket

dunk
False    14.168705
True      0.828142
Name: dist, dtype: float64

In [14]:
# Let's add distance to the model

model = smf.ols(formula='made ~ dunk + dist', data=df)
results = model.fit()
results.summary()

# --> we know dunk coeff measures only dunks effect
# and not that they're closer to the basket

0,1,2,3
Dep. Variable:,made,R-squared:,0.073
Model:,OLS,Adj. R-squared:,0.073
Method:,Least Squares,F-statistic:,667.7
Date:,"Mon, 08 Jul 2024",Prob (F-statistic):,8.26e-280
Time:,19:26:09,Log-Likelihood:,-11551.0
No. Observations:,16876,AIC:,23110.0
Df Residuals:,16873,BIC:,23130.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.5427,0.006,84.867,0.000,0.530,0.555
dunk[T.True],0.3778,0.017,22.372,0.000,0.345,0.411
dist,-0.0077,0.000,-21.266,0.000,-0.008,-0.007

0,1,2,3
Omnibus:,70096.276,Durbin-Watson:,2.017
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2276.793
Skew:,0.228,Prob(JB):,0.0
Kurtosis:,1.259,Cond. No.,79.2


#### layups

In [15]:
# with layup shots probability

model = smf.ols(formula='made ~ dunk + dist + layup', data=df)
results = model.fit()
results.summary()


# intercept = probability of made shot at 0ft not being a dunk/layup
# dist coeff --> further the shot, lower the chance

# prob of making dunk 2ft away
# Intercept + 2dist + dunk[T.True]

0,1,2,3
Dep. Variable:,made,R-squared:,0.075
Model:,OLS,Adj. R-squared:,0.075
Method:,Least Squares,F-statistic:,454.7
Date:,"Mon, 08 Jul 2024",Prob (F-statistic):,3.91e-284
Time:,19:26:09,Log-Likelihood:,-11538.0
No. Observations:,16876,AIC:,23080.0
Df Residuals:,16872,BIC:,23110.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4913,0.012,41.596,0.000,0.468,0.514
dunk[T.True],0.4273,0.019,22.022,0.000,0.389,0.465
layup[T.True],0.0669,0.013,5.168,0.000,0.042,0.092
dist,-0.0055,0.001,-9.732,0.000,-0.007,-0.004

0,1,2,3
Omnibus:,70806.63,Durbin-Watson:,2.012
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2254.673
Skew:,0.23,Prob(JB):,0.0
Kurtosis:,1.27,Cond. No.,106.0


#### Fixed Effects

In [16]:
# categorical variables
# all shot types or positions
# --> fixed effects

cats2 = ['layup', 'pullup', 'float', 'dunk', 
         'hook', 'fadeaway', 'step']

df['basic'] = df[cats2].sum(axis=1) == 0
df['dist_sq'] = df['dist'] ** 2

df['shot_type'] = np.nan
for shot in cats2 + ['basic']:
    df.loc[df[shot], 'shot_type'] = shot

df[['layup', 'pullup', 'float', 'dunk', 
         'hook', 'fadeaway', 'step', 'basic', 'shot_type']].head()

# leave out 1 category so maths work --> basic

Unnamed: 0,layup,pullup,float,dunk,hook,fadeaway,step,basic,shot_type
0,True,False,False,False,False,False,False,False,layup
1,False,True,False,False,False,False,False,False,pullup
2,False,False,False,False,False,False,False,True,basic
3,False,False,False,False,False,False,False,True,basic
4,False,False,False,False,False,False,False,True,basic


In [17]:
pd.get_dummies(df['shot_type']).head()

Unnamed: 0,basic,dunk,fadeaway,float,hook,layup,pullup,step
0,False,False,False,False,False,True,False,False
1,False,False,False,False,False,False,True,False
2,True,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False


#### Squaring Variables

In [18]:
model = smf.ols(formula='made ~ C(shot_type) + dist + dist_sq', data=df)
results = model.fit()
results.summary()

# convert categorical data into fixed effects set--> 
# wrapping variables in C(...)

# automatically dropped basic

0,1,2,3
Dep. Variable:,made,R-squared:,0.077
Model:,OLS,Adj. R-squared:,0.076
Method:,Least Squares,F-statistic:,155.5
Date:,"Mon, 08 Jul 2024",Prob (F-statistic):,6.53e-284
Time:,19:26:09,Log-Likelihood:,-11521.0
No. Observations:,16876,AIC:,23060.0
Df Residuals:,16866,BIC:,23140.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.5950,0.024,24.914,0.000,0.548,0.642
C(shot_type)[T.dunk],0.3300,0.028,11.974,0.000,0.276,0.384
C(shot_type)[T.fadeaway],-0.0649,0.023,-2.849,0.004,-0.110,-0.020
C(shot_type)[T.float],-0.0821,0.020,-4.205,0.000,-0.120,-0.044
C(shot_type)[T.hook],-0.0707,0.027,-2.601,0.009,-0.124,-0.017
C(shot_type)[T.layup],-0.0235,0.022,-1.045,0.296,-0.067,0.021
C(shot_type)[T.pullup],-0.0022,0.013,-0.176,0.860,-0.027,0.023
C(shot_type)[T.step],0.0249,0.019,1.299,0.194,-0.013,0.062
dist,-0.0135,0.002,-7.668,0.000,-0.017,-0.010

0,1,2,3
Omnibus:,71509.496,Durbin-Watson:,2.01
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2233.875
Skew:,0.232,Prob(JB):,0.0
Kurtosis:,1.279,Cond. No.,5710.0


In [19]:
# let's drop layup instead

model = smf.ols(
    formula="made ~ C(shot_type, Treatment(reference='layup')) + dist + dist_sq", data=df)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,made,R-squared:,0.077
Model:,OLS,Adj. R-squared:,0.076
Method:,Least Squares,F-statistic:,155.5
Date:,"Mon, 08 Jul 2024",Prob (F-statistic):,6.53e-284
Time:,19:26:10,Log-Likelihood:,-11521.0
No. Observations:,16876,AIC:,23060.0
Df Residuals:,16866,BIC:,23140.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.5715,0.008,75.500,0.000,0.557,0.586
"C(shot_type, Treatment(reference='layup'))[T.basic]",0.0235,0.022,1.045,0.296,-0.021,0.067
"C(shot_type, Treatment(reference='layup'))[T.dunk]",0.3535,0.017,20.492,0.000,0.320,0.387
"C(shot_type, Treatment(reference='layup'))[T.fadeaway]",-0.0414,0.025,-1.667,0.096,-0.090,0.007
"C(shot_type, Treatment(reference='layup'))[T.float]",-0.0587,0.018,-3.297,0.001,-0.094,-0.024
"C(shot_type, Treatment(reference='layup'))[T.hook]",-0.0473,0.023,-2.038,0.042,-0.093,-0.002
"C(shot_type, Treatment(reference='layup'))[T.pullup]",0.0212,0.022,0.953,0.341,-0.022,0.065
"C(shot_type, Treatment(reference='layup'))[T.step]",0.0483,0.027,1.762,0.078,-0.005,0.102
dist,-0.0135,0.002,-7.668,0.000,-0.017,-0.010

0,1,2,3
Omnibus:,71509.496,Durbin-Watson:,2.01
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2233.875
Skew:,0.232,Prob(JB):,0.0
Kurtosis:,1.279,Cond. No.,5140.0


#### Logging Variables

In [20]:
# can't take natural log of 0
# --> min dist 0.5 ft

df['ln_dist'] = np.log(df['dist'].apply(lambda x: max(x, 0.5)))
# any value less than 0.5 is replaced with 0.5
# --> void taking the logarithm of zero or negative numbers, 
# which would result in an undefined or error value

df.head()

Unnamed: 0,name,dist,value,made,desc,team,opp,x,y,player_id,...,area,date,period,min_left,sec_left,dist_sq,made_hats,basic,shot_type,ln_dist
0,L. James,2,2,1,Layup Shot,LAC,LAL,-9,23,2544,...,Center(C),20191022,1,11,47,4,0.58538,False,layup,0.693147
1,L. Shamet,26,3,0,Pullup Jump shot,LAC,LAL,201,178,1629013,...,Right Side Center(RC),20191022,1,11,40,676,0.339051,False,pullup,3.258097
2,D. Green,25,3,1,Jump Shot,LAC,LAL,125,221,201980,...,Right Side Center(RC),20191022,1,11,23,625,0.343195,True,basic,3.218876
3,P. Beverley,26,3,0,Jump Shot,LAC,LAL,117,239,201976,...,Right Side Center(RC),20191022,1,11,0,676,0.339051,True,basic,3.258097
4,A. Davis,18,2,0,Jump Shot,LAC,LAL,96,162,203076,...,Right Side Center(RC),20191022,1,10,47,324,0.387104,True,basic,2.890372


In [21]:
model = smf.ols(formula='made ~ ln_dist', data=df)
results = model.fit()
results.summary()

# --> the further lowers making shot prob.

0,1,2,3
Dep. Variable:,made,R-squared:,0.064
Model:,OLS,Adj. R-squared:,0.064
Method:,Least Squares,F-statistic:,1161.0
Date:,"Mon, 08 Jul 2024",Prob (F-statistic):,3.7399999999999996e-246
Time:,19:26:10,Log-Likelihood:,-11632.0
No. Observations:,16876,AIC:,23270.0
Df Residuals:,16874,BIC:,23280.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6411,0.006,98.843,0.000,0.628,0.654
ln_dist,-0.0919,0.003,-34.073,0.000,-0.097,-0.087

0,1,2,3
Omnibus:,72657.622,Durbin-Watson:,1.995
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2133.221
Skew:,0.178,Prob(JB):,0.0
Kurtosis:,1.295,Cond. No.,4.72


#### interactions

In [22]:
# have the distance efects same prob. for every shot type?
# No? layup taken from further dist. with bigger impact

df['is_layup'] = df['shot_type'] == 'layup'

model = smf.ols(formula='made ~ dist:is_layup', data=df)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,made,R-squared:,0.059
Model:,OLS,Adj. R-squared:,0.059
Method:,Least Squares,F-statistic:,526.7
Date:,"Mon, 08 Jul 2024",Prob (F-statistic):,1.27e-222
Time:,19:26:10,Log-Likelihood:,-11683.0
No. Observations:,16876,AIC:,23370.0
Df Residuals:,16873,BIC:,23390.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6503,0.007,92.576,0.000,0.637,0.664
dist:is_layup[False],-0.0123,0.000,-32.330,0.000,-0.013,-0.012
dist:is_layup[True],-0.0635,0.004,-17.984,0.000,-0.070,-0.057

0,1,2,3
Omnibus:,69787.594,Durbin-Watson:,2.013
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2208.947
Skew:,0.158,Prob(JB):,0.0
Kurtosis:,1.256,Cond. No.,33.9


# Logistic Regression

In [23]:
# modelling probability with Linear Regression or OLS
# --> often leads to predictions outside 0-1 range
# Logistic instead

model = smf.logit(formula='made ~ layup + dist + dist:layup', data=df)
logit_results = model.fit()
logit_results.summary()

Optimization terminated successfully.
         Current function value: 0.659379
         Iterations 5


0,1,2,3
Dep. Variable:,made,No. Observations:,16876.0
Model:,Logit,Df Residuals:,16872.0
Method:,MLE,Df Model:,3.0
Date:,"Mon, 08 Jul 2024",Pseudo R-squ.:,0.04426
Time:,19:37:04,Log-Likelihood:,-11128.0
converged:,True,LL-Null:,-11643.0
Covariance Type:,nonrobust,LLR p-value:,4e-223

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.5495,0.042,13.226,0.000,0.468,0.631
layup[T.True],0.2014,0.062,3.248,0.001,0.080,0.323
dist,-0.0480,0.002,-22.719,0.000,-0.052,-0.044
dist:layup[T.True],-0.2687,0.021,-13.090,0.000,-0.309,-0.228


In [26]:
# Then to calculate shot made prob given dist and is_layup
# --> logistic function

def prob_made_logit(dist, is_layup):
    b0, b1, b2, b3 = logit_results.params
    value = (b0 + b1*is_layup + b2*dist + b3*is_layup*dist)
    return 1/(1 + math.exp(-value))

In [29]:
prob_made_logit(0, 1)

0.6793817567896818

In [30]:
prob_made_logit(24, 1)

0.0010599337672312977

In [31]:
prob_made_logit(24, 0)

0.35393219503177753

A Logistic model guarantees predicted probability will be between 0 and 1.

Always use a logit instead of OLS when modelling yes or no outcome type.

Linear & Logistic Regressions useful for

1. Alaysing relationship between data (coeffs.)
2. Making predictions

# Random Forest

Random Forest models are much more of a black box
more flexibles, make fewer assumptions about data -->

great for

1. Analysing variables relationships
2. predictions

Not continuous or 0/1 but classification problems

In [35]:
df.columns

Index(['name', 'dist', 'value', 'made', 'desc', 'team', 'opp', 'x', 'y',
       'player_id', 'game_id', 'event_id', 'shot_id', 'running', 'jump',
       'hook', 'layup', 'driving', 'dunk', 'alley', 'reverse', 'turnaround',
       'fadeaway', 'bank', 'finger', 'putback', 'float', 'pullup', 'step',
       'cutting', 'tip', 'zone', 'area', 'date', 'period', 'min_left',
       'sec_left', 'dist_sq', 'made_hats', 'basic', 'shot_type', 'ln_dist',
       'is_layup'],
      dtype='object')

In [42]:
# time left in quarter - decimal format
df['time_left'] = df['min_left'] + df['sec_left']/60

shot_types = ['layup', 'pullup', 'float', 'dunk', 'hook', 'fadeaway']

df['other'] = df[shot_types].sum(axis=1) == 0

df['shot_type'] = 'other'
for shot in shot_types:
    df.loc[df[shot], 'shot_type'] = shot

0     layup
1    pullup
2     other
3     other
4     other
Name: shot_type, dtype: object

In [43]:
xvars = ['dist', 'x', 'y', 'period', 'time_left']
yvar = 'shot_type'

df[xvars + [yvar]].sample(10)

Unnamed: 0,dist,x,y,period,time_left,shot_type
12930,1,-11,5,2,8.05,layup
4749,9,-61,71,4,4.9,other
11618,13,-132,32,1,8.466667,fadeaway
1662,1,13,10,3,6.983333,layup
14517,24,235,52,4,11.533333,other
14480,0,2,5,3,8.983333,dunk
1305,23,230,11,2,0.533333,other
1555,0,8,5,4,2.766667,layup
13329,4,19,43,4,8.75,hook
3554,12,-62,111,1,3.183333,fadeaway


In [46]:
df[yvar].value_counts(normalize=True)

# Let's use dist, x, y, period and time_left to predict shot_type

shot_type
other       0.412064
layup       0.278621
pullup      0.118808
float       0.071996
dunk        0.055167
fadeaway    0.033717
hook        0.029628
Name: proportion, dtype: float64

In [50]:
# Holdout Set
train, test = train_test_split(df, test_size=0.2)

model = RandomForestClassifier(n_estimators=100)
model.fit(train[xvars], train[yvar])

In [51]:
test['shot_type_hat'] = model.predict(test[xvars])
test['correct'] = (test['shot_type_hat'] == test['shot_type'])

test['correct'].mean()

0.6863151658767772

In [52]:
model.predict_proba(test[xvars])

array([[0.  , 0.  , 0.55, ..., 0.26, 0.02, 0.12],
       [0.  , 0.03, 0.29, ..., 0.05, 0.22, 0.08],
       [0.42, 0.  , 0.01, ..., 0.57, 0.  , 0.  ],
       ...,
       [0.  , 0.01, 0.35, ..., 0.37, 0.14, 0.03],
       [0.  , 0.05, 0.01, ..., 0.  , 0.5 , 0.44],
       [0.  , 0.11, 0.5 , ..., 0.01, 0.26, 0.09]])

In [56]:
probs = DataFrame(model.predict_proba(test[xvars]),
                 index=test.index,
                 columns=model.classes_)
probs.head()

Unnamed: 0,dunk,fadeaway,float,hook,layup,other,pullup
7617,0.0,0.0,0.55,0.05,0.26,0.02,0.12
10310,0.0,0.03,0.29,0.33,0.05,0.22,0.08
13685,0.42,0.0,0.01,0.0,0.57,0.0,0.0
6975,0.0,0.0,0.0,0.0,0.0,0.92,0.08
1940,0.0,0.0,0.0,0.0,0.0,0.85,0.15


In [57]:
results = pd.concat([test[['name', 'dist', 'shot_type', 
                           'correct', 'shot_type_hat']], probs], axis=1)

In [60]:
results.groupby('shot_type')[['correct', 'layup', 
                              'pullup', 'float',
                              'dunk', 'hook',
                             'fadeaway', 'other']].mean().round(2)

Unnamed: 0_level_0,correct,layup,pullup,float,dunk,hook,fadeaway,other
shot_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
dunk,0.13,0.72,0.0,0.01,0.24,0.01,0.0,0.01
fadeaway,0.25,0.06,0.21,0.19,0.0,0.05,0.21,0.28
float,0.34,0.23,0.13,0.26,0.01,0.1,0.09,0.17
hook,0.08,0.38,0.05,0.26,0.03,0.14,0.05,0.1
layup,0.9,0.74,0.01,0.04,0.15,0.04,0.01,0.02
other,0.85,0.02,0.16,0.03,0.0,0.01,0.03,0.76
pullup,0.3,0.01,0.32,0.07,0.0,0.01,0.06,0.53


In [62]:
# Cross Validation
model = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(model, df[xvars], df[yvar], cv=10)
scores

array([0.66943128, 0.67120853, 0.6771327 , 0.66350711, 0.67061611,
       0.66587678, 0.682869  , 0.67575578, 0.68168346, 0.67219917])

In [63]:
scores.mean()

0.6730279921451187

In [64]:
model = RandomForestClassifier(n_estimators=100)
model.fit(df[xvars], df[yvar])

In [65]:
# feature importance
Series(model.feature_importances_, xvars).sort_values(ascending=False)

dist         0.298022
y            0.242542
x            0.212076
time_left    0.195403
period       0.051956
dtype: float64