# Imports

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from scipy.stats import norm

# Setup

In [2]:
def maxnorm(elo1, elo2, var=35): # 8 looks best
    nm=elo1-elo2
    var*=2
    return 1-norm.cdf(-nm/var**.5)

In [3]:
NBA_data= pd.read_csv("nbaallelo.csv")
NBA_data

Unnamed: 0,gameorder,game_id,lg_id,_iscopy,year_id,date_game,seasongame,is_playoffs,team_id,fran_id,...,win_equiv,opp_id,opp_fran,opp_pts,opp_elo_i,opp_elo_n,game_location,game_result,forecast,notes
0,1,194611010TRH,NBA,0,1947,11/1/1946,1,0,TRH,Huskies,...,40.294830,NYK,Knicks,68,1300.0000,1306.7233,H,L,0.640065,
1,1,194611010TRH,NBA,1,1947,11/1/1946,1,0,NYK,Knicks,...,41.705170,TRH,Huskies,66,1300.0000,1293.2767,A,W,0.359935,
2,2,194611020CHS,NBA,0,1947,11/2/1946,1,0,CHS,Stags,...,42.012257,NYK,Knicks,47,1306.7233,1297.0712,H,W,0.631101,
3,2,194611020CHS,NBA,1,1947,11/2/1946,2,0,NYK,Knicks,...,40.692783,CHS,Stags,63,1300.0000,1309.6521,A,L,0.368899,
4,3,194611020DTF,NBA,0,1947,11/2/1946,1,0,DTF,Falcons,...,38.864048,WSC,Capitols,50,1300.0000,1320.3811,H,L,0.640065,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126309,63155,201506110CLE,NBA,0,2015,6/11/2015,100,1,CLE,Cavaliers,...,60.309792,GSW,Warriors,103,1790.9591,1809.9791,H,L,0.546572,
126310,63156,201506140GSW,NBA,0,2015,6/14/2015,102,1,GSW,Warriors,...,68.013329,CLE,Cavaliers,91,1704.3949,1700.7391,H,W,0.765565,
126311,63156,201506140GSW,NBA,1,2015,6/14/2015,101,1,CLE,Cavaliers,...,60.010067,GSW,Warriors,104,1809.9791,1813.6349,A,L,0.234435,
126312,63157,201506170CLE,NBA,0,2015,6/16/2015,102,1,CLE,Cavaliers,...,59.290245,GSW,Warriors,105,1813.6349,1822.2881,H,L,0.481450,


In [4]:
NBA_data.columns

Index(['gameorder', 'game_id', 'lg_id', '_iscopy', 'year_id', 'date_game',
       'seasongame', 'is_playoffs', 'team_id', 'fran_id', 'pts', 'elo_i',
       'elo_n', 'win_equiv', 'opp_id', 'opp_fran', 'opp_pts', 'opp_elo_i',
       'opp_elo_n', 'game_location', 'game_result', 'forecast', 'notes'],
      dtype='object')

In [5]:
NBA_data['maxnorm_forecast'] = NBA_data.apply(lambda x: maxnorm(x['elo_i'], x['opp_elo_i']), axis=1)
NBA_data['results'] = (NBA_data['game_result'] == 'W')*1

In [6]:
NBA_var=np.var(NBA_data['elo_i'])
NBA_mean=np.mean(NBA_data[NBA_data['team_id']=='NYK']['elo_i'])

print(NBA_mean, NBA_var/(126314**.5))

1497.6156040908302 35.382742825274356


In [7]:
NBA_data[['elo_i', 'opp_elo_i', 'forecast', 'maxnorm_forecast', 'results']]

Unnamed: 0,elo_i,opp_elo_i,forecast,maxnorm_forecast,results
0,1300.0000,1300.0000,0.640065,5.000000e-01,0
1,1300.0000,1300.0000,0.359935,5.000000e-01,1
2,1300.0000,1306.7233,0.631101,2.108175e-01,1
3,1306.7233,1300.0000,0.368899,7.891825e-01,0
4,1300.0000,1300.0000,0.640065,5.000000e-01,0
...,...,...,...,...,...
126309,1723.4149,1790.9591,0.546572,3.330669e-16,0
126310,1809.9791,1704.3949,0.765565,1.000000e+00,1
126311,1704.3949,1809.9791,0.234435,0.000000e+00,0
126312,1700.7391,1813.6349,0.481450,0.000000e+00,0


In [8]:
regdf1=NBA_data[['game_id','forecast', 'maxnorm_forecast']]
model1 = smf.ols(formula='maxnorm_forecast ~ forecast', data=regdf1).fit()
model1.summary()

0,1,2,3
Dep. Variable:,maxnorm_forecast,R-squared:,0.481
Model:,OLS,Adj. R-squared:,0.481
Method:,Least Squares,F-statistic:,116900.0
Date:,"Mon, 04 Jan 2021",Prob (F-statistic):,0.0
Time:,15:16:06,Log-Likelihood:,-46848.0
No. Observations:,126314,AIC:,93700.0
Df Residuals:,126312,BIC:,93720.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.2837,0.002,-113.695,0.000,-0.289,-0.279
forecast,1.5673,0.005,341.971,0.000,1.558,1.576

0,1,2,3
Omnibus:,4471.458,Durbin-Watson:,2.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2007.814
Skew:,-0.0,Prob(JB):,0.0
Kurtosis:,2.382,Cond. No.,5.85


# 538 Model Test Results

In [9]:
regdf2=NBA_data[['game_id','forecast', 'results']]
model2 = smf.ols(formula='forecast ~ results', data=regdf2).fit()
model2.summary()

0,1,2,3
Dep. Variable:,forecast,R-squared:,0.185
Model:,OLS,Adj. R-squared:,0.185
Method:,Least Squares,F-statistic:,28640.0
Date:,"Mon, 04 Jan 2021",Prob (F-statistic):,0.0
Time:,15:16:06,Log-Likelihood:,27686.0
No. Observations:,126314,AIC:,-55370.0
Df Residuals:,126312,BIC:,-55350.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4075,0.001,526.892,0.000,0.406,0.409
results,0.1851,0.001,169.227,0.000,0.183,0.187

0,1,2,3
Omnibus:,6278.891,Durbin-Watson:,3.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2485.259
Skew:,-0.0,Prob(JB):,0.0
Kurtosis:,2.313,Cond. No.,2.62


In [10]:
log_reg1 = sm.Logit(regdf2['results'], regdf2['forecast']).fit() 
log_reg1.summary()

Optimization terminated successfully.
         Current function value: 0.678589
         Iterations 4


0,1,2,3
Dep. Variable:,results,No. Observations:,126314.0
Model:,Logit,Df Residuals:,126313.0
Method:,MLE,Df Model:,0.0
Date:,"Mon, 04 Jan 2021",Pseudo R-squ.:,0.021
Time:,15:16:06,Log-Likelihood:,-85715.0
converged:,True,LL-Null:,-87554.0
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
forecast,0.6341,0.011,59.958,0.000,0.613,0.655


# Maxnorm Test Results

In [11]:
regdf3=NBA_data[['game_id','maxnorm_forecast', 'results']]
model3 = smf.ols(formula='maxnorm_forecast ~ results', data=regdf3).fit()
model3.summary()

0,1,2,3
Dep. Variable:,maxnorm_forecast,R-squared:,0.086
Model:,OLS,Adj. R-squared:,0.086
Method:,Least Squares,F-statistic:,11960.0
Date:,"Mon, 04 Jan 2021",Prob (F-statistic):,0.0
Time:,15:16:06,Log-Likelihood:,-82527.0
No. Observations:,126314,AIC:,165100.0
Df Residuals:,126312,BIC:,165100.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.3569,0.002,192.873,0.000,0.353,0.361
results,0.2862,0.003,109.343,0.000,0.281,0.291

0,1,2,3
Omnibus:,621261.195,Durbin-Watson:,2.996
Prob(Omnibus):,0.0,Jarque-Bera (JB):,13436.595
Skew:,0.0,Prob(JB):,0.0
Kurtosis:,1.402,Cond. No.,2.62


In [12]:
log_reg2 = sm.Logit(regdf3['results'], regdf3['maxnorm_forecast']).fit() 
log_reg2.summary()

Optimization terminated successfully.
         Current function value: 0.671812
         Iterations 4


0,1,2,3
Dep. Variable:,results,No. Observations:,126314.0
Model:,Logit,Df Residuals:,126313.0
Method:,MLE,Df Model:,0.0
Date:,"Mon, 04 Jan 2021",Pseudo R-squ.:,0.03078
Time:,15:16:06,Log-Likelihood:,-84859.0
converged:,True,LL-Null:,-87554.0
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
maxnorm_forecast,0.6053,0.008,71.812,0.000,0.589,0.622


# Weighted Mix

In [16]:
a=.1
NBA_data['w_forecast']= NBA_data['maxnorm_forecast']*a+ NBA_data['forecast']*(1-a)

In [17]:
regdf4=NBA_data[['game_id','w_forecast', 'results']]
model4 = smf.ols(formula='w_forecast ~ results', data=regdf4).fit()
model4.summary()

0,1,2,3
Dep. Variable:,w_forecast,R-squared:,0.18
Model:,OLS,Adj. R-squared:,0.18
Method:,Least Squares,F-statistic:,27690.0
Date:,"Mon, 04 Jan 2021",Prob (F-statistic):,0.0
Time:,15:16:35,Log-Likelihood:,18846.0
No. Observations:,126314,AIC:,-37690.0
Df Residuals:,126312,BIC:,-37670.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4024,0.001,485.184,0.000,0.401,0.404
results,0.1952,0.001,166.406,0.000,0.193,0.197

0,1,2,3
Omnibus:,10013.806,Durbin-Watson:,3.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3245.152
Skew:,-0.0,Prob(JB):,0.0
Kurtosis:,2.215,Cond. No.,2.62


In [18]:
log_reg3 = sm.Logit(regdf3['results'], regdf4['w_forecast']).fit() 
log_reg3.summary()

Optimization terminated successfully.
         Current function value: 0.677294
         Iterations 4


0,1,2,3
Dep. Variable:,results,No. Observations:,126314.0
Model:,Logit,Df Residuals:,126313.0
Method:,MLE,Df Model:,0.0
Date:,"Mon, 04 Jan 2021",Pseudo R-squ.:,0.02287
Time:,15:16:35,Log-Likelihood:,-85552.0
converged:,True,LL-Null:,-87554.0
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
w_forecast,0.6554,0.010,62.476,0.000,0.635,0.676


# Binned Testing

In [19]:
bindf=NBA_data[['maxnorm_forecast','results']]

In [52]:
bins= np.arange(0,1.01,.01)
labels= np.arange(0,100,1)
print(bins)
bindf['binned']=pd.cut(bindf['maxnorm_forecast'],bins=bins,labels=labels)

[0.   0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 0.1  0.11 0.12 0.13
 0.14 0.15 0.16 0.17 0.18 0.19 0.2  0.21 0.22 0.23 0.24 0.25 0.26 0.27
 0.28 0.29 0.3  0.31 0.32 0.33 0.34 0.35 0.36 0.37 0.38 0.39 0.4  0.41
 0.42 0.43 0.44 0.45 0.46 0.47 0.48 0.49 0.5  0.51 0.52 0.53 0.54 0.55
 0.56 0.57 0.58 0.59 0.6  0.61 0.62 0.63 0.64 0.65 0.66 0.67 0.68 0.69
 0.7  0.71 0.72 0.73 0.74 0.75 0.76 0.77 0.78 0.79 0.8  0.81 0.82 0.83
 0.84 0.85 0.86 0.87 0.88 0.89 0.9  0.91 0.92 0.93 0.94 0.95 0.96 0.97
 0.98 0.99 1.  ]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [53]:
bindf['binned']=bindf['binned'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [54]:
bindf

Unnamed: 0,maxnorm_forecast,results,binned
0,5.000000e-01,0,49
1,5.000000e-01,1,49
2,2.108175e-01,1,21
3,7.891825e-01,0,78
4,5.000000e-01,0,49
...,...,...,...
126309,3.330669e-16,0,0
126310,1.000000e+00,1,99
126311,0.000000e+00,0,0
126312,0.000000e+00,0,0


In [58]:
#set(bindf['binned'])

In [59]:
bin_comp=[]
for i in range(len(bins)-1):
    binaver= np.mean(bindf['maxnorm_forecast'][bindf['binned']==labels[i]])
    bin_comp.append([(bins[i]+bins[i+1])/2,binaver])
    print((bins[i]+bins[i+1])/2,": ", binaver)

0.005 :  0.00017559408105474617
0.015 :  0.014625604127347137
0.025 :  0.024720900014084703
0.035 :  0.034736494276500676
0.045 :  0.04443213716756697
0.055 :  0.055021667955487685
0.065 :  0.06480881195502927
0.07500000000000001 :  0.07496106623677808
0.08499999999999999 :  0.08500340378572184
0.095 :  0.09516151586962268
0.10500000000000001 :  0.10438546198473045
0.11499999999999999 :  0.1150751225220412
0.125 :  0.12471484289942385
0.135 :  0.13486904817590126
0.14500000000000002 :  0.1448564617608738
0.155 :  0.15539327137066572
0.165 :  0.16474709392425102
0.175 :  0.17510438411387236
0.185 :  0.18522666694721723
0.195 :  0.19544900272003918
0.20500000000000002 :  0.2052112696965774
0.215 :  0.21483198909480114
0.225 :  0.22500101612511286
0.235 :  0.2355478007495365
0.245 :  0.24464126061283517
0.255 :  0.25498282095131153
0.265 :  0.26505039233427613
0.275 :  0.27457043561122135
0.28500000000000003 :  0.28496792871226834
0.295 :  0.29510240799657766
0.305 :  0.3052532376995282
0

In [61]:
bin_comp

[[0.005, 0.00017559408105474617],
 [0.015, 0.014625604127347137],
 [0.025, 0.024720900014084703],
 [0.035, 0.034736494276500676],
 [0.045, 0.04443213716756697],
 [0.055, 0.055021667955487685],
 [0.065, 0.06480881195502927],
 [0.07500000000000001, 0.07496106623677808],
 [0.08499999999999999, 0.08500340378572184],
 [0.095, 0.09516151586962268],
 [0.10500000000000001, 0.10438546198473045],
 [0.11499999999999999, 0.1150751225220412],
 [0.125, 0.12471484289942385],
 [0.135, 0.13486904817590126],
 [0.14500000000000002, 0.1448564617608738],
 [0.155, 0.15539327137066572],
 [0.165, 0.16474709392425102],
 [0.175, 0.17510438411387236],
 [0.185, 0.18522666694721723],
 [0.195, 0.19544900272003918],
 [0.20500000000000002, 0.2052112696965774],
 [0.215, 0.21483198909480114],
 [0.225, 0.22500101612511286],
 [0.235, 0.2355478007495365],
 [0.245, 0.24464126061283517],
 [0.255, 0.25498282095131153],
 [0.265, 0.26505039233427613],
 [0.275, 0.27457043561122135],
 [0.28500000000000003, 0.28496792871226834],
