# Imports

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from scipy.stats import norm

# Setup

In [2]:
def recalib(forecast): # 2000 looks best
    return forecast-2**(-17)*(2*(forecast-.5))**17

In [3]:
NBA_data= pd.read_csv("nbaallelo.csv")
NBA_data

Unnamed: 0,gameorder,game_id,lg_id,_iscopy,year_id,date_game,seasongame,is_playoffs,team_id,fran_id,...,win_equiv,opp_id,opp_fran,opp_pts,opp_elo_i,opp_elo_n,game_location,game_result,forecast,notes
0,1,194611010TRH,NBA,0,1947,11/1/1946,1,0,TRH,Huskies,...,40.294830,NYK,Knicks,68,1300.0000,1306.7233,H,L,0.640065,
1,1,194611010TRH,NBA,1,1947,11/1/1946,1,0,NYK,Knicks,...,41.705170,TRH,Huskies,66,1300.0000,1293.2767,A,W,0.359935,
2,2,194611020CHS,NBA,0,1947,11/2/1946,1,0,CHS,Stags,...,42.012257,NYK,Knicks,47,1306.7233,1297.0712,H,W,0.631101,
3,2,194611020CHS,NBA,1,1947,11/2/1946,2,0,NYK,Knicks,...,40.692783,CHS,Stags,63,1300.0000,1309.6521,A,L,0.368899,
4,3,194611020DTF,NBA,0,1947,11/2/1946,1,0,DTF,Falcons,...,38.864048,WSC,Capitols,50,1300.0000,1320.3811,H,L,0.640065,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126309,63155,201506110CLE,NBA,0,2015,6/11/2015,100,1,CLE,Cavaliers,...,60.309792,GSW,Warriors,103,1790.9591,1809.9791,H,L,0.546572,
126310,63156,201506140GSW,NBA,0,2015,6/14/2015,102,1,GSW,Warriors,...,68.013329,CLE,Cavaliers,91,1704.3949,1700.7391,H,W,0.765565,
126311,63156,201506140GSW,NBA,1,2015,6/14/2015,101,1,CLE,Cavaliers,...,60.010067,GSW,Warriors,104,1809.9791,1813.6349,A,L,0.234435,
126312,63157,201506170CLE,NBA,0,2015,6/16/2015,102,1,CLE,Cavaliers,...,59.290245,GSW,Warriors,105,1813.6349,1822.2881,H,L,0.481450,


In [4]:
NBA_data.columns

Index(['gameorder', 'game_id', 'lg_id', '_iscopy', 'year_id', 'date_game',
       'seasongame', 'is_playoffs', 'team_id', 'fran_id', 'pts', 'elo_i',
       'elo_n', 'win_equiv', 'opp_id', 'opp_fran', 'opp_pts', 'opp_elo_i',
       'opp_elo_n', 'game_location', 'game_result', 'forecast', 'notes'],
      dtype='object')

In [5]:
NBA_data['rc_forecast'] = NBA_data.apply(lambda x: recalib(x['forecast']), axis=1)
NBA_data['results'] = (NBA_data['game_result'] == 'W')*1

In [6]:
NBA_var=np.var(NBA_data['elo_i'])
NBA_mean=np.mean(NBA_data[NBA_data['team_id']=='NYK']['elo_i'])

print(NBA_mean, NBA_var) #/(126314**.5)

1497.6156040908302 12575.267726994409


In [7]:
NBA_data[['elo_i', 'opp_elo_i', 'forecast', 'rc_forecast', 'results']]

Unnamed: 0,elo_i,opp_elo_i,forecast,rc_forecast,results
0,1300.0000,1300.0000,0.640065,0.640065,0
1,1300.0000,1300.0000,0.359935,0.359935,1
2,1300.0000,1306.7233,0.631101,0.631101,1
3,1306.7233,1300.0000,0.368899,0.368899,0
4,1300.0000,1300.0000,0.640065,0.640065,0
...,...,...,...,...,...
126309,1723.4149,1790.9591,0.546572,0.546572,0
126310,1809.9791,1704.3949,0.765565,0.765544,1
126311,1704.3949,1809.9791,0.234435,0.234456,0
126312,1700.7391,1813.6349,0.481450,0.481450,0


In [8]:
regdf1=NBA_data[['game_id','forecast', 'rc_forecast']]
model1 = smf.ols(formula='rc_forecast ~ forecast', data=regdf1).fit()
model1.summary()

0,1,2,3
Dep. Variable:,rc_forecast,R-squared:,0.994
Model:,OLS,Adj. R-squared:,0.994
Method:,Least Squares,F-statistic:,19900000.0
Date:,"Mon, 04 Jan 2021",Prob (F-statistic):,0.0
Time:,18:18:28,Log-Likelihood:,337790.0
No. Observations:,126314,AIC:,-675600.0
Df Residuals:,126312,BIC:,-675600.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0135,0.000,113.561,0.000,0.013,0.014
forecast,0.9730,0.000,4461.081,0.000,0.973,0.973

0,1,2,3
Omnibus:,54835.578,Durbin-Watson:,2.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,176826740.932
Skew:,0.0,Prob(JB):,0.0
Kurtosis:,186.296,Cond. No.,5.85


# 538 Model Test Results

In [9]:
regdf2=NBA_data[['game_id','forecast', 'results']]
model2 = smf.ols(formula='forecast ~ results', data=regdf2).fit()
model2.summary()

0,1,2,3
Dep. Variable:,forecast,R-squared:,0.185
Model:,OLS,Adj. R-squared:,0.185
Method:,Least Squares,F-statistic:,28640.0
Date:,"Mon, 04 Jan 2021",Prob (F-statistic):,0.0
Time:,18:18:28,Log-Likelihood:,27686.0
No. Observations:,126314,AIC:,-55370.0
Df Residuals:,126312,BIC:,-55350.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4075,0.001,526.892,0.000,0.406,0.409
results,0.1851,0.001,169.227,0.000,0.183,0.187

0,1,2,3
Omnibus:,6278.891,Durbin-Watson:,3.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2485.259
Skew:,-0.0,Prob(JB):,0.0
Kurtosis:,2.313,Cond. No.,2.62


In [10]:
log_reg1 = sm.Logit(regdf2['results'], regdf2['forecast']).fit() 
log_reg1.summary()

Optimization terminated successfully.
         Current function value: 0.678589
         Iterations 4


0,1,2,3
Dep. Variable:,results,No. Observations:,126314.0
Model:,Logit,Df Residuals:,126313.0
Method:,MLE,Df Model:,0.0
Date:,"Mon, 04 Jan 2021",Pseudo R-squ.:,0.021
Time:,18:18:28,Log-Likelihood:,-85715.0
converged:,True,LL-Null:,-87554.0
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
forecast,0.6341,0.011,59.958,0.000,0.613,0.655


# RC Test Results

In [11]:
regdf3=NBA_data[['game_id','rc_forecast', 'results']]
model3 = smf.ols(formula='rc_forecast ~ results', data=regdf3).fit()
model3.summary()

0,1,2,3
Dep. Variable:,rc_forecast,R-squared:,0.184
Model:,OLS,Adj. R-squared:,0.184
Method:,Least Squares,F-statistic:,28400.0
Date:,"Mon, 04 Jan 2021",Prob (F-statistic):,0.0
Time:,18:18:28,Log-Likelihood:,30642.0
No. Observations:,126314,AIC:,-61280.0
Df Residuals:,126312,BIC:,-61260.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4100,0.001,542.702,0.000,0.408,0.411
results,0.1800,0.001,168.522,0.000,0.178,0.182

0,1,2,3
Omnibus:,8417.725,Durbin-Watson:,3.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2949.548
Skew:,-0.0,Prob(JB):,0.0
Kurtosis:,2.251,Cond. No.,2.62


In [12]:
log_reg2 = sm.Logit(regdf3['results'], regdf3['rc_forecast']).fit() 
log_reg2.summary()

Optimization terminated successfully.
         Current function value: 0.679274
         Iterations 4


0,1,2,3
Dep. Variable:,results,No. Observations:,126314.0
Model:,Logit,Df Residuals:,126313.0
Method:,MLE,Df Model:,0.0
Date:,"Mon, 04 Jan 2021",Pseudo R-squ.:,0.02001
Time:,18:18:29,Log-Likelihood:,-85802.0
converged:,True,LL-Null:,-87554.0
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
rc_forecast,0.6209,0.011,58.571,0.000,0.600,0.642


# Weighted Mix

In [13]:
a=.1
NBA_data['w_forecast']= NBA_data['rc_forecast']*a+ NBA_data['forecast']*(1-a)

In [14]:
regdf4=NBA_data[['game_id','w_forecast', 'results','rc_forecast','forecast']]
model4 = smf.ols(formula='w_forecast ~ results', data=regdf4).fit()
model4.summary()

0,1,2,3
Dep. Variable:,w_forecast,R-squared:,0.185
Model:,OLS,Adj. R-squared:,0.185
Method:,Least Squares,F-statistic:,28630.0
Date:,"Mon, 04 Jan 2021",Prob (F-statistic):,0.0
Time:,18:18:29,Log-Likelihood:,28022.0
No. Observations:,126314,AIC:,-56040.0
Df Residuals:,126312,BIC:,-56020.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4077,0.001,528.621,0.000,0.406,0.409
results,0.1846,0.001,169.215,0.000,0.182,0.187

0,1,2,3
Omnibus:,6665.358,Durbin-Watson:,3.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2576.091
Skew:,-0.0,Prob(JB):,0.0
Kurtosis:,2.3,Cond. No.,2.62


In [15]:
log_reg3 = sm.Logit(regdf3['results'], regdf4[['forecast','rc_forecast']]).fit() 
log_reg3.summary()

Optimization terminated successfully.
         Current function value: 0.661104
         Iterations 9


0,1,2,3
Dep. Variable:,results,No. Observations:,126314.0
Model:,Logit,Df Residuals:,126312.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 04 Jan 2021",Pseudo R-squ.:,0.04623
Time:,18:18:29,Log-Likelihood:,-83507.0
converged:,True,LL-Null:,-87554.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
forecast,72.9317,1.841,39.626,0.000,69.324,76.539
rc_forecast,-72.4079,1.842,-39.313,0.000,-76.018,-68.798


In [16]:
log_reg4 = sm.Logit(regdf4['results'], regdf4['w_forecast']).fit() 
log_reg4.summary()

Optimization terminated successfully.
         Current function value: 0.678656
         Iterations 4


0,1,2,3
Dep. Variable:,results,No. Observations:,126314.0
Model:,Logit,Df Residuals:,126313.0
Method:,MLE,Df Model:,0.0
Date:,"Mon, 04 Jan 2021",Pseudo R-squ.:,0.02091
Time:,18:18:29,Log-Likelihood:,-85724.0
converged:,True,LL-Null:,-87554.0
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
w_forecast,0.6329,0.011,59.823,0.000,0.612,0.654


# Binned Testing 538

In [17]:
bindff=NBA_data[['forecast','results']]
bins= np.arange(0,1.01,.05)
labels= np.arange(0,100,5)
print(bins)
bindff['binned']=pd.cut(bindff['forecast'],bins=bins,labels=labels)
bindff['binned']=bindff['binned'].fillna(0)
bindff

[0.   0.05 0.1  0.15 0.2  0.25 0.3  0.35 0.4  0.45 0.5  0.55 0.6  0.65
 0.7  0.75 0.8  0.85 0.9  0.95 1.  ]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,forecast,results,binned
0,0.640065,0,60
1,0.359935,1,35
2,0.631101,1,60
3,0.368899,0,35
4,0.640065,0,60
...,...,...,...
126309,0.546572,0,50
126310,0.765565,1,75
126311,0.234435,0,20
126312,0.481450,0,45


In [18]:
bin_compf=[]
for i in range(len(bins)-1):
    binaver= np.mean(bindff['results'][bindff['binned']==labels[i]])
    bin_compf.append([(bins[i]+bins[i+1])/2,binaver])
bin_compf

[[0.025, 0.031055900621118012],
 [0.07500000000000001, 0.07574094401756312],
 [0.125, 0.1243680485338726],
 [0.17500000000000002, 0.17873542917696927],
 [0.225, 0.2273752466873414],
 [0.275, 0.27632074287865477],
 [0.32500000000000007, 0.3236698878151727],
 [0.375, 0.37402207694780837],
 [0.42500000000000004, 0.43101182654402104],
 [0.475, 0.48166611277279275],
 [0.525, 0.5183338872272073],
 [0.5750000000000001, 0.568988173455979],
 [0.625, 0.6259779230521916],
 [0.675, 0.6763301121848273],
 [0.7250000000000001, 0.7236792571213452],
 [0.775, 0.7726247533126586],
 [0.8250000000000001, 0.8212645708230307],
 [0.875, 0.8756319514661274],
 [0.925, 0.9242590559824369],
 [0.9750000000000001, 0.968944099378882]]

# Binned Testing

In [19]:
bindf=NBA_data[['rc_forecast','results']]

In [20]:
bins= np.arange(0,1.01,.05)
labels= np.arange(0,100,5)
print(bins)
bindf['binned']=pd.cut(bindf['rc_forecast'],bins=bins,labels=labels)

[0.   0.05 0.1  0.15 0.2  0.25 0.3  0.35 0.4  0.45 0.5  0.55 0.6  0.65
 0.7  0.75 0.8  0.85 0.9  0.95 1.  ]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [21]:
bindf['binned']=bindf['binned'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
bindf

Unnamed: 0,rc_forecast,results,binned
0,0.640065,0,60
1,0.359935,1,35
2,0.631101,1,60
3,0.368899,0,35
4,0.640065,0,60
...,...,...,...
126309,0.546572,0,50
126310,0.765544,1,75
126311,0.234456,0,20
126312,0.481450,0,45


In [23]:
#set(bindf['binned'])

In [24]:
bin_comp=[]
for i in range(len(bins)-1):
    binaver= np.mean(bindf['results'][bindf['binned']==labels[i]])
    bin_comp.append([(bins[i]+bins[i+1])/2,binaver])

In [25]:
bin_comp

[[0.025, nan],
 [0.07500000000000001, nan],
 [0.125, 0.11119812059514488],
 [0.17500000000000002, 0.17052530736068977],
 [0.225, 0.2242861103410036],
 [0.275, 0.2746294681778553],
 [0.32500000000000007, 0.3228451141147795],
 [0.375, 0.3736487209675693],
 [0.42500000000000004, 0.43080625752105894],
 [0.475, 0.48156758551976087],
 [0.525, 0.5184324144802391],
 [0.5750000000000001, 0.5691937424789411],
 [0.625, 0.6263512790324307],
 [0.675, 0.6771548858852204],
 [0.7250000000000001, 0.7253705318221447],
 [0.775, 0.7757138896589963],
 [0.8250000000000001, 0.8294746926393103],
 [0.875, 0.8888018794048551],
 [0.925, nan],
 [0.9750000000000001, nan]]

# Weighted Binned Testing

In [26]:
bindfw=NBA_data[['w_forecast','results']]
bins= np.arange(0,1.01,.05)
labels= np.arange(0,100,5)
print(bins)
bindfw['binned']=pd.cut(bindfw['w_forecast'],bins=bins,labels=labels)
bindfw['binned']=bindfw['binned'].fillna(0)
bindfw

[0.   0.05 0.1  0.15 0.2  0.25 0.3  0.35 0.4  0.45 0.5  0.55 0.6  0.65
 0.7  0.75 0.8  0.85 0.9  0.95 1.  ]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,w_forecast,results,binned
0,0.640065,0,60
1,0.359935,1,35
2,0.631101,1,60
3,0.368899,0,35
4,0.640065,0,60
...,...,...,...
126309,0.546572,0,50
126310,0.765563,1,75
126311,0.234437,0,20
126312,0.481450,0,45


In [27]:
bin_compw=[]
for i in range(len(bins)-1):
    binaver= np.mean(bindfw['results'][bindfw['binned']==labels[i]])
    bin_compw.append([(bins[i]+bins[i+1])/2,binaver])
bin_compw

[[0.025, nan],
 [0.07500000000000001, 0.07232191408374117],
 [0.125, 0.12230392156862745],
 [0.17500000000000002, 0.17869718309859156],
 [0.225, 0.22731116121758738],
 [0.275, 0.27632074287865477],
 [0.32500000000000007, 0.3236698878151727],
 [0.375, 0.37402207694780837],
 [0.42500000000000004, 0.43101182654402104],
 [0.475, 0.48166611277279275],
 [0.525, 0.5183338872272073],
 [0.5750000000000001, 0.568988173455979],
 [0.625, 0.6259779230521916],
 [0.675, 0.6763301121848273],
 [0.7250000000000001, 0.7236792571213452],
 [0.775, 0.7726888387824127],
 [0.8250000000000001, 0.8213028169014085],
 [0.875, 0.8776960784313725],
 [0.925, 0.9276780859162589],
 [0.9750000000000001, nan]]

# End