<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Run-Values-from-Regression" data-toc-modified-id="Run-Values-from-Regression-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Run Values from Regression</a></span><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#First-Regression" data-toc-modified-id="First-Regression-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>First Regression</a></span></li><li><span><a href="#Are-Ks-more-costly-than-other-outs?" data-toc-modified-id="Are-Ks-more-costly-than-other-outs?-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Are Ks more costly than other outs?</a></span></li><li><span><a href="#What-does-this-regression-look-like-for-1-year?" data-toc-modified-id="What-does-this-regression-look-like-for-1-year?-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>What does this regression look like for 1 year?</a></span></li><li><span><a href="#How-does-the-relative-value-of-SB-and-CS-compare-to-wSB?" data-toc-modified-id="How-does-the-relative-value-of-SB-and-CS-compare-to-wSB?-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>How does the relative value of SB and CS compare to wSB?</a></span></li><li><span><a href="#Interaction" data-toc-modified-id="Interaction-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>Interaction</a></span></li></ul></li><li><span><a href="#Four-Factor-Model" data-toc-modified-id="Four-Factor-Model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Four Factor Model</a></span><ul class="toc-item"><li><span><a href="#Four-Factors-and-Winning-Pct" data-toc-modified-id="Four-Factors-and-Winning-Pct-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Four Factors and Winning Pct</a></span></li><li><span><a href="#Four-Factors-and-the-log-Rating-Ratio" data-toc-modified-id="Four-Factors-and-the-log-Rating-Ratio-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Four Factors and the log Rating Ratio</a></span></li><li><span><a href="#By-Games" data-toc-modified-id="By-Games-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>By Games</a></span></li></ul></li></ul></div>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import zscore

In [2]:
import statsmodels.formula.api as smf
import statsmodels.api as sm


def multiple_regression(dep_var, ind_vars, data, constant=False, interactions=None):
    if not isinstance(ind_vars, list):
        ind_vars = [ind_vars]
    formula = dep_var + '~' + (' + '.join(ind_vars))
    if constant:
        formula += ' + 1'
    else:
        formula += ' + 0'
    if interactions is not None:
        for (interact_1, interact_2) in interactions:
            formula += f' + {interact_1}:{interact_2}'
    results = smf.ols(formula, data=data).fit()
    return results


def multiple_regression_2(dep_var, ind_vars, data, constant=False):
    if not isinstance(ind_vars, list):
        ind_vars = [ind_vars]
    y = data[dep_var]
    X = data[ind_vars]
    if constant:
        X = sm.add_constant(X, prepend=False)
    results = sm.OLS(y, X).fit()
    return results

## Run Values from Regression

### Load Data

In [3]:
# Load lahman_main_teams.csv obtained from the lahman_main databank.  We only need a selection of the columns.
lahman_main = pd.read_csv("lahman_teams.csv", usecols=[0, 3, 6] + list(range(14, 28)))

# Need to add two fields, singles and PA (which is only approximate)
lahman_main['X2B'] = lahman_main['2B']
lahman_main['X3B'] = lahman_main['3B']
lahman_main['X1B'] = lahman_main['H'] - lahman_main['2B'] - lahman_main['3B'] - lahman_main['HR']
del lahman_main['2B'], lahman_main['3B']
lahman_main['HBP'].fillna(0, inplace=True)
lahman_main['PA'] = lahman_main['AB'] + lahman_main['BB'] + lahman_main['HBP'] +  lahman_main['SF']
lahman_main['O'] = lahman_main['AB'] - lahman_main['H']
lahman_main['O_nonK'] = lahman_main['O'] - lahman_main['SO']
lahman_main['RAA'] = lahman_main['R'] - lahman_main['R'].mean()
lahman_main.head()

Unnamed: 0,yearID,franchID,G,R,AB,H,HR,BB,SO,SB,...,SF,RA,ER,X2B,X3B,X1B,PA,O,O_nonK,RAA
0,1871,BNA,31,401,1372,426,3,60,19.0,73.0,...,,303,109,70,37,316,,946,927.0,-281.399295
1,1871,CNA,28,302,1196,323,10,60,22.0,69.0,...,,241,77,52,21,240,,873,851.0,-380.399295
2,1871,CFC,29,249,1186,328,7,26,25.0,18.0,...,,341,116,35,40,246,,858,833.0,-433.399295
3,1871,KEK,19,137,746,178,2,33,9.0,16.0,...,,243,97,19,8,149,,568,559.0,-545.399295
4,1871,NNA,33,302,1404,403,1,33,15.0,46.0,...,,313,121,43,21,338,,1001,986.0,-380.399295


In [4]:
# Restrict to after the year 2000
lahman = lahman_main.loc[lahman_main['yearID'] >= 2000].reset_index(drop=True).copy()
lahman.head()

Unnamed: 0,yearID,franchID,G,R,AB,H,HR,BB,SO,SB,...,SF,RA,ER,X2B,X3B,X1B,PA,O,O_nonK,RAA
0,2000,ANA,162,864,5628,1574,236,608,1024.0,93.0,...,43.0,869,805,309,34,995,6326.0,4054,3030.0,181.600705
1,2000,ARI,162,792,5527,1466,179,535,975.0,97.0,...,58.0,754,698,282,44,961,6179.0,4061,3086.0,109.600705
2,2000,ATL,162,810,5489,1490,179,595,1010.0,148.0,...,45.0,714,648,274,26,1011,6188.0,3999,2989.0,127.600705
3,2000,BAL,162,794,5549,1508,184,558,900.0,126.0,...,54.0,913,855,310,22,992,6210.0,4041,3141.0,111.600705
4,2000,BOS,162,792,5630,1503,167,611,1019.0,43.0,...,48.0,745,683,316,32,988,6331.0,4127,3108.0,109.600705


### First Regression

In [5]:
dep_var = 'RAA'
ind_vars = ['O', 'X1B', 'X2B', 'X3B', 'HR', 'BB', 'HBP', 'SB', 'CS']
results = multiple_regression(dep_var, ind_vars, lahman)
results.params

O     -0.259851
X1B    0.479376
X2B    0.744706
X3B    1.078258
HR     1.436029
BB     0.288419
HBP    0.335237
SB     0.182543
CS    -0.372263
dtype: float64

### Are Ks more costly than other outs?

In [6]:
ind_vars_with_K = ['O_nonK', 'X1B', 'X2B', 'X3B', 'HR', 'BB', 'HBP', 'SB', 'CS', 'SO']

In [7]:
results_with_K = multiple_regression(dep_var, ind_vars_with_K, lahman)
results_with_K.params

O_nonK   -0.248914
X1B       0.457801
X2B       0.733130
X3B       1.095368
HR        1.438783
BB        0.288666
HBP       0.335617
SB        0.187726
CS       -0.391668
SO       -0.268120
dtype: float64

### What does this regression look like for 1 year?

In [8]:
lahman_2016 = lahman.loc[lahman['yearID'] == 2016].copy()
results_2016 = multiple_regression(dep_var, ind_vars, lahman_2016)
results_2016.params

O     -0.251029
X1B    0.388503
X2B    1.132619
X3B    0.816812
HR     1.316626
BB     0.187916
HBP    0.533701
SB     0.135256
CS     0.014569
dtype: float64

### How does the relative value of SB and CS compare to wSB?

In [9]:
results.params['SB'] - results.params['CS']

0.55480575683256306

In [10]:
runSB = .2
runCS_avg = -(2 * 0.16 + 0.075)
runSB - runCS_avg

0.595

### Interaction

In [11]:
lahman_z = pd.DataFrame({col: (lahman[col] - lahman[col].mean())  for col in ind_vars})
lahman_z['R'] = lahman['R']

In [12]:
results = multiple_regression('R', ind_vars, lahman_z, constant=True)
results.params

Intercept    738.670588
O             -0.127630
X1B            0.522096
X2B            0.762760
X3B            1.128138
HR             1.460083
BB             0.310557
HBP            0.359109
SB             0.164748
CS            -0.180542
dtype: float64

In [13]:
interactions = [('X1B', 'X2B')]
results = multiple_regression('R', ind_vars, lahman_z, constant=True, interactions=interactions)
results.params

Intercept    737.938788
O             -0.132524
X1B            0.520565
X2B            0.760168
X3B            1.135643
HR             1.459150
BB             0.308725
HBP            0.378378
SB             0.165486
CS            -0.200720
X1B:X2B        0.001786
dtype: float64

In [14]:
# Restrict to 1960s
lahman = lahman_main.loc[(lahman_main['yearID'] >= 1952) & (lahman_main['yearID'] < 1970)].\
    reset_index(drop=True).\
    copy()
lahman.head()

ind_vars_60s = ['O', 'X1B', 'X2B', 'X3B', 'HR', 'BB', 'SB', 'CS']

lahman_z = pd.DataFrame({col: (lahman[col] - lahman[col].mean()) for col in ind_vars_60s})
lahman_z['R'] = lahman['R']

results = multiple_regression('R', ind_vars_60s, lahman_z, constant=True)
results.params

Intercept    661.825153
O             -0.088925
X1B            0.530905
X2B            0.684753
X3B            1.013603
HR             1.520621
BB             0.363532
SB             0.259640
CS             0.024825
dtype: float64

## Four Factor Model

In [15]:
nba_teams = pd.read_csv('team_season_data.csv')

nba_teams['efg'] = (nba_teams['fg'] + .5 * nba_teams['fg3']) / nba_teams['fga']
nba_teams['to'] = nba_teams['tov'] / \
    (nba_teams['tov'] + nba_teams['fga'] + .44 * nba_teams['fta'])
nba_teams['oreb'] = nba_teams['orb'] / \
    (nba_teams['orb'] + nba_teams['opp_drb'])
nba_teams['ftr'] = nba_teams['ft'] / nba_teams['fga']

nba_teams['opp_efg'] = (nba_teams['opp_fg'] + .5 *
                        nba_teams['opp_fg3']) / nba_teams['opp_fga']
nba_teams['opp_to'] = nba_teams['opp_tov'] / \
    (nba_teams['opp_tov'] + nba_teams['opp_fga'] + .44 * nba_teams['opp_fta'])
nba_teams['opp_oreb'] = nba_teams['opp_orb'] / \
    (nba_teams['opp_orb'] + nba_teams['drb'])
nba_teams['opp_ftr'] = nba_teams['opp_ft'] / nba_teams['opp_fga']

nba_teams['eFG'] = zscore(nba_teams['efg'] - nba_teams['opp_efg'])
nba_teams['Tov'] = zscore(nba_teams['to'] - nba_teams['opp_to'])
nba_teams['Reb'] = zscore(nba_teams['oreb'] - nba_teams['opp_oreb'])
nba_teams['Ftr'] = zscore(nba_teams['ftr'] - nba_teams['opp_ftr'])

In [16]:
nba_teams['win_pct'] = nba_teams['wins'] / (nba_teams['wins'] + nba_teams['losses'])
nba_teams['rtg_rat'] = nba_teams['off_rtg'] / nba_teams['def_rtg']
nba_teams['log_rtg_rat'] = np.log(nba_teams['rtg_rat'])

### Four Factors and Winning Pct

In [17]:
dep_vars = 'win_pct'
ind_vars = ['eFG', 'Tov', 'Reb', 'Ftr']
results = multiple_regression(dep_vars, ind_vars, nba_teams)
results.params / results.params.abs().sum()

eFG    0.471141
Tov   -0.232098
Reb    0.148451
Ftr    0.148310
dtype: float64

### Four Factors and the log Rating Ratio

In [18]:
dep_vars = 'log_rtg_rat'
results = multiple_regression(dep_vars, ind_vars, nba_teams)
results.params / results.params.abs().sum()

eFG    0.469041
Tov   -0.239897
Reb    0.156998
Ftr    0.134064
dtype: float64

### By Games

In [19]:
games = pd.read_csv('four_factor_game_data.csv')
games.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,EFG_PCT,FTA_RATE,TOV_PCT,OREB_PCT,...,DEF_RATING,NET_RATING,OPP_TEAM_ID,OPP_TEAM_NAME,OPP_TEAM_ABBREVIATION,OPP_TEAM_CITY,OPP_OFF_RATING,OPP_DEF_RATING,OPP_NET_RATING,PTS_DIFF
0,21600001,1610612752,Knicks,NYK,New York,240:00,0.42,0.23,0.179,0.245,...,110.0,-22.7,1610612739,Cavaliers,CLE,Cleveland,110.0,87.3,22.7,-29.0
1,21600002,1610612762,Jazz,UTA,Utah,240:00,0.537,0.195,0.144,0.171,...,121.9,-14.8,1610612757,Trail Blazers,POR,Portland,121.9,107.2,14.8,-9.0
2,21600003,1610612759,Spurs,SAS,San Antonio,240:00,0.541,0.265,0.137,0.438,...,99.1,26.8,1610612744,Warriors,GSW,Golden State,99.1,125.9,-26.8,29.0
3,21600004,1610612753,Magic,ORL,Orlando,240:00,0.416,0.315,0.113,0.294,...,108.0,-9.3,1610612748,Heat,MIA,Miami,108.0,98.6,9.3,-12.0
4,21600005,1610612754,Pacers,IND,Indiana,265:00,0.559,0.366,0.138,0.17,...,103.5,8.6,1610612742,Mavericks,DAL,Dallas,103.5,112.1,-8.6,9.0


In [20]:
games['eFG'] = zscore(games['EFG_PCT'] - games['OPP_EFG_PCT'])
games['Tov'] = zscore(games['TOV_PCT'] - games['OPP_TOV_PCT'])
games['Reb'] = zscore(games['OREB_PCT'] - games['OPP_OREB_PCT'])
games['Ftr'] = zscore(games['FTA_RATE'] - games['OPP_FTA_RATE'])

In [21]:
games['log_rtg_rat'] = np.log(games['OFF_RATING'] / games['DEF_RATING'])

In [22]:
dep_vars = 'log_rtg_rat'
results = multiple_regression(dep_vars, ind_vars, games)
results.params / results.params.abs().sum()

eFG    0.496854
Tov   -0.220977
Reb    0.191796
Ftr    0.090373
dtype: float64