<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Run-Values-from-Regression" data-toc-modified-id="Run-Values-from-Regression-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Run Values from Regression</a></span><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#First-Regression" data-toc-modified-id="First-Regression-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>First Regression</a></span></li><li><span><a href="#Are-Ks-more-costly-than-other-outs?" data-toc-modified-id="Are-Ks-more-costly-than-other-outs?-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Are Ks more costly than other outs?</a></span></li><li><span><a href="#What-does-this-regression-look-like-for-1-year?" data-toc-modified-id="What-does-this-regression-look-like-for-1-year?-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>What does this regression look like for 1 year?</a></span></li><li><span><a href="#How-does-the-relative-value-of-SB-and-CS-compare-to-wSB?" data-toc-modified-id="How-does-the-relative-value-of-SB-and-CS-compare-to-wSB?-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>How does the relative value of SB and CS compare to wSB?</a></span></li></ul></li><li><span><a href="#Four-Factor-Model" data-toc-modified-id="Four-Factor-Model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Four Factor Model</a></span><ul class="toc-item"><li><span><a href="#Four-Factors-and-Winning-Pct" data-toc-modified-id="Four-Factors-and-Winning-Pct-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Four Factors and Winning Pct</a></span></li><li><span><a href="#Four-Factors-and-the-log-Rating-Ratio" data-toc-modified-id="Four-Factors-and-the-log-Rating-Ratio-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Four Factors and the log Rating Ratio</a></span></li></ul></li></ul></div>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import zscore

In [2]:
import statsmodels.formula.api as smf
import statsmodels.api as sm


def multiple_regression(dep_var, ind_vars, data, constant=False, interactions=None):
    if not isinstance(ind_vars, list):
        ind_vars = [ind_vars]
    formula = dep_var + '~' + (' + '.join(ind_vars))
    if constant:
        formula += ' + 1'
    else:
        formula += ' + 0'
    if interactions is not None:
        for (interact_1, interact_2) in interactions:
            formula += f' {interact_1}:{interact_2}'
    results = smf.ols(formula, data=data).fit()
    return results


def multiple_regression_2(dep_var, ind_vars, data, constant=False):
    if not isinstance(ind_vars, list):
        ind_vars = [ind_vars]
    y = data[dep_var]
    X = data[ind_vars]
    if constant:
        X = sm.add_constant(X, prepend=False)
    results = sm.OLS(y, X).fit()
    return results

## Run Values from Regression

### Load Data

In [3]:
# Load lahman_teams.csv obtained from the Lahman databank.  We only need a selection of the columns.
lahman = pd.read_csv("lahman_teams.csv", usecols=[0, 3, 6] + list(range(14, 28)))

# Restrict to after the year 2000
lahman = lahman.loc[lahman['yearID'] >= 2000].copy()

# Need to add two fields, singles and PA (which is only approximate)
lahman['X2B'] = lahman['2B']
lahman['X3B'] = lahman['3B']
lahman['X1B'] = lahman['H'] - lahman['2B'] - lahman['3B'] - lahman['HR']
del lahman['2B'], lahman['3B']
lahman['HBP'].fillna(0, inplace=True)
lahman['PA'] = lahman['AB'] + lahman['BB'] + lahman['HBP'] +  lahman['SF']
lahman['O'] = lahman['AB'] - lahman['H']
lahman['O_nonK'] = lahman['O'] - lahman['SO']
lahman['RAA'] = lahman['R'] - lahman['R'].mean()
lahman.head()

Unnamed: 0,yearID,franchID,G,R,AB,H,HR,BB,SO,SB,...,SF,RA,ER,X2B,X3B,X1B,PA,O,O_nonK,RAA
2325,2000,ANA,162,864,5628,1574,236,608,1024.0,93.0,...,43.0,869,805,309,34,995,6326.0,4054,3030.0,125.329412
2326,2000,ARI,162,792,5527,1466,179,535,975.0,97.0,...,58.0,754,698,282,44,961,6179.0,4061,3086.0,53.329412
2327,2000,ATL,162,810,5489,1490,179,595,1010.0,148.0,...,45.0,714,648,274,26,1011,6188.0,3999,2989.0,71.329412
2328,2000,BAL,162,794,5549,1508,184,558,900.0,126.0,...,54.0,913,855,310,22,992,6210.0,4041,3141.0,55.329412
2329,2000,BOS,162,792,5630,1503,167,611,1019.0,43.0,...,48.0,745,683,316,32,988,6331.0,4127,3108.0,53.329412


### First Regression

In [4]:
dep_vars = 'RAA'
ind_vars = ['O', 'X1B', 'X2B', 'X3B', 'HR', 'BB', 'HBP', 'SB', 'CS']
results = multiple_regression(dep_vars, ind_vars, lahman)
results.params

O     -0.272014
X1B    0.475447
X2B    0.743046
X3B    1.073670
HR     1.433816
BB     0.286383
HBP    0.333042
SB     0.184180
CS    -0.389898
dtype: float64

### Are Ks more costly than other outs?

In [5]:
ind_vars_with_K = ['O_nonK', 'X1B', 'X2B', 'X3B', 'HR', 'BB', 'HBP', 'SB', 'CS', 'SO']

In [6]:
results_with_K = multiple_regression(dep_vars, ind_vars_with_K, lahman)
results_with_K.params

O_nonK   -0.261232
X1B       0.454179
X2B       0.731634
X3B       1.090536
HR        1.436531
BB        0.286626
HBP       0.333416
SB        0.189289
CS       -0.409027
SO       -0.280164
dtype: float64

### What does this regression look like for 1 year?

In [7]:
lahman_2016 = lahman.loc[lahman['yearID'] == 2016].copy()
results_2016 = multiple_regression(dep_vars, ind_vars, lahman_2016)
results_2016.params

O     -0.262792
X1B    0.381496
X2B    1.132650
X3B    0.843668
HR     1.313257
BB     0.186106
HBP    0.536711
SB     0.134893
CS    -0.011809
dtype: float64

### How does the relative value of SB and CS compare to wSB?

In [8]:
results.params['SB'] - results.params['CS']

0.57407748982412499

In [9]:
runSB = .2
runCS_avg = -(2 * 0.16 + 0.075)
runSB - runCS_avg

0.595

## Four Factor Model

In [10]:
nba_teams = pd.read_csv('team_season_data.csv')

nba_teams['efg'] = (nba_teams['fg'] + .5 * nba_teams['fg3']) / nba_teams['fga']
nba_teams['to'] = nba_teams['tov'] / \
    (nba_teams['tov'] + nba_teams['fga'] + .44 * nba_teams['fta'])
nba_teams['oreb'] = nba_teams['orb'] / \
    (nba_teams['orb'] + nba_teams['opp_drb'])
nba_teams['ftr'] = nba_teams['ft'] / nba_teams['fga']

nba_teams['opp_efg'] = (nba_teams['opp_fg'] + .5 *
                        nba_teams['opp_fg3']) / nba_teams['opp_fga']
nba_teams['opp_to'] = nba_teams['opp_tov'] / \
    (nba_teams['opp_tov'] + nba_teams['opp_fga'] + .44 * nba_teams['opp_fta'])
nba_teams['opp_oreb'] = nba_teams['opp_orb'] / \
    (nba_teams['opp_orb'] + nba_teams['drb'])
nba_teams['opp_ftr'] = nba_teams['opp_ft'] / nba_teams['opp_fga']

nba_teams['eFG'] = zscore(nba_teams['efg'] - nba_teams['opp_efg'])
nba_teams['Tov'] = zscore(nba_teams['to'] - nba_teams['opp_to'])
nba_teams['Reb'] = zscore(nba_teams['oreb'] - nba_teams['opp_oreb'])
nba_teams['Ftr'] = zscore(nba_teams['ftr'] - nba_teams['opp_ftr'])

In [11]:
nba_teams['win_pct'] = nba_teams['wins'] / (nba_teams['wins'] + nba_teams['losses'])
nba_teams['rtg_rat'] = nba_teams['off_rtg'] / nba_teams['def_rtg']
nba_teams['log_rtg_rat'] = np.log(nba_teams['rtg_rat'])

### Four Factors and Winning Pct

In [12]:
dep_vars = 'win_pct'
ind_vars = ['eFG', 'Tov', 'Reb', 'Ftr']
results = multiple_regression(dep_vars, ind_vars, nba_teams)
results.params / results.params.abs().sum()

eFG    0.471141
Tov   -0.232098
Reb    0.148451
Ftr    0.148310
dtype: float64

### Four Factors and the log Rating Ratio

In [13]:
dep_vars = 'log_rtg_rat'
results = multiple_regression(dep_vars, ind_vars, nba_teams)
results.params / results.params.abs().sum()

eFG    0.469041
Tov   -0.239897
Reb    0.156998
Ftr    0.134064
dtype: float64