In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

In [2]:
batting = pd.read_csv(r"D:\GitHub\Econometrics_Term_Paper\Yearly_Data\2021batting.csv")
batting = batting.add_suffix("_bat")
pitching = pd.read_csv(r"D:\GitHub\Econometrics_Term_Paper\Yearly_Data\2021pitching.csv")
pitching = pitching.add_suffix("_pitch")
fielding = pd.read_csv(r"D:\GitHub\Econometrics_Term_Paper\Yearly_Data\2021fielding.csv")
fielding = fielding.add_suffix("_field")
data = pd.merge(pd.merge(batting, pitching, left_on="Tm_bat", right_on="Tm_pitch"), fielding, left_on="Tm_pitch", right_on="Tm_field")
data.drop(["Tm_pitch", "Tm_field"], axis=1, inplace=True)
data = data.iloc[0:30]
win_totals = pd.read_csv(r"D:\GitHub\Econometrics_Term_Paper\Yearly_Data\win_totals.csv")

In [3]:
initials_dict = {"ARI": "Arizona Diamondbacks", "ATL": "Atlanta Braves", "BAL": "Baltimore Orioles", "BOS": "Boston Red Sox", "CHC": "Chicago Cubs", "CHW": "Chicago White Sox", "CIN": "Cincinnati Reds", "CLE": ["Cleveland Guardians", "Cleveland Indians"], "COL": "Colorado Rockies", "DET": "Detroit Tigers", "HOU": "Houston Astros", "KCR": "Kansas City Royals", "LAA": ["Los Angeles Angels", "Los Angeles Angels of Aneheim"], "LAD": "Los Angeles Dodgers", "MIA": "Miami Marlins", "MIL": "Milwaukee Brewers", "MIN": "Minnesota Twins", "NYM": "New York Mets", "NYY": "New York Yankees", "OAK": "Oakland Athletics", "PHI": "Philadelphia Phillies", "PIT": "Pittsburgh Pirates", "SDP": "San Diego Padres", "SFG": "San Francisco Giants", "SEA": "Seattle Mariners", "STL": "St. Louis Cardinals", "TBR": "Tampa Bay Rays", "TEX": "Texas Rangers", "TOR": "Toronto Blue Jays", "WSN": "Washington Nationals"}

In [4]:
win_totals = win_totals.melt(
    id_vars=["Year", "G"],
    var_name="Team", 
    value_name="Wins"
    )
win_totals.dropna(inplace=True)
win_totals.reset_index(drop=True, inplace=True)
win_totals.drop("G", axis=1, inplace=True)
win_totals = win_totals[(win_totals["Year"] == 2021)].copy()
win_totals.reset_index(drop=True, inplace=True)

In [5]:
invert_dict = {team: abbr for abbr, teams in initials_dict.items() for team in (teams if isinstance(teams, list) else [teams])}
data["Team Abbreviation"] = data["Tm_bat"].map(invert_dict)
data = data.merge(win_totals, left_on="Team Abbreviation", right_on="Team", how="left", suffixes=("", "_wintotals"))
data.drop(["Team Abbreviation", "Year"], axis=1, inplace=True)
data.rename(columns={"Team": "Abbreviation", "Wins": "Wins"}, inplace=True)
data.insert(1, "Abbreviation", data.pop("Abbreviation"))
data.insert(2, "Wins", data.pop("Wins"))
data["Team"] = data["Tm_bat"] + " " + "(" + data["Abbreviation"] + ")"
data.drop(["Tm_bat", "Abbreviation"], axis=1, inplace=True)
data.insert(0, "Team", data.pop("Team"))
data.set_index("Team", inplace=True)
data.dropna(inplace=True)
data["XBH_bat"] = data["2B_bat"] + data["3B_bat"] + data["HR_bat"]
data.insert(12, "XBH_bat", data.pop("XBH_bat"))
corr_matrix = data.corr()

In [6]:
# Whip, pitcher strikeouts, strikeout to walk ratio, fielding percentage, defensive efficiency, home runs or XBH, on base percentage, maybe stolen bases, hitter strikeouts to measure overtime, maybe LOB, OPS+
X = data[["OPS_bat", "WHIP_pitch"]]
Y = data["Wins"]
X_Standardized = sm.add_constant((X - X.mean()) / X.std())
Y_Standardized = (Y - Y.mean()) / Y.std()
# model = sm.OLS(Y, sm.add_constant(X)).fit()
model = sm.OLS(Y_Standardized, X_Standardized).fit()
print(model.summary())
n = X.shape[0]
k = X.shape[1]
df = n - k - 1
alpha = 0.05
critical_t = stats.t.ppf(1 - alpha / 2, df)
print(f"Critical t-stat: {critical_t}")

                            OLS Regression Results                            
Dep. Variable:                   Wins   R-squared:                       0.805
Model:                            OLS   Adj. R-squared:                  0.791
Method:                 Least Squares   F-statistic:                     55.76
Date:                Thu, 05 Dec 2024   Prob (F-statistic):           2.59e-10
Time:                        00:45:32   Log-Likelihood:                -17.531
No. Observations:                  30   AIC:                             41.06
Df Residuals:                      27   BIC:                             45.27
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.683e-16      0.084  -2.01e-15      1.0

In [7]:
_, pval, _, _ = het_breuschpagan(model.resid, model.model.exog) 
print(f"Bresuch-Pagan P-value: {pval}")
dw_stat = durbin_watson(model.resid)
print(f"Durbin-Watson Statistic: {dw_stat}")
vif = pd.DataFrame()
vif["Variable"] = X_Standardized.columns
vif["VIF"] = [variance_inflation_factor(X_Standardized.values, i) for i in range(X_Standardized.shape[1])]
print(vif)

Bresuch-Pagan P-value: 0.286348719260159
Durbin-Watson Statistic: 1.5211996671446708
     Variable      VIF
0       const  1.00000
1     OPS_bat  1.21982
2  WHIP_pitch  1.21982
