In [1]:
# Single-game HR records

# With HR rates at all-time highs since 2015, and so many HR records falling routinely, why have we not seen
# that many records for single-game HRs (for teams or for league/MLB)?

# The implied question:
# * Is there some reason that single-game highs would not increase along with the recent increase of HR rates?



In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go # for dual-Y-axis plots
from plotly.subplots import make_subplots

import boxball_loader as bbl

In [3]:
glt = bbl.load_gamelog_teams(bbl.GameType.RS, bbl.Eras.Integration)
glt.columns

Index(['game_id', 'date', 'double_header', 'yr', 'game_type', 'park_id',
       'team', 'team_league', 'team_game_number', 'runs_scored', 'line_score',
       'ab', 'h', 'd', 't', 'hr', 'rbi', 'sh', 'sf', 'hbp', 'bb', 'ibb', 'k',
       'sb', 'cs', 'gdp', 'ci', 'lob', 'pitchers', 'er', 'ter', 'wp', 'balks',
       'po', 'a', 'e', 'passed', 'db', 'tp', 'manager_id', 'manager_name',
       'starting_pitcher_id', 'starting_pitcher_name', 'batting_1_player_id',
       'batting_1_name', 'batting_1_position', 'batting_2_player_id',
       'batting_2_name', 'batting_2_position', 'batting_3_player_id',
       'batting_3_name', 'batting_3_position', 'batting_4_player_id',
       'batting_4_name', 'batting_4_position', 'batting_5_player_id',
       'batting_5_name', 'batting_5_position', 'batting_6_player_id',
       'batting_6_name', 'batting_6_position', 'batting_7_player_id',
       'batting_7_name', 'batting_7_position', 'batting_8_player_id',
       'batting_8_name', 'batting_8_position', '

In [4]:
teams = glt.groupby(['yr', 'team'])['hr'].agg([max, np.mean, len]).rename(columns={'len': 'g'})
teams.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,max,mean,g
yr,team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,NYN,7.0,1.382716,162
1948,BRO,3.0,0.587097,155
2015,TOR,5.0,1.432099,162
1974,MIL,3.0,0.740741,162
2018,ANA,5.0,1.320988,162
1951,CIN,3.0,0.567742,155
1970,NYA,4.0,0.680982,163
1979,MIN,5.0,0.691358,162
2009,CHA,6.0,1.135802,162
1951,BRO,5.0,1.164557,158


In [5]:
# Consolidate to seasons (and throw out partial seasons)
MIN_SEASON_SIZE = 150
col_mapper = {'max': 'mean_high', 'mean': 'lg_hr_rate', 'g': 'mean_g'}
years = teams.groupby('yr').mean().rename(columns=col_mapper).query('mean_g > @MIN_SEASON_SIZE')
years

Unnamed: 0_level_0,mean_high,lg_hr_rate,mean_g
yr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1947,4.125000,0.629114,155.375000
1948,3.812500,0.628105,154.625000
1949,4.375000,0.686785,155.000000
1950,4.812500,0.837188,154.750000
1951,4.062500,0.750769,154.875000
...,...,...,...
2015,4.800000,1.010455,161.933333
2016,5.233333,1.155147,161.866667
2017,5.366667,1.256173,162.000000
2018,5.033333,1.148493,162.066667


In [6]:
px.scatter(years, y='lg_hr_rate')

In [7]:
px.scatter(years, y='mean_high')

In [8]:
def make_dual_axis_plot(df, ys):
    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces
    for (i, y_col) in enumerate(ys):
        y = df[y_col]
        fig.add_trace(go.Scatter(x=df.index, y=y, name=y.name), secondary_y=i)

    return fig

make_dual_axis_plot(years, ['lg_hr_rate', 'mean_high'])

In [9]:
px.scatter(years, x='lg_hr_rate', y='mean_high')

In [10]:
px.scatter(teams, x='mean', y='max')

In [11]:
bins = pd.qcut(teams['mean'], q=40)
binned_teams = teams.groupby(bins).agg(bin_size=('max', len), bin_hr_rate=('mean', np.mean), avg_high=('max', np.mean))
binned_teams

Unnamed: 0_level_0,bin_size,bin_hr_rate,avg_high
mean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(0.2, 0.452]",46,0.387835,3.086957
"(0.452, 0.515]",45,0.48697,3.266667
"(0.515, 0.568]",46,0.542291,3.695652
"(0.568, 0.592]",45,0.581021,3.577778
"(0.592, 0.617]",47,0.60229,3.446809
"(0.617, 0.648]",47,0.631872,3.914894
"(0.648, 0.667]",52,0.659582,3.942308
"(0.667, 0.683]",36,0.676267,4.027778
"(0.683, 0.704]",53,0.695089,3.924528
"(0.704, 0.726]",38,0.71625,4.105263


In [12]:
px.scatter(binned_teams, x='bin_hr_rate', y='avg_high', hover_data=['bin_size'], title="Team-seasons binned by HR/G rate")

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [14]:
# Do a linear regression, predicting team-season high HR count from team-season HR/G rate 

lr = LinearRegression()

qualified_tms = teams.query('g >= @MIN_SEASON_SIZE')

X = qualified_tms[['mean']]
y = qualified_tms['max'].astype(int)

lr.fit(X, y)

y_pred = pd.Series(lr.predict(X), index=X.index).rename('pred')
y_pred


yr    team
1947  BOS     3.880311
      BRO     3.553362
      BSN     3.598009
      CHA     3.028506
      CHN     3.343420
                ...   
2019  SLN     5.616502
      TBA     5.733677
      TEX     5.834112
      TOR     6.235854
      WAS     5.968026
Name: pred, Length: 1708, dtype: float64

In [15]:
# Look at the "errors" over time
ys = pd.concat([y, y_pred], axis=1)
ys['err'] = ys['max'] - ys['pred']
ys['abs_err'] = abs(ys['err'])
ys

Unnamed: 0_level_0,Unnamed: 1_level_0,max,pred,err,abs_err
yr,team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1947,BOS,4,3.880311,0.119689,0.119689
1947,BRO,3,3.553362,-0.553362,0.553362
1947,BSN,4,3.598009,0.401991,0.401991
1947,CHA,3,3.028506,-0.028506,0.028506
1947,CHN,4,3.343420,0.656580,0.656580
...,...,...,...,...,...
2019,SLN,5,5.616502,-0.616502,0.616502
2019,TBA,6,5.733677,0.266323,0.266323
2019,TEX,5,5.834112,-0.834112,0.834112
2019,TOR,6,6.235854,-0.235854,0.235854


In [16]:
# mean error by year
# This measures bias -- Are we off in one direction or another?
px.scatter(ys.groupby('yr')['err'].mean())

In [17]:
# mean absolute error by year
# This measures error -- how far are we off?
px.scatter(ys.groupby('yr')['abs_err'].mean())