In [1]:
from dataclasses import dataclass, field
from concurrent.futures import ProcessPoolExecutor
import pandas as pd
import numpy as np
import json
import datetime
import glob
import requests
from bs4 import BeautifulSoup
import re

In [18]:
all_games = pd.read_csv("./adv_metrics/raw_game_data.csv.gz", compression = "gzip")

with open("./adv_metrics/wOBA_weights.json", "r+") as f:
    wOBA_weights = json.load(f)
    
all_stadiums = pd.read_csv("./data/all_stadiums_w_park_ids.csv", index_col = 0)

# Prepare all games dataframe

In [19]:
all_games = all_games.fillna(0.)

all_games["date"] = pd.to_datetime(all_games.date, format = r"%Y-%m-%d")

all_games["season"] = pd.DatetimeIndex(all_games.date).year

all_games = all_games.sort_values(by = ["date"]).reset_index(drop = True)

In [20]:
all_games.head()

Unnamed: 0,date,home_team,road_team,is_doubleheader,is_tripleheader,home_PA,home_AB,home_H,home_1B,home_2B,...,home_starter_ER,home_relief_IP,home_relief_H,home_relief_HR,home_relief_BB,home_relief_IBB,home_relief_HBP,home_relief_K,home_relief_ER,season
0,1918-04-15,WS1,NYA,0.0,0.0,38,31,6,6,0,...,4,0.0,0,0,0,0,0,0,0,1918
1,1918-04-15,BOS,PHA,0.0,0.0,38,31,9,7,2,...,1,0.0,0,0,0,0,0,0,0,1918
2,1918-04-16,PHI,BSN,0.0,0.0,35,33,10,8,1,...,1,0.0,0,0,0,0,0,0,0,1918
3,1918-04-16,CIN,PIT,0.0,0.0,27,25,3,3,0,...,0,0.0,0,0,0,0,0,0,0,1918
4,1918-04-16,SLN,CHN,0.0,0.0,31,27,9,4,2,...,0,0.0,0,0,0,0,0,0,0,1918


# PREPARE wOBA weights

In [21]:
wOBA = pd.DataFrame(wOBA_weights)

wOBA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 14 columns):
Season       149 non-null object
wOBA         149 non-null object
wOBAScale    149 non-null object
wBB          149 non-null object
wHBP         149 non-null object
w1B          149 non-null object
w2B          149 non-null object
w3B          149 non-null object
wHR          149 non-null object
runSB        149 non-null object
runCS        149 non-null object
R/PA         149 non-null object
R/W          149 non-null object
cFIP         149 non-null object
dtypes: object(14)
memory usage: 16.4+ KB


In [22]:
wOBA.head()

Unnamed: 0,Season,wOBA,wOBAScale,wBB,wHBP,w1B,w2B,w3B,wHR,runSB,runCS,R/PA,R/W,cFIP
0,2019,0.32,1.157,0.69,0.719,0.87,1.217,1.529,1.94,0.2,-0.435,0.126,10.296,3.214
1,2017,0.321,1.185,0.693,0.723,0.877,1.232,1.552,1.98,0.2,-0.423,0.122,10.048,3.158
2,2015,0.313,1.251,0.687,0.718,0.881,1.256,1.594,2.065,0.2,-0.392,0.113,9.421,3.134
3,2013,0.314,1.277,0.69,0.722,0.888,1.271,1.616,2.101,0.2,-0.384,0.11,9.264,3.048
4,2011,0.316,1.264,0.694,0.726,0.89,1.27,1.611,2.086,0.2,-0.394,0.112,9.454,3.025


In [23]:
change_cols = ["wOBA", "wOBAScale", "wBB", "wHBP", "w1B", "w2B", "w3B", "wHR",
              "runSB", "runCS", "R/PA", "R/W", "cFIP"]

wOBA["Season"] = wOBA.Season.astype("int64")

for col in change_cols:
    
    wOBA[col] = wOBA[col].astype("float32")
    
wOBA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 14 columns):
Season       149 non-null int64
wOBA         149 non-null float32
wOBAScale    149 non-null float32
wBB          149 non-null float32
wHBP         149 non-null float32
w1B          149 non-null float32
w2B          149 non-null float32
w3B          149 non-null float32
wHR          149 non-null float32
runSB        149 non-null float32
runCS        149 non-null float32
R/PA         149 non-null float32
R/W          149 non-null float32
cFIP         149 non-null float32
dtypes: float32(13), int64(1)
memory usage: 8.9 KB


In [24]:
wOBA = wOBA.rename(columns = {"Season" : "season"})

all_games = all_games.merge(wOBA, how = "left", left_on = ["season"],
                           right_on = ["season"])

all_games.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170754 entries, 0 to 170753
Data columns (total 77 columns):
date                170754 non-null datetime64[ns]
home_team           170754 non-null object
road_team           170754 non-null object
is_doubleheader     170754 non-null float64
is_tripleheader     170754 non-null float64
home_PA             170754 non-null int64
home_AB             170754 non-null int64
home_H              170754 non-null int64
home_1B             170754 non-null int64
home_2B             170754 non-null int64
home_3B             170754 non-null int64
home_HR             170754 non-null int64
home_TB             170754 non-null int64
home_BB             170754 non-null int64
home_IBB            170754 non-null int64
home_HBP            170754 non-null int64
home_R              170754 non-null int64
road_starter        170754 non-null object
road_starter_H      170754 non-null int64
road_starter_HR     170754 non-null int64
road_starter_BB     170754 non-nu

In [161]:
all_stadiums = pd.read_csv("./data/all_stadiums_w_park_ids.csv", index_col = 0)

all_stadiums.head()

Unnamed: 0,team_code,team,state_code,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor,park_ids
0,TBD,Tampa Bay Rays,FL,2019,Tropicana Field,27.768333,82.653333,,,,14552,96,97,STP01
1,TBD,Tampa Bay Rays,FL,2018,Tropicana Field,27.768333,82.653333,,,,14259,96,97,STP01
2,TBD,Tampa Bay Rays,FL,2017,Tropicana Field,27.768333,82.653333,,,,15477,96,96,STP01
3,TBD,Tampa Bay Rays,FL,2016,Tropicana Field,27.768333,82.653333,,,,15879,95,95,STP01
4,TBD,Tampa Bay Rays,FL,2015,Tropicana Field,27.768333,82.653333,,,,15322,97,96,STP01


In [162]:
park_factors = all_stadiums[["team_code", "year", "batting_park_factor"]]

In [163]:
with open("./intermediate_data/modern_rc.json", "r+") as f:
    retrosheet_codes = json.load(f)
    
len(retrosheet_codes.keys())

43

In [165]:
retrosheet_codes.update({"MIA" : "FLA"})

In [34]:
all_games["home_team_"] = all_games.home_team.map(retrosheet_codes)

all_games["road_team_"] = all_games.road_team.map(retrosheet_codes)

In [36]:
all_games = all_games.drop(columns = ["home_team", "road_team"]).\
rename(columns = {"home_team_" : "home_team", "road_team_" : "road_team"})

all_games.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170754 entries, 0 to 170753
Data columns (total 77 columns):
date                170754 non-null datetime64[ns]
is_doubleheader     170754 non-null float64
is_tripleheader     170754 non-null float64
home_PA             170754 non-null int64
home_AB             170754 non-null int64
home_H              170754 non-null int64
home_1B             170754 non-null int64
home_2B             170754 non-null int64
home_3B             170754 non-null int64
home_HR             170754 non-null int64
home_TB             170754 non-null int64
home_BB             170754 non-null int64
home_IBB            170754 non-null int64
home_HBP            170754 non-null int64
home_R              170754 non-null int64
road_starter        170754 non-null object
road_starter_H      170754 non-null int64
road_starter_HR     170754 non-null int64
road_starter_BB     170754 non-null int64
road_starter_IBB    170754 non-null int64
road_starter_HBP    170754 non-null

In [38]:
all_games = all_games.merge(park_factors, how = "left", 
                           left_on = ["home_team", "season"],
                           right_on = ["team_code", "year"])

all_games.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170754 entries, 0 to 170753
Data columns (total 80 columns):
date                   170754 non-null datetime64[ns]
is_doubleheader        170754 non-null float64
is_tripleheader        170754 non-null float64
home_PA                170754 non-null int64
home_AB                170754 non-null int64
home_H                 170754 non-null int64
home_1B                170754 non-null int64
home_2B                170754 non-null int64
home_3B                170754 non-null int64
home_HR                170754 non-null int64
home_TB                170754 non-null int64
home_BB                170754 non-null int64
home_IBB               170754 non-null int64
home_HBP               170754 non-null int64
home_R                 170754 non-null int64
road_starter           170754 non-null object
road_starter_H         170754 non-null int64
road_starter_HR        170754 non-null int64
road_starter_BB        170754 non-null int64
road_starter_IBB    

In [39]:
all_games = all_games.drop(columns = ["team_code", "year"])

## IN LOOKING AT THE CALCULATIONS FOR wOBA and wRC+, I realized that I did not collect Sac Flies of Sac Bunts... Collecting now...

In [2]:
all_files = glob.glob("./parsed/all*.csv")

In [3]:
with open("./intermediate_data/all_event_header.json", "r+") as f:
    header = json.load(f)

In [15]:
def get_sacs(game_df, game_id):
            
    game_master = {}

    game_master["date"] = datetime.datetime.strptime(game_id[3:11], "%Y%m%d").strftime("%Y-%m-%d")

    game_master["home_team"] = game_id[0:3]

    game_master["road_team"] = game_df.iloc[0]["AWAY_TEAM_ID"]

    if game_id[11] == "2":

        game_master["is_doubleheader"] = 1
        
        game_master["is_tripleheader"] = 0

    elif game_id[11] == "3":
        
        game_master["is_doubleheader"] = 0

        game_master["is_tripleheader"] = 1

    else:

        game_master["is_doubleheader"] = 0 

        game_master["is_tripleheader"] = 0

    prefixes = ["home_", "road_"]
    
    for prefix in prefixes:
        
        if prefix == "home_":
            
            team_events = game_df[game_df.BAT_HOME_ID == 1]
            
        else:
            
            team_events = game_df[game_df.BAT_HOME_ID == 0]
            
        game_master[prefix + "SAC"] = len(team_events[(team_events.EVENT_TX.str.contains("SH")) | 
                                             (team_events.EVENT_TX.str.contains("SF"))])
        
    return(game_master)

In [16]:
_all_games = []

for file in all_files:
    
    season = file.split("all")[1][0:4]

    season_df = pd.read_csv(file, low_memory = False, header = None)

    season_df.columns = header

    all_games = season_df.GAME_ID.unique()

    season_list = []

    with ProcessPoolExecutor(max_workers = None) as executor:

        for game in all_games:

            game_df = season_df[season_df.GAME_ID == game].reset_index(drop = True)

            season_list.append(executor.submit(get_sacs, game_df, game))

    season_list = [i.result() for i in season_list]

    _all_games += season_list

In [19]:
all_sacs = pd.DataFrame(_all_games)

all_sacs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170754 entries, 0 to 170753
Data columns (total 7 columns):
date               170754 non-null object
home_team          170754 non-null object
road_team          170754 non-null object
is_doubleheader    170754 non-null int64
is_tripleheader    170754 non-null int64
home_SAC           170754 non-null int64
road_SAC           170754 non-null int64
dtypes: int64(4), object(3)
memory usage: 9.1+ MB


In [22]:
all_sacs["date"] = pd.to_datetime(all_sacs.date, format = "%Y-%m-%d")

all_sacs["season"] = pd.DatetimeIndex(all_sacs.date).year

all_sacs.head()

Unnamed: 0,date,home_team,road_team,is_doubleheader,is_tripleheader,home_SAC,road_SAC,season
0,1994-04-12,ATL,SFN,0,0,2,2,1994
1,1994-04-13,ATL,SFN,0,0,2,2,1994
2,1994-04-14,ATL,SFN,0,0,2,1,1994
3,1994-04-18,ATL,SLN,0,0,1,1,1994
4,1994-04-19,ATL,SLN,0,0,1,1,1994


In my infinite wisdom, I wasn't thinking and wrote over all_games... Reading back in dataframe.

In [30]:
with open("./intermediate_data/modern_rc.json", "r+") as f:
    retrosheet_codes = json.load(f)
    
retrosheet_codes.update({"MIA" : "FLA"})

all_sacs["_home_team"] = all_sacs.home_team.map(retrosheet_codes)

all_sacs["_road_team"] = all_sacs.road_team.map(retrosheet_codes)

all_sacs = all_sacs.drop(columns = ["home_team", "road_team"]).\
rename(columns = {"_home_team" : "home_team",
                 "_road_team" : "road_team"})

In [29]:
all_sacs.head()

Unnamed: 0,date,is_doubleheader,is_tripleheader,home_SAC,road_SAC,season,home_team,road_team
0,1994-04-12,0,0,2,2,1994,ATL,SFG
1,1994-04-13,0,0,2,2,1994,ATL,SFG
2,1994-04-14,0,0,2,1,1994,ATL,SFG
3,1994-04-18,0,0,1,1,1994,ATL,STL
4,1994-04-19,0,0,1,1,1994,ATL,STL


In [24]:
master = pd.read_csv("./adv_metrics/master_raw.csv.gz", compression = "gzip")

master["date"] = pd.to_datetime(master.date, format = "%Y-%m-%d")

In [25]:
master.head()

Unnamed: 0,date,is_doubleheader,is_tripleheader,home_PA,home_AB,home_H,home_1B,home_2B,home_3B,home_HR,...,w3B,wHR,runSB,runCS,R/PA,R/W,cFIP,home_team,road_team,batting_park_factor
0,1918-04-15,0.0,0.0,38,31,6,6,0,0,0,...,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,MIN,NYY,98
1,1918-04-15,0.0,0.0,38,31,9,7,2,0,0,...,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,BOS,OAK,99
2,1918-04-16,0.0,0.0,35,33,10,8,1,1,0,...,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,PHI,ATL,107
3,1918-04-16,0.0,0.0,27,25,3,3,0,0,0,...,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,CIN,PIT,99
4,1918-04-16,0.0,0.0,31,27,9,4,2,3,0,...,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,STL,CHC,96


In [31]:
master = master.merge(all_sacs, how = "left", 
                     left_on = ["date", "is_doubleheader", "is_tripleheader",
                               "season", "home_team", "road_team"],
                     right_on = ["date", "is_doubleheader", "is_tripleheader",
                                "season", "home_team", "road_team"])

master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170754 entries, 0 to 170753
Data columns (total 80 columns):
date                   170754 non-null datetime64[ns]
is_doubleheader        170754 non-null float64
is_tripleheader        170754 non-null float64
home_PA                170754 non-null int64
home_AB                170754 non-null int64
home_H                 170754 non-null int64
home_1B                170754 non-null int64
home_2B                170754 non-null int64
home_3B                170754 non-null int64
home_HR                170754 non-null int64
home_TB                170754 non-null int64
home_BB                170754 non-null int64
home_IBB               170754 non-null int64
home_HBP               170754 non-null int64
home_R                 170754 non-null int64
road_starter           170754 non-null object
road_starter_H         170754 non-null int64
road_starter_HR        170754 non-null int64
road_starter_BB        170754 non-null int64
road_starter_IBB    

# WHOOPS! FORGOT ONE MORE THING... I NEED LEAGUE wRC/PA.... SCRAPING FROM FANGRAPHS AS WE SPEAK

In [142]:
al_league_wRC = pd.read_csv("./adv_metrics/league_wRC_AL.csv")

nl_league_wRC = pd.read_csv("./adv_metrics/league_wRC_NL.csv")

In [143]:
al_league_wRC = al_league_wRC[["Season", "PA", "wRC"]].rename(columns = {
    "Season" : "season",
    "PA" : "league_PA",
    "wRC" : "league_wRC"
})

nl_league_wRC = nl_league_wRC[["Season", "PA", "wRC"]].rename(columns = {
    "Season" : "season",
    "PA" : "league_PA",
    "wRC" : "league_wRC"
})

In [88]:
teams = requests.get("https://www.retrosheet.org/TEAMABR.TXT").content

team_soup = BeautifulSoup(teams, "html.parser").get_text().split("\n")

In [79]:
with open("./intermediate_data/modern_rc.json", "r+") as f:
    retrosheet_codes = json.load(f)
    
retrosheet_codes.update({"MIA" : "FLA"})

In [99]:
leagues = {}

pattern = r'([\w]+)'

for row in team_soup:
        
    vals = re.findall(pattern, row)
    
    try:
    
        leagues[vals[0]] = vals[1]
        
    except:
        
        continue

In [104]:
pop_list = []

for key in leagues:
    
    if key not in retrosheet_codes:
        
        pop_list.append(key)

In [105]:
for key in pop_list:
    
    leagues.pop(key)

In [108]:
leagues["MIA"] = "NL"

In [113]:
df = pd.DataFrame({"team_code" : list(leagues.keys()), "league" : list(leagues.values())})

df["elo_code"] = df.team_code.map(retrosheet_codes)

In [116]:
for code in df.elo_code.unique():
    
    if len(df[df.elo_code == code].league.value_counts()) > 1:
        
        print(code)

MIL


In [129]:
elo_leagues = {}

for key in retrosheet_codes:
    
    elo_leagues[retrosheet_codes[key]] = leagues[key]

### Problem that has appeared: Houston and Milwaukee have both switched leagues so this as not as simple as a 1-1 mapping... I'm going to map the rest of the DataFrame and then write over Milwaukee and Houston manually

In [131]:
master["home_league"] = master.home_team.map(elo_leagues)

master["road_league"] = master.road_team.map(elo_leagues)

MIL_home_indices = master[(master.home_team == "MIL") & (master.season.between(1969, 1997))].index

MIL_road_indices = master[(master.road_team == "MIL") & (master.season.between(1969, 1997))].index

HOU_home_indices = master[(master.home_team == "HOU") & (master.season.between(2013, 2019))].index

HOU_road_indices = master[(master.road_team == "HOU") & (master.season.between(2013, 2019))].index

for k in MIL_home_indices:
    
    master.at[k, "home_league"] = "AL"
    
for k in MIL_road_indices:
    
    master.at[k, "road_league"] = "AL"
    
for k in HOU_home_indices:
    
    master.at[k, "home_league"] = "AL"
    
for k in HOU_road_indices:
    
    master.at[k, "road_league"] = "AL"


In [144]:
al_league_wRC = al_league_wRC.assign(league = "AL")

nl_league_wRC = nl_league_wRC.assign(league = "NL")

In [147]:
league_wRC = pd.concat([al_league_wRC, nl_league_wRC], axis = 0).sort_values(by = ["season"]).reset_index(drop = True)

In [149]:
master.head()

Unnamed: 0,date,is_doubleheader,is_tripleheader,home_PA,home_AB,home_H,home_1B,home_2B,home_3B,home_HR,home_TB,home_BB,home_IBB,home_HBP,home_R,road_starter,road_starter_H,road_starter_HR,road_starter_BB,road_starter_IBB,road_starter_HBP,road_starter_K,road_starter_IP,road_starter_ER,road_relief_IP,road_relief_H,road_relief_HR,road_relief_BB,road_relief_IBB,road_relief_HBP,road_relief_K,road_relief_ER,road_PA,road_AB,road_H,road_1B,road_2B,road_3B,road_HR,road_TB,road_BB,road_IBB,road_HBP,road_R,home_starter,home_starter_H,home_starter_HR,home_starter_BB,home_starter_IBB,home_starter_HBP,home_starter_K,home_starter_IP,home_starter_ER,home_relief_IP,home_relief_H,home_relief_HR,home_relief_BB,home_relief_IBB,home_relief_HBP,home_relief_K,home_relief_ER,season,wOBA,wOBAScale,wBB,wHBP,w1B,w2B,w3B,wHR,runSB,runCS,R/PA,R/W,cFIP,home_team,road_team,batting_park_factor,home_SAC,road_SAC,home_league,road_league
0,1918-04-15,0.0,0.0,38,31,6,6,0,0,0,6,6,0,0,3,mogrg101,5,0,1,0,0,0,3.67,3,5.33,1,0,5,0,0,2,0,43,34,11,10,1,0,0,12,5,0,0,6,johnw102,11,0,5,0,0,2,9.0,4,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,MIN,NYY,98,1,3,AL,AL
1,1918-04-15,0.0,0.0,38,31,9,7,2,0,0,11,4,0,0,7,myere101,7,0,3,0,0,0,6.0,3,2.0,2,0,1,0,0,3,0,33,30,4,4,0,0,0,4,2,0,0,1,ruthb101,4,0,2,0,0,3,9.0,1,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,BOS,OAK,99,3,1,AL,AL
2,1918-04-16,0.0,0.0,35,33,10,8,1,1,0,13,2,0,0,5,ragap101,10,0,2,0,0,3,7.0,4,1.0,0,0,0,0,0,0,0,35,32,5,4,1,0,0,6,2,0,0,2,mayee101,5,0,2,0,0,6,9.0,1,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,PHI,ATL,107,0,1,NL,NL
3,1918-04-16,0.0,0.0,27,25,3,3,0,0,0,3,1,0,0,2,coopw101,3,0,1,0,0,0,7.0,0,1.0,0,0,0,0,0,0,0,32,27,1,0,1,0,0,2,5,0,0,0,schnp101,1,0,5,0,0,3,9.0,0,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,CIN,PIT,99,1,0,NL,NL
4,1918-04-16,0.0,0.0,31,27,9,4,2,3,0,17,1,0,1,4,alexg102,9,0,1,0,1,2,8.0,3,0.0,0,0,0,0,0,0,0,34,31,5,5,0,0,0,5,2,0,0,2,meadl101,5,0,2,0,0,1,9.0,0,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,STL,CHC,96,2,1,NL,NL


In [150]:
master = master.merge(league_wRC, how = "left", left_on = ["season", "home_league"],
                     right_on = ["season", "league"])

In [152]:
master = master.drop(columns = ["league"]).rename(columns = {
    "league_PA" : "home_league_PA",
    "league_wRC" : "home_league_wRC"
}
)

In [155]:
master = master.merge(league_wRC, how = "left", left_on = ["season", "road_league"],
                     right_on = ["season", "league"])

master = master.drop(columns = ["league"]).rename(columns = {
    "league_PA" : "road_league_PA",
    "league_wRC" : "road_league_wRC"
})

In [156]:
master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170754 entries, 0 to 170753
Data columns (total 86 columns):
date                   170754 non-null object
is_doubleheader        170754 non-null float64
is_tripleheader        170754 non-null float64
home_PA                170754 non-null int64
home_AB                170754 non-null int64
home_H                 170754 non-null int64
home_1B                170754 non-null int64
home_2B                170754 non-null int64
home_3B                170754 non-null int64
home_HR                170754 non-null int64
home_TB                170754 non-null int64
home_BB                170754 non-null int64
home_IBB               170754 non-null int64
home_HBP               170754 non-null int64
home_R                 170754 non-null int64
road_starter           170754 non-null object
road_starter_H         170754 non-null int64
road_starter_HR        170754 non-null int64
road_starter_BB        170754 non-null int64
road_starter_IBB       17075

In [159]:
master = master.rename(columns = {"batting_park_factor" : "home_batting_park_factor"})

In [166]:
master = master.merge(park_factors, how = "left", 
                      left_on = ["road_team", "season"],
                      right_on = ["team_code", "year"])

master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170754 entries, 0 to 170753
Data columns (total 89 columns):
date                        170754 non-null object
is_doubleheader             170754 non-null float64
is_tripleheader             170754 non-null float64
home_PA                     170754 non-null int64
home_AB                     170754 non-null int64
home_H                      170754 non-null int64
home_1B                     170754 non-null int64
home_2B                     170754 non-null int64
home_3B                     170754 non-null int64
home_HR                     170754 non-null int64
home_TB                     170754 non-null int64
home_BB                     170754 non-null int64
home_IBB                    170754 non-null int64
home_HBP                    170754 non-null int64
home_R                      170754 non-null int64
road_starter                170754 non-null object
road_starter_H              170754 non-null int64
road_starter_HR             17075

In [167]:
master = master.drop(columns = ["team_code", "year"]).rename(columns = {
    "batting_park_factor" : "road_batting_park_factor"
})

In [168]:
master.to_csv("./adv_metrics/master_raw.csv.gz", index = False, compression = "gzip")

# READ IN PREPARED DATASET HERE

In [2]:
master = pd.read_csv("./adv_metrics/master_raw.csv.gz", compression = "gzip")

pd.set_option("max.columns", 100)

master.head()

Unnamed: 0,date,is_doubleheader,is_tripleheader,home_PA,home_AB,home_H,home_1B,home_2B,home_3B,home_HR,home_TB,home_BB,home_IBB,home_HBP,home_R,road_starter,road_starter_H,road_starter_HR,road_starter_BB,road_starter_IBB,road_starter_HBP,road_starter_K,road_starter_IP,road_starter_ER,road_relief_IP,road_relief_H,road_relief_HR,road_relief_BB,road_relief_IBB,road_relief_HBP,road_relief_K,road_relief_ER,road_PA,road_AB,road_H,road_1B,road_2B,road_3B,road_HR,road_TB,road_BB,road_IBB,road_HBP,road_R,home_starter,home_starter_H,home_starter_HR,home_starter_BB,home_starter_IBB,home_starter_HBP,home_starter_K,home_starter_IP,home_starter_ER,home_relief_IP,home_relief_H,home_relief_HR,home_relief_BB,home_relief_IBB,home_relief_HBP,home_relief_K,home_relief_ER,season,wOBA,wOBAScale,wBB,wHBP,w1B,w2B,w3B,wHR,runSB,runCS,R/PA,R/W,cFIP,home_team,road_team,home_batting_park_factor,home_SAC,road_SAC,home_league,road_league,home_league_PA,home_league_wRC,road_league_PA,road_league_wRC,road_batting_park_factor
0,1918-04-15,0.0,0.0,38,31,6,6,0,0,0,6,6,0,0,3,mogrg101,5,0,1,0,0,0,3.67,3,5.33,1,0,5,0,0,2,0,43,34,11,10,1,0,0,12,5,0,0,6,johnw102,11,0,5,0,0,2,9.0,4,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,MIN,NYY,98,1,3,AL,AL,34688,3628,34688,3628,102
1,1918-04-15,0.0,0.0,38,31,9,7,2,0,0,11,4,0,0,7,myere101,7,0,3,0,0,0,6.0,3,2.0,2,0,1,0,0,3,0,33,30,4,4,0,0,0,4,2,0,0,1,ruthb101,4,0,2,0,0,3,9.0,1,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,BOS,OAK,99,3,1,AL,AL,34688,3628,34688,3628,100
2,1918-04-16,0.0,0.0,35,33,10,8,1,1,0,13,2,0,0,5,ragap101,10,0,2,0,0,3,7.0,4,1.0,0,0,0,0,0,0,0,35,32,5,4,1,0,0,6,2,0,0,2,mayee101,5,0,2,0,0,6,9.0,1,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,PHI,ATL,107,0,1,NL,NL,34257,3458,34257,3458,95
3,1918-04-16,0.0,0.0,27,25,3,3,0,0,0,3,1,0,0,2,coopw101,3,0,1,0,0,0,7.0,0,1.0,0,0,0,0,0,0,0,32,27,1,0,1,0,0,2,5,0,0,0,schnp101,1,0,5,0,0,3,9.0,0,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,CIN,PIT,99,1,0,NL,NL,34257,3458,34257,3458,104
4,1918-04-16,0.0,0.0,31,27,9,4,2,3,0,17,1,0,1,4,alexg102,9,0,1,0,1,2,8.0,3,0.0,0,0,0,0,0,0,0,34,31,5,5,0,0,0,5,2,0,0,2,meadl101,5,0,2,0,0,1,9.0,0,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,STL,CHC,96,2,1,NL,NL,34257,3458,34257,3458,102


# STATS TO COLLECT: 

## OFFENSIVE:

* wOBA
* OPS+
* wRAA+
* wRC+

## BULLPEN

* ERA
* WHIP
* K / BB
* FIP

## STARTING PITCHERS

* ERA
* WHIP
* K / BB
* FIP

In [43]:
compiled = pd.DataFrame(master_list)

compiled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170754 entries, 0 to 170753
Data columns (total 50 columns):
date                         170754 non-null object
is_doubleheader              170754 non-null float64
is_tripleheader              170754 non-null float64
season                       170754 non-null int64
home_team                    170754 non-null object
road_team                    170754 non-null object
home_starter                 170754 non-null object
road_starter                 170754 non-null object
home_wOBA                    170754 non-null float64
home_wRAA                    170754 non-null float64
home_wRC                     170754 non-null float64
home_OPS                     170754 non-null float64
home_relief_FIP              170754 non-null float64
home_relief_WHIP             170754 non-null float64
home_relief_ERA              170754 non-null float64
home_relief_K_BB             170754 non-null float64
home_relief_K_9              170754 non-null flo

In [44]:
compiled.to_csv("./adv_metrics/adv_compiled.csv.gz", index = False, compression = "gzip")

In [47]:
master.head()

Unnamed: 0,date,is_doubleheader,is_tripleheader,home_PA,home_AB,home_H,home_1B,home_2B,home_3B,home_HR,home_TB,home_BB,home_IBB,home_HBP,home_R,road_starter,road_starter_H,road_starter_HR,road_starter_BB,road_starter_IBB,road_starter_HBP,road_starter_K,road_starter_IP,road_starter_ER,road_relief_IP,road_relief_H,road_relief_HR,road_relief_BB,road_relief_IBB,road_relief_HBP,road_relief_K,road_relief_ER,road_PA,road_AB,road_H,road_1B,road_2B,road_3B,road_HR,road_TB,road_BB,road_IBB,road_HBP,road_R,home_starter,home_starter_H,home_starter_HR,home_starter_BB,home_starter_IBB,home_starter_HBP,home_starter_K,home_starter_IP,home_starter_ER,home_relief_IP,home_relief_H,home_relief_HR,home_relief_BB,home_relief_IBB,home_relief_HBP,home_relief_K,home_relief_ER,season,wOBA,wOBAScale,wBB,wHBP,w1B,w2B,w3B,wHR,runSB,runCS,R/PA,R/W,cFIP,home_team,road_team,home_batting_park_factor,home_SAC,road_SAC,home_league,road_league,home_league_PA,home_league_wRC,road_league_PA,road_league_wRC,road_batting_park_factor
0,1918-04-15,0.0,0.0,38,31,6,6,0,0,0,6,6,0,0,3,mogrg101,5,0,1,0,0,0,3.67,3,5.33,1,0,5,0,0,2,0,43,34,11,10,1,0,0,12,5,0,0,6,johnw102,11,0,5,0,0,2,9.0,4,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,MIN,NYY,98,1,3,AL,AL,34688,3628,34688,3628,102
1,1918-04-15,0.0,0.0,38,31,9,7,2,0,0,11,4,0,0,7,myere101,7,0,3,0,0,0,6.0,3,2.0,2,0,1,0,0,3,0,33,30,4,4,0,0,0,4,2,0,0,1,ruthb101,4,0,2,0,0,3,9.0,1,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,BOS,OAK,99,3,1,AL,AL,34688,3628,34688,3628,100
2,1918-04-16,0.0,0.0,35,33,10,8,1,1,0,13,2,0,0,5,ragap101,10,0,2,0,0,3,7.0,4,1.0,0,0,0,0,0,0,0,35,32,5,4,1,0,0,6,2,0,0,2,mayee101,5,0,2,0,0,6,9.0,1,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,PHI,ATL,107,0,1,NL,NL,34257,3458,34257,3458,95
3,1918-04-16,0.0,0.0,27,25,3,3,0,0,0,3,1,0,0,2,coopw101,3,0,1,0,0,0,7.0,0,1.0,0,0,0,0,0,0,0,32,27,1,0,1,0,0,2,5,0,0,0,schnp101,1,0,5,0,0,3,9.0,0,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,CIN,PIT,99,1,0,NL,NL,34257,3458,34257,3458,104
4,1918-04-16,0.0,0.0,31,27,9,4,2,3,0,17,1,0,1,4,alexg102,9,0,1,0,1,2,8.0,3,0.0,0,0,0,0,0,0,0,34,31,5,5,0,0,0,5,2,0,0,2,meadl101,5,0,2,0,0,1,9.0,0,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,STL,CHC,96,2,1,NL,NL,34257,3458,34257,3458,102


In [45]:
all_files = glob.glob("./parsed/all*.csv")

In [46]:
with open("./intermediate_data/all_event_header.json", "r+") as f:
    header = json.load(f)

In [63]:
def get_starter_hits(game_df, game_id):
            
    game_master = {}

    game_master["date"] = datetime.datetime.strptime(game_id[3:11], "%Y%m%d").strftime("%Y-%m-%d")

    game_master["home_team"] = game_id[0:3]

    game_master["road_team"] = game_df.iloc[0]["AWAY_TEAM_ID"]

    if game_id[11] == "2":

        game_master["is_doubleheader"] = 1
        
        game_master["is_tripleheader"] = 0

    elif game_id[11] == "3":
        
        game_master["is_doubleheader"] = 0

        game_master["is_tripleheader"] = 1

    else:

        game_master["is_doubleheader"] = 0 

        game_master["is_tripleheader"] = 0

    prefixes = ["home_", "road_"]
    
    for prefix in prefixes:
        
        if prefix == "home_":
            
            team_events = game_df[game_df.BAT_HOME_ID == 1]
            
            alt = "road_"
            
        else:
            
            team_events = game_df[game_df.BAT_HOME_ID == 0]
            
            alt = "home_"
            
        starter_events = team_events[team_events.RESP_PIT_START_FL == "T"]

        if len(starter_events) != 0:

            relief_events = team_events[team_events.RESP_PIT_START_FL != "T"]

        else:

            de_facto_starter = team_events.iloc[0]["RESP_PIT_ID"]

            starter_events = team_events[team_events.RESP_PIT_ID == de_facto_starter]

            relief_events = team_events[team_events.RESP_PIT_ID != de_facto_starter]

        game_master[alt + "starter"] = starter_events.iloc[0]["RESP_PIT_ID"]
        
        game_master[alt + "starter_PA"] = starter_events.GAME_PA_CT.max() + 1
        
        game_master[alt + "starter_1B"] = len(starter_events[starter_events.EVENT_CD == 20])
        
        game_master[alt + "starter_2B"] = len(starter_events[starter_events.EVENT_CD == 21])

        game_master[alt + "starter_3B"] = len(starter_events[starter_events.EVENT_CD == 22])
        
        game_master[alt + "starter_SAC"] = len(starter_events[(starter_events.EVENT_TX.str.contains("SH")) | 
                                        (starter_events.EVENT_TX.str.contains("SF"))])

        if len(relief_events) == 0:

            relief_cols = ["relief_PA", "relief_1B", "relief_2B", "relief_3B", "relief_SAC"]

            for col in relief_cols:

                game_master[alt + col] = 0

        else:
            
            game_master[alt + "relief_PA"] = relief_events.GAME_PA_CT.max() - starter_events.GAME_PA_CT.max()
            
            game_master[alt + "relief_1B"] = len(relief_events[relief_events.EVENT_CD == 20])
            
            game_master[alt + "relief_2B"] = len(relief_events[relief_events.EVENT_CD == 21])
            
            game_master[alt + "relief_3B"] = len(relief_events[relief_events.EVENT_CD == 22])
            
            game_master[alt + "relief_SAC"] = len(relief_events[(relief_events.EVENT_TX.str.contains("SH")) | 
                                                  (relief_events.EVENT_TX.str.contains("SF"))])

    return(game_master)

In [64]:
_all_games = []

for file in all_files:
    
    season = file.split("all")[1][0:4]

    season_df = pd.read_csv(file, low_memory = False, header = None)

    season_df.columns = header

    all_games = season_df.GAME_ID.unique()

    season_list = []

    with ProcessPoolExecutor(max_workers = None) as executor:

        for game in all_games:

            game_df = season_df[season_df.GAME_ID == game].reset_index(drop = True)

            season_list.append(executor.submit(get_starter_hits, game_df, game))

    season_list = [i.result() for i in season_list]

    _all_games += season_list
    
    print("Season {} complete".format(season))

Season 1994 complete
Season 1980 complete
Season 1957 complete
Season 1943 complete
Season 1942 complete
Season 1956 complete
Season 1981 complete
Season 1995 complete
Season 1983 complete
Season 1997 complete
Season 1968 complete
Season 1940 complete
Season 1954 complete
Season 1955 complete
Season 1941 complete
Season 1969 complete
Season 1996 complete
Season 1982 complete
Season 1986 complete
Season 1992 complete
Season 1945 complete
Season 1951 complete
Season 1979 complete
Season 1978 complete
Season 1950 complete
Season 1944 complete
Season 1993 complete
Season 1987 complete
Season 1991 complete
Season 1985 complete
Season 1952 complete
Season 1946 complete
Season 1947 complete
Season 1953 complete
Season 1984 complete
Season 1990 complete
Season 1934 complete
Season 1920 complete
Season 2018 complete
Season 2019 complete
Season 1921 complete
Season 1935 complete
Season 1923 complete
Season 1937 complete
Season 1936 complete
Season 1922 complete
Season 1926 complete
Season 1932 c

In [66]:
add_on = pd.DataFrame(_all_games)

add_on.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170754 entries, 0 to 170753
Data columns (total 27 columns):
date                170754 non-null object
home_team           170754 non-null object
road_team           170754 non-null object
is_doubleheader     170754 non-null int64
is_tripleheader     170754 non-null int64
road_starter        170754 non-null object
road_starter_PA     170754 non-null int64
road_starter_1B     170754 non-null int64
road_starter_2B     170754 non-null int64
road_starter_3B     170754 non-null int64
road_starter_SAC    170754 non-null int64
road_relief_PA      170754 non-null int64
road_relief_1B      170754 non-null int64
road_relief_2B      170754 non-null int64
road_relief_3B      170754 non-null int64
road_relief_SAC     170754 non-null int64
home_starter        170754 non-null object
home_starter_PA     170754 non-null int64
home_starter_1B     170754 non-null int64
home_starter_2B     170754 non-null int64
home_starter_3B     170754 non-null int64
ho

In [72]:
add_on["date"] = pd.to_datetime(add_on.date, format = "%Y-%m-%d")

add_on = add_on.sort_values(by = ["date"]).reset_index(drop = True)

In [83]:
with open("./intermediate_data/modern_rc.json", "r+") as f:
    retrosheet_codes = json.load(f)
    
retrosheet_codes.update({"MIA" : "FLA"})

add_on["_home_team"] = add_on.home_team.map(retrosheet_codes)

add_on["_road_team"] = add_on.road_team.map(retrosheet_codes)

add_on = add_on.drop(columns = ["home_team", "road_team"]).\
rename(columns = {"_home_team" : "home_team",
                 "_road_team" : "road_team"})

In [89]:
def get_starter_ABs(game_df, game_id):
            
    game_master = {}

    game_master["date"] = datetime.datetime.strptime(game_id[3:11], "%Y%m%d").strftime("%Y-%m-%d")

    game_master["home_team"] = game_id[0:3]

    game_master["road_team"] = game_df.iloc[0]["AWAY_TEAM_ID"]

    if game_id[11] == "2":

        game_master["is_doubleheader"] = 1
        
        game_master["is_tripleheader"] = 0

    elif game_id[11] == "3":
        
        game_master["is_doubleheader"] = 0

        game_master["is_tripleheader"] = 1

    else:

        game_master["is_doubleheader"] = 0 

        game_master["is_tripleheader"] = 0

    prefixes = ["home_", "road_"]
    
    for prefix in prefixes:
        
        if prefix == "home_":
            
            team_events = game_df[game_df.BAT_HOME_ID == 1]
            
            alt = "road_"
            
        else:
            
            team_events = game_df[game_df.BAT_HOME_ID == 0]
            
            alt = "home_"
            
        starter_events = team_events[team_events.RESP_PIT_START_FL == "T"]

        if len(starter_events) != 0:

            relief_events = team_events[team_events.RESP_PIT_START_FL != "T"]

        else:

            de_facto_starter = team_events.iloc[0]["RESP_PIT_ID"]

            starter_events = team_events[team_events.RESP_PIT_ID == de_facto_starter]

            relief_events = team_events[team_events.RESP_PIT_ID != de_facto_starter]

        game_master[alt + "starter"] = starter_events.iloc[0]["RESP_PIT_ID"]
        
        game_master[alt + "starter_AB"] = len(starter_events[starter_events.AB_FL == 'T'])
        
        if len(relief_events) == 0:

            relief_cols = ["relief_AB"]

            for col in relief_cols:

                game_master[alt + col] = 0

        else:
            
            game_master[alt + "relief_AB"] = len(relief_events[relief_events.AB_FL == 'T'])
            
    return(game_master)

In [90]:
_all_games = []

for file in all_files:
    
    season = file.split("all")[1][0:4]

    season_df = pd.read_csv(file, low_memory = False, header = None)

    season_df.columns = header

    all_games = season_df.GAME_ID.unique()

    season_list = []

    with ProcessPoolExecutor(max_workers = None) as executor:

        for game in all_games:

            game_df = season_df[season_df.GAME_ID == game].reset_index(drop = True)

            season_list.append(executor.submit(get_starter_ABs, game_df, game))

    season_list = [i.result() for i in season_list]

    _all_games += season_list
    
    print("Season {} complete".format(season))

Season 1994 complete
Season 1980 complete
Season 1957 complete
Season 1943 complete
Season 1942 complete
Season 1956 complete
Season 1981 complete
Season 1995 complete
Season 1983 complete
Season 1997 complete
Season 1968 complete
Season 1940 complete
Season 1954 complete
Season 1955 complete
Season 1941 complete
Season 1969 complete
Season 1996 complete
Season 1982 complete
Season 1986 complete
Season 1992 complete
Season 1945 complete
Season 1951 complete
Season 1979 complete
Season 1978 complete
Season 1950 complete
Season 1944 complete
Season 1993 complete
Season 1987 complete
Season 1991 complete
Season 1985 complete
Season 1952 complete
Season 1946 complete
Season 1947 complete
Season 1953 complete
Season 1984 complete
Season 1990 complete
Season 1934 complete
Season 1920 complete
Season 2018 complete
Season 2019 complete
Season 1921 complete
Season 1935 complete
Season 1923 complete
Season 1937 complete
Season 1936 complete
Season 1922 complete
Season 1926 complete
Season 1932 c

In [92]:
AB = pd.DataFrame(_all_games)

AB.head()

Unnamed: 0,date,home_team,road_team,is_doubleheader,is_tripleheader,road_starter,road_starter_AB,road_relief_AB,home_starter,home_starter_AB,home_relief_AB
0,1994-04-12,ATL,SFN,0,0,portm001,11,24,smolj001,32,3
1,1994-04-13,ATL,SFN,0,0,hickb001,24,19,avers001,20,18
2,1994-04-14,ATL,SFN,0,0,burkj001,24,7,maddg002,30,0
3,1994-04-18,ATL,SLN,0,0,sutcr001,13,19,avers001,19,11
4,1994-04-19,ATL,SLN,0,0,watsa001,23,11,maddg002,24,8


In [94]:
AB["date"] = pd.to_datetime(AB.date, format = "%Y-%m-%d")

AB["_home_team"] = AB.home_team.map(retrosheet_codes)

AB["_road_team"] = AB.road_team.map(retrosheet_codes)

AB = AB.drop(columns = ["home_team", "road_team"]).\
rename(columns = {"_home_team" : "home_team",
                 "_road_team" : "road_team"})

In [98]:
add_on = add_on.merge(AB, how = "left", left_on = merge_cols, right_on = merge_cols)

In [103]:
master["date"] = pd.to_datetime(master.date, format = "%Y-%m-%d")

master = master.merge(add_on, how = "left", left_on = merge_cols, right_on = merge_cols)

In [126]:
master = master.drop(columns = ["road_batting_park_factor"])

In [20]:
master["home_batting_park_factor"] = master.home_batting_park_factor / 2

In [22]:
master.to_csv("./adv_metrics/master_raw.csv.gz", index = False, compression = "gzip")

# BEGIN UPDATED STAT COMPILING

In [3]:
master = pd.read_csv("./adv_metrics/master_raw.csv.gz", compression = "gzip")

pd.set_option("max.columns", 200)

master.head()

Unnamed: 0,date,is_doubleheader,is_tripleheader,home_PA,home_AB,home_H,home_1B,home_2B,home_3B,home_HR,home_TB,home_BB,home_IBB,home_HBP,home_R,road_starter,road_starter_H,road_starter_HR,road_starter_BB,road_starter_IBB,road_starter_HBP,road_starter_K,road_starter_IP,road_starter_ER,road_relief_IP,road_relief_H,road_relief_HR,road_relief_BB,road_relief_IBB,road_relief_HBP,road_relief_K,road_relief_ER,road_PA,road_AB,road_H,road_1B,road_2B,road_3B,road_HR,road_TB,road_BB,road_IBB,road_HBP,road_R,home_starter,home_starter_H,home_starter_HR,home_starter_BB,home_starter_IBB,home_starter_HBP,home_starter_K,home_starter_IP,home_starter_ER,home_relief_IP,home_relief_H,home_relief_HR,home_relief_BB,home_relief_IBB,home_relief_HBP,home_relief_K,home_relief_ER,season,wOBA,wOBAScale,wBB,wHBP,w1B,w2B,w3B,wHR,runSB,runCS,R/PA,R/W,cFIP,home_team,road_team,home_batting_park_factor,home_SAC,road_SAC,home_league,road_league,home_league_PA,home_league_wRC,road_league_PA,road_league_wRC,road_starter_PA,road_starter_1B,road_starter_2B,road_starter_3B,road_starter_SAC,road_relief_PA,road_relief_1B,road_relief_2B,road_relief_3B,road_relief_SAC,home_starter_PA,home_starter_1B,home_starter_2B,home_starter_3B,home_starter_SAC,home_relief_PA,home_relief_1B,home_relief_2B,home_relief_3B,home_relief_SAC,road_starter_AB,road_relief_AB,home_starter_AB,home_relief_AB
0,1918-04-15,0.0,0.0,38,31,6,6,0,0,0,6,6,0,0,3,mogrg101,5,0,1,0,0,0,3.67,3,5.33,1,0,5,0,0,2,0,43,34,11,10,1,0,0,12,5,0,0,6,johnw102,11,0,5,0,0,2,9.0,4,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,MIN,NYY,1.96,1,3,AL,AL,34688,3628,34688,3628,17,5,0,0,0,21,1,0,0,1,43,10,1,0,3,0,0,0,0,0,16,15,34,0
1,1918-04-15,0.0,0.0,38,31,9,7,2,0,0,11,4,0,0,7,myere101,7,0,3,0,0,0,6.0,3,2.0,2,0,1,0,0,3,0,33,30,4,4,0,0,0,4,2,0,0,1,ruthb101,4,0,2,0,0,3,9.0,1,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,BOS,OAK,1.98,3,1,AL,AL,34688,3628,34688,3628,29,6,1,0,3,9,1,1,0,0,33,4,0,0,1,0,0,0,0,0,23,8,30,0
2,1918-04-16,0.0,0.0,35,33,10,8,1,1,0,13,2,0,0,5,ragap101,10,0,2,0,0,3,7.0,4,1.0,0,0,0,0,0,0,0,35,32,5,4,1,0,0,6,2,0,0,2,mayee101,5,0,2,0,0,6,9.0,1,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,PHI,ATL,2.14,0,1,NL,NL,34257,3458,34257,3458,32,8,1,1,0,3,0,0,0,0,35,4,1,0,1,0,0,0,0,0,30,3,32,0
3,1918-04-16,0.0,0.0,27,25,3,3,0,0,0,3,1,0,0,2,coopw101,3,0,1,0,0,0,7.0,0,1.0,0,0,0,0,0,0,0,32,27,1,0,1,0,0,2,5,0,0,0,schnp101,1,0,5,0,0,3,9.0,0,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,CIN,PIT,1.98,1,0,NL,NL,34257,3458,34257,3458,24,3,0,0,1,3,0,0,0,0,32,0,1,0,0,0,0,0,0,0,22,3,27,0
4,1918-04-16,0.0,0.0,31,27,9,4,2,3,0,17,1,0,1,4,alexg102,9,0,1,0,1,2,8.0,3,0.0,0,0,0,0,0,0,0,34,31,5,5,0,0,0,5,2,0,0,2,meadl101,5,0,2,0,0,1,9.0,0,0.0,0,0,0,0,0,0,0,1918,0.317,1.521,0.733,0.771,0.969,1.425,1.835,2.446,0.2,-0.342,0.097,8.417,2.229,STL,CHC,1.92,2,1,NL,NL,34257,3458,34257,3458,31,4,2,3,2,0,0,0,0,0,34,5,0,0,1,0,0,0,0,0,27,0,31,0


In [32]:
@dataclass
class AdvancedMetricsCreator():
    master: pd.core.frame.DataFrame
    n_jobs: int = None
        
    def recreate(self):
        
        all_results = []
        
        with ProcessPoolExecutor(max_workers = self.n_jobs) as executor:
            
            for k in self.master.index:
                
                if k > 0 and k % 10000 == 0:
                    
                    print("{} observations processed".format(str(k)))
                        
                prior = self.master.iloc[:k+1]
                
                all_results.append(executor.submit(self.calc_stats, prior, k))
                
        all_results = [i.result() for i in all_results]
        
        return(all_results)
              
    @staticmethod
    def calc_stats(prior_df: pd.core.frame.DataFrame, k: int):
        
        game_master= {}
        
        game_master["date"] = prior_df.iloc[k]["date"]
        
        game_master["is_doubleheader"] = prior_df.iloc[k]["is_doubleheader"]
        
        game_master["is_tripleheader"] = prior_df.iloc[k]["is_tripleheader"]
        
        season = prior_df.iloc[k]["season"]
        
        home_team = prior_df.iloc[k]["home_team"]
        
        road_team = prior_df.iloc[k]["road_team"]
        
        home_starter = prior_df.iloc[k]["home_starter"]
        
        road_starter = prior_df.iloc[k]["road_starter"]
        
        game_master["season"] = season
        
        game_master["home_team"] = home_team
        
        game_master["road_team"] = road_team
        
        game_master["home_starter"] = home_starter
        
        game_master["road_starter"] = road_starter
        
        prior_df = prior_df.drop(k, axis = 0)
        
        wOBA_weights = ["wBB", "wHBP", "w1B", "w2B", "w3B", "wHR"]

        wOBA_home = ["home_BB", "home_HBP", "home_1B", "home_2B", "home_3B", "home_HR"]

        wOBA_road = ["road_BB", "road_HBP", "road_1B", "road_2B", "road_3B", "road_HR"]

        denom_home = ["home_AB", "home_BB", "home_SAC", "home_HBP"]

        denom_road = ["road_AB", "road_BB", "road_SAC", "road_HBP"]

        home_IBB = "home_IBB"

        road_IBB = "road_IBB"
        
        wOBA_home_SP = ["home_starter_BB", "home_starter_HBP", "home_starter_1B", "home_starter_2B", 
                 "home_starter_3B", "home_starter_HR"]

        wOBA_road_SP = ["road_starter_BB", "road_starter_HBP", "road_starter_1B", "road_starter_2B", 
                     "road_starter_3B", "road_starter_HR"]

        wOBA_home_R = ["home_relief_BB", "home_relief_HBP", "home_relief_1B", "home_relief_2B", 
                     "home_relief_3B", "home_relief_HR"]

        wOBA_road_R = ["road_relief_BB", "road_relief_HBP", "road_relief_1B", "road_relief_2B", 
                     "road_relief_3B", "road_relief_HR"]

        denom_home_SP = ["home_starter_AB", "home_starter_BB", "home_starter_SAC", "home_starter_HBP"]

        denom_road_SP = ["road_starter_AB", "road_starter_BB", "road_starter_SAC", "road_starter_HBP"]

        denom_home_R = ["home_relief_AB", "home_relief_BB", "home_relief_SAC", "home_relief_HBP"]

        denom_road_R = ["road_relief_AB", "road_relief_BB", "road_relief_SAC", "road_relief_HBP"]

        home_IBB_SP = "home_starter_IBB"

        road_IBB_SP = "road_starter_IBB"

        home_IBB_R = "home_relief_IBB"

        road_IBB_R = "road_relief_IBB"
        
        OBP_home = ["home_H", "home_BB", "home_IBB", "home_HBP"]
        
        OBP_denomH = ["home_AB", "home_BB", "home_HBP", "home_SAC"]
        
        OBP_road = ["road_H", "road_BB", "road_IBB", "road_HBP"]
        
        OBP_denomR = ["road_AB", "road_BB", "road_HBP", "road_SAC"]
        
        SLG_home = ["home_1B", "home_2B", "home_3B", "home_HR"]
        
        SLG_road = ["road_1B", "road_2B", "road_3B", "road_HR"]
        
        SLG_mlt = np.array([1, 2, 3, 4])
        
        prefixes = ["home_", "road_"]
        
        for prefix in prefixes:
            
            if prefix == "home_":
                
                home_df = prior_df[(prior_df.home_team == home_team) & (prior_df.season == season)]
                
                road_df = prior_df[(prior_df.road_team == home_team) & (prior_df.season == season)]
                
                home_s_df = home_df[home_df.home_starter == home_starter]
                
                road_s_df = road_df[road_df.road_starter == home_starter]
                
                home_career = prior_df[prior_df.home_starter == home_starter]
                
                road_career = prior_df[prior_df.road_starter == home_starter]
                
            else:
                
                home_df = prior_df[(prior_df.home_team == road_team) & (prior_df.season == season)]
                
                road_df = prior_df[(prior_df.road_team == road_team) & (prior_df.season == season)]
                
                home_s_df = home_df[home_df.home_starter == road_starter]
                
                road_s_df = road_df[road_df.road_starter == road_starter]
                
                home_career = prior_df[prior_df.home_starter == road_starter]
                
                road_career = prior_df[prior_df.road_starter == road_starter]
        
            if len(home_df) != 0 or len(road_df) != 0:

                if len(home_df) != 0:
                    
                    park_factor = pd.concat([home_df.home_batting_park_factor, 
                                            road_df.home_batting_park_factor], axis = 0).mean()

                    wOBA = sum((home_df[wOBA_home].sum().values + road_df[wOBA_road].sum().values) * \
                                home_df[wOBA_weights].max().values) /\
                                (sum(home_df[denom_home].sum().values + road_df[denom_road].sum().values) -\
                                home_df[home_IBB].sum() - road_df[road_IBB].sum())

                    wRAA = ((wOBA - home_df.wOBA.max()) / home_df.wOBAScale.max()) * \
                            (home_df.home_PA.sum() + road_df.road_PA.sum())

                    wRC = ((((wRAA / (home_df.home_PA.sum() + road_df.road_PA.sum())) + home_df["R/PA"].max()) +\
                            (home_df["R/PA"].max() - (park_factor * home_df["R/PA"].max()))) /\
                            (home_df.home_league_wRC.max() / home_df.home_league_PA.max())) * 100
                    
                    if home_df.home_relief_IP.sum() + road_df.road_relief_IP.sum() == 0.:
                        
                        relief_wOBA = 0.
                        
                        relief_wRAA = 0.
                        
                        relief_wRC = 0.
                        
                    else:
                        
                        relief_wOBA = sum((home_df[wOBA_home_R].sum().values + road_df[wOBA_road_R].sum().values) * \
                                    home_df[wOBA_weights].max().values) /\
                                    (sum(home_df[denom_home_R].sum().values + road_df[denom_road_R].sum().values) -\
                                    home_df[home_IBB_R].sum() - road_df[road_IBB_R].sum())

                        relief_wRAA = ((relief_wOBA - home_df.wOBA.max()) / home_df.wOBAScale.max()) * \
                                      (home_df.home_relief_PA.sum() + road_df.road_relief_PA.sum())

                        relief_wRC = ((((relief_wRAA / (home_df.home_relief_PA.sum() +\
                                                        road_df.road_relief_PA.sum())) + home_df["R/PA"].max()) +\
                                    (home_df["R/PA"].max() - (park_factor * home_df["R/PA"].max()))) /\
                                    (home_df.home_league_wRC.max() / home_df.home_league_PA.max())) * 100
                    
                    if home_df.home_relief_IP.sum() + road_df.road_relief_IP.sum() == 0.:
                        
                        FIP = 0.
                        
                    else:
                    
                        FIP = (((13 * (home_df.home_relief_HR.sum() + road_df.road_relief_HR.sum())) +\
                              (3 * (home_df.home_relief_BB.sum() + home_df.home_relief_HBP.sum() +\
                                   home_df.home_relief_IBB.sum() + road_df.road_relief_BB.sum() +\
                                   road_df.road_relief_HBP.sum() + road_df.road_relief_IBB.sum())) -\
                              (2 * (home_df.home_relief_K.sum() + road_df.road_relief_K.sum()))) /\
                              (home_df.home_relief_IP.sum() + road_df.road_relief_IP.sum())) + \
                              home_df.cFIP.max()

                else:
                    
                    park_factor = road_df.home_batting_park_factor.mean()

                    wOBA = sum((home_df[wOBA_home].sum().values + road_df[wOBA_road].sum().values) * \
                                road_df[wOBA_weights].max().values) /\
                                (sum(home_df[denom_home].sum().values + road_df[denom_road].sum().values) -\
                                home_df[home_IBB].sum() - road_df[road_IBB].sum())

                    wRAA = ((wOBA - road_df.wOBA.max()) / road_df.wOBAScale.max()) * \
                            (home_df.home_PA.sum() + road_df.road_PA.sum())

                    wRC = ((((wRAA / (home_df.home_PA.sum() + road_df.road_PA.sum())) + road_df["R/PA"].max()) +\
                            (road_df["R/PA"].max() - (park_factor * road_df["R/PA"].max()))) /\
                            (road_df.road_league_wRC.max() / road_df.road_league_PA.max())) * 100
                    
                    if road_df.road_relief_IP.sum() == 0.:
                        
                        relief_wOBA = 0.
                        
                        relief_wRAA = 0.
                        
                        relief_wRC = 0.
                        
                    else:
                    
                        relief_wOBA = sum((home_df[wOBA_home_R].sum().values + road_df[wOBA_road_R].sum().values) * \
                                    road_df[wOBA_weights].max().values) /\
                                    (sum(home_df[denom_home_R].sum().values + road_df[denom_road_R].sum().values) -\
                                    home_df[home_IBB_R].sum() - road_df[road_IBB_R].sum())

                        relief_wRAA = ((relief_wOBA - road_df.wOBA.max()) / road_df.wOBAScale.max()) * \
                                      (home_df.home_relief_PA.sum() + road_df.road_relief_PA.sum())

                        relief_wRC = ((((relief_wRAA / (home_df.home_relief_PA.sum() +\
                                                        road_df.road_relief_PA.sum())) + road_df["R/PA"].max()) +\
                                    (road_df["R/PA"].max() - (park_factor * road_df["R/PA"].max()))) /\
                                    (road_df.road_league_wRC.max() / road_df.road_league_PA.max())) * 100
                    
                    if road_df.road_relief_IP.sum() == 0.:
                        
                        FIP = 0.
                        
                    else:
                    
                        FIP = (((13 * (home_df.home_relief_HR.sum() + road_df.road_relief_HR.sum())) +\
                              (3 * (home_df.home_relief_BB.sum() + home_df.home_relief_HBP.sum() +\
                                   home_df.home_relief_IBB.sum() + road_df.road_relief_BB.sum() +\
                                   road_df.road_relief_HBP.sum() + road_df.road_relief_IBB.sum())) -\
                              (2 * (home_df.home_relief_K.sum() + road_df.road_relief_K.sum()))) /\
                              (home_df.home_relief_IP.sum() + road_df.road_relief_IP.sum())) + \
                              road_df.cFIP.max()

                OBP = sum(home_df[OBP_home].sum().values + road_df[OBP_road].sum().values) /\
                      sum(home_df[OBP_denomH].sum().values + road_df[OBP_denomR].sum().values)

                SLG = (sum(home_df[SLG_home].sum().values * SLG_mlt) +\
                       sum(road_df[SLG_road].sum().values * SLG_mlt)) / (home_df["home_AB"].sum() +\
                                                                        road_df["road_AB"].sum())
                
                if home_df.home_relief_IP.sum() + road_df.road_relief_IP.sum() == 0.:
                    
                    WHIP = 0.
                    
                    ERA = 0.
                    
                    K_9 = 0.
                    
                else:
                
                    WHIP = (home_df.home_relief_H.sum() + home_df.home_relief_BB.sum() +\
                           home_df.home_relief_IBB.sum() + road_df.road_relief_H.sum() +\
                           road_df.road_relief_BB.sum() + road_df.road_relief_IBB.sum()) /\
                           (home_df.home_relief_IP.sum() + road_df.road_relief_IP.sum())
                
                    ERA = ((home_df.home_relief_ER.sum() + road_df.road_relief_ER.sum()) /\
                          (home_df.home_relief_IP.sum() + road_df.road_relief_IP.sum())) * 9
                    
                    K_9 = ((home_df.home_relief_K.sum() + road_df.road_relief_K.sum()) /\
                          (home_df.home_relief_IP.sum() + road_df.road_relief_IP.sum())) * 9
                
                if home_df.home_relief_BB.sum() + road_df.road_relief_BB.sum() == 0:
                    
                    K_BB = home_df.home_relief_K.sum() + road_df.road_relief_K.sum()
                    
                else:
                
                    K_BB = (home_df.home_relief_K.sum() + road_df.road_relief_K.sum()) /\
                           (home_df.home_relief_BB.sum() + road_df.road_relief_BB.sum())
                
                game_master[prefix + "wOBA"] = wOBA

                game_master[prefix + "wRAA"] = wRAA

                game_master[prefix + "wRC"] = wRC

                game_master[prefix + "OPS"] = OBP + SLG
                
                game_master[prefix + "relief_wOBA"] = relief_wOBA
                
                game_master[prefix + "relief_wRAA"] = relief_wRAA
                
                game_master[prefix + "relief_wRC"] = relief_wRC
                
                game_master[prefix + "relief_FIP"] = FIP
                
                game_master[prefix + "relief_WHIP"] = WHIP
                
                game_master[prefix + "relief_ERA"] = ERA
                
                game_master[prefix + "relief_K_BB"] = K_BB
                
                game_master[prefix + "relief_K_9"] = K_9

            else:

                game_master[prefix + "wOBA"] = 0.

                game_master[prefix + "wRAA"] = 0.

                game_master[prefix + "wRC"] = 0.

                game_master[prefix + "OPS"] = 0.
                
                game_master[prefix + "relief_wOBA"] = 0.
                
                game_master[prefix + "relief_wRAA"] = 0.
                
                game_master[prefix + "relief_wRC"] = 0. 
                
                game_master[prefix + "relief_FIP"] = 0.
                
                game_master[prefix + "relief_WHIP"] = 0.
                
                game_master[prefix + "relief_ERA"] = 0.
                
                game_master[prefix + "relief_K_BB"] = 0.
                
                game_master[prefix + "relief_K_9"] = 0.
                
            if len(home_career) != 0 or len(road_career) != 0:
                
                if len(home_career) != 0:
                    
                    cpark_factor = pd.concat([home_career.home_batting_park_factor, 
                                            road_career.home_batting_park_factor], axis = 0).mean()
                    
                    cwOBA_avg = pd.concat([home_career.wOBA, road_career.wOBA], axis = 0).mean()
                    
                    cwOBA_scale = pd.concat([home_career.wOBAScale, road_career.wOBAScale],
                                            axis = 0).mean()
                    
                    cRPA = pd.concat([home_career["R/PA"], road_career["R/PA"]], axis = 0).mean()
                    
                    cleague_wRC = pd.concat([home_career.home_league_wRC, road_career.road_league_wRC],
                                           axis = 0).mean()
                    
                    cleague_PA = pd.concat([home_career.home_league_PA, road_career.road_league_PA],
                                          axis = 0).mean()
                    
                    cwOBA = sum((home_career[wOBA_home_SP].sum().values + road_career[wOBA_road_SP].sum().values) * \
                                home_career[wOBA_weights].max().values) /\
                            (sum(home_career[denom_home_SP].sum().values +road_career[denom_road_SP].sum().values)\
                                 - home_career[home_IBB_SP].sum() - road_career[road_IBB_SP].sum())

                    cwRAA = ((cwOBA - cwOBA_avg) / cwOBA_scale) * \
                            (home_career.home_starter_PA.sum() + road_career.road_starter_PA.sum())

                    cwRC = ((((cwRAA / (home_career.home_starter_PA.sum() +\
                                        road_career.road_starter_PA.sum())) + cRPA) +\
                            (cRPA - (cpark_factor * cRPA))) /\
                            (cleague_wRC / cleague_PA)) * 100
                    
                    if home_career.home_starter_IP.sum() + road_career.road_starter_IP.sum() == 0.:
                        
                        caFIP = "inf"
                        
                    else:
                    
                        caFIP = (((13 * (home_career.home_starter_HR.sum() + road_career.road_starter_HR.sum())) +\
                              (3 * (home_career.home_starter_BB.sum() + home_career.home_starter_HBP.sum() +\
                                   home_career.home_starter_IBB.sum() + road_career.road_starter_BB.sum() +\
                                   road_career.road_starter_HBP.sum() + road_career.road_starter_IBB.sum())) -\
                              (2 * (home_career.home_starter_K.sum() + road_career.road_starter_K.sum()))) /\
                              (home_career.home_starter_IP.sum() + road_career.road_starter_IP.sum())) + \
                              home_career.cFIP.mean()
    
                else:
            
                    cpark_factor = road_career.home_batting_park_factor.mean()
                    
                    cwOBA_avg = road_career.wOBA.mean()
                    
                    cwOBA_scale = road_career.wOBAScale.mean()
                    
                    cRPA = road_career["R/PA"].mean()
                    
                    cleague_wRC = road_career.road_league_wRC.mean()
                    
                    cleague_PA = road_career.road_league_PA.mean()
                    
                    cwOBA = sum((home_career[wOBA_home_SP].sum().values + road_career[wOBA_road_SP].sum().values) * \
                                road_career[wOBA_weights].max().values) /\
                            (sum(home_career[denom_home_SP].sum().values +road_career[denom_road_SP].sum().values)\
                                 - home_career[home_IBB_SP].sum() - road_career[road_IBB_SP].sum())

                    cwRAA = ((cwOBA - cwOBA_avg) / cwOBA_scale) * \
                            (home_career.home_starter_PA.sum() + road_career.road_starter_PA.sum())

                    cwRC = ((((cwRAA / (home_career.home_starter_PA.sum() +\
                                        road_career.road_starter_PA.sum())) + cRPA) +\
                            (cRPA - (cpark_factor * cRPA))) /\
                            (cleague_wRC / cleague_PA)) * 100
        
                    if road_career.road_starter_IP.sum() == 0.:
                        
                        caFIP = "inf"
                    
                    else:
                    
                        caFIP = (((13 * (home_career.home_starter_HR.sum() + road_career.road_starter_HR.sum())) +\
                              (3 * (home_career.home_starter_BB.sum() + home_career.home_starter_HBP.sum() +\
                                   home_career.home_starter_IBB.sum() + road_career.road_starter_BB.sum() +\
                                   road_career.road_starter_HBP.sum() + road_career.road_starter_IBB.sum())) -\
                              (2 * (home_career.home_starter_K.sum() + road_career.road_starter_K.sum()))) /\
                              (home_career.home_starter_IP.sum() + road_career.road_starter_IP.sum())) + \
                              road_career.cFIP.mean()
                
                if home_career.home_starter_IP.sum() + road_career.road_starter_IP.sum() == 0.:
                    
                    caWHIP = "inf"
                    
                    caERA = "inf"
                    
                    caK_9 = 0.
                    
                    caAVG_IP = 0.
                    
                else:
                    
                    caWHIP = (home_career.home_starter_H.sum() + home_career.home_starter_BB.sum() +\
                           home_career.home_starter_IBB.sum() + road_career.road_starter_H.sum() +\
                           road_career.road_starter_BB.sum() + road_career.road_starter_IBB.sum()) /\
                           (home_career.home_starter_IP.sum() + road_career.road_starter_IP.sum())

                    caERA = ((home_career.home_starter_ER.sum() + road_career.road_starter_ER.sum()) /\
                          (home_career.home_starter_IP.sum() + road_career.road_starter_IP.sum())) * 9
                    
                    caK_9 = ((home_career.home_starter_K.sum() + road_career.road_starter_K.sum()) /\
                          (home_career.home_starter_IP.sum() + road_career.road_starter_IP.sum())) * 9
                    
                    caAVG_IP = (home_career.home_starter_IP.sum() + road_career.road_starter_IP.sum()) /\
                                (len(home_career) + len(road_career))
                
                if home_career.home_starter_BB.sum() + road_career.road_starter_BB.sum() == 0:
                    
                    caK_BB = home_career.home_starter_K.sum() + road_career.road_starter_K.sum()
                    
                else:
                    caK_BB = (home_career.home_starter_K.sum() + road_career.road_starter_K.sum()) /\
                             (home_career.home_starter_BB.sum() + road_career.road_starter_BB.sum())
                 
                game_master[prefix + "starter_career_wOBA"] = cwOBA
                
                game_master[prefix + "starter_career_wRAA"] = cwRAA
                
                game_master[prefix + "starter_career_wRC"] = cwRC
                
                game_master[prefix + "starter_career_FIP"] = caFIP
                
                game_master[prefix + "starter_career_WHIP"] = caWHIP
                
                game_master[prefix + "starter_career_ERA"] = caERA
                
                game_master[prefix + "starter_careerK/BB"] = caK_BB
                
                game_master[prefix + "starter_careerK/9"] = caK_9
                
                game_master[prefix + "starter_career_AVGIP"] = caAVG_IP
                
            else:
                
                game_master[prefix + "starter_career_wOBA"] = 0.
                
                game_master[prefix + "starter_career_wRAA"] = 0.
                
                game_master[prefix + "starter_career_wRC"] = 0.
                
                game_master[prefix + "starter_career_FIP"] = 0.
                
                game_master[prefix + "starter_career_WHIP"] = 0.
                
                game_master[prefix + "starter_career_ERA"] = 0.
                
                game_master[prefix + "starter_careerK/BB"] = 0.
                
                game_master[prefix + "starter_careerK/9"] = 0.
                
                game_master[prefix + "starter_career_AVGIP"] = 0.
                
            if len(home_s_df) != 0 or len(road_s_df) != 0:
                
                if len(home_s_df) != 0:
                    
                    spark_factor = pd.concat([home_s_df.home_batting_park_factor, 
                                            road_s_df.home_batting_park_factor], axis = 0).mean()
                    
                    swOBA = sum((home_s_df[wOBA_home_SP].sum().values + road_s_df[wOBA_road_SP].sum().values) * \
                                home_s_df[wOBA_weights].max().values) /\
                            (sum(home_s_df[denom_home_SP].sum().values +road_s_df[denom_road_SP].sum().values)\
                                 - home_s_df[home_IBB_SP].sum() - road_s_df[road_IBB_SP].sum())

                    swRAA = ((swOBA - home_s_df.wOBA.max()) / home_s_df.wOBAScale.max()) * \
                            (home_s_df.home_starter_PA.sum() + road_s_df.road_starter_PA.sum())

                    swRC = ((((swRAA / (home_s_df.home_starter_PA.sum() +\
                                        road_s_df.road_starter_PA.sum())) + home_s_df["R/PA"].max()) +\
                            (home_s_df["R/PA"].max() - (spark_factor * home_s_df["R/PA"].max()))) /\
                            (home_s_df.home_league_wRC.max() / home_s_df.home_league_PA.max())) * 100
                    
                    if home_s_df.home_starter_IP.sum() + road_s_df.road_starter_IP.sum() == 0.:
                        
                        sFIP = "inf"
                        
                    else:
                    
                        sFIP = (((13 * (home_s_df.home_starter_HR.sum() + road_s_df.road_starter_HR.sum())) +\
                              (3 * (home_s_df.home_starter_BB.sum() + home_s_df.home_starter_HBP.sum() +\
                                   home_s_df.home_starter_IBB.sum() + road_s_df.road_starter_BB.sum() +\
                                   road_s_df.road_starter_HBP.sum() + road_s_df.road_starter_IBB.sum())) -\
                              (2 * (home_s_df.home_starter_K.sum() + road_s_df.road_starter_K.sum()))) /\
                              (home_s_df.home_starter_IP.sum() + road_s_df.road_starter_IP.sum())) + \
                              home_s_df.cFIP.max()
                
                else:
                    
                    spark_factor = road_s_df.home_batting_park_factor.mean()
                    
                    swOBA = sum((home_s_df[wOBA_home_SP].sum().values + road_s_df[wOBA_road_SP].sum().values) * \
                                road_s_df[wOBA_weights].max().values) /\
                            (sum(home_s_df[denom_home_SP].sum().values +road_s_df[denom_road_SP].sum().values)\
                                 - home_s_df[home_IBB_SP].sum() - road_s_df[road_IBB_SP].sum())

                    swRAA = ((swOBA - road_s_df.wOBA.max()) / road_s_df.wOBAScale.max()) * \
                            (home_s_df.home_starter_PA.sum() + road_s_df.road_starter_PA.sum())

                    swRC = ((((swRAA / (home_s_df.home_starter_PA.sum() +\
                                        road_s_df.road_starter_PA.sum())) + road_s_df["R/PA"].max()) +\
                            (road_s_df["R/PA"].max() - (spark_factor * road_s_df["R/PA"].max()))) /\
                            (road_s_df.road_league_wRC.max() / road_s_df.road_league_PA.max())) * 100
                    
                    if road_s_df.road_starter_IP.sum() == 0.:
                        
                        sFIP = "inf"
                        
                    else:
                    
                        sFIP = (((13 * (home_s_df.home_starter_HR.sum() + road_s_df.road_starter_HR.sum())) +\
                              (3 * (home_s_df.home_starter_BB.sum() + home_s_df.home_starter_HBP.sum() +\
                                   home_s_df.home_starter_IBB.sum() + road_s_df.road_starter_BB.sum() +\
                                   road_s_df.road_starter_HBP.sum() + road_s_df.road_starter_IBB.sum())) -\
                              (2 * (home_s_df.home_starter_K.sum() + road_s_df.road_starter_K.sum()))) /\
                              (home_s_df.home_starter_IP.sum() + road_s_df.road_starter_IP.sum())) + \
                              road_s_df.cFIP.max()
                    
                if home_s_df.home_starter_IP.sum() + road_s_df.road_starter_IP.sum() == 0.:
                    
                    sWHIP = "inf"
                    
                    sERA = "inf"
                    
                    sK_9 = 0.
                    
                    sAVG_IP = 0.
                    
                else:
                
                    sWHIP = (home_s_df.home_starter_H.sum() + home_s_df.home_starter_BB.sum() +\
                           home_s_df.home_starter_IBB.sum() + road_s_df.road_starter_H.sum() +\
                           road_s_df.road_starter_BB.sum() + road_s_df.road_starter_IBB.sum()) /\
                           (home_s_df.home_starter_IP.sum() + road_s_df.road_starter_IP.sum())

                    sERA = ((home_s_df.home_starter_ER.sum() + road_s_df.road_starter_ER.sum()) /\
                          (home_s_df.home_starter_IP.sum() + road_s_df.road_starter_IP.sum())) * 9
                    
                    sK_9 = ((home_s_df.home_starter_K.sum() + road_s_df.road_starter_K.sum()) /\
                      (home_s_df.home_starter_IP.sum() + road_s_df.road_starter_IP.sum())) * 9
                    
                    sAVG_IP = (home_s_df.home_starter_IP.sum() + road_s_df.road_starter_IP.sum()) /\
                              (len(home_s_df) + len(road_s_df))
                
                if home_s_df.home_starter_BB.sum() + road_s_df.road_starter_BB.sum() == 0:
                    
                    sK_BB = home_s_df.home_starter_K.sum() + road_s_df.road_starter_K.sum()
                
                else:
                    sK_BB = (home_s_df.home_starter_K.sum() + road_s_df.road_starter_K.sum()) /\
                           (home_s_df.home_starter_BB.sum() + road_s_df.road_starter_BB.sum())
                
                game_master[prefix + "starter_season_wOBA"] = swOBA
                
                game_master[prefix + "starter_season_wRAA"] = swRAA
                
                game_master[prefix + "starter_season_wRC"] = swRC
                
                game_master[prefix + "starter_season_FIP"] = sFIP
                
                game_master[prefix + "starter_season_WHIP"] = sWHIP
                
                game_master[prefix + "starter_season_ERA"] = sERA
                
                game_master[prefix + "starter_seasonK/BB"] = sK_BB
                
                game_master[prefix + "starter_seasonK/9"] = sK_9
                
                game_master[prefix + "starter_seasonAVG_IP"] = sAVG_IP
                
            else:
                
                game_master[prefix + "starter_season_wOBA"] = 0.
                
                game_master[prefix + "starter_season_wRAA"] = 0.
                
                game_master[prefix + "starter_season_wRC"] = 0.
                
                game_master[prefix + "starter_season_FIP"] = 0.
                
                game_master[prefix + "starter_season_WHIP"] = 0.
                
                game_master[prefix + "starter_season_ERA"] = 0.
                
                game_master[prefix + "starter_seasonK/BB"] = 0.
                
                game_master[prefix + "starter_seasonK/9"] = 0.
                
                game_master[prefix + "starter_seasonAVG_IP"] = 0.
                        
        return(game_master)

In [38]:
master_list = []

for season in master.season.unique():
    
    all_season = master[master.season == season].reset_index(drop = True)
    
    AMC = AdvancedMetricsCreator(all_season)
    
    season_results = AMC.recreate()
    
    master_list += season_results
    
    print("All data compiled for {}".format(season))

All data compiled for 1918
All data compiled for 1919
All data compiled for 1920
All data compiled for 1921
All data compiled for 1922
All data compiled for 1923
All data compiled for 1924
All data compiled for 1925
All data compiled for 1926
All data compiled for 1927
All data compiled for 1928
All data compiled for 1929
All data compiled for 1930
All data compiled for 1931
All data compiled for 1932
All data compiled for 1933
All data compiled for 1934
All data compiled for 1935
All data compiled for 1936
All data compiled for 1937




All data compiled for 1938
All data compiled for 1939
All data compiled for 1940
All data compiled for 1941
All data compiled for 1942
All data compiled for 1943
All data compiled for 1944
All data compiled for 1945
All data compiled for 1946
All data compiled for 1947
All data compiled for 1948
All data compiled for 1949
All data compiled for 1950
All data compiled for 1951
All data compiled for 1952
All data compiled for 1953
All data compiled for 1954
All data compiled for 1955




All data compiled for 1956
All data compiled for 1957
All data compiled for 1958
All data compiled for 1959
All data compiled for 1960
All data compiled for 1961
All data compiled for 1962
All data compiled for 1963
All data compiled for 1964
All data compiled for 1965
All data compiled for 1966
All data compiled for 1967
All data compiled for 1968
All data compiled for 1969
All data compiled for 1970
All data compiled for 1971




All data compiled for 1972
All data compiled for 1973
All data compiled for 1974
All data compiled for 1975
All data compiled for 1976
All data compiled for 1977
All data compiled for 1978
All data compiled for 1979
All data compiled for 1980
All data compiled for 1981
All data compiled for 1982
All data compiled for 1983
All data compiled for 1984
All data compiled for 1985
All data compiled for 1986
All data compiled for 1987
All data compiled for 1988
All data compiled for 1989
All data compiled for 1990
All data compiled for 1991
All data compiled for 1992
All data compiled for 1993
All data compiled for 1994
All data compiled for 1995
All data compiled for 1996
All data compiled for 1997
All data compiled for 1998
All data compiled for 1999
All data compiled for 2000
All data compiled for 2001
All data compiled for 2002
All data compiled for 2003
All data compiled for 2004
All data compiled for 2005
All data compiled for 2006
All data compiled for 2007
All data compiled for 2008
A

In [40]:
compiled_df = pd.DataFrame(master_list)

In [44]:
compiled_df.to_csv("./adv_metrics/compiled_unstable.csv.gz", index = False, compression = "gzip")