# Processing statcast pitch/hit/distance data
#### Christopher Callahan
#### Christopher.W.Callahan.GR@dartmouth.edu

#### Mechanics
Dependencies

In [1]:
import xarray as xr
import numpy as np
import sys
import os
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap, cm
from matplotlib import rcParams
import matplotlib.gridspec as gridspec
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols as reg

Data locations

In [2]:
loc_in = "../Data/Statcast/"
loc_panel = "../Data/Panel/"

#### Analysis

years

In [3]:
y1 = 2015 # start in 2015 because there's no distance data before then
y2 = 2019
years = np.arange(y1,y2+1,1)

For each year read in data

In [4]:
for yy in years:
    print(yy)
    
    statcast_hr_yr = pd.read_csv(loc_in+"statcast_homeruns_"+str(yy)+".csv")
    cols = ["pitch_type","game_date","events","home_team","away_team",\
            "bb_type","vx0","vy0","vz0","ax","ay","az","hit_distance_sc",\
            "launch_speed","launch_angle","release_spin_rate","game_pk",]
    
    statcast_hits_yr = pd.read_csv(loc_in+"statcast_1b2b3b_"+str(yy)+".csv")
    statcast_outs_yr = pd.read_csv(loc_in+"statcast_outs_"+str(yy)+".csv")
    
    if yy==years[0]:
        distance_data = statcast_hr_yr.loc[::-1,cols].reset_index().drop(columns="index")
        distance_data = pd.concat([distance_data,statcast_hits_yr.loc[::-1,cols].reset_index().drop(columns="index")])
        distance_data = pd.concat([distance_data,statcast_outs_yr.loc[::-1,cols].reset_index().drop(columns="index")])
    else:
        distance_data = pd.concat([distance_data,statcast_hr_yr.loc[::-1,cols].reset_index().drop(columns="index")])
        distance_data = pd.concat([distance_data,statcast_hits_yr.loc[::-1,cols].reset_index().drop(columns="index")])
        distance_data = pd.concat([distance_data,statcast_outs_yr.loc[::-1,cols].reset_index().drop(columns="index")])

2015
2016
2017
2018
2019


In [5]:
gby = ["game_date","home_team","away_team","events","bb_type"]
mean_distance_data = distance_data.groupby(gby).mean().reset_index()

Now read in the gamelogs panel so we can use home/away/date to determine which park each game was played in

In [6]:
y1_panel = 1954
y2_panel = 2019
data = pd.read_csv(loc_panel+"baseball_climate_data_"+str(y1_panel)+"-"+str(y2_panel)+".csv")

In [7]:
# parks
# 2016-07-03, ATL home team vs. MIA, Fort Bragg Field, but no distance data

In [8]:
tx_cols = ["date","visitor","home","daynight","park_id","tmax_hadisd","windspeed_hadisd","dome","dome_status"]
tx_data = data.loc[(data.year.values>=y1)&(data.year.values<=y2),tx_cols].reset_index().drop(columns="index")

synchronize team codes

In [9]:
data_team_codes = np.unique(tx_data.home.values)
distance_team_codes = np.unique(distance_data.home_team.values)

In [10]:
count = 0
for x in data_team_codes:
    if x not in distance_data.home_team.values:
        print(x)
        count = count+1
print(count)

ANA
CHA
CHN
KCA
LAN
NYA
NYN
SDN
SFN
SLN
TBA
WAS
12


dictionary for team codes that don't match

In [11]:
# only doing this for teams
# that exist in the 2015-2019 time frame
# e.g., ignoring brooklyn dodgers, montreal expos, etc.
team_code_mapping = {"ANA":"LAA","CAL":"LAA","CHA":"CWS",
                     "CHN":"CHC","FLO":"MIA","KCA":"KC",
                     "LAN":"LAD","NYA":"NYY","NYN":"NYM",
                     "SDN":"SD","SFN":"SF","SLN":"STL",
                     "TBA":"TB","WAS":"WSH"}

In [12]:
tx_data["away_team"] = np.full(tx_data.shape[0],"NAN")
tx_data["home_team"] = np.full(tx_data.shape[0],"NAN")

In [13]:
for x in np.arange(0,tx_data.shape[0],1):
    visitor_code = tx_data.loc[tx_data.index==x,"visitor"].values[0]
    home_code = tx_data.loc[tx_data.index==x,"home"].values[0]
    if visitor_code in distance_team_codes:
        away_team = visitor_code
    else:
        away_team = team_code_mapping[visitor_code]
    if home_code in distance_team_codes:
        home_team = home_code
    else:
        home_team = team_code_mapping[home_code]
    tx_data.loc[tx_data.index==x,"away_team"] = away_team
    tx_data.loc[tx_data.index==x,"home_team"] = home_team

create date column in tx_data

In [14]:
tx_data["year"] = [str(x)[0:4] for x in tx_data.date.values]
tx_data["month"] = [str(x)[4:6] for x in tx_data.date.values]
tx_data["day"] = [str(x)[6:8] for x in tx_data.date.values]

In [15]:
tx_data["game_date"] = tx_data["year"]+"-"+tx_data["month"]+"-"+tx_data["day"]
tx_data["dayofyear"] = [pd.Period(x).dayofyear for x in tx_data.game_date.values]

In [16]:
tx_cols_final = ["daynight","park_id","tmax_hadisd","windspeed_hadisd","away_team","home_team","game_date","year","dayofyear","dome","dome_status"]
distance_data_final = pd.merge(distance_data,tx_data.loc[:,tx_cols_final],on=["game_date","home_team","away_team"],how="left")

Write out!

In [17]:
distance_data_final.to_csv(loc_panel+"statcast_battedball_panel_"+str(y1)+"-"+str(y2)+".csv")