# Process temperature data from retrosheet game files
#### Christopher Callahan
#### Christopher.W.Callahan.GR@dartmouth.edu

#### Mechanics
Dependencies

In [1]:
import xarray as xr
import numpy as np
import sys
import os
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap, cm
from matplotlib import rcParams
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn import linear_model
import statsmodels.api as sm
import statsmodels.formula.api as smf

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes

Locations

In [2]:
loc_eventfiles = "../Data/RetrosheetEventFiles/"
loc_gamelogs = "../Data/RetrosheetGameLogs/"
loc_otherbaseball = "../Data/"

years

In [3]:
y1_in = 1954
y2_in = 2019
y1 = 1954
y2 = 2019

#### Analysis

Read in game logs

In [4]:
gamelogs = pd.read_csv(loc_gamelogs+"GameLogs_"+str(y1_in)+"-"+str(y2_in)+".csv",index_col=0)

In [5]:
%%capture
panel = gamelogs.loc[gamelogs["year"].values>=y1,:]
panel["temp_retrosheet_f"] = np.full(len(panel["year"].values),np.nan)
panel["dome_status"] = np.full(len(panel["year"].values),np.nan)

Get unique list of home teams and years

In [6]:
years = np.arange(y1,y2+1,1)
hometeams = np.array(list(set(panel["home"].values)))

Now loop through teams and years and get temps for each game 

In [7]:
panel_years = panel["year"].values
panel_months = panel["month"].values
panel_days = panel["day"].values

for yy in years:
    print(yy)
    
    decade = str(yy - np.mod(yy,10))
    #print(decade)
    for hh in hometeams:
        
        #indices = (panel["home"]==hh) & (panel["year"] == yy)
        
        # open event file for team and year
        folder = loc_eventfiles+decade+"seve/"
        teamyear = str(yy)+str(hh)
        filename = [f for f in os.listdir(folder) if teamyear in f]
        
        if len(filename) != 0:
            
            #print(filename[0])
            eventfile = pd.read_csv(folder+filename[0],sep="\t",header=None)[0]
            
            # get game dates
            
            game_indices = ["id,"+str(hh)+str(yy) in f for f in eventfile]
            gameids = np.array([f.split(",")[1] for f in eventfile[game_indices].values])
            
            # we have to iteratively add temperature since it's not guaranteed
            # that each game has temperature associated with it
            for gg in np.arange(0,len(gameids),1):
                gameid = gameids[gg]
                game_index = list(eventfile).index("id,"+gameid)
                
                game_month = int(gameid[7:9]) #np.array([int(x[7:9]) for x in gameids])
                game_day = int(gameid[9:11]) #np.array([int(x[9:11]) for x in gameids])
                gamedate_indices = (panel["home"].values == hh) & (panel_years == yy) & (panel_months == game_month) & (panel_days == game_day)
    
                # test whether temp exists
                does_temp_exist = ["info,temp" in f for f in eventfile[game_index:game_index+35]]
                
                if True in does_temp_exist:
                    t = eventfile[game_index+(does_temp_exist.index(True))]
                    tsplit = t.split(",")[2]
                    if tsplit!="unknown":
                        temp = float(tsplit)
                        temp_nan = np.where(temp==0,np.nan,temp)
                        panel.loc[gamedate_indices,"temp_retrosheet_f"] = temp_nan
                    else:
                        panel.loc[gamedate_indices,"temp_retrosheet_f"] = np.nan
                else:
                    panel.loc[gamedate_indices,"temp_retrosheet_f"] = np.nan
            
                # test whether "sky" exists -- for dome
                does_sky_exist = ["info,sky" in f for f in eventfile[game_index:game_index+35]]
                
                if True in does_sky_exist:
                    sky = eventfile[game_index+(does_sky_exist.index(True))]
                    skysplit = sky.split(",")[2]
                    if skysplit=="unknown":
                        panel.loc[gamedate_indices,"dome_status"] = np.nan
                    if skysplit in ["cloudy","night","overcast","sunny"]:
                        panel.loc[gamedate_indices,"dome_status"] = 0
                    if skysplit=="dome":
                        panel.loc[gamedate_indices,"dome_status"] = 1
                else:
                    panel.loc[gamedate_indices,"dome_status"] = np.nan
                
            # get temps
            
            #temp_indices = ["info,temp" in f for f in eventfile]
            #temps = np.array([int(f.split(",")[2]) for f in eventfile[temp_indices].values])
            #temps_nan = np.where(temps==0,np.nan,temps)

1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019


In [8]:
### convert to C
panel["temp_retrosheet"] = (panel["temp_retrosheet_f"].values - 32) * (5.0/9.0)
panel_out = panel.drop(columns="temp_retrosheet_f")

In [9]:
panel_out.to_csv(loc_gamelogs+"GameLogs_Temp_"+str(y1)+"-"+str(y2)+".csv")