# Effect of Travel on Baseball Performance
***
**Team Members:** Brandon Zink, Cameron Connor, Abiel Fattore

In this project, we will look at the effect of travel in terms of distance in miles and number of time zones changed on the traveling teams performance.

In [60]:
import numpy as np 
import pandas as pd
import matplotlib.pylab as plt
%matplotlib inline
from math import sin, cos, sqrt, atan2, radians
import math

In [115]:
#Read in the data, store in appropriate data frames

dfStadiumsInfo = pd.read_csv('data/Stadium_Info.csv')

#Import and clean the game logs data
dfGameInfo = pd.read_csv('data/Game_Logs.csv', low_memory=False, header=None)
#Name all of the columns
dfGameInfo.columns = (['date','gameNumber','day','awayTeam','awayTeamLg','awayTeamGmNmbr','homeTeam','homeTeamLg','homeTeamGmNmbr','awayScore','homeScore','lengthInOuts','timeOfDay','completion','forfeit','protest','parkID','attendance','lengthOfGameInMinutes','awayLineScore','homeLineScore','awayAB','awayH','away2B','away3B','awayHR','awayRBI','awaySH','awaySF','awayHBP','awayBB','awayIBB','awaySO','awaySB','awayCS','awayGIDP','awayCatchInf','awayLOB','awayPitchersUsed','awayIndivER','awayER','awayWildPitch','awayBalk','awayPO','awayAssists','awayE','awayPassedBalls','awayDoubPlay','awayTripPlay','homeAB','homeH','home2B','home3B','homeHR','homeRBI','homeSH','homeSF','homeHBP','homeBB','homeIBB','homeSO','homeSB','homeCS','homeGIDP','homeCatchInf','homeLOB','homePitchersUsed','homeIndivER','homeER','homeWildPitch','homeBalk','homePO','homeAssists','homeE','homePassedBalls','homeDoubPlay','homeTripPlay','homePlateUmpID','homePlateUmpName','1BUmpID','1BUmpName','2BUmpID','2BUmpName','3BUmpID','3BUmpName','LFUmpID','LFUmpName','RFUmpID','RFUmpName','awayManagerID','awayManagerName','homeManagerID','homeManagerName','winPitcherID','winPitcherName','losePitcherID','losePitcherName','savePitcherID','savePitcherName','GWRBIHitterID','GWRBIHitterName','awaySPID','awaySPName','homeSPID','homeSPName','away1ID','away1Name','away1POS','away2ID','away2Name','away2POS','away3ID','away3Name','away3POS','away4ID','away4Name','away4POS','away5ID','away5Name','away5POS','away6ID','away6Name','away6POS','away7ID','away7Name','away7POS','away8ID','away8Name','away8POS','away9ID','away9Name','away9POS','home1ID','home1Name','home1POS','home2ID','home2Name','home2POS','home3ID','home3Name','home3POS','home4ID','home4Name','home4POS','home5ID','home5Name','home5POS','home6ID','home6Name','home6POS','home7ID','home7Name','home7POS','home8ID','home8Name','home8POS','home9ID','home9Name','home9POS','addInfo','infoAquisition'])
#Remove the last two digits of parkID since they are useless
dfGameInfo['parkID'] = dfGameInfo['parkID'].astype(str).str[:-2].astype(str)
#Get rid of games in Tokyo, Montreal, Puerto Rico, Disney World, Sydney, Fort Bragg
dfGameInfo = dfGameInfo[dfGameInfo['parkID'] != 'TOK']
dfGameInfo = dfGameInfo[dfGameInfo['parkID'] != 'MON']
dfGameInfo = dfGameInfo[dfGameInfo['parkID'] != 'SJU']
dfGameInfo = dfGameInfo[dfGameInfo['parkID'] != 'LBV']
dfGameInfo = dfGameInfo[dfGameInfo['parkID'] != 'SYD']
dfGameInfo = dfGameInfo[dfGameInfo['parkID'] != 'FTB']

In [8]:
#Team ID Lookup
#This is used across multiple functions to give each team a specific number that can be used in lookups (returns 0 through 29, else -1)

def team_ID_lookup(teamID):
    teamMatrix = ['ANA','ARI','ATL','BAL','BOS','CHN','CHA','CIN','CLE','COL','DET','FLO','HOU','KCA','LAN','MIL','MIN','WAS','NYN','NYA','OAK','PHI','PIT','SLN','SDN','SFN','SEA','TBA','TEX','TOR']
    for i in range (0,30):
        if(teamMatrix[i] == teamID):
            return i
    return -1

In [9]:
#Find Distance
#This function finds the distance between two sets of lat, long

def find_distance(latitude1, longitude1, latitude2, longitude2):
    # approximate radius of earth in mi
    R = 3959.0

    lat1 = radians(latitude1)
    lon1 = radians(longitude1)
    lat2 = radians(latitude2)
    lon2 = radians(longitude2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance


In [10]:
#Get the time zone difference between two stadiums, returns a value between -3 and 3, where -3 means that the team traveled
#3 hours westward, and 3 means that the team traveled 3 hours eastward. Takes inputs as 3 letter code for teams. Returns
#-5 if either team is not found.

def find_time_zone_difference(awayTeam, homeTeam):
    #Convert the dataframe to an array to make it easier to loop through
    distance_array = dfStadiumsInfo.values
    
    homeTeamTZ = 1000
    awayTeamTZ = -1000
    
    #Find the time zones for the two teams
    for i in range(0,30):
        if(distance_array[i][1] == homeTeam):
            homeTeamTZ = distance_array[i][4]
        if(distance_array[i][1] == awayTeam):
            awayTeamTZ = distance_array[i][4]
    
    if(homeTeamTZ - awayTeamTZ > 3 or homeTeamTZ - awayTeamTZ < -3):
        return -5
    else:
        return homeTeamTZ - awayTeamTZ

In [11]:
#Gets the distance stored in the distance matrix, returns 0 if not yet updated

def get_distance_matrix(matrix, team1ID, team2ID):
    if(matrix[team1ID][team2ID] > 0):
        return matrix[team1ID][team2ID]
    elif (matrix[team2ID][team1ID] > 0):
        return matrix[team2ID][team1ID]
    else:
        return 0

In [12]:
#Creates/resets the distance matrix to all 0, returns the matrix

def start_distance_matrix():
    return np.zeros((30, 30), dtype=float)

#initialize the distance matrix
dist_matrix = start_distance_matrix()

In [13]:
#Updates a position in the matrix to the specified distance, returns the input matrix

def update_distance_matrix(matrix, team1ID, team2ID, distance):
    matrix[team1ID][team2ID] = distance
    matrix[team2ID][team1ID] = distance
    return matrix

In [14]:
#This gets the distance between two stadiums from the stored matrix, and if it's not there, it calculates it and stores it
#in there (dist_matrix)

def get_distance(awayTeam, homeTeam):
    global dist_matrix
    
    stadium_info_array = dfStadiumsInfo.values
    distance = 0
    #get the distance between the stadiums from the distance matrix if it has already been updated
    distance = get_distance_matrix(dist_matrix, team_ID_lookup(awayTeam), team_ID_lookup(homeTeam))
    
    #if it has not been updated yet, update it
    if(distance == 0):
        #loop through the stadium matrix to find the correct row, get the lat and long from that row, calculate the distance,
        #and then update the matrix to include that distance
        awayLat = awayLong = homeLat = homeLong = -1.0
        for i in range(0,30):
            
            if(stadium_info_array[i][1] == awayTeam):
                awayLat = stadium_info_array[i][2]
                awayLong = stadium_info_array[i][3]
                
            if(stadium_info_array[i][1] == homeTeam):
                homeLat = stadium_info_array[i][2]
                homeLong = stadium_info_array[i][3]
            
        distance = find_distance(awayLat, awayLong, homeLat, homeLong)
                
        dist_matrix = update_distance_matrix(dist_matrix, team_ID_lookup(awayTeam), team_ID_lookup(homeTeam), distance)
        
    return distance

In [62]:
def OBP_calc(H, BB, HBP, AB, SF):
    return ((H+BB+HBP)/(BB+HBP+AB+SF))

def SLG_calc(H, twoB, threeB, HR, AB):
    return (((H-(twoB+threeB+HR))+(twoB*2)+(threeB*3)+(HR*4))/AB)
    
def ISO_calc(SLG, BA):
    return SLG-BA
    
def FIP_calc(HR, HBP, BB, K, IP):
    return ((((HR*13)+(3*(HBP+BB))-(2*K))/IP)+3.2)

In [20]:
#This just checks the distance matrix to make sure it is updating correctly

def check_distance_matrix():
    for i in range(0,30):
        print(dist_matrix[i])

In [131]:
#This gets us the data that we need for the analysis
dfGameData = dfGameInfo[['awayTeam','homeTeam','awayScore','homeScore','lengthInOuts','attendance','awayAB','awayH',
                                  'away2B','away3B','awayHR','awayRBI','awaySH','awaySF','awayHBP','awayBB','awaySO',
                                 'awayER','homeAB','homeH','home2B','home3B','homeHR','homeRBI','homeSH','homeSF','homeHBP',
                                  'homeBB','homeSO','homeER']].copy()


#Here we add the columns that we need including distance, time zones, OBP, SLG, etc.
dfGameData['distance'] = -1
dfGameData['timeZoneChange'] = -1
dfGameData['away Batting R/9'] = -1.0
dfGameData['away Batting H/9'] = -1.0
dfGameData['away Batting BB/9'] = -1.0
dfGameData['away Batting K/9'] = -1.0
dfGameData['awayBA'] = -1.0
dfGameData['awayOBP'] = -1.0
dfGameData['awaySLG'] = -1.0
dfGameData['awayISO'] = -1.0
dfGameData['away Pitching H/9'] = -1.0
dfGameData['away Pitching BB/9'] = -1.0
dfGameData['away Pitching K/9'] = -1.0
dfGameData['awayERA'] = -1.0
dfGameData['awayFIP'] = -1.0


#Loop through an calculate the values 
#WARNING: Not pretty
for i, row in dfGameInfo.iterrows():
    
    #The number of innings the away team hit and pitched
    away_innings_hit = math.ceil(row['lengthInOuts']/6.0)
    away_innings_pitch = math.floor(row['lengthInOuts']/6.0)
    
    #distance
    distance_val = get_distance(str(row['awayTeam']), str(row['homeTeam']))
    dfGameData.set_value(i, 'distance', distance_val)
    
    #time zone change
    t_z = find_time_zone_difference(str(row['awayTeam']), str(row['homeTeam']))
    dfGameData.set_value(i, 'timeZoneChange', t_z)
    
    #away batting R/9
    bat_run_per_nine = row['awayScore']*(9.0/away_innings_hit)
    dfGameData.set_value(i, 'away Batting R/9', bat_run_per_nine)
    
    #away batting H/9
    bat_hit_per_nine = row['awayH']*(9.0/away_innings_hit)
    dfGameData.set_value(i, 'away Batting H/9', bat_hit_per_nine)
    
    #away batting BB/9
    bat_walk_per_nine = row['awayBB']*(9.0/away_innings_hit)
    dfGameData.set_value(i, 'away Batting BB/9', bat_walk_per_nine)
    
    #away batting K/9
    bat_k_per_nine = row['awaySO']*(9.0/away_innings_hit)
    dfGameData.set_value(i, 'away Batting K/9', bat_k_per_nine)
    
    #away BA
    BA = row['awayH']/row['awayAB']
    dfGameData.set_value(i, 'awayBA', BA)
    
    #away OBP
    OBP = OBP_calc(row['awayH'], row['awayBB'], row['awayHBP'], row['awayAB'], row['awaySF'])
    dfGameData.set_value(i, 'awayOBP', OBP)
    
    #away SLG
    SLG = SLG_calc(row['awayH'], row['away2B'], row['away3B'], row['awayHR'], row['awayAB'])
    dfGameData.set_value(i, 'awaySLG', SLG)
    
    #away ISO
    ISO = ISO_calc(SLG_calc(row['awayH'], row['away2B'], row['away3B'], row['awayHR'], row['awayAB']), row['awayH']/row['awayAB'])
    dfGameData.set_value(i, 'awayISO', ISO)
    
    #away pitching H/9
    pit_hit_per_nine = row['homeH']*(9.0/away_innings_pitch)
    dfGameData.set_value(i, 'away Pitching H/9', pit_hit_per_nine)
    
    #away pitching BB/9
    pit_walk_per_nine = row['homeBB']*(9.0/away_innings_pitch)
    dfGameData.set_value(i, 'away Pitching BB/9', pit_walk_per_nine)
    
    #away pitching K/9
    pit_k_per_nine = row['homeSO']*(9.0/away_innings_pitch)
    dfGameData.set_value(i, 'away Pitching K/9', pit_k_per_nine)
    
    #away ERA
    ERA = row['homeER']*(9.0/away_innings_pitch)
    dfGameData.set_value(i, 'awayERA', ERA)
    
    #away FIP
    FIP = FIP_calc(row['homeHR'], row['homeHBP'], row['homeBB'], row['homeSO'], away_innings_pitch)
    dfGameData.set_value(i, 'awayFIP', FIP)

In [132]:
dfGameData.head()

Unnamed: 0,awayTeam,homeTeam,awayScore,homeScore,lengthInOuts,attendance,awayAB,awayH,away2B,away3B,...,away Batting K/9,awayBA,awayOBP,awaySLG,awayISO,away Pitching H/9,away Pitching BB/9,away Pitching K/9,awayERA,awayFIP
2,COL,ATL,0,2,51,42255.0,31,6,2,0,...,7.0,0.193548,0.242424,0.258065,0.064516,7.875,1.125,6.75,0.0,5.7
3,MIL,CIN,3,3,31,55596.0,22,7,1,0,...,1.5,0.318182,0.444444,0.363636,0.045455,9.0,1.8,7.2,3.6,4.8
4,SFN,FLO,4,6,51,35101.0,35,10,2,2,...,8.0,0.285714,0.305556,0.542857,0.257143,13.5,1.125,7.875,4.5,2.2
6,SDN,NYN,1,2,51,52308.0,31,5,0,0,...,8.0,0.16129,0.1875,0.258065,0.096774,4.5,2.25,7.875,1.125,4.2
7,CHN,SLN,1,7,51,48156.0,27,3,1,0,...,7.0,0.111111,0.21875,0.148148,0.037037,11.25,7.875,6.75,1.125,9.2
