# Retrosheet Gamelog Data

After collecting weather data, I realized that weather data is not meaningful without knowing the time the game was played. The retrosheet_collector.py script collects all of the data and writes it to a dataframe, but Retrosheet and FiveThirtyEight do not use the same team codes, so merging the dataframes together required some additional work to create common keys.

In [1]:
import requests
import pandas as pd
import numpy as np
import json

In [2]:
#retrieve metadata from Retrosheet site to parse in gamelogs
dot_txt = requests.get('https://www.retrosheet.org/gamelogs/glfields.txt')
dot_txt = dot_txt.content.decode('utf-8')
with open('retrograde_meta.txt', 'w') as f:
    f.writelines(dot_txt)

In [2]:
#retrieve meta data for Retrosheet team codes
page = requests.get('https://www.retrosheet.org/TEAMABR.TXT')
string = page.content
string = string.decode('utf-8')

In [38]:
#write data to .csv file
with open('data/retrograde_team_codes.csv', 'w') as f:
    f.write('team_code,league,city,team_name,year_1,year_2')
    f.write('\n')
    f.writelines(string)

In [42]:
#retrieve meta data for Retrosheet park codes
park_page = requests.get('https://www.retrosheet.org/parkcode.txt')
park_string = park_page.content.decode('utf-8')

In [43]:
#write data to .csv file
with open('retrograde_park_codes.csv', 'w') as f:
    f.writelines(park_string)

In [3]:
park_codes = pd.read_csv('data/retrograde_park_codes.csv')

In [12]:
park_codes

Unnamed: 0,PARKID,NAME,AKA,CITY,STATE,START,END,LEAGUE,NOTES
0,ALB01,Riverside Park,,Albany,NY,09/11/1880,05/30/1882,NL,TRN:9/11/80;6/15&9/10/1881;5/16-5/18&5/30/1882
1,ALT01,Columbia Park,,Altoona,PA,04/30/1884,05/31/1884,UA,
2,ANA01,Angel Stadium of Anaheim,Edison Field; Anaheim Stadium,Anaheim,CA,04/19/1966,,AL,
3,ARL01,Arlington Stadium,,Arlington,TX,04/21/1972,10/03/1993,AL,
4,ARL02,Rangers Ballpark in Arlington,The Ballpark in Arlington; Ameriquest Fl,Arlington,TX,04/11/1994,,AL,
...,...,...,...,...,...,...,...,...,...
247,WIL02,BB&T Ballpark at Bowman Field,,Williamsport,PA,08/20/2017,08/20/2017,NL,PIT
248,WNY01,West New York Field Club Grounds,,West New York,NJ,09/11/1898,09/17/1899,NL,"BRO:9/18&10/2/1898; NY1:9/11/98, 6/4&7/16&8/13..."
249,WOR01,Agricultural County Fair Grounds I,,Worcester,MA,05/01/1880,09/29/1882,NL,
250,WOR02,Agricultural County Fair Grounds II,,Worcester,MA,08/17/1887,08/17/1887,NL,1 BSN game


In [13]:
#drop unecessary columns
park_codes = park_codes.drop(columns = ['START', 'END', 'LEAGUE', 'NOTES'])

In [4]:
#read in stadium data from baseball-reference.com
stadiums_ = pd.read_csv('data/complete_stadiums.csv', index_col = [0])

In [16]:
stadiums_.head()

Unnamed: 0,team_code,team,state_code,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor
0,TBD,Tampa Bay Rays,FL,2019,Tropicana Field,27.768333,82.653333,,,,14552,96,97
1,TBD,Tampa Bay Rays,FL,2018,Tropicana Field,27.768333,82.653333,,,,14259,96,97
2,TBD,Tampa Bay Rays,FL,2017,Tropicana Field,27.768333,82.653333,,,,15477,96,96
3,TBD,Tampa Bay Rays,FL,2016,Tropicana Field,27.768333,82.653333,,,,15879,95,95
4,TBD,Tampa Bay Rays,FL,2015,Tropicana Field,27.768333,82.653333,,,,15322,97,96


In [17]:
all_stadiums_ = stadiums_.primary_stadium.unique()

In [30]:
coordinates_list = []
for stadium in all_stadiums_:
    dict_iter = {'stadium_name' : stadium,
                'lat' : None,
                'lon' : None,
                'state_code' : None}
    dict_iter['lat'] = stadiums_[stadiums_.primary_stadium == stadium].primary_latitude.value_counts().idxmax()
    dict_iter['lon'] = stadiums_[stadiums_.primary_stadium == stadium].primary_longitude.value_counts().idxmax()
    dict_iter['state_code'] = stadiums_[stadiums_.primary_stadium == stadium].state_code.value_counts().idxmax()
    coordinates_list.append(dict_iter)

In [31]:
unique_stadiums_df = pd.DataFrame(coordinates_list)

In [32]:
unique_stadiums_df.head()

Unnamed: 0,stadium_name,lat,lon,state_code
0,Tropicana Field,27.768333,82.653333,FL
1,Oakland-Alameda County Coliseum,37.751667,122.200556,CA
2,O.co Coliseum,37.751667,122.200556,CA
3,McAfee Coliseum,37.751667,122.200556,CA
4,Network Associates Coliseum,37.751667,122.200556,CA


In [22]:
park_codes.head()

Unnamed: 0,PARKID,NAME,AKA,CITY,STATE
0,ALB01,Riverside Park,,Albany,NY
1,ALT01,Columbia Park,,Altoona,PA
2,ANA01,Angel Stadium of Anaheim,Edison Field; Anaheim Stadium,Anaheim,CA
3,ARL01,Arlington Stadium,,Arlington,TX
4,ARL02,Rangers Ballpark in Arlington,The Ballpark in Arlington; Ameriquest Fl,Arlington,TX


In [34]:
park_ids = []
for j in range(len(park_codes)):
    dict_iter = {'state_code' : park_codes.iloc[j]['STATE']}
    park_id = park_codes.iloc[j]['PARKID']
    stadiums = [park_codes.iloc[j]['NAME']]
    if type(park_codes.iloc[j]['AKA']) == str and ';' in park_codes.iloc[j]['AKA']:
        other_names = park_codes.iloc[j]['AKA'].split(';')
        stadiums += other_names
    elif type(park_codes.iloc[j]['AKA']) == str and ';' not in park_codes.iloc[j]['AKA']:
        stadiums.append(park_codes.iloc[j]['AKA'])
    else:
        pass
    for k in range(len(stadiums)):
        dict_iter[stadiums[k]] = park_id
    park_ids.append(dict_iter)
    

In [37]:
unique_stadiums_df.head()

Unnamed: 0,stadium_name,lat,lon,state_code,park_id
0,Tropicana Field,27.768333,82.653333,FL,
1,Oakland-Alameda County Coliseum,37.751667,122.200556,CA,
2,O.co Coliseum,37.751667,122.200556,CA,
3,McAfee Coliseum,37.751667,122.200556,CA,
4,Network Associates Coliseum,37.751667,122.200556,CA,


In [38]:
park_codes.head()

Unnamed: 0,PARKID,NAME,AKA,CITY,STATE
0,ALB01,Riverside Park,,Albany,NY
1,ALT01,Columbia Park,,Altoona,PA
2,ANA01,Angel Stadium of Anaheim,Edison Field; Anaheim Stadium,Anaheim,CA
3,ARL01,Arlington Stadium,,Arlington,TX
4,ARL02,Rangers Ballpark in Arlington,The Ballpark in Arlington; Ameriquest Fl,Arlington,TX


In [57]:
issue_stadiums = []
for j in range(len(unique_stadiums_df)):
    stad = unique_stadiums_df.iloc[j]['stadium_name']
    try:
        park_id = park_codes[park_codes.NAME == stad].PARKID.value_counts().idxmax()
    except:
        issue_stadiums.append(stad)
        continue
    unique_stadiums_df.at[j, unique_stadiums_df.columns.get_loc('park_id')] = park_id

In [60]:
unique_stadiums_df = unique_stadiums_df.drop(columns = ['park_id'])
unique_stadiums_df.columns = ['stadium_name', 'lat', 'lon', 'state_code', 'park_id']

In [61]:
unique_stadiums_df

Unnamed: 0,stadium_name,lat,lon,state_code,park_id
0,Tropicana Field,27.768333,82.653333,FL,STP01
1,Oakland-Alameda County Coliseum,37.751667,122.200556,CA,OAK01
2,O.co Coliseum,37.751667,122.200556,CA,
3,McAfee Coliseum,37.751667,122.200556,CA,
4,Network Associates Coliseum,37.751667,122.200556,CA,
5,Municipal Stadium,39.086,94.555,MO,KAN05
6,Connie Mack Stadium,39.996111,75.165,PA,
7,Shibe Park,39.996111,75.165,PA,PHI11
8,Columbia Park,39.981111,75.182778,PA,ALT01
9,Oracle Park,37.778611,122.389167,CA,


In [69]:
for k in range(1, len(unique_stadiums_df) - 1):
    if type(unique_stadiums_df.iloc[k]['park_id']) != str and \
    unique_stadiums_df.iloc[k]['lat'] == unique_stadiums_df.iloc[k-1]['lat'] and \
    unique_stadiums_df.iloc[k]['lon'] == unique_stadiums_df.iloc[k-1]['lon']:
        unique_stadiums_df.at[k, 'park_id'] = unique_stadiums_df.iloc[k-1]['park_id']
    elif type(unique_stadiums_df.iloc[k]['park_id']) != str and \
    unique_stadiums_df.iloc[k]['lat'] == unique_stadiums_df.iloc[k+1]['lat'] and \
    unique_stadiums_df.iloc[k]['lon'] == unique_stadiums_df.iloc[k+1]['lon']:
        unique_stadiums_df.at[k, 'park_id'] = unique_stadiums_df.iloc[k+1]['park_id']
    else:
        continue

In [71]:
unique_stadiums_df[unique_stadiums_df.park_id.isnull()]

Unnamed: 0,stadium_name,lat,lon,state_code,park_id
9,Oracle Park,37.778611,122.389167,CA,
10,AT&T; Park,37.778611,122.389167,CA,
11,SBC Park,37.778611,122.389167,CA,
12,Pacific Bell Park,37.778611,122.389167,CA,
32,Guaranteed Rate Field,41.83,87.633889,IL,
33,U.S. Cellular Field,41.83,87.633889,IL,
34,Comiskey Park II,41.83,87.633889,IL,
70,Great American Ball Park,39.0975,84.506667,OH,
93,SunTrust Park,33.89,84.468056,GA,
110,Edison Field,33.8003,117.883,CA,


In [72]:
for k in range(9, 13):
    unique_stadiums_df.at[k, 'park_id'] = 'SFO03'
for k in range(32, 35):
    unique_stadiums_df.at[k, 'park_id'] = 'CHI12'
unique_stadiums_df.at[70, 'park_id'] = 'CIN09'
unique_stadiums_df.at[93, 'park_id'] = 'ATL03'
for k in range(110, 112):
    unique_stadiums_df.at[k, 'park_id'] = 'ANA01'
unique_stadiums_df.at[117, 'park_id'] = 'WAS10'
unique_stadiums_df.at[118, 'park_id'] = 'SAN02'
unique_stadiums_df.at[124, 'park_id'] = 'STL07'

In [74]:
unique_stadiums_df

Unnamed: 0,stadium_name,lat,lon,state_code,park_id
0,Tropicana Field,27.768333,82.653333,FL,STP01
1,Oakland-Alameda County Coliseum,37.751667,122.200556,CA,OAK01
2,O.co Coliseum,37.751667,122.200556,CA,OAK01
3,McAfee Coliseum,37.751667,122.200556,CA,OAK01
4,Network Associates Coliseum,37.751667,122.200556,CA,OAK01
5,Municipal Stadium,39.086,94.555,MO,KAN05
6,Connie Mack Stadium,39.996111,75.165,PA,PHI11
7,Shibe Park,39.996111,75.165,PA,PHI11
8,Columbia Park,39.981111,75.182778,PA,ALT01
9,Oracle Park,37.778611,122.389167,CA,SFO03


In [75]:
d = {}
for j in range(len(unique_stadiums_df)):
    stad = unique_stadiums_df.iloc[j]['stadium_name']
    park_id = unique_stadiums_df.iloc[j]['park_id']
    d[stad] = park_id
    

In [78]:
with open('data/retrograde_park_codes.json', 'w') as f:
    json.dump(d, f)

In [2]:
team_codes = pd.read_csv('retrograde_team_codes.csv')
team_codes['year_2'] = pd.to_numeric(team_codes['year_2'])
team_codes['year_1'] = pd.to_numeric(team_codes['year_1'])
team_codes = team_codes.assign(full_name = team_codes.city + ' ' + team_codes.team_name)
team_codes = team_codes.drop(columns = ['city', 'team_name'])
team_codes = team_codes[team_codes.year_2 >= 1900]
team_codes = team_codes.reset_index().drop(columns = ['index'])


In [3]:
team_codes.to_csv('data/retrograde_team_codes.csv')

In [5]:
stadiums_.head()

Unnamed: 0,team_code,team,state_code,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor
0,TBD,Tampa Bay Rays,FL,2019,Tropicana Field,27.768333,82.653333,,,,14552,96,97
1,TBD,Tampa Bay Rays,FL,2018,Tropicana Field,27.768333,82.653333,,,,14259,96,97
2,TBD,Tampa Bay Rays,FL,2017,Tropicana Field,27.768333,82.653333,,,,15477,96,96
3,TBD,Tampa Bay Rays,FL,2016,Tropicana Field,27.768333,82.653333,,,,15879,95,95
4,TBD,Tampa Bay Rays,FL,2015,Tropicana Field,27.768333,82.653333,,,,15322,97,96


In [6]:
with open('data/retrograde_park_codes.json', 'r') as f:
    stadiums_codes_ = json.load(f)

In [8]:
stadiums_['park_ids'] = stadiums_['primary_stadium'].map(stadiums_codes_)

In [11]:
stadiums_.to_csv('data/all_stadiums_w_park_ids.csv')

In [13]:
stadiums_.head()

Unnamed: 0,team_code,team,state_code,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor,park_ids
0,TBD,Tampa Bay Rays,FL,2019,Tropicana Field,27.768333,82.653333,,,,14552,96,97,STP01
1,TBD,Tampa Bay Rays,FL,2018,Tropicana Field,27.768333,82.653333,,,,14259,96,97,STP01
2,TBD,Tampa Bay Rays,FL,2017,Tropicana Field,27.768333,82.653333,,,,15477,96,96,STP01
3,TBD,Tampa Bay Rays,FL,2016,Tropicana Field,27.768333,82.653333,,,,15879,95,95,STP01
4,TBD,Tampa Bay Rays,FL,2015,Tropicana Field,27.768333,82.653333,,,,15322,97,96,STP01


In [14]:
team_codes.head()

Unnamed: 0,team_code,league,year_1,year_2,full_name
0,BSN,NL,1876,1952,Boston Braves
1,CHN,NL,1876,2010,Chicago Cubs
2,NY1,NL,1883,1957,New York Giants
3,PHI,NL,1883,2010,Philadelphia Phillies
4,PIT,NL,1887,2010,Pittsburgh Pirates


In [15]:
teams = list(stadiums_.team.unique())

In [27]:
stadiums_ = stadiums_.assign(rg_teamcode = None)

In [28]:
for k in range(len(stadiums_)):
    try:
        rg_tc = team_codes[team_codes.full_name == stadiums_.iloc[k]['team']].team_code.value_counts().idxmax()
        stadiums_.at[k, 'rg_teamcode'] = rg_tc
    except Exception as e:
        print(e)
        print(stadiums_.iloc[k]['team'])
        continue

attempt to get argmax of an empty sequence
Tampa Bay Rays
attempt to get argmax of an empty sequence
Tampa Bay Rays
attempt to get argmax of an empty sequence
Tampa Bay Rays
attempt to get argmax of an empty sequence
Tampa Bay Rays
attempt to get argmax of an empty sequence
Tampa Bay Rays
attempt to get argmax of an empty sequence
Tampa Bay Rays
attempt to get argmax of an empty sequence
Tampa Bay Rays
attempt to get argmax of an empty sequence
Tampa Bay Rays
attempt to get argmax of an empty sequence
Tampa Bay Rays
attempt to get argmax of an empty sequence
Tampa Bay Rays
attempt to get argmax of an empty sequence
Tampa Bay Rays
attempt to get argmax of an empty sequence
Tampa Bay Rays
attempt to get argmax of an empty sequence
Chicago Orphans
attempt to get argmax of an empty sequence
Chicago Orphans
attempt to get argmax of an empty sequence
Chicago Orphans
attempt to get argmax of an empty sequence
Miami Marlins
attempt to get argmax of an empty sequence
Miami Marlins
attempt to ge

attempt to get argmax of an empty sequence
New York Highlanders
attempt to get argmax of an empty sequence
New York Highlanders
attempt to get argmax of an empty sequence
New York Highlanders
attempt to get argmax of an empty sequence
New York Highlanders
attempt to get argmax of an empty sequence
New York Highlanders
attempt to get argmax of an empty sequence
New York Highlanders
attempt to get argmax of an empty sequence
New York Highlanders
attempt to get argmax of an empty sequence
New York Highlanders
attempt to get argmax of an empty sequence
New York Highlanders
attempt to get argmax of an empty sequence
New York Highlanders
attempt to get argmax of an empty sequence
Los Angeles Angels of Anaheim
attempt to get argmax of an empty sequence
Los Angeles Angels of Anaheim
attempt to get argmax of an empty sequence
Los Angeles Angels of Anaheim
attempt to get argmax of an empty sequence
Los Angeles Angels of Anaheim
attempt to get argmax of an empty sequence
Los Angeles Angels of Ana

In [31]:
problem_teams = list(stadiums_[stadiums_.rg_teamcode.isnull()].team.unique())

In [32]:
problem_teams

['Tampa Bay Rays',
 'Chicago Orphans',
 'Miami Marlins',
 'Brooklyn Robins',
 'Brooklyn Superbas',
 'Boston Americans',
 'Houston Astros',
 'Houston Colt .45s',
 'Cincinnati Redlegs',
 'Cleveland Naps',
 'Cleveland Bronchos',
 'Cleveland Blues',
 'Boston Bees',
 'Boston Rustlers',
 'Boston Doves',
 'Boston Beaneaters',
 'New York Highlanders',
 'Los Angeles Angels of Anaheim',
 'St. Louis Browns']

In [34]:
df_list = []
for team in problem_teams:
    df = stadiums_[stadiums_.team == team]
    df_list.append(df)
problem_data = pd.concat(df_list, sort = False)

In [43]:
team_codes[team_codes.team_code == 'BRO']

Unnamed: 0,team_code,league,year_1,year_2,full_name
5,BRO,NL,1890,1957,Brooklyn Dodgers


In [36]:
pd.set_option('max.rows', 230)
problem_data

Unnamed: 0,team_code,team,state_code,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor,park_ids,rg_teamcode
0,TBD,Tampa Bay Rays,FL,2019,Tropicana Field,27.768333,82.653333,,,,14552,96,97,STP01,
1,TBD,Tampa Bay Rays,FL,2018,Tropicana Field,27.768333,82.653333,,,,14259,96,97,STP01,
2,TBD,Tampa Bay Rays,FL,2017,Tropicana Field,27.768333,82.653333,,,,15477,96,96,STP01,
3,TBD,Tampa Bay Rays,FL,2016,Tropicana Field,27.768333,82.653333,,,,15879,95,95,STP01,
4,TBD,Tampa Bay Rays,FL,2015,Tropicana Field,27.768333,82.653333,,,,15322,97,96,STP01,
5,TBD,Tampa Bay Rays,FL,2014,Tropicana Field,27.768333,82.653333,,,,17858,97,97,STP01,
6,TBD,Tampa Bay Rays,FL,2013,Tropicana Field,27.768333,82.653333,,,,18646,96,97,STP01,
7,TBD,Tampa Bay Rays,FL,2012,Tropicana Field,27.768333,82.653333,,,,19255,94,95,STP01,
8,TBD,Tampa Bay Rays,FL,2011,Tropicana Field,27.768333,82.653333,,,,18879,92,93,STP01,
9,TBD,Tampa Bay Rays,FL,2010,Tropicana Field,27.768333,82.653333,,,,23025,94,95,STP01,


In [39]:
for l in range(0, 12):
    stadiums_.at[l, 'rg_teamcode'] = 'TBA'
for l in range(378, 381):
    stadiums_.at[l, 'rg_teamcode'] = 'CHN'
for l in range(424, 432):
    stadiums_.at[l, 'rg_teamcode'] = 'FLO'
for l in range(731, 763):
    stadiums_.at[l, 'rg_teamcode'] = 'BRO'
for l in range(1038, 1045):
    stadiums_.at[l, 'rg_teamcode'] = 'BOS'
for l in range(1164, 1222):
    stadiums_.at[l, 'rg_teamcode'] = 'HOU'
for l in range(1334, 1339):
    stadiums_.at[l, 'rg_teamcode'] = 'CIN'
for l in range(1498, 1512):
    stadiums_.at[l, 'rg_teamcode'] = 'CLE'
for l in range(1847, 1852):
    stadiums_.at[l, 'rg_teamcode'] = 'BSN'
for l in range(1876, 1888):
    stadiums_.at[l, 'rg_teamcode'] = 'BSN'
for l in range(2114, 2124):
    stadiums_.at[l, 'rg_teamcode'] = 'NYA'
for l in range(2128, 2139):
    stadiums_.at[l, 'rg_teamcode'] = 'ANA'
for l in range(2479, 2531):
    stadiums_.at[l, 'rg_teamcode'] = 'SLA'

In [42]:
stadiums_.to_csv('data/relational_retrograde.csv')

In [51]:
gamelogs_full = pd.read_csv('data/retrograde_gamelog.csv')

In [52]:
gamelogs_full = gamelogs_full.drop(columns = ['Unnamed: 6'])

In [53]:
gamelogs_full

Unnamed: 0,date,away_team,home_team,game_time,park_id,attendance
0,20080325,BOS,OAK,N,TOK01,44628.0
1,20080326,BOS,OAK,N,TOK01,44735.0
2,20080330,ATL,WAS,N,WAS11,39389.0
3,20080331,PIT,ATL,N,ATL02,45269.0
4,20080331,MIL,CHN,D,CHI11,41089.0
...,...,...,...,...,...,...
197737,20051002,MIL,PIT,D,PIT08,23008.0
197738,20051002,LAN,SDN,D,SAN02,37748.0
197739,20051002,ARI,SFN,D,SFO03,40239.0
197740,20051002,CIN,SLN,D,STL09,50434.0


In [54]:
date_format = '%Y%m%d'

In [56]:
gamelogs_full['date'] = pd.to_datetime(gamelogs_full['date'], format = date_format)

In [57]:
gamelogs_full

Unnamed: 0,date,away_team,home_team,game_time,park_id,attendance
0,2008-03-25,BOS,OAK,N,TOK01,44628.0
1,2008-03-26,BOS,OAK,N,TOK01,44735.0
2,2008-03-30,ATL,WAS,N,WAS11,39389.0
3,2008-03-31,PIT,ATL,N,ATL02,45269.0
4,2008-03-31,MIL,CHN,D,CHI11,41089.0
...,...,...,...,...,...,...
197737,2005-10-02,MIL,PIT,D,PIT08,23008.0
197738,2005-10-02,LAN,SDN,D,SAN02,37748.0
197739,2005-10-02,ARI,SFN,D,SFO03,40239.0
197740,2005-10-02,CIN,SLN,D,STL09,50434.0


In [2]:
stadiums = pd.read_csv('data/relational_retrograde.csv', index_col = [0])

In [3]:
stadiums.head()

Unnamed: 0,team_code,team,state_code,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor,park_ids,rg_teamcode
0,TBD,Tampa Bay Rays,FL,2019,Tropicana Field,27.768333,82.653333,,,,14552,96,97,STP01,TBA
1,TBD,Tampa Bay Rays,FL,2018,Tropicana Field,27.768333,82.653333,,,,14259,96,97,STP01,TBA
2,TBD,Tampa Bay Rays,FL,2017,Tropicana Field,27.768333,82.653333,,,,15477,96,96,STP01,TBA
3,TBD,Tampa Bay Rays,FL,2016,Tropicana Field,27.768333,82.653333,,,,15879,95,95,STP01,TBA
4,TBD,Tampa Bay Rays,FL,2015,Tropicana Field,27.768333,82.653333,,,,15322,97,96,STP01,TBA


In [5]:
rg_codes = list(stadiums.rg_teamcode.unique())

In [10]:
teams = stadiums.team.unique()

In [11]:
team_code_simplified = []
for team in teams:
    dict_iter = {'team_name' : team}
    rg_code = stadiums[stadiums.team == team].rg_teamcode.value_counts().idxmax()
    dict_iter['rg_code'] = rg_code
    team_code_simplified.append(dict_iter)
    

In [13]:
t = pd.DataFrame(team_code_simplified)

In [16]:
t.head()

Unnamed: 0,team_name,rg_code
0,Tampa Bay Rays,TBA
1,Tampa Bay Devil Rays,TBA
2,Oakland Athletics,OAK
3,Kansas City Athletics,KC1
4,Philadelphia Athletics,PHA


In [15]:
t.to_csv('data/rg_codes_by_team.csv')

In [18]:
di = {}
for j in range(len(t)):
    di[t.iloc[j]['team_name']] = t.iloc[j]['rg_code']

In [19]:
di

{'Tampa Bay Rays': 'TBA',
 'Tampa Bay Devil Rays': 'TBA',
 'Oakland Athletics': 'OAK',
 'Kansas City Athletics': 'KC1',
 'Philadelphia Athletics': 'PHA',
 'San Francisco Giants': 'SFN',
 'New York Giants': 'NY1',
 'Chicago Cubs': 'CHN',
 'Chicago Orphans': 'CHN',
 'Seattle Mariners': 'SEA',
 'Miami Marlins': 'FLO',
 'Florida Marlins': 'FLO',
 'Chicago White Sox': 'CHA',
 'Washington Nationals': 'WAS',
 'Montreal Expos': 'MON',
 'Arizona Diamondbacks': 'ARI',
 'Los Angeles Dodgers': 'LAN',
 'Brooklyn Dodgers': 'BRO',
 'Brooklyn Robins': 'BRO',
 'Brooklyn Superbas': 'BRO',
 'Pittsburgh Pirates': 'PIT',
 'Toronto Blue Jays': 'TOR',
 'Boston Red Sox': 'BOS',
 'Boston Americans': 'BOS',
 'Minnesota Twins': 'MIN',
 'Washington Senators': 'WS2',
 'Houston Astros': 'HOU',
 'Houston Colt .45s': 'HOU',
 'Kansas City Royals': 'KCA',
 'Cincinnati Reds': 'CIN',
 'Cincinnati Redlegs': 'CIN',
 'Cleveland Indians': 'CLE',
 'Cleveland Naps': 'CLE',
 'Cleveland Bronchos': 'CLE',
 'Cleveland Blues': 'CLE