# Stadium Data Cleaning and Dealing with Problem Stadiums 

Using the scraper_team_stadium.py to create a Scraper object and collect stadium names and coordinates each of the 30 active MLB teams since the creation of the MLB, the next step is to prepare this data into a usable format and to deal with any stadiums that the Scraper object was unable to collect coordinates for.

In [2]:
import json
import glob
import pandas as pd
import numpy as np
import os

In [21]:
#read in all .json files collected by scraper
stadium_files = glob.glob('data/stadiums_coordinates/*.json')

The tables on baseball-reference.com were complete, however the column containing the team stadium names was formatted in a way that if the team played games in multiple stadiums that year, sometimes the primary stadium was listed first, sometimes the secondary stadium. In order to correct this, I determined which stadium was the primary stadium and if necessary changed the order of the stadiums.

In [22]:
#check for observations where the primary stadium was listed after the secondary stadium (scraper only split multiple
#observations into multiple columns, there was no logic for determining which stadium was the primary stadium 
#as this was not provided in the table being scraped)
incorrect_primary_stadiums = []
for file in stadium_files:
    with open(file, 'r') as f:
        team = json.load(f)
    for i in range(1, len(team) - 1):
        try:
            if team[i]['stadium_name_2'] == team[i-1]['stadium_name_1']\
            or team[i]['stadium_name_2'] == team[i+1]['stadium_name_1']:
                incorrect_primary_stadiums.append(team[i])
        except KeyError:
            continue

In [23]:
pd.set_option('max.columns', 25)
pd.DataFrame(incorrect_primary_stadiums)

Unnamed: 0,team_code,year,team,attendance/game,pitching_park_factor,batting_park_factor,stadium_name_1,latitude_1,longitude_1,stadium_name_2,latitude_2,longitude_2,stadium_name_3,latitude_3,longitude_3,stadium_name_4,latitude_4,longitude_4,stadium_name_5,latitude_5,longitude_5
0,TBD,2008,Tampa Bay Rays,22370.0,101,101,Ballpark at Disney's Wide World of Sports,ERROR,ERROR,Tropicana Field,27.7683,82.6533,,,,,,,,,
1,TBD,2007,Tampa Bay Devil Rays,17131.0,100,98,Ballpark at Disney's Wide World of Sports,ERROR,ERROR,Tropicana Field,27.7683,82.6533,,,,,,,,,
2,OAK,1996,Oakland Athletics,14178.0,102,101,Cashman Field,36.1794,115.13,Oakland-Alameda County Coliseum,37.7517,122.201,,,,,,,,,
3,SFG,1889,New York Giants,,99,103,Oakdale Park,39.9917,75.1511,Polo Grounds III,40.7981,73.9503,St. George Cricket Grounds,40.6467,74.0783,,,,,,
4,CHC,1893,Chicago Colts,3062.0,104,103,South Side Park II,ERROR,ERROR,West Side Grounds,41.8703,87.6725,,,,,,,,,
5,CHC,1891,Chicago Colts,2708.0,103,104,West Side Park I,41.8703,87.6725,South Side Park II,ERROR,ERROR,,,,,,,,,
6,SEA,1999,Seattle Mariners,36004.0,102,102,Kingdome,47.5953,122.331,Safeco Field,47.5911,122.333,,,,,,,,,
7,FLA,2004,Florida Marlins,21539.0,95,95,U.S. Cellular Field,41.83,87.6339,Pro Player Stadium,25.9581,80.2389,,,,,,,,,
8,CHW,1910,Chicago White Sox,6988.0,94,94,South Side Park III,41.8244,87.6328,White Sox Park,41.8319,87.6339,,,,,,,,,
9,LAD,1956,Brooklyn Dodgers,15761.0,106,109,Roosevelt Stadium,40.7064,74.105,Ebbets Field,40.665,73.9581,,,,,,,,,


In [24]:
#switch values for primary stadium and secondary stadium for files with incorrect stadiums
master_ = []
for file in stadium_files:
    with open(file, 'r') as f:
        team = json.load(f)
    for i in team:
        if i in incorrect_primary_stadiums:
            primary_stadium = i['stadium_name_2']
            primary_latitude = i['latitude_2']
            primary_longitude = i['longitude_2']
            secondary_stadium = i['stadium_name_1']
            secondary_latitude = i['latitude_1']
            secondary_longitude = i['longitude_1']
            i['stadium_name_1'] = primary_stadium
            i['latitude_1'] = primary_latitude
            i['longitude_1'] = primary_longitude
            i['stadium_name_2'] = secondary_stadium
            i['latitude_2'] = secondary_latitude
            i['longitude_2'] = secondary_longitude
        master_.append(i)

In [25]:
#append all observations into single data frame
pd.set_option('max.rows', 3000)
full_frame = pd.DataFrame(master_)

In [26]:
full_frame.team.unique()

array(['Tampa Bay Rays', 'Tampa Bay Devil Rays', 'Oakland Athletics',
       'Kansas City Athletics', 'Philadelphia Athletics',
       'San Francisco Giants', 'New York Giants', 'New York Gothams',
       'Chicago Cubs', 'Chicago Orphans', 'Chicago Colts',
       'Chicago White Stockings', 'Seattle Mariners', 'Miami Marlins',
       'Florida Marlins', 'Chicago White Sox', 'Washington Nationals',
       'Montreal Expos', 'Arizona Diamondbacks', 'Los Angeles Dodgers',
       'Brooklyn Dodgers', 'Brooklyn Robins', 'Brooklyn Superbas',
       'Brooklyn Bridegrooms', 'Brooklyn Grooms', 'Brooklyn Grays',
       'Brooklyn Atlantics', 'Pittsburgh Pirates',
       'Pittsburgh Alleghenys', 'Toronto Blue Jays', 'Boston Red Sox',
       'Boston Americans', 'Minnesota Twins', 'Washington Senators',
       'Houston Astros', 'Houston Colt .45s', 'Kansas City Royals',
       'Cincinnati Reds', 'Cincinnati Redlegs',
       'Cincinnati Red Stockings', 'Cleveland Indians', 'Cleveland Naps',
       'Cleve

The next step is to add a column for the state code for each city, which will be used later on in locating weather stations for each stadium.

In [27]:
#add column of state codes for each city listed
state_dict = {'Tampa Bay' : 'FL', 'Oakland' : 'CA', 'Kansas City' : 'MO', 'Philadelphia' : 'PA', 'San Francisco' : 'CA',
             'New York' : 'NY', 'Chicago' : 'IL', 'Seattle' : 'WA', 'Miami' : 'FL', 'Florida' : 'FL', 'Washington' : 'D.C.',
             'Montreal' : 'CANADA', 'Arizona' : 'AZ', 'Los Angeles' : 'CA', 'Brooklyn' : 'NY', 'Pittsburgh' : 'PA', 
             'Toronto' : 'CANADA', 'Boston' : 'MA', 'Minnesota' : 'MN', 'Houston' : 'TX', 'Cincinnati' : 'OH', 
             'Cleveland' : 'OH', 'Colorado' : 'CO', 'Milwaukee' : 'WI', 'Atlanta' : 'GA', 'Detroit' : 'MI',
             'Anaheim' : 'CA', 'California' : 'CA', 'Texas' : 'TX', 'San Diego' : 'CA', 'St. Louis' : 'MO', 
             'Baltimore' : 'MD'}

In [28]:
full_frame['state_code'] = None

In [29]:
#map corresponding state codes to teams that contain city names
team_state_dict = {}
for j in state_dict.keys():
    for k in range(len(full_frame)):
        if j in full_frame.iloc[k]['team'] and full_frame.iloc[k]['team'] not in team_state_dict.keys():
            team_state_dict[full_frame.iloc[k]['team']] = state_dict[j]
            

In [30]:
full_frame['state_code'] = full_frame['team'].map(team_state_dict)

Now, it is time to look into which stadiums the scraper was unable to find coordinates for and locate coordinates for these stadiums.

In [31]:
full_frame.head()

Unnamed: 0,team_code,year,team,attendance/game,pitching_park_factor,batting_park_factor,stadium_name_1,latitude_1,longitude_1,stadium_name_2,latitude_2,longitude_2,stadium_name_3,latitude_3,longitude_3,stadium_name_4,latitude_4,longitude_4,stadium_name_5,latitude_5,longitude_5,state_code
0,TBD,2019,Tampa Bay Rays,14552,96,97,Tropicana Field,27.7683,82.6533,,,,,,,,,,,,,FL
1,TBD,2018,Tampa Bay Rays,14259,96,97,Tropicana Field,27.7683,82.6533,,,,,,,,,,,,,FL
2,TBD,2017,Tampa Bay Rays,15477,96,96,Tropicana Field,27.7683,82.6533,,,,,,,,,,,,,FL
3,TBD,2016,Tampa Bay Rays,15879,95,95,Tropicana Field,27.7683,82.6533,,,,,,,,,,,,,FL
4,TBD,2015,Tampa Bay Rays,15322,97,96,Tropicana Field,27.7683,82.6533,,,,,,,,,,,,,FL


In [32]:
#read in .csv files of problem stadiums created during scraping
problem_files = glob.glob('data/stadiums_coordinates/*.csv')

In [33]:
#print each problem stadium and corresponding team code
for file in problem_files:
    team_code = file[26:29]
    with open(file, 'r') as f:
        for line in f:
            print(line, team_code)

Sportsman's Park III,
 STL
Association Park,
 STL
Sportsman's Park I,
 STL
Edison Field,
 ANA
Washington Park III,
 LAD
West New York Field Club Grounds,
 LAD
Ridgewood Park II,
 LAD
Ridgewood Park,
 LAD
League Park II,
 CLE
Dunn Field,
 CLE
League Park I,
 CLE
Neil Park I,
 CLE
Fairview Park,
 CLE
Jailhouse Flats,
 CLE
Municipal Stadium,
 KCR
South Side Park II,
 CHC
Lake Front Park II,
 CHC
Lake Front Park I,
 CHC
23rd Street Park,
 CHC
AT&T; Park,
 SFG
Polo Grounds V,
 SFG
West New York Field Club Grounds,
 SFG
Polo Grounds I (Southeast Diamond),
 SFG
Memorial Stadium,
 BAL
Sportsman's Park III,
 BAL
Sportsman's Park II,
 BAL
Polo Grounds V,
 NYM
Ballpark at Disney's Wide World of Sports,
 TBD
Tiger Stadium,
 DET
Bennett Park,
 DET
Neil Park II,
 DET
Burns Park,
 DET
Fort Bragg Park,
 ATL
South End Grounds III,
 ATL
American League Park II,
 MIN
American League Park I,
 MIN
Yankee Stadium IIx,
 NYY
Polo Grounds V,
 NYY
Wiedenmeyer's Park,
 NYY
Sydney Cricket Grounds,
 ARI
Exposition

In [34]:
#convert the feature year from a string value to numeric
full_frame['year'] = pd.to_numeric(full_frame['year'])

In [35]:
#many of the problem stadiums are stadiums from an era that won't be analyzed anyway, so drop these values
new_era_ballparks = full_frame[full_frame.year > 1903]

In [37]:
#create dataframe of problem ball parks
new_era_issues = new_era_ballparks[new_era_ballparks.latitude_1 == 'ERROR']

In [39]:
#list of unique problem ball park names
list = [new_era_issues.stadium_name_1.unique()]

In [40]:
print(list)

[array(['Municipal Stadium', 'AT&T; Park', 'Polo Grounds V',
       'Washington Park III', 'Exposition Park III',
       'American League Park II', 'League Park II', 'Dunn Field',
       'League Park I', 'South End Grounds III', 'Tiger Stadium',
       'Bennett Park', 'Yankee Stadium IIx', 'Edison Field',
       "Sportsman's Park III", 'Memorial Stadium', "Sportsman's Park II"],
      dtype=object)]


In [41]:
#function used to convert latitude and longitude from degree format to decimal format
def converter(a, b, c):
    return(a + b/60 + c/3600)

In [42]:
converter(76, 59, 12)

76.98666666666666

After determining which stadiums coordinates were not avaiable for through the scraper, I manually imputed coordinates for the following stadiums.

In [43]:
trouble_parks_latitude = {"Municipal Stadium" : 39.086,
                         "AT&T; Park" : 37.77861111111111,
                         "Polo Grounds V" : 40.83083333333334,
                         "Washington Park III" : 40.6739,
                         "Exposition Park III" : 40.44694444444444,
                         "American League Park II" : 38.901666666666664,
                         "League Park II" : 41.51138888888889,
                         "Dunn Field" : 41.51138888888889,
                         "League Park I" : 41.51138888888889,
                         "South End Grounds III" : 42.337500000000006,
                         "Tiger Stadium" : 42.331944444444446,
                         "Bennett Park" : 42.331944444444446,
                         "Neil Park II" : 42.331944444444446,
                         "Yankee Stadium IIx" : 40.82916666666667,
                         "Wiedenmeyer's Park" : 40.82916666666667,
                         "Edison Field" : 33.8003,
                         "Sportsman's Park III" : 38.658,
                         "Sportsman's Park II" : 38.658,
                         "Memorial Stadium" : 39.32944444444445,
                         "Ballpark at Disney's Wide World of Sports" : 28.337083333333332,
                         }

trouble_parks_longitude = {"Municipal Stadium" : 94.555,
                          "AT&T; Park" : 122.38916666666667,
                          "Polo Grounds V" : 73.9375,
                          "Washington Park III" : 73.9856,
                          "Exposition Park III" : 80.01083333333334,
                          "American League Park II" : 76.98666666666666,
                          "League Park II" : 81.64416666666668,
                          "Dunn Field" : 81.64416666666668,
                          "League Park I" : 81.64416666666668,
                          "South End Grounds III" : 71.08694444444444,
                          "Tiger Stadium" : 83.06888888888888,
                          "Bennett Park" : 83.06888888888888,
                          "Neil Park II" : 83.06888888888888,
                          "Yankee Stadium IIx" : 73.9263888888889,
                          "Wiedenmeyer's Park" : 73.9263888888889,
                          "Edison Field" : 117.883,
                          "Sportsman's Park III" : 90.220,
                          "Sportsman's Park II" : 90.220,
                          "Memorial Stadium" : 76.60138888888888,
                          "Ballpark at Disney's Wide World of Sports" : 81.556}

In [44]:
full_frame['primary_latitude'] = full_frame['stadium_name_1'].map(trouble_parks_latitude)
full_frame['primary_longitude'] = full_frame['stadium_name_1'].map(trouble_parks_longitude)
full_frame['secondary_latitude'] = full_frame['stadium_name_2'].map(trouble_parks_latitude)
full_frame['secondary_longitude'] = full_frame['stadium_name_2'].map(trouble_parks_longitude)

In [45]:
full_frame['primary_latitude'] = full_frame['primary_latitude'].fillna(full_frame['latitude_1'])

In [46]:
full_frame['primary_longitude'] = full_frame['primary_longitude'].fillna(full_frame['longitude_1'])
full_frame['secondary_latitude'] = full_frame['secondary_latitude'].fillna(full_frame['latitude_2'])
full_frame['secondary_longitude'] = full_frame['secondary_longitude'].fillna(full_frame['longitude_2'])

In [47]:
full_frame['year'] = pd.to_numeric(full_frame.year)

In [48]:
full_frame = full_frame[full_frame.year >= 1900]

In [49]:
full_frame = full_frame.drop(columns = ['latitude_1', 'longitude_1', 'latitude_2', 'longitude_2', 'stadium_name_3',
                     'latitude_3', 'longitude_3', 'stadium_name_4', 'latitude_4', 'longitude_4', 
                     'stadium_name_5', 'latitude_5', 'longitude_5'])

In [50]:
full_frame = full_frame.rename({'stadium_name_1' : 'primary_stadium', 'stadium_name_2' : 'secondary_stadium'}, axis = 1)

In [51]:
full_frame.columns

Index(['team_code', 'year', 'team', 'attendance/game', 'pitching_park_factor',
       'batting_park_factor', 'primary_stadium', 'secondary_stadium',
       'state_code', 'primary_latitude', 'primary_longitude',
       'secondary_latitude', 'secondary_longitude'],
      dtype='object')

In [52]:
full_frame = full_frame[['team_code', 'team', 'state_code', 'year', 'primary_stadium', 'primary_latitude', 'primary_longitude', 
      'secondary_stadium', 'secondary_latitude', 'secondary_longitude', 'attendance/game', 
      'pitching_park_factor', 'batting_park_factor']]

In [53]:
full_frame = full_frame.reset_index(drop = True)

In [55]:
full_frame[full_frame.primary_latitude == 'ERROR']

Unnamed: 0,team_code,team,state_code,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor
1161,MIN,Washington Senators,D.C.,1903,American League Park I,ERROR,ERROR,,,,1815,105,101
1162,MIN,Washington Senators,D.C.,1902,American League Park I,ERROR,ERROR,,,,2767,103,100
1163,MIN,Washington Senators,D.C.,1901,American League Park I,ERROR,ERROR,,,,2377,100,99


In [56]:
for j in range(1161, 1164):
    full_frame.at[j, 'primary_latitude'] = 38.90166666
    full_frame.at[j, 'primary_longitude'] = 76.986666

In [57]:
full_frame[full_frame.primary_latitude == 'ERROR']

Unnamed: 0,team_code,team,state_code,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor


In [129]:
f.to_csv('data/complete_stadiums.csv')