# Data Collection and Preparation Notebook

After collecting preparing stadium location data for all 30 active MLB teams from the beginning of the league, the next step is to isolate the unique coordinate pairs to be used for collecting weather data.

In [1]:
import json
import pandas as pd
import glob
import numpy as np
import os

In [2]:
#reading full stadium dataframe created in stadium_data_cleansing.ipynb
complete_stadiums = pd.read_csv('data/complete_stadiums.csv', index_col = 0)

In [3]:
complete_stadiums.head()

Unnamed: 0,team_code,team,state_code,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor
0,TBD,Tampa Bay Rays,FL,2019,Tropicana Field,27.768333,82.653333,,,,14552,96,97
1,TBD,Tampa Bay Rays,FL,2018,Tropicana Field,27.768333,82.653333,,,,14259,96,97
2,TBD,Tampa Bay Rays,FL,2017,Tropicana Field,27.768333,82.653333,,,,15477,96,96
3,TBD,Tampa Bay Rays,FL,2016,Tropicana Field,27.768333,82.653333,,,,15879,95,95
4,TBD,Tampa Bay Rays,FL,2015,Tropicana Field,27.768333,82.653333,,,,15322,97,96


In [4]:
#create dictionary of coordinate pairs for unique stadiums
unique_coordinates = {}
for j in range(len(complete_stadiums)):
    if complete_stadiums.iloc[j]['primary_stadium'] not in unique_coordinates.keys():
        unique_coordinates[complete_stadiums.iloc[j]['primary_stadium']] = [complete_stadiums.iloc[j]['state_code'],\
                                                                    complete_stadiums.iloc[j]['primary_latitude'],\
                                                                    complete_stadiums.iloc[j]['primary_longitude']]
        try:
            if unique_coordinates[complete_stadiums.iloc[j]['primary_stadium']][1] == \
            unique_coordinates[complete_stadiums.iloc[j-1]['primary_stadium']][1] and \
            unique_coordinates[complete_stadiums.iloc[j]['primary_stadium']][2] == \
            unique_coordinates[complete_stadiums.iloc[j-1]['primary_stadium']][2]:
                del unique_coordinates[complete_stadiums.iloc[j]['primary_stadium']]
        except Exception:
            continue

In [5]:
#removing entries with unique stadium names that have the same coordinate pairs, i.e. stadiums that were rebuilt
#in the same place, just with a different name 
final_coordinates_dict = []
for j in unique_coordinates.keys():
    if unique_coordinates[j] not in final_coordinates_dict:
        final_coordinates_dict.append(unique_coordinates[j])

In [6]:
#Create unique names for each coordinate pair and state code (necessary to for creating unique directories with
#noaa_weather_collection.py scraper file)
unique_dict = {}
counter = 1
for j in range(len(final_coordinates_dict)):
    unique_dict['location_' + str(counter + j)] = final_coordinates_dict[j]

In [7]:
#create dataframe for unique coordinate pairs
locations_ = []
for j in unique_dict:
    dict_iter = {'state': None,
                'latitude' : None,
                'longitude' : None}
    dict_iter['state'] = unique_dict[j][0]
    dict_iter['latitude'] = unique_dict[j][1]
    dict_iter['longitude'] = unique_dict[j][2]
    locations_.append(dict_iter)
for j in locations_:
    if j['state'] == 'D.C.':
        j['state'] = 'DC'
coords_df = pd.DataFrame(locations_)

In [14]:
#store as csv file
coords_df.to_csv('data/unique_coordinates.csv')

Once the DataFrame of unique coordinate pairs and state codes has been created, the next step is to collect altitude. The reason for this is twofold: One, altitude could be a significant feature in the modeling phase. For example, playing in the thin air in Denver is notoriously hitter-friendly and widely hated by opposing pitchers. The second reason has to do with weather collection. The stations being evaluated from the Global Historical Climatology Network have varying elevations, and it's possible that the elevation for the stadium will be much higher or lower than that of the station. This could introduce noise into the data, if weather is collected at the top of a nearby mountain when the stadium itself sits in a valley. 

To collect this data, I have used the bulk point query feature of the USGS TNM National Map service.

In [7]:
#formatting coordinate points in specified format from USGS site: longitude,latitude
with open('data/coordinates_elevation.csv', 'w') as f:
    for j in range(len(coords_df)):
        long = '-' + str(coords_df.iloc[j]['longitude'])
        f.write(long)
        f.write(',')
        lat = str(coords_df.iloc[j]['latitude'])
        f.write(lat)
        f.write('\n')
    

In [9]:
#reading output from USGS site
elevations = pd.read_csv('data/bulk_pqs.csv')

In [10]:
#entering column names for USGS output
elevations.columns = ['ID', 'input_lon', 'input_lat', 'elevation_ft', 'elevation_m']

In [11]:
#removing negative sign from longitudes: all longitudes are West of the Prime Meridian so there will be no confusion,
#this will be necessary to merge with GHCN data later
elevations['input_lon'] = elevations['input_lon'].apply(lambda x: (x * -1))

In [12]:
#concatenate elevatiosn to coordinate dataframe
coordinates_w_elevation = pd.concat([coords_df, elevations], axis = 1)

In [13]:
#remove redundant columns
coordinates_w_elevation = coordinates_w_elevation.drop(columns = ['ID', 'input_lon', 'input_lat'])

In [14]:
#manually input elevations for Montreal stadium: not provided by USGS service
coordinates_w_elevation.at[21, 'elevation_ft'] = 101.706
coordinates_w_elevation.at[21, 'elevation_m'] = 31.0
coordinates_w_elevation.at[22, 'elevation_ft'] = 101.706
coordinates_w_elevation.at[22, 'elevation_m'] = 31.0

In [15]:
#add state codes for Canadian cities
coordinates_w_elevation.at[21, 'state'] = 'QC'
coordinates_w_elevation.at[22, 'state'] = 'QC'
coordinates_w_elevation.at[33, 'state'] = 'ON'
coordinates_w_elevation.at[34, 'state'] = 'ON'

In [35]:
#save as .csv file
coordinates_w_elevation.to_csv('data/coordinates_elevation_final.csv')

In [16]:
coordinates_w_elevation

Unnamed: 0,state,latitude,longitude,elevation_ft,elevation_m
0,FL,27.768333,82.653333,30.24,9.22
1,CA,37.751667,122.200556,0.0,0.0
2,MO,39.086,94.555,912.63,278.17
3,PA,39.996111,75.165,117.89,35.93
4,PA,39.981111,75.182778,100.44,30.61
5,CA,37.778611,122.389167,11.57,3.53
6,CA,37.713611,122.386111,25.24,7.69
7,CA,37.766667,122.409167,55.49,16.91
8,NY,40.830833,73.9375,14.41,4.39
9,NY,40.798056,73.950278,21.26,6.48


With all coordinates and elevations collected and formatted, the next step is to determine which weather stations to collect historical data from. Tne NOAA provides access to weather stations collecting data all the way back to the 1700s (the Global Historical Climatology Network), but there are hundreds of thousands of stations and no clear indication of which stations are which, and it is thus difficult to determine which stations to collect. The cells below narrow the field of search. 

In [17]:
#read .txt file detailing station details and limit stations to United States and Canada stations
all_stations = []
with open('data/ghcnd-stations.txt', 'r') as f:
    for line in f:
        if line[0:2] == 'US' or line[0:2] == 'CA':
            all_stations.append(line)

#function to format station data. Function takes a single line of .txt file as a string containing the stationID, 
#latitude, longitude, elevation, state, and location name, and returns a dictionary with each feature per line 
#corresponding to its index
def line_parser(s):
    dict_iter = {'id' : None,
                'latitude' : None,
                'longitude' : None,
                'elevation' : None,
                'state' : None,
                'name' : None}
    ident = ''
    latitude = ''
    longitude = ''
    elevation = ''
    state = ''
    name = ''
    for i in range(0, 11):
        ident += s[i]
    dict_iter['id'] = ident
    for i in range(12, 20):
        latitude += s[i]
    dict_iter['latitude'] = latitude
    for i in range(21, 30):
        longitude += s[i]
    dict_iter['longitude'] = longitude
    for i in range(31, 37):
        elevation += s[i]
    dict_iter['elevation'] = elevation
    for i in range(38, 40):
        state += s[i]
    dict_iter['state'] = state
    for i in range(41, 71):
        name += s[i]
        name = name.strip()
    dict_iter['name'] = name
    return(dict_iter)

weather_stations = []
for i in all_stations:
    station = line_parser(i)
    weather_stations.append(station)

#creates pandas dataframe from list of dictionaries, converts numeric observations read in as strings to numeric
weather_stations_df = pd.DataFrame(weather_stations)
weather_stations_df['latitude'] = pd.to_numeric(weather_stations_df['latitude'])
weather_stations_df['longitude'] = pd.to_numeric(weather_stations_df['longitude'])
weather_stations_df['elevation'] = pd.to_numeric(weather_stations_df['elevation'])
weather_stations_df['longitude'] = weather_stations_df['longitude'].apply(lambda x: (x * -1))

In [18]:
weather_stations_df

Unnamed: 0,id,latitude,longitude,elevation,state,name
0,CA001010066,48.8667,123.2833,4.0,BC,ACTIVEPASS
1,CA001010235,48.4000,123.4833,17.0,BC,ALBERTHEAD
2,CA001010595,48.5833,123.5167,85.0,BC,BAMBERTONOCEANCEMENT
3,CA001010720,48.5000,124.0000,351.0,BC,BEARCREEK
4,CA001010774,48.5000,123.3500,61.0,BC,BEAVERLAKE
...,...,...,...,...,...,...
70389,USW00096405,60.4731,145.3542,25.3,AK,CORDOVA14ESE
70390,USW00096406,64.5014,154.1297,78.9,AK,RUBY44ESE
70391,USW00096407,66.5620,159.0036,6.7,AK,SELAWIK28E
70392,USW00096408,63.4519,150.8747,678.2,AK,DENALI27N


In [19]:
#function to determine stations to collect data from by location. Arguments taken are the dataframe of unique 
#coordinates and state codes, dataframe of weather stations, and thresholds for distance from stadium and elevation
#difference. Function returns a dictionary of all possible stations within these thresholds for each unique coordinate
#pair
def station_locator(coordinates_df, stations_df, distance_thresh, elevation_thresh):
    stations_by_location = {}
    counter = 1
    for j in range(len(coordinates_df)):
        loc_stations = []
        possible_stations = stations_df[stations_df.state == coordinates_df.iloc[j]['state']]
        try:
            for k in range(len(possible_stations)):
                station_distance = ((possible_stations.iloc[k]['latitude'] - coordinates_df.iloc[j]['latitude']) **2 +\
                                    (possible_stations.iloc[k]['longitude'] - coordinates_df.iloc[j]['longitude']) **2)\
                                   ** 0.5
                elev_difference = abs(possible_stations.iloc[k]['elevation'] - coordinates_df.iloc[j]['elevation_m'])
                if station_distance <= distance_thresh and elev_difference <= elevation_thresh:
                    loc_stations.append([station_distance, possible_stations.iloc[k]['id']])
            sorted_stations = sorted(loc_stations, key = lambda x: x[0])
            closest_stations = [sorted_stations[l][1] for l in range(len(sorted_stations))]
            if len(closest_stations) > 50:
                closest_stations = closest_stations[0:50]
            stations_by_location['location_' + str(counter)] = closest_stations
            counter += 1
        except Exception:
            print('This location {} does not have 50 stations in the range'.format(str(j)))
            break
    return(stations_by_location)
        
            

In [20]:
#station dictionary with distance threshold chosen as 0.25 (Euclidean distance calculated using latitude/longtiude),
#and elevation threshold chosen to be within 100m of the stadium elevation.
s_l = station_locator(coordinates_w_elevation, weather_stations_df, 0.25, 100)

In [75]:
#store station names in json file
with open('data/new_stations.json', 'w') as f:
    json.dump(s_l, f)

With a dictionary of stations indexed by location, weather data will be collected and written to .csv files by passing the JSON file created to the noaa_weather_collection.py script in the repo. To do this, I created a WeatherCollector object that interacted with the NOAA FTP, collected the .dly station files, parsed them and saved them as .csv files.

While the files have been collected and stored locally, the challenge becomes that not all stations are active for all the necessary years and different stations collect different elements and may not contain observations for all the elements being sought. To overcome this, I have created a function to read in all individual station files and create a total picture of weather for the location by keeping only observations containing the most elements for each date and aggregating all the observations into one full dataframe. 

In [22]:
FILE_PATH = 'data/noaa_station_csvs/'
#this function takes a file path where all station .csv files are stored as an argument and returns two lists:
#one is a list of dataframes of full weather data for each location, and a list of directories returned by the 
#glob function to ensure the order files are collected and written in
def create_full_city_weather(file_path = FILE_PATH):
    directory_list = glob.glob(file_path + '*')
    main = []
    for directory in directory_list:
        station_files = glob.glob(directory + '/*.csv')
        master = []
        for station in station_files:
            try:
                df = pd.read_csv(station, index_col = False)
                df = df.replace({-9999 : np.nan})
                df = df.dropna(axis = 1, how = 'all').dropna(axis = 0, thresh = 3)
                df['date'] = pd.to_datetime(df['date'])
                master.append(df)
            except Exception:
                print(station)
                continue
        full_frame = pd.concat(master, sort = False)
        full_frame = full_frame.assign(counts = full_frame.count(axis = 1))
        full_frame = full_frame.sort_values(['date', 'counts']).drop_duplicates('date', keep = 'last').drop('counts', axis = 1)
        full_frame = full_frame.reset_index().drop(columns = ['index'])
        main.append(full_frame)
    print('Successfully created all necessary weather files')
    return(main, directory_list)

In [23]:
m, d = create_full_city_weather()

Successfully created all necessary weather files


In [24]:
#assign latitude and longitude to city weather dataframes for merging purposes with primary dataset
for n in range(len(d)):
    value = int(d[n].split('location_')[1]) - 1
    m[n] = m[n].assign(latitude = coords_df.iloc[value]['latitude'])
    m[n] = m[n].assign(longitude = coords_df.iloc[value]['longitude'])

In [175]:
#write full .csv frames to respective directories
for l in range(len(d)):
    m[l].to_csv(d[l] + '/all_city_weather.csv')

With a full dataframe of weather for each location created, it is now time to merge the weather data with the primary dataset, the mlb_elo dataset provided by FiveThirtyEight. 

In [18]:
#read in mlb_elo dataset
mlb_elo = pd.read_csv('data/mlb_elo_w_stadiums.csv', index_col = [0], low_memory = False)

In [19]:
#view info for mlb_elo dataframe
mlb_elo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 220010 entries, 0 to 220009
Data columns (total 39 columns):
date                    220010 non-null object
season                  220010 non-null int64
neutral                 220010 non-null int64
playoff                 1617 non-null object
team1                   220010 non-null object
team2                   220010 non-null object
elo1_pre                220010 non-null float64
elo2_pre                220010 non-null float64
elo_prob1               220010 non-null float64
elo_prob2               220010 non-null float64
elo1_post               220010 non-null float64
elo2_post               220010 non-null float64
rating1_pre             220010 non-null float64
rating2_pre             220010 non-null float64
pitcher1                219972 non-null object
pitcher2                219972 non-null object
pitcher1_rgs            186269 non-null float64
pitcher2_rgs            186269 non-null float64
pitcher1_adj            185588 non-nu

In [20]:
mlb_elo[(mlb_elo.season.between(1901, 1902)) & (mlb_elo.team1 == 'NYY')]

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1_pre,elo2_pre,elo_prob1,elo_prob2,...,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor
197932,1902-09-29,1902,0,,NYY,BOS,1404.763,1499.646,0.399383,0.600617,...,,,,,,,,,,
197939,1902-09-27,1902,0,,NYY,BOS,1406.043,1498.366,0.402923,0.597077,...,,,,,,,,,,
197944,1902-09-27,1902,0,,NYY,BOS,1407.083,1497.326,0.405807,0.594193,...,,,,,,,,,,
198012,1902-09-15,1902,0,,NYY,MIN,1410.121,1438.396,0.493848,0.506152,...,,,,,,,,,,
198016,1902-09-15,1902,0,,NYY,MIN,1412.230,1436.287,0.499918,0.500082,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200013,1901-05-13,1901,0,,NYY,OAK,1457.904,1437.734,0.563225,0.436775,...,,,,,,,,,,
200022,1901-05-11,1901,0,,NYY,OAK,1459.502,1436.136,0.567746,0.432254,...,,,,,,,,,,
200040,1901-05-08,1901,0,,NYY,MIN,1457.332,1448.139,0.547624,0.452376,...,,,,,,,,,,
200115,1901-04-27,1901,0,,NYY,BOS,1452.252,1447.748,0.540929,0.459071,...,,,,,,,,,,


In [21]:
#drop observations for teams that are not predecessors of active MLB teams (defunct teams before 1900)
mlb_elo = mlb_elo[mlb_elo.primary_stadium.notnull()]

In [22]:
mlb_elo[(mlb_elo.season.between(1901, 1902)) & (mlb_elo.team1 == 'NYY')]

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1_pre,elo2_pre,elo_prob1,elo_prob2,...,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor


In [14]:
mlb_elo[(mlb_elo.season.between(2001, 2019)) & (mlb_elo.team1 == 'PIT')]

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1_pre,elo2_pre,elo_prob1,elo_prob2,...,year,primary_stadium,primary_latitude,primary_longitude,secondary_stadium,secondary_latitude,secondary_longitude,attendance/game,pitching_park_factor,batting_park_factor
49,2019-09-29,2019,0,,PIT,CIN,1463.388434,1481.228324,0.508864,0.491136,...,2019.0,PNC Park,40.446944,80.005833,,,,18413,97.0,96.0
59,2019-09-28,2019,0,,PIT,CIN,1465.328021,1479.288737,0.514444,0.485556,...,2019.0,PNC Park,40.446944,80.005833,,,,18413,97.0,96.0
80,2019-09-27,2019,0,,PIT,CIN,1463.934078,1480.682680,0.510434,0.489566,...,2019.0,PNC Park,40.446944,80.005833,,,,18413,97.0,96.0
86,2019-09-26,2019,0,,PIT,CHC,1460.878972,1528.135780,0.438068,0.561932,...,2019.0,PNC Park,40.446944,80.005833,,,,18413,97.0,96.0
103,2019-09-25,2019,0,,PIT,CHC,1458.685272,1530.329480,0.431861,0.568139,...,2019.0,PNC Park,40.446944,80.005833,,,,18413,97.0,96.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46595,2001-04-18,2001,0,,PIT,HOU,1477.890000,1514.056000,0.482499,0.517501,...,2001.0,PNC Park,40.446944,80.005833,,,,30430,104.0,103.0
46619,2001-04-16,2001,0,,PIT,HOU,1475.573000,1516.374000,0.475840,0.524160,...,2001.0,PNC Park,40.446944,80.005833,,,,30430,104.0,103.0
46672,2001-04-12,2001,0,,PIT,CIN,1483.764000,1519.289000,0.483420,0.516580,...,2001.0,PNC Park,40.446944,80.005833,,,,30430,104.0,103.0
46684,2001-04-11,2001,0,,PIT,CIN,1482.328000,1520.724000,0.479294,0.520706,...,2001.0,PNC Park,40.446944,80.005833,,,,30430,104.0,103.0


In [None]:
base = 

The method I will use to merge the dataframes is to isolate all observations corresponding to a certain latitude and longitude, and then merge these observations with the corresponding weather data for that location, matching on date.

In [29]:
#merge slices of dataframe, storing dataframes in a list
final_master = []
for i in range(len(d)):
    weather_merge = pd.read_csv(d[i]+'/all_city_weather.csv', index_col = [0])
    team_merge = mlb_elo[(mlb_elo.primary_latitude == weather_merge.iloc[0]['latitude']) &\
                        (mlb_elo.primary_longitude == weather_merge.iloc[0]['longitude'])]
    final_merged = team_merge.merge(weather_merge, how = 'left', left_on = ['date'], right_on = ['date'])
    final_master.append(final_merged)    

In [30]:
#concatenating all lists
full_final = pd.concat(final_master, sort = False)

In [31]:
#sort final dataframe by date, reset the index, and drop the old index column
full_final = full_final.sort_values(by = ['date']).reset_index().drop(columns = ['index'])

In [32]:
full_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198286 entries, 0 to 198285
Data columns (total 57 columns):
date                    198286 non-null object
season                  198286 non-null int64
neutral                 198286 non-null int64
playoff                 1608 non-null object
team1                   198286 non-null object
team2                   198286 non-null object
elo1_pre                198286 non-null float64
elo2_pre                198286 non-null float64
elo_prob1               198286 non-null float64
elo_prob2               198286 non-null float64
elo1_post               198286 non-null float64
elo2_post               198286 non-null float64
rating1_pre             198286 non-null float64
rating2_pre             198286 non-null float64
pitcher1                198278 non-null object
pitcher2                198278 non-null object
pitcher1_rgs            184642 non-null float64
pitcher2_rgs            184642 non-null float64
pitcher1_adj            183965 non-nu

While the collection method successfully collected weather for $91.5%$ of the observations from the mlb_elo dataset, there are still over $13,000$ observations missing weather altogether and many observations that are missing several of the desired elements (primarily wind). In order to try and improve on this, I will look into which observations are missing and determine how to fill these in. 

In [33]:
weather_issues = full_final[full_final.TMAX.isnull()]

In [34]:
weather_issues.team.value_counts()

Washington Senators      4162
Chicago Cubs             3957
Detroit Tigers           3151
Pittsburgh Pirates       1784
Washington Nationals     1232
Cincinnati Reds           870
Colorado Rockies          391
Chicago Orphans            77
Toronto Blue Jays          54
Seattle Mariners           42
Kansas City Royals         18
Philadelphia Phillies       1
Tampa Bay Rays              1
Name: team, dtype: int64

It is immediately apparent that 4 cities account for nearly all of the missing weather observations, and the primary offender being Washington D.C. In order to correct this, I did several things. I narrowed the field of stations to select from, which I simply did to increase efficiency as it became apparent in the first data collection effor that I was looking through too many stations. Then, I changed my metric of distance threshold from Euclidean distance to Haversine distance. I initially thought that because I was only looking to round up weather stations nearby each stadium and I was not concerned with exact location that Euclidean distance would be sufficient even though it's a meaningless metric of distance measurement when applied to latitude and longitude (Obviously I'm looking to measure spherical distance not the straight line distance between Cartesian coordinates). However, I decided to change this as I considered how to expand my station search while still ensuring that the stations are close enough to the stadium to provide accurate observations. Using Euclidean distance, I was essentially pulling a number out of a hat as my distance threshold; the actual value of the threshold has no interpretable meaning. Thus, I changed my distance metric to using the Haversine formula to calculate distance over a sphere, and thus I was able to set a radius of Kilometers around the stadium in which to search. Finally, I changed the state code of Washington D.C. stations from DC to MD, as there were many stations that fell in the 25 KM radius around the stadiums I was searching for that were indexed under the MD state code and were not appearing in my code isolating weather stations as it was searching for stations indexed under DC. 

In [36]:
#create weather stations df with narrowed criterion
all_stations = []
with open('data/ghcnd-stations.txt', 'r') as f:
    for line in f:
        if line[0:4] == 'USW0' or line[0:4] == 'USC0' or line[0:2] == 'CA':
            all_stations.append(line)

weather_stations = []
for i in all_stations:
    station = line_parser(i)
    weather_stations.append(station)
    
weather_stations_df = pd.DataFrame(weather_stations)
weather_stations_df['latitude'] = pd.to_numeric(weather_stations_df['latitude'])
weather_stations_df['longitude'] = pd.to_numeric(weather_stations_df['longitude'])
weather_stations_df['elevation'] = pd.to_numeric(weather_stations_df['elevation'])

In [37]:
#function to compute haversine distance for two pairs of coordinates. Takes latitude and longitude of two locations
#in degree format as an input and returns a distance in Kilometers 
def haversine_distance(latitude_1, longitude_1, latitude_2, longitude_2):
    R = 6378.137
    h = np.arcsin( np.sqrt(np.sin( (np.radians(latitude_2) - np.radians(latitude_1))/2)**2 \
                           + np.cos(np.radians(latitude_1))*np.cos(np.radians(latitude_2))*\
                          np.sin( (np.radians(longitude_2) - np.radians(longitude_1))/2)**2))
    return(2 * R * h)

In [38]:
#station locator function used in the first sweep, only difference is changed distance metric from Euclidean to 
#Haversine
def station_locator_haversine(coordinates_df, stations_df, distance_thresh, elevation_thresh):
    stations_by_location = {}
    counter = 1
    for j in range(len(coordinates_df)):
        loc_stations = []
        possible_stations = stations_df[stations_df.state == coordinates_df.iloc[j]['state']]
        latitude_1 = coordinates_df.iloc[j]['latitude']
        longitude_1 = coordinates_df.iloc[j]['longitude']
        try:
            for k in range(len(possible_stations)):
                latitude_2 = possible_stations.iloc[k]['latitude']
                longitude_2 = possible_stations.iloc[k]['longitude']
                station_distance = haversine_distance(latitude_1, longitude_1, latitude_2, longitude_2)
                elev_difference = abs(possible_stations.iloc[k]['elevation'] - coordinates_df.iloc[j]['elevation_m'])
                if station_distance <= distance_thresh and elev_difference <= elevation_thresh:
                    loc_stations.append([station_distance, possible_stations.iloc[k]['id']])
            sorted_stations = sorted(loc_stations, key = lambda x: x[0])
            closest_stations = [sorted_stations[l][1] for l in range(len(sorted_stations))]
            if len(closest_stations) > 50:
                closest_stations = closest_stations[0:50]
            stations_by_location['location_' + str(counter)] = closest_stations
            counter += 1
        except Exception:
            print('This location {} does not have 50 stations in the range'.format(str(j)))
            break
    return(stations_by_location)

In [41]:
#converting Longitude to negative in order to be converted to radians. 
coordinates_w_elevation['longitude'] = coordinates_w_elevation['longitude'].apply(lambda x: x * -1)

In [43]:
#changed state code from DC to MD
coordinates_w_elevation = coordinates_w_elevation.replace({'DC' : 'MD'})

In [44]:
#collect stations 
d = station_locator_haversine(coordinates_w_elevation, weather_stations_df, 25, 250)

In [56]:
#determine if stations had already been collected and return a dictionary of stations still to collect
paths = glob.glob('data/noaa_station_csvs/*/')
for path in paths:
    files = glob.glob(path + '*.csv')
    key = path.split('csvs/')[1].split('/')[0]
    for file in files:
        station_id = file.split(path)[1].split('.')[0]
        if station_id not in d[key]:
            os.remove(file)
        else:
            d[key].pop(d[key].index(station_id))


In [57]:
d

{'location_1': ['USW00012842', 'USC00081632'],
 'location_2': ['USC00047765'],
 'location_3': [],
 'location_4': ['USC00365623',
  'USC00368263',
  'USC00361423',
  'USC00369103',
  'USC00366194',
  'USC00368388'],
 'location_5': ['USC00361423',
  'USC00365368',
  'USC00369103',
  'USW00014793',
  'USC00368263'],
 'location_6': [],
 'location_7': ['USW00093228'],
 'location_8': ['USC00046502'],
 'location_9': ['USC00307587'],
 'location_10': [],
 'location_11': ['USW00014810',
  'USC00117988',
  'USW00014892',
  'USW00014819',
  'USW00014855',
  'USW00094846',
  'USC00111564',
  'USC00111497',
  'USC00114814',
  'USC00112286',
  'USC00114816'],
 'location_12': ['USC00111564',
  'USC00111577',
  'USC00114814',
  'USC00112888',
  'USC00114816',
  'USC00111552',
  'USW00014810',
  'USC00111522',
  'USC00115110',
  'USC00117988'],
 'location_13': ['USC00458802',
  'USC00454169',
  'USC00450826',
  'USC00453985',
  'USC00454187'],
 'location_14': ['USW00024233',
  'USC00458802',
  'USC00454

In [58]:
#save stations to be collected as a .json
with open('data/federal_stations.json', 'w') as f:
    json.dump(d, f)

In [59]:
#return longitude to postive value before appending to dataframe created below
coordinates_w_elevation['longitude'] = coordinates_w_elevation['longitude'].apply(lambda x: x * -1)

After passing the new dictionary to the WeatherCollector and collecting data for those stations, I followed the same format as used previously to create a single .csv file with weather for each location. 

In [78]:
m, d = create_full_city_weather()
#assign latitude and longitude to city weather dataframes for merging purposes with primary dataset
for n in range(len(d)):
    value = int(d[n].split('location_')[1]) - 1
    m[n] = m[n].assign(latitude = coordinates_df.iloc[value]['latitude'])
    m[n] = m[n].assign(longitude = coordinates_df.iloc[value]['longitude'])

In [63]:
#write full .csv frames to respective directories
for l in range(len(d)):
    m[l].to_csv(d[l] + '/all_city_weather.csv')

In [76]:
#merge slices of dataframe, storing dataframes in a list
final_master = []
for i in range(len(d)):
    weather_merge = pd.read_csv(d[i]+'/all_city_weather.csv', index_col = [0])
    team_merge = mlb_elo[(mlb_elo.primary_latitude == weather_merge.iloc[0]['latitude']) &\
                        (mlb_elo.primary_longitude == weather_merge.iloc[0]['longitude'])]
    final_merged = team_merge.merge(weather_merge, how = 'left', left_on = ['date'], right_on = ['date'])
    final_master.append(final_merged)  
    
full_final = pd.concat(final_master, sort = False)
full_final = full_final.sort_values(by = ['date']).reset_index().drop(columns = ['index'])

In [77]:
full_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198286 entries, 0 to 198285
Data columns (total 57 columns):
date                    198286 non-null object
season                  198286 non-null int64
neutral                 198286 non-null int64
playoff                 1608 non-null object
team1                   198286 non-null object
team2                   198286 non-null object
elo1_pre                198286 non-null float64
elo2_pre                198286 non-null float64
elo_prob1               198286 non-null float64
elo_prob2               198286 non-null float64
elo1_post               198286 non-null float64
elo2_post               198286 non-null float64
rating1_pre             198286 non-null float64
rating2_pre             198286 non-null float64
pitcher1                198278 non-null object
pitcher2                198278 non-null object
pitcher1_rgs            184642 non-null float64
pitcher2_rgs            184642 non-null float64
pitcher1_adj            183965 non-nu

In [79]:
full_final.to_csv('data/mlb_final.csv')

The result shows that this method was successful, and now the dataframe has weather observations for $99.8$ percent of observations and it is time to being EDA and modeling. 