In [None]:
# python package geopy is not in conda distribution
# run this code to install it

!pip install geopy

In [None]:
# geopy documentation
https://geopy.readthedocs.io/en/stable/
    

In [30]:
import geopy
import pandas as pd


In [60]:
!ls

Add_zipcode_to_MTA_station.ipynb geopy_example.ipynb
MTA_station_info.csv             station_info_zipcode.csv


In [None]:
# import csv file with MTA station information including latitude and longitude

# data source:  http://web.mta.info/developers/data/nyct/subway/Stations.csv

station_info = pd.read_csv("MTA_station_info.csv")

In [61]:
len(station_info)

496

In [None]:
# the time to return result is about 1 second per request
# time to run on all data set will be about 8 minutes

# tested with sample of 5 before running on all 496

In [34]:
station_info.columns

Index(['Station ID', 'Complex ID', 'GTFS Stop ID', 'Division', 'Line',
       'Stop Name', 'Borough', 'Daytime Routes', 'Structure', 'GTFS Latitude',
       'GTFS Longitude', 'North Direction Label', 'South Direction Label',
       'ADA', 'ADA Notes'],
      dtype='object')

In [46]:
# check that all stations have lat/long data
station_info.isna().sum()

Station ID                 0
Complex ID                 0
GTFS Stop ID               0
Division                   0
Line                       0
Stop Name                  0
Borough                    0
Daytime Routes             0
Structure                  0
GTFS Latitude              0
GTFS Longitude             0
North Direction Label     18
South Direction Label     15
ADA                        0
ADA Notes                487
zip_code                   0
dtype: int64

In [42]:
def get_zipcode(station_info, geolocator, lat_field, lon_field):
    
    """ Uses the Nominatim geocoder to return address of latitude and longitude.
        Then extracts the zip code of the address"""
    try:
        location = geolocator.reverse((station_info[lat_field], station_info[lon_field]))
        zip_code = location.raw['address']['postcode']
        
    except:
        zip_code = 99999 # will check any results of 99999 manually
    
    return zip_code

geolocator = geopy.Nominatim(user_agent='metis_project01')

station_info['zip_code'] = station_info.apply(get_zipcode, axis=1, geolocator=geolocator, lat_field='GTFS Latitude', lon_field='GTFS Longitude')

In [45]:
# check if any 99999
station_info[station_info['zip_code'] == 99999]

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,North Direction Label,South Direction Label,ADA,ADA Notes,zip_code
42,43,43,D27,BMT,Broadway - Brighton,Parkside Av,Bk,B Q,Open Cut,40.655292,-73.961495,Manhattan,Brighton Beach & Coney Island,0,,99999
313,313,313,123,IRT,Broadway - 7Av,72 St,M,1 2 3,Subway,40.778453,-73.98197,Uptown & The Bronx,Downtown & Brooklyn,1,,99999


In [51]:
# from wikipedia the Parkside Ave station at this lat and long is in zip code 11225
# https://en.wikipedia.org/wiki/Parkside_Avenue_station

station_info.loc[42,'zip_code'] = 11225

In [54]:
# from wikipedia the 72st station at this lat and long is in zip code 10023
# https://en.wikipedia.org/wiki/72nd_Street_station_(IRT_Broadway%E2%80%93Seventh_Avenue_Line)

station_info.loc[313, 'zip_code'] = 10023

In [57]:
#export new csv with the zip_code added to the station_info file

station_info.to_csv('station_info_zipcode.csv')

In [59]:
!ls

Add_zipcode_to_MTA_station.ipynb geopy_example.ipynb
MTA_station_info.csv             station_info_zipcode.csv
