#### Initial Set up

In [2]:
# Import Dependencies
import pandas as pd
import numpy as np
import geopy
from geopy import distance
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import geopandas as gpd
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import matplotlib.pyplot as plt
import plotly_express as px
from tqdm import tqdm
from tqdm._tqdm_notebook import tqdm_notebook


# Remove dataFrame display size restrictions
#pd.set_option("display.max_rows", None, "display.max_columns", None)

# Create path
path = "Resources/stations.csv"

# Read in csv
df = pd.read_csv(path)

# Grab original dimensions before clean
original_dimensions = df.shape
print(f'Original dimensions of the subway stations site dataset (rows/columns): {original_dimensions}') 

Original dimensions of the subway stations site dataset (rows/columns): (496, 15)


#### Clean Up

In [3]:
# Remove irrelevant columns
df = df[df.columns.difference(['Complex ID', 'Division', 'GTFS Stop ID', 'Daytime Routes', 'North Direction Label', 'South Direction Label', 'ADA Notes'])]

columns_removed = df.shape
print(f'The dimensions of the subway stations site dataset after removing irrelevant columns: {columns_removed}') 

The dimensions of the subway stations site dataset after removing irrelevant columns: (496, 8)


In [4]:
# Change each Borough Designation Initial to the full Borough for consistency with other dataframes
df['Borough'] = df['Borough'].str.replace('Q', 'queens')
df['Borough'] = df['Borough'].str.replace('M', 'manhattan')
df['Borough'] = df['Borough'].str.replace('Bk', 'brooklyn')
df['Borough'] = df['Borough'].str.replace('Bx', 'bronx')
df['Borough'] = df['Borough'].str.replace('SI', 'staten island')

In [5]:
# Rename column names
df = df.rename(columns={"ADA": "ada_access","Borough": "borough", "GTFS Latitude": "lat_field", "GTFS Longitude": "lon_field", "Line": "line", "Station ID": "station_id", "Stop Name": "station_name", "Structure": "structure"}) 

# Convert all strings to lowercase
df["line"] = df["line"].str.lower()
df["station_name"] = df["station_name"].str.lower()
df["structure"] = df["structure"].str.lower()

In [6]:
# Display first 5 records
df.head()

Unnamed: 0,ada_access,borough,lat_field,lon_field,line,station_id,station_name,structure
0,0,queens,40.775036,-73.912034,astoria,1,astoria-ditmars blvd,elevated
1,1,queens,40.770258,-73.917843,astoria,2,astoria blvd,elevated
2,0,queens,40.766779,-73.921479,astoria,3,30 av,elevated
3,0,queens,40.76182,-73.925508,astoria,4,broadway,elevated
4,0,queens,40.756804,-73.929575,astoria,5,36 av,elevated


#### Add Categorical Encoding & Binary Values

In [7]:
# ADA Accessible is denoted by 1, 

# Convert type of columns to 'category'
df['ada_access'] = df['ada_access'].astype('category')

# Assigning numerical values and store in another column
df['ada_level'] = df['ada_access'].cat.codes
df['ada_level'].value_counts()

0    361
1    126
2      9
Name: ada_level, dtype: int64

In [8]:
# Generate binary values using get_dummies for crime category
dum_df = pd.get_dummies(df, columns=["ada_access"], prefix=["access_type_is"] )

# Merge with main df
df = df.merge(dum_df)
df

Unnamed: 0,ada_access,borough,lat_field,lon_field,line,station_id,station_name,structure,ada_level,access_type_is_0,access_type_is_1,access_type_is_2
0,0,queens,40.775036,-73.912034,astoria,1,astoria-ditmars blvd,elevated,0,1,0,0
1,1,queens,40.770258,-73.917843,astoria,2,astoria blvd,elevated,1,0,1,0
2,0,queens,40.766779,-73.921479,astoria,3,30 av,elevated,0,1,0,0
3,0,queens,40.761820,-73.925508,astoria,4,broadway,elevated,0,1,0,0
4,0,queens,40.756804,-73.929575,astoria,5,36 av,elevated,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
491,0,staten island,40.525507,-74.200064,staten island,517,prince's bay,open cut,0,1,0,0
492,0,staten island,40.522410,-74.217847,staten island,518,pleasant plains,embankment,0,1,0,0
493,0,staten island,40.519631,-74.229141,staten island,519,richmond valley,open cut,0,1,0,0
494,1,staten island,40.512764,-74.251961,staten island,522,tottenville,at grade,1,0,1,0


In [9]:
from tqdm import tqdm
tqdm.pandas()
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="alison.sadel@gmail.com")
from geopy.extra.rate_limiter import RateLimiter

reverse = RateLimiter(geolocator.reverse, min_delay_seconds=.01)

df['location'] = df.progress_apply(lambda row: reverse((row['lat_field'], row['lon_field'])),axis=1)
    
    
def parse_zipcode(location):
    if location and location.raw.get('address') and location.raw['address'].get('postcode'):
        return location.raw['address']['postcode']
    else:
        return None
df['zipcode'] = df['location'].apply(parse_zipcode)

df

  from pandas import Panel
100%|██████████| 496/496 [11:39<00:00,  1.41s/it]  


Unnamed: 0,ada_access,borough,lat_field,lon_field,line,station_id,station_name,structure,ada_level,access_type_is_0,access_type_is_1,access_type_is_2,location,zipcode
0,0,queens,40.775036,-73.912034,astoria,1,astoria-ditmars blvd,elevated,0,1,0,0,"(Astoria-Ditmars Boulevard, 31st Street, Stein...",11101
1,1,queens,40.770258,-73.917843,astoria,2,astoria blvd,elevated,1,0,1,0,"(Astoria Boulevard, Hoyt Avenue South, Queens,...",11102
2,0,queens,40.766779,-73.921479,astoria,3,30 av,elevated,0,1,0,0,"(30th Avenue, 31st Street, Queens, Queens Coun...",11102
3,0,queens,40.761820,-73.925508,astoria,4,broadway,elevated,0,1,0,0,"(Broadway, 31st Street, Queens, Queens County,...",11101
4,0,queens,40.756804,-73.929575,astoria,5,36 av,elevated,0,1,0,0,"(35-53, 31st Street, Queens, Queens County, Ne...",11106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491,0,staten island,40.525507,-74.200064,staten island,517,prince's bay,open cut,0,1,0,0,"(Prince's Bay, Holton Avenue, Princes Bay, Sta...",10309
492,0,staten island,40.522410,-74.217847,staten island,518,pleasant plains,embankment,0,1,0,0,"(Pleasant Plains, Station Avenue, Pleasant Pla...",10309
493,0,staten island,40.519631,-74.229141,staten island,519,richmond valley,open cut,0,1,0,0,"(Richmond Valley, Richmond Valley Road, Richmo...",10307
494,1,staten island,40.512764,-74.251961,staten island,522,tottenville,at grade,1,0,1,0,"(Tottenville, Bentley Street, Tottenville, Sta...",10307


In [None]:
df['geom'] = df['lat_field'].map(str) + ',' + df['lon_field'].map(str)
#df['geom'][0]

locator = Nominatim(user_agent="alison.sadel@gmail.com", timeout=10)

rgeocode = RateLimiter(locator.reverse, min_delay_seconds=0.001)

tqdm.pandas()
location = df['geom'].progress_apply(rgeocode)

In [None]:
location

In [None]:
rgeocode.reverse((lat,long))

In [None]:
def parse_zipcode(location):
    if location and location.raw.get('address') and location.raw['address'].get('postcode'):
        return location.raw['address']['postcode']
    else:
        return None
        
df['Zipcode'] = df['location'].apply(parse_zipcode)

In [None]:
type(rgeocode)

In [None]:
locator.reverse('40.516578, -74.242096')

In [None]:
locator.reverse('40.516578, -74.242096').postcode

In [None]:
tester = location.to_frame()
#tester['geom'].split(',')
tester["geom"].str.split(",", n = 10, expand = True)

In [None]:
chunks = location.split(',')

In [None]:
df['smash'] = df['lat_field'].map(str) + ',' + df['lon_field'].map(str)
df
#location = locator.reverse()
#location.raw 

#### Add Zipcodes Column

In [None]:
# df['geom'] = df['lat_field'].map(str) + ',' + df['lon_field'].map(str)
# #df['geom'][0]

# locator = Nominatim(user_agent="alison.sadel@gmail.com", timeout=10)

# rgeocode = RateLimiter(locator.reverse, min_delay_seconds=0.001)

In [None]:
tqdm.pandas()
df['fill'] = df['geom'].progress_apply(rgeocode)
df.head()

In [None]:
subway_df = df.rename(columns={"access_type_0": "ada_access_type_no", "access_type_1": "ada_access_type_yes", "ada_access_type_2": "ada_access_type_partial"}) 
                   

In [None]:
subway_df

In [None]:
#subway_df['zipcode'].tolist()
subway_df[['1','2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']] = subway_df.zipcode.apply(
   lambda x: pd.Series(str(x).split(",")))

In [None]:
subway_df['zipcode'].tolist()

In [None]:
subway_df.columns

In [None]:
subway_df['1']

In [None]:
subway_df['2']