#### Initial Set up

In [2]:
# Import Dependencies
import pandas as pd
import numpy as np
import geopy
from geopy import distance
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import geopandas as gpd
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import matplotlib.pyplot as plt
import plotly_express as px
from tqdm import tqdm
from tqdm._tqdm_notebook import tqdm_notebook


# Remove dataFrame display size restrictions
#pd.set_option("display.max_rows", None, "display.max_columns", None)

# Create path
path = "Resources/stations.csv"

# Read in csv
df = pd.read_csv(path)

# Grab original dimensions before clean
original_dimensions = df.shape
print(f'Original dimensions of the subway stations site dataset (rows/columns): {original_dimensions}') 

Original dimensions of the subway stations site dataset (rows/columns): (496, 15)


Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


#### Clean Up

In [3]:
# Remove irrelevant columns
df = df[df.columns.difference(['Complex ID', 'Structure', 'Division', 'GTFS Stop ID', 'Daytime Routes', 'North Direction Label', 'South Direction Label', 'ADA Notes'])]

columns_removed = df.shape
print(f'The dimensions of the subway stations site dataset after removing irrelevant columns: {columns_removed}') 

The dimensions of the subway stations site dataset after removing irrelevant columns: (496, 7)


In [4]:
# Change each Borough Designation Initial to the full Borough for consistency with other dataframes
df['Borough'] = df['Borough'].str.replace('Q', 'queens')
df['Borough'] = df['Borough'].str.replace('M', 'manhattan')
df['Borough'] = df['Borough'].str.replace('Bk', 'brooklyn')
df['Borough'] = df['Borough'].str.replace('Bx', 'bronx')
df['Borough'] = df['Borough'].str.replace('SI', 'staten island')

In [5]:
# Rename column names
df = df.rename(columns={"ADA": "ada_access","Borough": "borough", "GTFS Latitude": "lat_field", "GTFS Longitude": "lon_field", "Line": "line", "Station ID": "station_id", "Stop Name": "station_name"}) 

# Convert all strings to lowercase
df["line"] = df["line"].str.lower()
df["station_name"] = df["station_name"].str.lower()

In [6]:
# Display first 5 records
df.head()

Unnamed: 0,ada_access,borough,lat_field,lon_field,line,station_id,station_name
0,0,queens,40.775036,-73.912034,astoria,1,astoria-ditmars blvd
1,1,queens,40.770258,-73.917843,astoria,2,astoria blvd
2,0,queens,40.766779,-73.921479,astoria,3,30 av
3,0,queens,40.76182,-73.925508,astoria,4,broadway
4,0,queens,40.756804,-73.929575,astoria,5,36 av


#### Add Categorical Encoding & Binary Values

In [7]:
# ADA Accessible is denoted by 1, 

# Convert type of columns to 'category'
df['ada_access'] = df['ada_access'].astype('category')

# Assigning numerical values and store in another column
df['ada_level'] = df['ada_access'].cat.codes
df['ada_level'].value_counts()

0    361
1    126
2      9
Name: ada_level, dtype: int64

In [8]:
# Generate binary values using get_dummies for crime category
dum_df = pd.get_dummies(df, columns=["ada_access"], prefix=["access_type_is"] )

# Merge with main df
df = df.merge(dum_df)

# Rename columns to display whether the station is fully, partially or NOT ada-accessible (i.e. is there an elevator on both platforms?)
df = df.rename(columns={"access_type_is_0": "ada_access_no", "access_type_is_1": "ada_access_yes", "access_type_is_2": "ada_access_partial"})      

#### Add Zipcodes Column

In [9]:
from tqdm import tqdm
tqdm.pandas()
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="alison.sadel@gmail.com")
from geopy.extra.rate_limiter import RateLimiter

reverse = RateLimiter(geolocator.reverse, min_delay_seconds=.01)

df['location'] = df.progress_apply(lambda row: reverse((row['lat_field'], row['lon_field'])),axis=1)
    
    
def parse_zipcode(location):
    if location and location.raw.get('address') and location.raw['address'].get('postcode'):
        return location.raw['address']['postcode']
    else:
        return None
df['zipcode'] = df['location'].apply(parse_zipcode)

df

  from pandas import Panel
100%|██████████| 496/496 [13:39<00:00,  1.65s/it]   


Unnamed: 0,ada_access,borough,lat_field,lon_field,line,station_id,station_name,ada_level,ada_access_no,ada_access_yes,ada_access_partial,location,zipcode
0,0,queens,40.775036,-73.912034,astoria,1,astoria-ditmars blvd,0,1,0,0,"(Astoria-Ditmars Boulevard, 31st Street, Stein...",11101
1,1,queens,40.770258,-73.917843,astoria,2,astoria blvd,1,0,1,0,"(Astoria Boulevard, Hoyt Avenue South, Queens,...",11102
2,0,queens,40.766779,-73.921479,astoria,3,30 av,0,1,0,0,"(30th Avenue, 31st Street, Queens, Queens Coun...",11102
3,0,queens,40.761820,-73.925508,astoria,4,broadway,0,1,0,0,"(Broadway, 31st Street, Queens, Queens County,...",11101
4,0,queens,40.756804,-73.929575,astoria,5,36 av,0,1,0,0,"(35-53, 31st Street, Queens, Queens County, Ne...",11106
...,...,...,...,...,...,...,...,...,...,...,...,...,...
491,0,staten island,40.525507,-74.200064,staten island,517,prince's bay,0,1,0,0,"(Prince's Bay, Holton Avenue, Princes Bay, Sta...",10309
492,0,staten island,40.522410,-74.217847,staten island,518,pleasant plains,0,1,0,0,"(Pleasant Plains, Station Avenue, Pleasant Pla...",10309
493,0,staten island,40.519631,-74.229141,staten island,519,richmond valley,0,1,0,0,"(Richmond Valley, Richmond Valley Road, Richmo...",10307
494,1,staten island,40.512764,-74.251961,staten island,522,tottenville,1,0,1,0,"(Tottenville, Bentley Street, Tottenville, Sta...",10307


In [10]:
# Ensure zipcodes are only 5 characters in length
df['zipcode'] = df['zipcode'].str[:5]

# Remove 'None' value in zipcode column
df = df[df.zipcode.str.contains("None") == False]

# Check work
df['zipcode'].tolist()

['11101',
 '11102',
 '11102',
 '11101',
 '11106',
 '11101',
 '10022',
 '10153',
 '10019',
 '10019',
 '10036',
 '10019',
 '10001',
 '10010',
 '10003',
 '10003',
 '10003',
 '10003',
 '10013',
 '10003',
 '10012',
 '10006',
 '10004',
 '11201',
 '11201',
 '11201',
 '11217',
 '11217',
 '11209',
 '11215',
 '11232',
 '11209',
 '11220',
 '11220',
 '11204',
 '11220',
 '11209',
 '11209',
 '11209',
 '11217',
 '11217',
 '11225',
 '11225',
 '11235',
 '11226',
 '11226',
 '11226',
 '11230',
 '11234',
 '11229',
 '11234',
 '11235',
 '11235',
 '11235',
 '11235',
 '11235',
 '11224',
 '11224',
 '11220',
 '11219',
 '11219',
 '11214',
 '11214',
 '11214',
 '11214',
 '11214',
 '11223',
 '11214',
 '11214',
 '11224',
 '11220',
 '11204',
 '11204',
 '11204',
 '11204',
 '11223',
 '11223',
 '11223',
 '11223',
 '11418',
 '11418',
 '11418',
 '11421',
 '11421',
 '11421',
 '11208',
 '11208',
 '11208',
 '11208',
 '11207',
 '11207',
 '11207',
 '11207',
 '11207',
 '11207',
 '11207',
 '11237',
 '11206',
 '11207',
 '11211',


#### Export to CSV

In [11]:
# Print final number of rows and columns
final_length = len(df)
final_column_length = len(df.columns)

print(f'After preprocessing the dataframe, creating classifications and calling an API to create a zipcode colum, there are {final_length} sales records and {final_column_length} fields.')
      
      
      

After preprocessing the dataframe, creating classifications and calling an API to create a zipcode colum, there are 494 sales records and 13 fields.


In [18]:
# Label index header 'index'
df.index.name = 'index'

In [19]:
# Export to csv 
df.to_csv("output/stations_data_clean.csv")

In [20]:
# Display datatypes
df.dtypes

ada_access            category
borough                 object
lat_field              float64
lon_field              float64
line                    object
station_id               int64
station_name            object
ada_level                 int8
ada_access_no            uint8
ada_access_yes           uint8
ada_access_partial       uint8
location                object
zipcode                 object
dtype: object

In [21]:
# Display column names
df.columns

Index(['ada_access', 'borough', 'lat_field', 'lon_field', 'line', 'station_id',
       'station_name', 'ada_level', 'ada_access_no', 'ada_access_yes',
       'ada_access_partial', 'location', 'zipcode'],
      dtype='object')

In [22]:
df

Unnamed: 0_level_0,ada_access,borough,lat_field,lon_field,line,station_id,station_name,ada_level,ada_access_no,ada_access_yes,ada_access_partial,location,zipcode
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,0,queens,40.775036,-73.912034,astoria,1,astoria-ditmars blvd,0,1,0,0,"(Astoria-Ditmars Boulevard, 31st Street, Stein...",11101
1,1,queens,40.770258,-73.917843,astoria,2,astoria blvd,1,0,1,0,"(Astoria Boulevard, Hoyt Avenue South, Queens,...",11102
2,0,queens,40.766779,-73.921479,astoria,3,30 av,0,1,0,0,"(30th Avenue, 31st Street, Queens, Queens Coun...",11102
3,0,queens,40.761820,-73.925508,astoria,4,broadway,0,1,0,0,"(Broadway, 31st Street, Queens, Queens County,...",11101
4,0,queens,40.756804,-73.929575,astoria,5,36 av,0,1,0,0,"(35-53, 31st Street, Queens, Queens County, Ne...",11106
...,...,...,...,...,...,...,...,...,...,...,...,...,...
491,0,staten island,40.525507,-74.200064,staten island,517,prince's bay,0,1,0,0,"(Prince's Bay, Holton Avenue, Princes Bay, Sta...",10309
492,0,staten island,40.522410,-74.217847,staten island,518,pleasant plains,0,1,0,0,"(Pleasant Plains, Station Avenue, Pleasant Pla...",10309
493,0,staten island,40.519631,-74.229141,staten island,519,richmond valley,0,1,0,0,"(Richmond Valley, Richmond Valley Road, Richmo...",10307
494,1,staten island,40.512764,-74.251961,staten island,522,tottenville,1,0,1,0,"(Tottenville, Bentley Street, Tottenville, Sta...",10307
