In [1]:
import pandas as pd
import numpy as np

In [2]:
station_locations = pd.read_csv('http://web.mta.info/developers/data/nyct/subway/Stations.csv')
station_locations.head()

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,North Direction Label,South Direction Label
0,1,1,R01,BMT,Astoria,Astoria - Ditmars Blvd,Q,N W,Elevated,40.775036,-73.912034,,Manhattan
1,2,2,R03,BMT,Astoria,Astoria Blvd,Q,N W,Elevated,40.770258,-73.917843,Ditmars Blvd,Manhattan
2,3,3,R04,BMT,Astoria,30 Av,Q,N W,Elevated,40.766779,-73.921479,Astoria - Ditmars Blvd,Manhattan
3,4,4,R05,BMT,Astoria,Broadway,Q,N W,Elevated,40.76182,-73.925508,Astoria - Ditmars Blvd,Manhattan
4,5,5,R06,BMT,Astoria,36 Av,Q,N W,Elevated,40.756804,-73.929575,Astoria - Ditmars Blvd,Manhattan


In [3]:
station_locations.shape

(496, 13)

In [4]:
station_locations['Stop Name'].unique().shape

(377,)

In [5]:
station_locations = station_locations.drop_duplicates('Stop Name')

In [9]:
daily_means_weekly = pd.read_csv('daily_means_weekly.csv')

In [10]:
daily_means_weekly.drop('Unnamed: 0', axis = 1, inplace = True)

In [11]:
daily_means_weekly.head()

Unnamed: 0,STATION,Day of Week,ENTRIES
0,1 AV,0,15817.779661
1,1 AV,1,17275.050847
2,1 AV,2,18004.661017
3,1 AV,3,18011.322034
4,1 AV,4,18247.237288


In [12]:
station_locations['Stop Name'] = [str.upper(station) for station in station_locations['Stop Name']]

In [13]:
station_locations['Stop Name'] = [station.replace(" - ", "-") for station in station_locations['Stop Name']]

In [14]:
daily_means_weekly['STATION'] = [station.replace("/", " ST-") for station in daily_means_weekly['STATION']]

In [15]:
station_locations['Stop Name'] = [station.replace("/", " ST-") for station in station_locations['Stop Name']]

In [16]:
station_list = list(daily_means_weekly['STATION'].unique())

In [17]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [18]:
similar("hello", "hella")

0.8

In [19]:
replace_list = []

for station1 in station_locations['Stop Name']:
    top_score = 0
    for station2 in station_list:
        score = similar(station1, station2)
        if score > top_score:
            top_score = score
            station_match = station2
    #print(station1, " || ", station_match)
    replace_list.append(station_match)

In [20]:
station_locations['Stop Name'] = replace_list

In [21]:
set1 = set(station_locations['Stop Name'].unique())
set2 = set(daily_means_weekly['STATION'].unique())

len(set1.intersection(set2))

338

In [22]:
daily_means_weekly.head()

Unnamed: 0,STATION,Day of Week,ENTRIES
0,1 AV,0,15817.779661
1,1 AV,1,17275.050847
2,1 AV,2,18004.661017
3,1 AV,3,18011.322034
4,1 AV,4,18247.237288


In [36]:
top25 = daily_means_weekly[['STATION', 'ENTRIES']].groupby(by = 'STATION').agg(sum).sort_values('ENTRIES', ascending = False).head(25).reset_index()

In [26]:
station_locations['STATION'] = list(station_locations['Stop Name'])

In [29]:
station_locations.drop(columns = 'Stop Name', inplace = True)

In [38]:
station_locations.head()

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,North Direction Label,South Direction Label,STATION
0,1,1,R01,BMT,Astoria,Q,N W,Elevated,40.775036,-73.912034,,Manhattan,ASTORIA DITMARS
1,2,2,R03,BMT,Astoria,Q,N W,Elevated,40.770258,-73.917843,Ditmars Blvd,Manhattan,ASTORIA BLVD
2,3,3,R04,BMT,Astoria,Q,N W,Elevated,40.766779,-73.921479,Astoria - Ditmars Blvd,Manhattan,30 AV
3,4,4,R05,BMT,Astoria,Q,N W,Elevated,40.76182,-73.925508,Astoria - Ditmars Blvd,Manhattan,BROADWAY
4,5,5,R06,BMT,Astoria,Q,N W,Elevated,40.756804,-73.929575,Astoria - Ditmars Blvd,Manhattan,36 AV


In [41]:
top25 = pd.merge(top25, station_locations, how = 'left', on = 'STATION')

In [46]:
# Need to drop some of the stations from the merged dataframe because there were duplicate names in station_locations dataframe
# These stations have very similar names to the ones that are actuall in the top25, but they should not be here.
# Also need to add Lat Lon data manually for Path New WTC

top25 = top25.drop([11, 15, 17, 28]).reset_index()
top25 = top25[['STATION', 'ENTRIES', 'GTFS Latitude', 'GTFS Longitude']]

In [50]:
top25.loc[15,'GTFS Latitude'] = 40.711835
top25.loc[15, 'GTFS Longitude'] = -74.012188

In [148]:
top25.head()

Unnamed: 0,STATION,ENTRIES,GTFS Latitude,GTFS Longitude
0,34 ST-PENN STA,879572.09322,40.752287,-73.993391
1,GRD CNTRL-42 ST,790266.043103,40.751776,-73.976848
2,34 ST-HERALD SQ,656823.525424,40.749567,-73.98795
3,23 ST,578143.059322,40.741303,-73.989344
4,42 ST-PORT AUTH,562587.110169,40.757308,-73.989735


In [101]:
import folium

In [131]:
day_map = {0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 4: "Friday", 5: "Saturday", 6: "Sunday"}

In [107]:
lat_mean = top25['GTFS Latitude'].mean()
lon_mean = top25['GTFS Longitude'].mean()

In [133]:
def create_popup_string(i, row):
    
    weekly_total = str(int(round(row['ENTRIES'],-3) / 1000))
    
    popup_string = "<h3>" + row['STATION'] + "</h3>"
    popup_string += "Rank: <strong>" + str(i + 1) + "</strong><br>"
    popup_string += "Weekly total: <strong>" + weekly_total + " K</strong><br>"

    for day in range(0,7):
        station = row['STATION']
        
        day_total = daily_means_weekly[(daily_means_weekly['STATION'] == station) & (daily_means_weekly['Day of Week'] == day)]['ENTRIES'].iloc[0]
        day_total = str(int(round(day_total, -3) / 1000))
        
        popup_string += day_map[day] + ": <strong>" + day_total + " K</strong><br>"
        
    return(popup_string)  

In [142]:
def color_size(row):
    entries = row['ENTRIES']
    if entries > 600000:
        color = 'red'
        size = 14
    elif entries > 400000:
        color = 'yellow'
        size = 12
    elif entries > 300000:
        color = 'green'
        size = 10
    else:
        color = 'blue'
        size = 8
    return color, size

In [147]:
station_map = folium.Map(location=[lat_mean, lon_mean], zoom_start=12, tiles="OpenStreetMap")

fg_stations = folium.FeatureGroup(name="Stations")

for i, row in top25.iterrows():
    
    lat = row['GTFS Latitude']
    lon = row['GTFS Longitude']
    
    popup_string = create_popup_string(i, row)
    color, size = color_size(row)
    
    
    iframe = folium.IFrame(html=popup_string, width=250, height=250)
    
    fg_stations.add_child(folium.CircleMarker(location=[lat, lon], radius = size, popup = folium.Popup(iframe, max_width = 2650),
    fill_color = color, fill = True,  color = 'grey', fill_opacity = 0.7, tooltip = row['STATION']))


station_map.add_child(fg_stations)
station_map.add_child(folium.LayerControl())
station_map.save("station_map.html")
station_map