In [1]:
import pandas as pd
import folium
from ast import literal_eval

# Creating Maps of the Subway System

Here I am using Folium to plot the subway stops on the map of New York City. I have made several versions of this plot including:
- A basic map of all the stops
- A map showing all the possible transfer points
- 


In [176]:
stops = pd.read_csv('../Data/stops_updated.txt',usecols=['stop_id','stop_name','stop_lat','stop_lon','parent_station','lines'])
stops['lines'] = stops['lines'].apply(literal_eval)
ordered_stops = pd.read_csv('../Data/ordered_stops.txt',usecols=['route_id','stop_id','stop_sequence'])

In [488]:
parent_stops = stops[stops['parent_station'].isnull()].reset_index(drop=True)

In [178]:
ordered_stops['route_dir_id'] = ordered_stops['route_id'] + ordered_stops['stop_id'].str[-1]
ordered_stops = ordered_stops.merge(stops[['stop_id','stop_name','stop_lat','stop_lon','parent_station']],on='stop_id').sort_values(['route_dir_id','stop_sequence']).reset_index(drop=True)

In [13]:
colors = {'1N':'red','2N':'red','3N':'red','4N':'green','5N':'green','6N':'green',
          '7N':'purple','A1N':'blue','A2N':'blue','CN':'blue','EN':'blue',
          'BN':'orange','DN':'orange','FN':'orange','MN':'orange',
          'FSN':'grey','HN':'grey','LN':'grey','GSN':'grey','GN':'lightgreen',
          'JN':'brown','ZN':'brown','NN':'yellow','QN':'yellow','RN':'yellow','SIN':'lightblue'}

To plot each line correctly, I did some manual labor to delete the duplicate stops to make sure the stops were connected in the correct order. This work is not shown in the notebook, but the output is saved to a file.

In [567]:
inds_2 = inds_2a.append(inds_2b).append(inds_2c).append(inds_2d).append(inds_2e)

In [568]:
inds = inds_1.append(inds_2).append(inds_3).append(inds_4).append(inds_5).append(inds_6).append(inds_7).append(inds_8).append(inds_9).append(inds_10)

In [11]:
#ploting_data = ordered_stops.loc[inds].reset_index(drop=True)
#ploting_data.to_csv('../Data/stops_for_plot.csv')
ploting_data = pd.read_csv('../Data/stops_for_plot.csv')

In [14]:
map1 = folium.Map(location=[40.7528, -73.92],zoom_start=12,tiles='Stamen Terrain')

for i in xrange(len(ploting_data)):
    folium.RegularPolygonMarker(location=[ploting_data.loc[i]['stop_lat'],ploting_data.loc[i]['stop_lon']], 
                                popup=ploting_data.loc[i]['stop_name'],number_of_sides=4, radius=2,
                                color=colors[ploting_data.loc[i]['route_dir_id']],
                                fill_color=colors[ploting_data.loc[i]['route_dir_id']]).add_to(map1)
for i in xrange(len(ploting_data)-1):
    if ploting_data.loc[i]['route_dir_id']==ploting_data.loc[i+1]['route_dir_id'] and ploting_data.loc[i]['stop_sequence']==ploting_data.loc[i+1]['stop_sequence']-1:
        folium.PolyLine([(ploting_data.loc[i]['stop_lat'],ploting_data.loc[i]['stop_lon']),
                         (ploting_data.loc[i+1]['stop_lat'],ploting_data.loc[i+1]['stop_lon'])],
                        opacity=1,color=colors[ploting_data.loc[i]['route_dir_id']],weight=2).add_to(map1)
map1

KeyboardInterrupt: 

In [577]:
map1.save('../Maps/subway_stops.html')

The second plot shows all the transfer stations as the larger squares on the map.

In [397]:
transfers = pd.read_csv('../Data/transfers_updated.txt',usecols=['from_stop_id','to_stop_id'])

In [410]:
transfers = transfers[transfers['from_stop_id']!=transfers['to_stop_id']]
from_stations = transfers.groupby('from_stop_id').size().index
to_stations = transfers.groupby('to_stop_id').size().index
transfer_stations = set(from_stations.append(to_stations))

In [572]:
map2 = folium.Map(location=[40.7528, -73.92],zoom_start=12,tiles='Stamen Terrain')

for i in xrange(len(ploting_data)):
    if ploting_data.loc[i]['parent_station'] in transfer_stations:
        size=10
    else:
        size=2
    folium.RegularPolygonMarker(location=[ploting_data.loc[i]['stop_lat'],ploting_data.loc[i]['stop_lon']], 
                                popup=ploting_data.loc[i]['stop_name'],number_of_sides=4, radius=size,
                                color=colors[ploting_data.loc[i]['route_dir_id']],
                                fill_color=colors[ploting_data.loc[i]['route_dir_id']]).add_to(map2)
for i in xrange(len(ploting_data)-1):
    if ploting_data.loc[i]['route_dir_id']==ploting_data.loc[i+1]['route_dir_id'] and ploting_data.loc[i]['stop_sequence']==ploting_data.loc[i+1]['stop_sequence']-1:
        folium.PolyLine([(ploting_data.loc[i]['stop_lat'],ploting_data.loc[i]['stop_lon']),
                         (ploting_data.loc[i+1]['stop_lat'],ploting_data.loc[i+1]['stop_lon'])],
                        opacity=1,color=colors[ploting_data.loc[i]['route_dir_id']],weight=2).add_to(map2)

map2

In [573]:
map2.save('../Maps/transfer_stations.html')

Finally, I plot the popularity of the different transfer stations, by changing the size of the markers. I scaled the popularities by the number of train lines that come to that station. Before this scaling Times Square was the most popular transfer point, because almost every train line stops there. After scaling, we can see that 59th St - Columbus Circle is also just as popular as Times Square.

In [16]:
import datetime
def sec_to_time(row):
    hour = row['seconds'] // 3600
    minute = (row['seconds'] - hour*3600) / 60
    return datetime.time(int(hour), int(minute), 0)

In [18]:
data = pd.read_csv('../Data/Crowdedness/trip_data_crowdedness.csv', usecols=['stop_id','day','adjusted_value','seconds'])
data['time'] = data.apply(sec_to_time, axis=1)
stops = pd.read_csv('../Data/stops_updated.txt',usecols=['stop_id','stop_name','stop_lat','stop_lon','lines'])
stops['lines'] = stops['lines'].apply(literal_eval)
data_new = data.merge(stops, on='stop_id')
data_json = data_new.to_json(orient='records')
with open('../Data/Crowdedness/crowdedness.json', 'w') as f:
    f.write(data_json)

In [39]:
data = pd.read_csv('../Data/Crowdedness/trip_data_crowdedness.csv', usecols=['stop_id','day','adjusted_value','seconds'])
data = data[(data['stop_id'].isin(['125','132','116'])) & (data['day'] == 'Weekday')]
data['time'] = data.apply(sec_to_time, axis=1)
stops = pd.read_csv('../Data/stops_updated.txt',usecols=['stop_id','stop_name','stop_lat','stop_lon','lines'])
stops['lines'] = stops['lines'].apply(literal_eval)
data_new = data.merge(stops, on='stop_id')
data_json = data_new.to_json(orient='records')
with open('../Data/Crowdedness/sample_crowdedness.json', 'w') as f:
    f.write(data_json)

In [26]:
data_new['hour'] = data_new.apply(lambda x: x['time'].hour, axis=1)

In [27]:
data_new.head()

Unnamed: 0,stop_id,day,adjusted_value,seconds,time,stop_name,stop_lat,stop_lon,lines,hour
0,103,Sunday,5.0,13200,03:40:00,238 St,40.884667,-73.90087,[1],3
1,103,Weekday,1.0,30000,08:20:00,238 St,40.884667,-73.90087,[1],8
2,103,Saturday,2.0,3000,00:50:00,238 St,40.884667,-73.90087,[1],0
3,103,Saturday,1.0,6000,01:40:00,238 St,40.884667,-73.90087,[1],1
4,103,Saturday,15.0,6600,01:50:00,238 St,40.884667,-73.90087,[1],1


In [35]:
group = pd.DataFrame(data_new.groupby(['stop_id','day','hour'])['adjusted_value'].sum()).reset_index()
group[(group.stop_id == '101') & (group.day == 'Weekday')]

Unnamed: 0,stop_id,day,hour,adjusted_value
14,101,Weekday,0,1.0
15,101,Weekday,1,7.0
16,101,Weekday,2,1.0
17,101,Weekday,3,2.0
18,101,Weekday,7,1.0
19,101,Weekday,9,3.0
20,101,Weekday,10,3.0
21,101,Weekday,11,1.0
22,101,Weekday,13,1.0
23,101,Weekday,15,2.0


In [5]:
crowdedness = pd.read_csv('../Data/Crowdedness/trip_data_crowdedness.csv')#,usecols=['stop_id_start_transfer','stop_id_end_transfer','popularity'])
crowdedness = crowdedness.groupby('stop_id')['adjusted_value'].sum()

In [8]:
crowdedness = pd.DataFrame(crowdedness)

In [18]:
crowdedness['scaled_value'] = crowdedness['adjusted_value']/max(crowdedness['adjusted_value'])

In [19]:
crowdedness

Unnamed: 0_level_0,adjusted_value,scaled_value
stop_id,Unnamed: 1_level_1,Unnamed: 2_level_1
101,73.000000,0.000223
103,134.000000,0.000409
104,202.000000,0.000616
106,127.000000,0.000387
107,311.000000,0.000949
108,211.000000,0.000644
109,396.000000,0.001208
110,564.000000,0.001721
111,1219.000000,0.003719
112,19901.333333,0.060713


In [450]:
def num_lines(row):
    return len(row['lines'])
    
stops['num_of_lines'] = stops.apply(num_lines,axis=1)

In [493]:
popularity = popularity.merge(parent_stops[['stop_id','num_of_lines']],left_on='stop_id_start_transfer',right_on='stop_id').merge(parent_stops[['stop_id','num_of_lines']],left_on='stop_id_end_transfer',right_on='stop_id')

In [505]:
start_popularity = pd.DataFrame(popularity.groupby('stop_id_start_transfer')['popularity'].sum()).reset_index()#,columns=['popularity_start'])
start_popularity = start_popularity.merge(parent_stops[['stop_id','num_of_lines']],left_on='stop_id_start_transfer',right_on='stop_id')
end_popularity = pd.DataFrame(popularity.groupby('stop_id_end_transfer')['popularity'].sum()).reset_index()#,columns=['popularity_end'])
end_popularity = end_popularity.merge(parent_stops[['stop_id','num_of_lines']],left_on='stop_id_end_transfer',right_on='stop_id')
total_popularity = start_popularity.merge(end_popularity, on=['stop_id','num_of_lines'])

In [514]:
total_popularity.head()

Unnamed: 0_level_0,stop_id_start_transfer,popularity_x,num_of_lines,stop_id_end_transfer,popularity_y,popularity
stop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
112,112,0.096389,1,112,0.087409,0.183798
120,120,0.063825,3,120,0.063825,0.04255
123,123,0.025497,3,123,0.025497,0.016998
125,125,0.718922,1,125,0.776767,1.495689
127,127,2.208676,3,127,2.237051,1.481909


In [512]:
total_popularity['popularity'] = (total_popularity['popularity_x'] + total_popularity['popularity_y'])/total_popularity['num_of_lines']
total_popularity = total_popularity.set_index('stop_id')#.max()

In [22]:
map3 = folium.Map(location=[40.7528, -73.92],zoom_start=12,tiles='Stamen Terrain')

for i in xrange(len(ploting_data)):
    if ploting_data.loc[i]['parent_station'] in crowdedness.index:
        size = 20*crowdedness.loc[ploting_data.loc[i]['parent_station']]['scaled_value']
        folium.RegularPolygonMarker(location=[ploting_data.loc[i]['stop_lat'],ploting_data.loc[i]['stop_lon']], 
                                    popup=ploting_data.loc[i]['stop_name'],number_of_sides=4, radius=size,
                                    color=None,
                                    fill_color=colors[ploting_data.loc[i]['route_dir_id']],fill_opacity=0.8).add_to(map3)

for i in xrange(len(ploting_data)-1):
    if ploting_data.loc[i]['route_dir_id']==ploting_data.loc[i+1]['route_dir_id'] and ploting_data.loc[i]['stop_sequence']==ploting_data.loc[i+1]['stop_sequence']-1:
        folium.PolyLine([(ploting_data.loc[i]['stop_lat'],ploting_data.loc[i]['stop_lon']),
                         (ploting_data.loc[i+1]['stop_lat'],ploting_data.loc[i+1]['stop_lon'])],
                        opacity=1,color=colors[ploting_data.loc[i]['route_dir_id']],weight=2).add_to(map3)

map3

In [23]:
map3.save('../Maps/subway_crowdedness.html')