In [1]:
import pandas as pd
from ast import literal_eval
import itertools
import time

# Finding Most Convenient Transfer Station

To find the most convenient transfer station, I look at all the possible transfer stations between two subway lines. 

For each station, I calculate:
- How many stops between the starting station and the transfer on the first line
- How many stops between the transfer and the ending station on the second line

From this I can determine the shortest trip (in terms of number of stops) and therefore the most convenient transfer station.

For some trips, two or more stations are equally convenient (the same number of stops). I have added in a 'value' parameter which is equal to 1/(# of convenient stops) to weight these differently.

In [55]:
j=1
trip_data = pd.read_csv('../Data/Filtered/trip_data_filtered%d.csv'%j,usecols=['pickup_datetime','dropoff_datetime','passenger_count','trip_time_in_secs','trip_distance','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','stop_id_start','stop_name_start','stop_lat_start','stop_lon_start','lines_start','stop_id_end','stop_name_end','stop_lat_end','stop_lon_end','lines_end','dist_start','dist_end','num_transfers'],parse_dates=['pickup_datetime','dropoff_datetime'],infer_datetime_format=True)
trip_data['lines_start'] = trip_data['lines_start'].apply(literal_eval)
trip_data['lines_end'] = trip_data['lines_end'].apply(literal_eval)

transfers = pd.read_csv('../Data/transfers_updated2.txt',usecols=['from_stop_id','to_stop_id','start_line','end_line'])

lines_data = pd.read_csv('../Data/ordered_stops.txt',usecols=['route_id','stop_id','stop_sequence'])

stops = pd.read_csv('../Data/stops_updated.txt',usecols=['stop_id','parent_station'])

In [36]:
lines_data = lines_data.merge(stops[['stop_id','parent_station']],on='stop_id')
lines_data['dir_route_id'] = lines_data['route_id']+lines_data['stop_id'].str[-1]

In [37]:
#trip_data = trip_data.loc[:100]

In [38]:
def find_transfer_stations(start_lines,end_lines):
    '''find all stations that you can transfer between start and end train lines'''
    pairs = itertools.product(start_lines,end_lines)
    transfer_lines = pd.DataFrame.from_records(pairs,columns=['start_line','end_line'])
    transfer_stations = transfers.merge(transfer_lines,on=['start_line','end_line']).groupby(['from_stop_id','to_stop_id'])
    transfer_stations = pd.DataFrame.from_records(transfer_stations.groups.keys(),columns=['from_stop_id','to_stop_id'])
    return transfer_stations

In [39]:
def convenient_transfers(start_station,transfer_stations,end_station):
    '''Find the most convenient transfer station(s) in the transfer_stations list'''
    start_lines = lines_data[(lines_data['stop_id']==start_station) | (lines_data['parent_station']==start_station)]
    start_transfer_lines = lines_data[(lines_data['stop_id']).isin(transfer_stations['from_stop_id']) | (lines_data['parent_station']).isin(transfer_stations['from_stop_id'])]
    path1 = start_lines.merge(start_transfer_lines,on='dir_route_id',suffixes=('_start','_start_transfer'))
    path1['num_of_stops'] = path1['stop_sequence_start_transfer'] - path1['stop_sequence_start']
    path1 = path1[path1['num_of_stops']>0]
    
    end_lines = lines_data[(lines_data['stop_id']==end_station) | (lines_data['parent_station']==end_station)]
    end_transfer_lines = lines_data[(lines_data['stop_id']).isin(transfer_stations['to_stop_id']) | (lines_data['parent_station']).isin(transfer_stations['to_stop_id'])]
    path2 = end_transfer_lines.merge(end_lines,on='dir_route_id',suffixes=('_end_transfer','_end'))
    path2['num_of_stops'] = path2['stop_sequence_end'] - path2['stop_sequence_end_transfer']
    path2 = path2[path2['num_of_stops']>0]
    
    trips = path1.merge(transfer_stations,left_on='parent_station_start_transfer',right_on='from_stop_id').merge(path2,left_on='to_stop_id',right_on='parent_station_end_transfer',suffixes=('_1','_2'))
    trips['num_of_stops'] = trips['num_of_stops_1'] + trips['num_of_stops_2']
    trips = trips[trips['num_of_stops'] == trips['num_of_stops'].min()]
    
    return trips

In [47]:
def find_convenience(start_station,start_lines,end_station,end_lines):
    result = pd.DataFrame(columns=['stop_id_start','stop_id_end',
                                      'stop_id_start_transfer','stop_id_end_transfer','num_of_stops_1','num_of_stops_2','num_of_stops','value'])
    
    transfer_stations = find_transfer_stations(start_lines,end_lines)
    convenient_stations = convenient_transfers(start_station,transfer_stations,end_station)
    convenient_stations = convenient_stations[['from_stop_id','to_stop_id','num_of_stops_1','num_of_stops_2','num_of_stops']].drop_duplicates().reset_index(drop=True)
    if len(convenient_stations)!=0: #if len=0 then not actually a transfer (starting or ending station is a transfer station)
        convenient_stations['value'] = 1.0/float(len(convenient_stations))
    for i in xrange(len(convenient_stations)):
        result = result.append(pd.DataFrame([(start_station,end_station,
                                              convenient_stations.loc[i]['from_stop_id'],
                                              convenient_stations.loc[i]['to_stop_id'],
                                              convenient_stations.loc[i]['num_of_stops_1'],
                                              convenient_stations.loc[i]['num_of_stops_2'],
                                              convenient_stations.loc[i]['num_of_stops'],
                                              convenient_stations.loc[i]['value'])],
                                            columns=['stop_id_start','stop_id_end',
                                                     'stop_id_start_transfer','stop_id_end_transfer','num_of_stops_1','num_of_stops_2','num_of_stops','value']))
    return result

The data contains many taxi trips that contain the same starting and ending stations. I filter out the unique trips to determine the convenient stations and then recombine them to the full data set.

In [48]:
unique_trips = trip_data[['stop_id_start','lines_start','stop_id_end','lines_end']].drop_duplicates(['stop_id_start','stop_id_end']).reset_index()

In [49]:
convenient = pd.DataFrame(columns=['stop_id_start','stop_id_end',
                                                   'stop_id_start_transfer','stop_id_end_transfer','num_of_stops_1','num_of_stops_2','num_of_stops','value'])
for i in xrange(len(unique_trips)):
    convenient = convenient.append(find_convenience(unique_trips.loc[i]['stop_id_start'],unique_trips.loc[i]['lines_start'],
                                                    unique_trips.loc[i]['stop_id_end'],unique_trips.loc[i]['lines_end']))

In [50]:
trip_data = trip_data.merge(convenient,on=['stop_id_start','stop_id_end'])

In [257]:
trip_data.to_csv('../Data/Convenience/trip_data_convenience%d.csv'%j)