In [88]:
#Libraries

#Python Libs
import sys
import os
import glob
import traceback
from datetime import datetime
import time
from geopy import distance


#Data Analysis Libs
import pandas as pd
import numpy as np


In [89]:
#Functions
def select_input_files(enh_buste_base_path,init_date,fin_date,suffix):
        selected_files = []
        all_files = glob.glob(os.path.join(enh_buste_base_path,"*"))

        for file_ in all_files:
                try:
                        file_date = pd.to_datetime(file_.split('/')[-1],format=('%Y_%m_%d' + suffix  + '.csv'))
                        if (file_date >= init_date) and (file_date <= fin_date):
                                selected_files.append((file_,file_date))
                except:
                        continue

        return sorted(selected_files)

def get_router_id(query_date):
    INTERMEDIATE_OTP_DATE = pd.to_datetime("2017-06-30", format="%Y-%m-%d")
    router_id = ''

    if (query_date <= INTERMEDIATE_OTP_DATE):
        return 'ctba-2017-1'
    else:
        return 'ctba-2017-2'

In [90]:
def match_terminal_boardings(selected_trips,itineraries_start):
        terminal_boarding_origins = selected_trips[(selected_trips['o_busCode'].str.isdigit()) & (selected_trips['o_route'] != '021')]
        matched_terminal_boardings = terminal_boarding_origins.merge(itineraries_start,
                                                                                                                                left_on=['o_boarding_id','o_stopPointId'],
                                                                                                                                right_on=['user_trip_id','from_parent_station'], how='inner')
        num_matched_terminal_boardings = len(matched_terminal_boardings.drop_duplicates(subset=['cardNum','o_boarding_id']))
        if num_matched_terminal_boardings == 0:
            matched_perc = 0.0
        else:
            matched_perc = 100*(num_matched_terminal_boardings/float(len(terminal_boarding_origins)))
        return (matched_terminal_boardings,num_matched_terminal_boardings,matched_perc)

def match_terminal_021_boardings(selected_trips,itineraries_start):
        terminal_021_origins = selected_trips[(selected_trips['o_busCode'].str.isdigit()) & (selected_trips['o_route'] == '021')]
        matched_021_terminal_boardings = terminal_021_origins.merge(itineraries_start, left_on=['o_boarding_id','o_route','o_stopPointId'], right_on=['user_trip_id','route','from_parent_station'], how='inner')
        num_matched_021_terminal_boardings = len(matched_021_terminal_boardings.drop_duplicates(subset=['cardNum','o_boarding_id']))
        if num_matched_021_terminal_boardings == 0:
                terminal_021_match_perc = 0.0
        else:
                terminal_021_match_perc = 100*(num_matched_021_terminal_boardings/float(len(terminal_021_origins)))
        return (len(terminal_021_origins),matched_021_terminal_boardings,num_matched_021_terminal_boardings,terminal_021_match_perc)


In [94]:
def get_candidate_itineraries_penalty_score(otp_buste_itineraries_filtered):
        otp_buste_itineraries_penalty = otp_buste_itineraries_filtered \
                                                        .assign(penalty = lambda x: 2*x['start_diff'].dt.total_seconds() + x['trip_duration'].dt.total_seconds() + x['match_num_transfers']*10) \
                                                        [['cardNum','user_trip_id','itinerary_id','match_num_transfers','match_vehicle_boarding','next_origin_dist','origin_dist','start_diff','trip_duration','penalty']] \
                                                        .sort_values(['user_trip_id','penalty'], ascending=True)
        return otp_buste_itineraries_penalty


## Main

In [96]:
otp_suggestions_filepath = '/local/tarciso/data/otp_itineraries/2017_05_01_user_trips_100_otp_itineraries.csv'
user_trips_folderpath = '/local/tarciso/data/enhanced-buste/user_trips/'
bus_trips_folderpath = '/local/tarciso/data/enhanced-buste/bus_trips/'
gtfs_base_folderpath = '/local/tarciso/data/gtfs/'
output_folderpath = '/local/tarciso/data/test-odmat/'

In [97]:
file_date_str = otp_suggestions_filepath.split('/')[-1].split('_user_trips_')[0]
file_date = pd.to_datetime(file_date_str,format='%Y_%m_%d')
print "Processing File:", otp_suggestions_filepath

Processing File: /local/tarciso/data/otp_itineraries/2017_05_01_user_trips_100_otp_itineraries.csv


In [98]:
        # Extracting itinerary part name for later use
        itinerary_part_name = otp_suggestions_filepath.split('/')[-1].split('_')[5]
        # Read OTP Suggestions
        otp_suggestions_raw = pd.read_csv(otp_suggestions_filepath, parse_dates=['date','otp_start_time','otp_end_time'])

        if len(otp_suggestions_raw) == 0:
            print "Zero OTP suggestions found."
            print "Skipping next steps..."
            exit(0)


In [99]:
def prepare_otp_data(otp_data):
        #Fixing prefix
        otp_data.columns = otp_data.columns.str.replace('otp_','')
        otp_data = otp_data.add_prefix('otp_')
        
        #Fixing Timezone difference - when needed
        otp_data['otp_start_time'] = otp_data['otp_start_time'] - pd.Timedelta('10800 s')
        otp_data['otp_end_time'] = otp_data['otp_end_time'] - pd.Timedelta('10800 s')
        
        #Adjusting route format to have 3 numbers
        otp_data['otp_route'] = otp_data['otp_route'].astype(str)
        otp_data['otp_route'] = np.where(otp_data['otp_mode'] == 'BUS',
                            otp_data['otp_route'].astype(str).str.replace("\.0",'').str.zfill(3),
                            otp_data['otp_route'])

        return otp_data

In [100]:
        # Prepare OTP data for analysis
        otp_suggestions = prepare_otp_data(otp_suggestions_raw)

        # Read stops data
        stops_filepath = gtfs_base_folderpath + os.sep + get_router_id(file_date) + os.sep + 'stops.txt'
        stops_df = pd.read_csv(stops_filepath)

        # Adding Parent Stop data to OTP Suggestions
        stops_parent_stations = stops_df[['stop_id','parent_station']]
        otp_suggestions = otp_suggestions.merge(stops_parent_stations.add_prefix('from_'),
                                                left_on='otp_from_stop_id',
                                                right_on='from_stop_id',
                                                how='left') \
                                        .merge(stops_parent_stations.add_prefix('to_'),
                                                left_on='otp_to_stop_id',
                                                right_on='to_stop_id',
                                                how='left') \
                                        .drop(['from_stop_id','to_stop_id'], axis=1) \
                                        .rename(index=str, columns={'from_parent_station':'otp_from_parent_station',
                                                                    'to_parent_station':'otp_to_parent_station'})
        
        otp_suggestions_bus_legs = otp_suggestions[otp_suggestions['otp_mode'] == 'BUS']
        otp_suggestions_walk_legs = otp_suggestions[otp_suggestions['otp_mode'] == 'WALK']


In [101]:
otp_suggestions.head()

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827.0,33788.0,30993.0,6.45,,14508.0
1,2017-05-01,28150.0,1,2,2017-05-01 13:45:00,2017-05-01 13:45:52,WALK,,,,0.866667,,
2,2017-05-01,28150.0,1,3,2017-05-01 13:51:00,2017-05-01 14:01:59,BUS,303.0,26195.0,25753.0,10.983333,14508.0,41756.0
3,2017-05-01,28150.0,1,4,2017-05-01 14:02:00,2017-05-01 14:09:32,WALK,,,,7.533333,,
4,2017-05-01,28150.0,2,1,2017-05-01 13:42:58,2017-05-01 13:57:00,BUS,822.0,33788.0,30994.0,14.033333,,14508.0


## Match Scheduled Itineraries to Observed Bus Trips

### Read and Prepare Bus Trips data

In [102]:
        # Find OTP Suggested Itineraries in BUSTE Data
        bus_trips_filepath = bus_trips_folderpath + os.sep + file_date_str + '_bus_trips.csv'
        bus_trips = pd.read_csv(bus_trips_filepath, dtype={'route': object},parse_dates=['gps_datetime']) \
                                        .sort_values(['route','busCode','tripNum','gps_datetime']) \
                                        .assign(route = lambda x: x['route'].astype(str).str.replace("\.0",'').str.zfill(3))


In [103]:
bus_trips.head()

Unnamed: 0,route,busCode,shapeId,tripNum,stopPointId,gps_datetime,distanceTraveledShape,stop_lat,stop_lon,parent_station
29435,10,BB303,1708.0,1.0,33157.0,2017-05-01 06:28:07,537.974,-25.410517,-49.276479,
29436,10,BB303,1708.0,1.0,33159.0,2017-05-01 06:30:38,1141.061,-25.411726,-49.270902,
29437,10,BB303,1708.0,1.0,33158.0,2017-05-01 06:31:40,1624.751,-25.415285,-49.270134,
29438,10,BB303,1708.0,1.0,30150.0,2017-05-01 06:32:26,1972.077,-25.416733,-49.267863,
29439,10,BB303,1708.0,1.0,28637.0,2017-05-01 06:33:11,2378.349,-25.414184,-49.265917,


In [104]:
bus_trips.dtypes

route                            object
busCode                          object
shapeId                         float64
tripNum                         float64
stopPointId                     float64
gps_datetime             datetime64[ns]
distanceTraveledShape           float64
stop_lat                        float64
stop_lon                        float64
parent_station                  float64
dtype: object

In [105]:
otp_suggestions_bus_legs.dtypes

otp_date                   datetime64[ns]
otp_user_trip_id                  float64
otp_itinerary_id                    int64
otp_leg_id                          int64
otp_start_time             datetime64[ns]
otp_end_time               datetime64[ns]
otp_mode                           object
otp_route                          object
otp_from_stop_id                  float64
otp_to_stop_id                    float64
otp_duration_mins                 float64
otp_from_parent_station           float64
otp_to_parent_station             float64
dtype: object

In [106]:
scheduled_itin_observed_od = otp_suggestions_bus_legs.merge(bus_trips.add_prefix('bt_from_'),
                                left_on=['otp_route','otp_from_stop_id'],
                                right_on=['bt_from_route','bt_from_stopPointId'],
                                how='inner') \
                                .assign(sched_obs_start_timediff = 
                                        lambda x: np.absolute(x['bt_from_gps_datetime'] - x['otp_start_time']))
scheduled_itin_observed_od = scheduled_itin_observed_od[scheduled_itin_observed_od['sched_obs_start_timediff'] <= pd.Timedelta(minutes=60)]

In [107]:
scheduled_itin_observed_od = scheduled_itin_observed_od.merge(bus_trips.add_prefix('bt_to_'),
                                left_on=['otp_route','bt_from_busCode','bt_from_tripNum','otp_to_stop_id'],
                                right_on=['bt_to_route','bt_to_busCode','bt_to_tripNum','bt_to_stopPointId'],
                                how='inner') \
                                .assign(sched_obs_end_timediff = 
                                        lambda x: np.absolute(x['bt_to_gps_datetime'] - x['otp_end_time'])) \
                                .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','sched_obs_start_timediff','sched_obs_end_timediff'])
scheduled_itin_observed_od = scheduled_itin_observed_od[scheduled_itin_observed_od['sched_obs_end_timediff'] <= pd.Timedelta(minutes=60)]

In [108]:
scheduled_itin_observed_od_full = pd.concat([scheduled_itin_observed_od,otp_suggestions_walk_legs])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [109]:
#bus_trips[(bus_trips['route'] == '827') & (bus_trips['stopPointId'] == 33788)].sort_values(['gps_datetime'])

In [110]:
scheduled_itin_observed_od_full.filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_route','bt_from_busCode','bt_from_tripNum','bt_from_stopPointId','otp_start_time','bt_from_gps_datetime','sched_obs_start_timediff','bt_to_stopPointId','otp_end_time','bt_to_gps_datetime','sched_obs_end_timediff']) \
                            .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_route,bt_from_busCode,bt_from_tripNum,bt_from_stopPointId,otp_start_time,bt_from_gps_datetime,sched_obs_start_timediff,bt_to_stopPointId,otp_end_time,bt_to_gps_datetime,sched_obs_end_timediff
3,28150.0,1,1,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,00:03:15,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,00:00:46
4,28150.0,1,1,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,00:03:15,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,00:00:46
5,28150.0,1,1,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,00:03:15,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,00:00:46
6,28150.0,1,1,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,00:03:15,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,00:00:46
15,28150.0,1,1,827,BC929,13.0,33788.0,2017-05-01 13:38:33,2017-05-01 14:22:34,00:44:01,30993.0,2017-05-01 13:45:00,2017-05-01 14:30:58,00:45:58
0,28150.0,1,1,827,BC929,11.0,33788.0,2017-05-01 13:38:33,2017-05-01 12:47:49,00:50:44,30993.0,2017-05-01 13:45:00,2017-05-01 12:53:04,00:51:56
1,28150.0,1,2,,,,,2017-05-01 13:45:00,NaT,NaT,,2017-05-01 13:45:52,NaT,NaT
110,28150.0,1,3,303,DE722,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:02:48,00:11:48,25753.0,2017-05-01 14:01:59,2017-05-01 14:12:10,00:10:11
51,28150.0,1,3,303,DE710,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:27:30,00:23:30,25753.0,2017-05-01 14:01:59,2017-05-01 13:35:59,00:26:00
67,28150.0,1,3,303,DE713,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:16:34,00:34:26,25753.0,2017-05-01 14:01:59,2017-05-01 13:24:23,00:37:36


In [111]:
#Run choose_legs on scheduled_itin_observed_od

In [112]:
scheduled_itin_observed_od_earliest = scheduled_itin_observed_od_full.sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','sched_obs_start_timediff','sched_obs_end_timediff']) \
                        .groupby(['otp_user_trip_id','otp_itinerary_id','otp_leg_id']) \
                        .first() \
                        .filter(['otp_route','otp_mode','stopPointId','otp_start_time','bt_from_gps_datetime','sched_obs_start_timediff','sched_obs_end_timediff']) \
                        .reset_index()

In [113]:
scheduled_itin_observed_od_earliest

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_route,otp_mode,otp_start_time,bt_from_gps_datetime,sched_obs_start_timediff,sched_obs_end_timediff
0,28150.0,1,1,827,BUS,2017-05-01 13:38:33,2017-05-01 13:35:18,00:03:15,00:00:46
1,28150.0,1,2,,WALK,2017-05-01 13:45:00,NaT,NaT,NaT
2,28150.0,1,3,303,BUS,2017-05-01 13:51:00,2017-05-01 14:02:48,00:11:48,00:10:11
3,28150.0,1,4,,WALK,2017-05-01 14:02:00,NaT,NaT,NaT
4,28150.0,2,2,,WALK,2017-05-01 13:57:00,NaT,NaT,NaT
5,28150.0,2,3,303,BUS,2017-05-01 14:03:00,2017-05-01 14:02:48,00:00:12,00:03:09
6,28150.0,2,4,,WALK,2017-05-01 14:15:20,NaT,NaT,NaT
7,28150.0,3,1,,WALK,2017-05-01 13:41:55,NaT,NaT,NaT
8,28150.0,3,2,870,BUS,2017-05-01 14:03:39,2017-05-01 13:57:41,00:05:58,00:12:32
9,28150.0,3,3,,WALK,2017-05-01 14:20:06,NaT,NaT,NaT


In [114]:
scheduled_itin_observed_od_earliest[scheduled_itin_observed_od_earliest['otp_mode'] == 'BUS'].sched_obs_start_timediff.quantile(.95)

Timedelta('0 days 00:28:10.500000')

### Read and Prepare Origin/Next-Origin Pairs data

In [115]:
def compatible_dates(otp_data,ticketing_data):
        otp_date = otp_data['otp_date'].iloc[0]
        ticketing_date = pd.to_datetime(ticketing_data['o_boarding_datetime'].dt.strftime('%Y-%m-%d')[0])

        return (otp_date == ticketing_date,otp_date,ticketing_date)


In [116]:
        # Read Origin/Next-Origin Pairs for the same date
        trips_origins_filepath = user_trips_folderpath + os.sep + file_date_str + '_user_trips.csv'
        trips_on_pairs_full = pd.read_csv(trips_origins_filepath,
                                                parse_dates=['o_boarding_datetime','o_gps_datetime','next_o_boarding_datetime','next_o_gps_datetime'])
        # Checking whether OTP and ticketing dates match
        dates_compatibility, otp_date, ticketing_date = compatible_dates(otp_suggestions,trips_on_pairs_full)
        if not dates_compatibility:
                print "ERROR: OTP date", otp_date, "does not match Ticketing data", ticketing_date
                print "Skipping current day"
                exit(1)
        
        trips_on_pairs = trips_on_pairs_full.filter(['o_boarding_id','next_o_boarding_id'])
        trips_origins = trips_on_pairs_full.filter([col for col in trips_on_pairs_full.columns if col.startswith('o_')])
        trips_next_origins = trips_on_pairs_full.filter([col for col in trips_on_pairs_full.columns if col.startswith('next_o_')])


In [117]:
trips_origins.head()

Unnamed: 0,o_boarding_id,o_boarding_datetime,o_gps_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_stop_lat,o_stop_lon
0,180.0,2017-05-01 04:48:42,2017-05-01 04:48:17,650,HA030,1.0,36293.0,-25.568989,-49.332253
1,40392.0,2017-05-01 15:52:39,NaT,21,08044,,41791.0,-25.431951,-49.296491
2,181.0,2017-05-01 04:49:13,2017-05-01 04:49:08,650,HA030,1.0,36294.0,-25.56686,-49.3329
3,31560.0,2017-05-01 14:16:55,NaT,0,00038,,26051.0,-25.447479,-49.263816
4,182.0,2017-05-01 04:49:19,2017-05-01 04:49:08,650,HA030,1.0,36294.0,-25.56686,-49.3329


In [118]:
trips_next_origins.head()

Unnamed: 0,next_o_boarding_id,next_o_boarding_datetime,next_o_gps_datetime,next_o_route,next_o_busCode,next_o_tripNum,next_o_stopPointId,next_o_stop_lat,next_o_stop_lon
0,40392.0,2017-05-01 15:52:39,NaT,21,08044,,41791.0,-25.431951,-49.296491
1,180.0,2017-05-01 04:48:42,2017-05-01 04:48:17,650,HA030,1.0,36293.0,-25.568989,-49.332253
2,31560.0,2017-05-01 14:16:55,NaT,0,00038,,26051.0,-25.447479,-49.263816
3,181.0,2017-05-01 04:49:13,2017-05-01 04:49:08,650,HA030,1.0,36294.0,-25.56686,-49.3329
4,61724.0,2017-05-01 19:27:16,2017-05-01 19:26:26,814,LA851,21.0,30284.0,-25.432534,-49.338889


In [119]:
trips_on_pairs.head()

Unnamed: 0,o_boarding_id,next_o_boarding_id
0,180.0,40392.0
1,40392.0,180.0
2,181.0,31560.0
3,31560.0,181.0
4,182.0,61724.0


In [120]:
        # Selecting trips for whom OTP suggestions were found
        selected_trips = trips_origins[trips_origins['o_boarding_id'].isin(otp_suggestions['otp_user_trip_id'])]
        num_selected_trips = len(selected_trips)


In [121]:
        # Matching all kinds of boarding events to valid OTP suggestions
        itins_first_bus_legs = otp_suggestions.query('otp_mode == \'BUS\'') \
                                .groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                .first() \
                                .reset_index()

In [122]:
itins_first_bus_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_date,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,28150.0,1,2017-05-01,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,14508.0,14508.0
1,28150.0,2,2017-05-01,1,2017-05-01 13:42:58,2017-05-01 13:57:00,BUS,822,33788.0,30994.0,14.033333,14508.0,14508.0
2,28150.0,3,2017-05-01,2,2017-05-01 14:03:39,2017-05-01 14:20:05,BUS,870,33980.0,35079.0,16.433333,,
3,28150.0,4,2017-05-01,1,2017-05-01 13:57:58,2017-05-01 14:12:00,BUS,822,33788.0,30994.0,14.033333,14508.0,14508.0
4,28150.0,5,2017-05-01,2,2017-05-01 14:10:00,2017-05-01 14:14:00,BUS,821,33973.0,30995.0,4.0,14508.0,14508.0


In [123]:
len(itins_first_bus_legs)

2000

In [124]:
len(pd.concat([itins_first_bus_legs,otp_suggestions]).drop_duplicates(keep=False))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


7092

In [125]:
itins_rest_bus_legs = otp_suggestions.query('otp_mode == \'BUS\'') \
                                .groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                .apply(lambda group: group.iloc[1:]) #\
                                #.reset_index(level=0)

In [126]:
itins_rest_bus_legs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
otp_user_trip_id,otp_itinerary_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
28150.0,1,2,2017-05-01,28150.0,1,3,2017-05-01 13:51:00,2017-05-01 14:01:59,BUS,303,26195.0,25753.0,10.983333,14508.0,41756.0
28150.0,2,6,2017-05-01,28150.0,2,3,2017-05-01 14:03:00,2017-05-01 14:15:19,BUS,303,26195.0,25753.0,12.316667,14508.0,41756.0
28150.0,4,13,2017-05-01,28150.0,4,3,2017-05-01 14:15:00,2017-05-01 14:25:59,BUS,303,26195.0,25753.0,10.983333,14508.0,41756.0
28150.0,5,18,2017-05-01,28150.0,5,4,2017-05-01 14:21:00,2017-05-01 14:33:19,BUS,303,26195.0,25753.0,12.316667,14508.0,41756.0
28150.0,7,25,2017-05-01,28150.0,7,3,2017-05-01 14:33:00,2017-05-01 14:43:59,BUS,303,26195.0,25753.0,10.983333,14508.0,41756.0
28150.0,8,29,2017-05-01,28150.0,8,3,2017-05-01 14:39:00,2017-05-01 14:51:19,BUS,303,26195.0,25753.0,12.316667,14508.0,41756.0
28150.0,9,33,2017-05-01,28150.0,9,3,2017-05-01 14:45:00,2017-05-01 14:57:19,BUS,303,26195.0,25753.0,12.316667,14508.0,41756.0
28150.0,10,37,2017-05-01,28150.0,10,3,2017-05-01 15:02:00,2017-05-01 15:14:19,BUS,303,26195.0,25753.0,12.316667,14508.0,41756.0
28154.0,1,71,2017-05-01,28154.0,1,3,2017-05-01 14:22:00,2017-05-01 14:53:48,BUS,603,26296.0,25521.0,31.800000,14494.0,26096.0
28154.0,2,75,2017-05-01,28154.0,2,3,2017-05-01 14:39:00,2017-05-01 15:10:48,BUS,603,26296.0,25521.0,31.800000,14494.0,26096.0


In [127]:
itins_first_bus_legs.otp_leg_id.describe()

count    2000.000000
mean        1.726000
std         0.446121
min         1.000000
25%         1.000000
50%         2.000000
75%         2.000000
max         2.000000
Name: otp_leg_id, dtype: float64

In [128]:
itins_first_bus_legs.dtypes

otp_user_trip_id                  float64
otp_itinerary_id                    int64
otp_date                   datetime64[ns]
otp_leg_id                          int64
otp_start_time             datetime64[ns]
otp_end_time               datetime64[ns]
otp_mode                           object
otp_route                          object
otp_from_stop_id                  float64
otp_to_stop_id                    float64
otp_duration_mins                 float64
otp_from_parent_station           float64
otp_to_parent_station             float64
dtype: object

## Working with Vehicle Boardings

In [129]:
selected_trips.head()

Unnamed: 0,o_boarding_id,o_boarding_datetime,o_gps_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_stop_lat,o_stop_lon
19800,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221
19801,57666.0,2017-05-01 18:46:58,NaT,021,08065,,41778.0,-25.436627,-49.317949
19802,28153.0,2017-05-01 13:36:28,NaT,TCJ,06003,,14478.0,-25.43998,-49.221858
19803,64706.0,2017-05-01 20:22:36,2017-05-01 19:59:34,020,BB607,7.0,28132.0,-25.435878,-49.306888
19804,28156.0,2017-05-01 13:36:28,2017-05-01 13:36:21,650,HR410,5.0,36299.0,-25.565199,-49.333825


In [130]:
itins_first_bus_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_date,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,28150.0,1,2017-05-01,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,14508.0,14508.0
1,28150.0,2,2017-05-01,1,2017-05-01 13:42:58,2017-05-01 13:57:00,BUS,822,33788.0,30994.0,14.033333,14508.0,14508.0
2,28150.0,3,2017-05-01,2,2017-05-01 14:03:39,2017-05-01 14:20:05,BUS,870,33980.0,35079.0,16.433333,,
3,28150.0,4,2017-05-01,1,2017-05-01 13:57:58,2017-05-01 14:12:00,BUS,822,33788.0,30994.0,14.033333,14508.0,14508.0
4,28150.0,5,2017-05-01,2,2017-05-01 14:10:00,2017-05-01 14:14:00,BUS,821,33973.0,30995.0,4.0,14508.0,14508.0


In [131]:
#itins_first_bus_legs.groupby(['otp_user_trip_id','otp_route','otp_from_stop_id']) \
#                .drop()

In [132]:
def match_vehicle_boardings(selected_trips,itineraries_st):
        vehicle_boarding_origins = selected_trips[np.logical_not(selected_trips['o_busCode'].str.isdigit())]
        matched_vehicle_boardings = vehicle_boarding_origins.merge(itineraries_st, left_on=['o_boarding_id','o_route','o_stopPointId'], 
                                                                   right_on=['otp_user_trip_id','otp_route','otp_from_stop_id'], how='inner')
        num_matched_vehicle_boardings = len(matched_vehicle_boardings.drop_duplicates(subset=['o_boarding_id']))
        if num_matched_vehicle_boardings == 0:
            match_perc = 0.0
        else:
            match_perc = 100*(num_matched_vehicle_boardings/float(len(vehicle_boarding_origins)))
        return (matched_vehicle_boardings,num_matched_vehicle_boardings,match_perc)

In [133]:
        # Matching vehicle boarding origins
        matched_vehicle_boardings,num_matched_vehicle_boardings,vehicle_match_perc = match_vehicle_boardings(selected_trips,itins_first_bus_legs)
        print "Vehicle boardings with matching OTP suggestions: ", num_matched_vehicle_boardings, "(",vehicle_match_perc, "%)"


        # Matching terminal boarding origins
        #matched_terminal_boardings,num_matched_terminal_boardings,terminal_matched_perc = match_terminal_boardings(selected_trips,itineraries_start)
        #print "Terminal boardings with matching OTP suggestions: ", num_matched_terminal_boardings, "(", terminal_matched_perc, "%)"

        # Matching special case route 021 terminal boarding origins 
        #num_terminal_021_boardings,matched_021_terminal_boardings,num_matched_021_terminal_boardings,terminal_021_match_perc = match_terminal_021_boardings(selected_trips,itineraries_start)
        #if (num_terminal_021_boardings > 0):
        #        print "Line 021 Terminal boardings with matching OTP suggestions: ", num_matched_021_terminal_boardings, "(",terminal_021_match_perc, "%)"
        #else:
        #        print "No Line 021 Terminal boardings found. Skipping matching."

        #boarding_suggestions_matches = pd.concat([matched_vehicle_boardings,matched_021_terminal_boardings,matched_terminal_boardings])

        #total_num_matches = num_matched_vehicle_boardings + num_matched_021_terminal_boardings + num_matched_terminal_boardings
        #print "Total number of matches: ", total_num_matches, "(", 100*(total_num_matches/float(num_selected_trips)), "%)"

        #if total_num_matches == 0:
        #    print "No match was found. Skipping next steps..."
        #    exit(0)


Vehicle boardings with matching OTP suggestions:  92 ( 80.701754386 %)


In [134]:
matched_vehicle_boardings.head()

Unnamed: 0,o_boarding_id,o_boarding_datetime,o_gps_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_stop_lat,o_stop_lon,otp_user_trip_id,...,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,28150.0,...,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,14508.0,14508.0
1,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,28150.0,...,1,2017-05-01 14:28:17,2017-05-01 14:35:00,BUS,827,33788.0,30993.0,6.716667,14508.0,14508.0
2,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.58523,-49.33721,28154.0,...,1,2017-05-01 13:42:13,2017-05-01 14:18:00,BUS,684,39378.0,31053.0,35.783333,14494.0,14494.0
3,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.58523,-49.33721,28154.0,...,1,2017-05-01 13:57:13,2017-05-01 14:33:00,BUS,684,39378.0,31053.0,35.783333,14494.0,14494.0
4,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.58523,-49.33721,28154.0,...,1,2017-05-01 14:12:13,2017-05-01 14:48:00,BUS,684,39378.0,31053.0,35.783333,14494.0,14494.0


In [147]:
vehic_first_boardings_options = matched_vehicle_boardings.merge(bus_trips, 
                                left_on=['o_route','o_busCode','o_tripNum','otp_to_stop_id'],
                               right_on=['route','busCode','tripNum','stopPointId'],
                               how='inner') \
            #.filter(['otp_user_trip_id','otp_itinerary_id','route','busCode','tripNum','stopPointId','otp_start_time','o_gps_datetime','otp_end_time','gps_datetime'])
            #.filter(selected_trips.columns)
            

In [148]:
vehic_first_boardings_options

Unnamed: 0,o_boarding_id,o_boarding_datetime,o_gps_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_stop_lat,o_stop_lon,otp_user_trip_id,...,route,busCode,shapeId,tripNum,stopPointId,gps_datetime,distanceTraveledShape,stop_lat,stop_lon,parent_station
0,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,28150.0,...,827,BC929,2895.0,12.0,30993.0,2017-05-01 13:44:14,8568.987,-25.441705,-49.346328,14508.0
1,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,28150.0,...,827,BC929,2895.0,12.0,30993.0,2017-05-01 13:44:14,8568.987,-25.441705,-49.346328,14508.0
2,28262.0,2017-05-01 13:37:59,2017-05-01 13:37:57,827,BC929,12.0,33775.0,-25.438158,-49.358207,28262.0,...,827,BC929,2895.0,12.0,30993.0,2017-05-01 13:44:14,8568.987,-25.441705,-49.346328,14508.0
3,28262.0,2017-05-01 13:37:59,2017-05-01 13:37:57,827,BC929,12.0,33775.0,-25.438158,-49.358207,28262.0,...,827,BC929,2895.0,12.0,30993.0,2017-05-01 13:44:14,8568.987,-25.441705,-49.346328,14508.0
4,28424.0,2017-05-01 13:39:51,2017-05-01 13:39:12,827,BC929,12.0,33535.0,-25.440179,-49.355608,28424.0,...,827,BC929,2895.0,12.0,30993.0,2017-05-01 13:44:14,8568.987,-25.441705,-49.346328,14508.0
5,28424.0,2017-05-01 13:39:51,2017-05-01 13:39:12,827,BC929,12.0,33535.0,-25.440179,-49.355608,28424.0,...,827,BC929,2895.0,12.0,30993.0,2017-05-01 13:44:14,8568.987,-25.441705,-49.346328,14508.0
6,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.585230,-49.337210,28154.0,...,684,HA298,2148.0,11.0,31053.0,2017-05-01 14:01:41,15448.084,-25.513098,-49.295303,14494.0
7,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.585230,-49.337210,28154.0,...,684,HA298,2148.0,11.0,31053.0,2017-05-01 14:01:41,15448.084,-25.513098,-49.295303,14494.0
8,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.585230,-49.337210,28154.0,...,684,HA298,2148.0,11.0,31053.0,2017-05-01 14:01:41,15448.084,-25.513098,-49.295303,14494.0
9,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.585230,-49.337210,28154.0,...,684,HA298,2148.0,11.0,31053.0,2017-05-01 14:01:41,15448.084,-25.513098,-49.295303,14494.0


### Removing Scheduled trips whose scheduled start time is more than 30 minutes away from the actual boarding time

In [149]:
vehic_first_boardings_options['sched_actual_start_timediff'] = np.abs(vehic_first_boardings_options['otp_start_time']-vehic_first_boardings_options['o_gps_datetime'])

In [150]:
vehic_first_boardings_options \
    .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_start_time']) \
    .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','route','busCode','tripNum','stopPointId','otp_start_time','o_gps_datetime','otp_end_time','gps_datetime','sched_actual_start_timediff'])

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,route,busCode,tripNum,stopPointId,otp_start_time,o_gps_datetime,otp_end_time,gps_datetime,sched_actual_start_timediff
0,28150.0,1,1,827,BC929,12.0,30993.0,2017-05-01 13:38:33,2017-05-01 13:35:18,2017-05-01 13:45:00,2017-05-01 13:44:14,00:03:15
1,28150.0,8,1,827,BC929,12.0,30993.0,2017-05-01 14:28:17,2017-05-01 13:35:18,2017-05-01 14:35:00,2017-05-01 13:44:14,00:52:59
6,28154.0,1,1,684,HA298,11.0,31053.0,2017-05-01 13:42:13,2017-05-01 13:35:38,2017-05-01 14:18:00,2017-05-01 14:01:41,00:06:35
7,28154.0,2,1,684,HA298,11.0,31053.0,2017-05-01 13:57:13,2017-05-01 13:35:38,2017-05-01 14:33:00,2017-05-01 14:01:41,00:21:35
8,28154.0,6,1,684,HA298,11.0,31053.0,2017-05-01 14:12:13,2017-05-01 13:35:38,2017-05-01 14:48:00,2017-05-01 14:01:41,00:36:35
9,28154.0,7,1,684,HA298,11.0,31053.0,2017-05-01 14:27:13,2017-05-01 13:35:38,2017-05-01 15:03:00,2017-05-01 14:01:41,00:51:35
10,28154.0,8,1,684,HA298,11.0,31053.0,2017-05-01 14:42:13,2017-05-01 13:35:38,2017-05-01 15:18:00,2017-05-01 14:01:41,01:06:35
17,28154.0,9,1,684,HA298,11.0,34160.0,2017-05-01 14:57:13,2017-05-01 13:35:38,2017-05-01 15:30:02,2017-05-01 13:57:14,01:21:35
18,28154.0,10,1,684,HA298,11.0,34160.0,2017-05-01 15:12:13,2017-05-01 13:35:38,2017-05-01 15:45:02,2017-05-01 13:57:14,01:36:35
26,28161.0,4,1,050,JB301,5.0,32454.0,2017-05-01 13:44:57,2017-05-01 13:35:59,2017-05-01 13:51:00,2017-05-01 13:39:53,00:08:58


In [151]:
vehic_first_boardings_options = vehic_first_boardings_options[vehic_first_boardings_options['sched_actual_start_timediff'] < pd.Timedelta(minutes=30)]

In [153]:
vehic_first_boardings_options  \
    .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_start_time']) \
    .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','route','busCode','tripNum','stopPointId','otp_start_time','o_gps_datetime','otp_end_time','gps_datetime','sched_actual_start_timediff'])

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,route,busCode,tripNum,stopPointId,otp_start_time,o_gps_datetime,otp_end_time,gps_datetime,sched_actual_start_timediff
0,28150.0,1,1,827,BC929,12.0,30993.0,2017-05-01 13:38:33,2017-05-01 13:35:18,2017-05-01 13:45:00,2017-05-01 13:44:14,00:03:15
6,28154.0,1,1,684,HA298,11.0,31053.0,2017-05-01 13:42:13,2017-05-01 13:35:38,2017-05-01 14:18:00,2017-05-01 14:01:41,00:06:35
7,28154.0,2,1,684,HA298,11.0,31053.0,2017-05-01 13:57:13,2017-05-01 13:35:38,2017-05-01 14:33:00,2017-05-01 14:01:41,00:21:35
26,28161.0,4,1,050,JB301,5.0,32454.0,2017-05-01 13:44:57,2017-05-01 13:35:59,2017-05-01 13:51:00,2017-05-01 13:39:53,00:08:58
27,28161.0,10,1,050,JB301,5.0,32454.0,2017-05-01 14:03:36,2017-05-01 13:35:59,2017-05-01 14:09:00,2017-05-01 13:39:53,00:27:37
28,28164.0,1,1,812,BA020,9.0,28138.0,2017-05-01 13:46:46,2017-05-01 13:36:32,2017-05-01 14:03:00,2017-05-01 13:47:34,00:10:14
43,28165.0,1,1,462,DC093,7.0,26157.0,2017-05-01 13:38:46,2017-05-01 13:35:42,2017-05-01 14:12:00,2017-05-01 14:00:22,00:03:04
44,28165.0,2,1,462,DC093,7.0,26157.0,2017-05-01 13:52:02,2017-05-01 13:35:42,2017-05-01 14:28:00,2017-05-01 14:00:22,00:16:20
50,28169.0,4,1,545,JA300,6.0,29967.0,2017-05-01 13:50:13,2017-05-01 13:36:30,2017-05-01 14:35:04,2017-05-01 14:14:48,00:13:43
58,28172.0,6,1,334,DA023,20.0,27812.0,2017-05-01 13:55:39,2017-05-01 13:36:33,2017-05-01 14:02:00,2017-05-01 13:20:33,00:19:06


In [154]:
otp_suggestions.head()

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827.0,33788.0,30993.0,6.45,,14508.0
1,2017-05-01,28150.0,1,2,2017-05-01 13:45:00,2017-05-01 13:45:52,WALK,,,,0.866667,,
2,2017-05-01,28150.0,1,3,2017-05-01 13:51:00,2017-05-01 14:01:59,BUS,303.0,26195.0,25753.0,10.983333,14508.0,41756.0
3,2017-05-01,28150.0,1,4,2017-05-01 14:02:00,2017-05-01 14:09:32,WALK,,,,7.533333,,
4,2017-05-01,28150.0,2,1,2017-05-01 13:42:58,2017-05-01 13:57:00,BUS,822.0,33788.0,30994.0,14.033333,,14508.0


In [156]:
scheduled_itin_observed_od_full.head()

Unnamed: 0,bt_from_busCode,bt_from_distanceTraveledShape,bt_from_gps_datetime,bt_from_parent_station,bt_from_route,bt_from_shapeId,bt_from_stopPointId,bt_from_stop_lat,bt_from_stop_lon,bt_from_tripNum,...,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,otp_start_time,otp_to_parent_station,otp_to_stop_id,otp_user_trip_id,sched_obs_end_timediff,sched_obs_start_timediff
3,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
4,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
5,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
6,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
15,BC929,6303.387,2017-05-01 14:22:34,,827,2895.0,33788.0,-25.436303,-49.362221,13.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:45:58,00:44:01


## Keep only OTP-BusTrips itineraries which figure in first-vehicle-boarding matched itineraries

In [169]:
matched_vehicle_boardings_itins = vehic_first_boardings_options.filter(['otp_user_trip_id','otp_itinerary_id'])

In [170]:
matched_vehicle_boardings_itins.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id
0,28150.0,1
2,28262.0,1
4,28424.0,1
6,28154.0,1
7,28154.0,2


In [171]:
scheduled_itin_observed_od_full.head()

Unnamed: 0,bt_from_busCode,bt_from_distanceTraveledShape,bt_from_gps_datetime,bt_from_parent_station,bt_from_route,bt_from_shapeId,bt_from_stopPointId,bt_from_stop_lat,bt_from_stop_lon,bt_from_tripNum,...,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,otp_start_time,otp_to_parent_station,otp_to_stop_id,otp_user_trip_id,sched_obs_end_timediff,sched_obs_start_timediff
3,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
4,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
5,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
6,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
15,BC929,6303.387,2017-05-01 14:22:34,,827,2895.0,33788.0,-25.436303,-49.362221,13.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:45:58,00:44:01


In [172]:
len(scheduled_itin_observed_od_full)

19121

In [173]:
scheduled_itin_observed_od.columns

Index([u'otp_date', u'otp_user_trip_id', u'otp_itinerary_id', u'otp_leg_id',
       u'otp_start_time', u'otp_end_time', u'otp_mode', u'otp_route',
       u'otp_from_stop_id', u'otp_to_stop_id', u'otp_duration_mins',
       u'otp_from_parent_station', u'otp_to_parent_station', u'bt_from_route',
       u'bt_from_busCode', u'bt_from_shapeId', u'bt_from_tripNum',
       u'bt_from_stopPointId', u'bt_from_gps_datetime',
       u'bt_from_distanceTraveledShape', u'bt_from_stop_lat',
       u'bt_from_stop_lon', u'bt_from_parent_station',
       u'sched_obs_start_timediff', u'bt_to_route', u'bt_to_busCode',
       u'bt_to_shapeId', u'bt_to_tripNum', u'bt_to_stopPointId',
       u'bt_to_gps_datetime', u'bt_to_distanceTraveledShape',
       u'bt_to_stop_lat', u'bt_to_stop_lon', u'bt_to_parent_station',
       u'sched_obs_end_timediff'],
      dtype='object')

In [174]:
matched_vehicle_boardings_itins.columns

Index([u'otp_user_trip_id', u'otp_itinerary_id'], dtype='object')

In [196]:
vehicle_boardings_obs_sch_itin_legs = scheduled_itin_observed_od_full.merge(matched_vehicle_boardings_itins,how='inner') \
                                        .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])

In [197]:
len(vehicle_boardings_obs_sch_itin_legs)

1864

In [198]:
vehicle_boardings_obs_sch_itin_legs.head()

Unnamed: 0,bt_from_busCode,bt_from_distanceTraveledShape,bt_from_gps_datetime,bt_from_parent_station,bt_from_route,bt_from_shapeId,bt_from_stopPointId,bt_from_stop_lat,bt_from_stop_lon,bt_from_tripNum,...,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,otp_start_time,otp_to_parent_station,otp_to_stop_id,otp_user_trip_id,sched_obs_end_timediff,sched_obs_start_timediff
0,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
1,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
2,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
3,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
4,BC929,6303.387,2017-05-01 14:22:34,,827,2895.0,33788.0,-25.436303,-49.362221,13.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:45:58,00:44:01


### Concatenating first boarding legs to other itinerary legs

#### Preparing First Boarding legs data

In [199]:
vehic_first_boardings_options.columns

Index([u'o_boarding_id', u'o_boarding_datetime', u'o_gps_datetime', u'o_route',
       u'o_busCode', u'o_tripNum', u'o_stopPointId', u'o_stop_lat',
       u'o_stop_lon', u'otp_user_trip_id', u'otp_itinerary_id', u'otp_date',
       u'otp_leg_id', u'otp_start_time', u'otp_end_time', u'otp_mode',
       u'otp_route', u'otp_from_stop_id', u'otp_to_stop_id',
       u'otp_duration_mins', u'otp_from_parent_station',
       u'otp_to_parent_station', u'route', u'busCode', u'shapeId', u'tripNum',
       u'stopPointId', u'gps_datetime', u'distanceTraveledShape', u'stop_lat',
       u'stop_lon', u'parent_station', u'sched_actual_start_timediff'],
      dtype='object')

In [200]:
vehic_first_boardings_options_clean = vehic_first_boardings_options \
    .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_mode','otp_route','o_busCode',
             'o_tripNum','otp_from_stop_id','otp_start_time','o_boarding_datetime','otp_to_stop_id',
             'otp_end_time','gps_datetime','otp_duration_mins']) \
    .rename(index=str, columns={'o_busCode':'bt_bus_code','o_tripNum':'bt_trip_num',
                                'o_boarding_datetime':'bt_start_time','gps_datetime':'bt_end_time'}) \
    .assign(bt_duration_mins = lambda x: (x.bt_end_time - x.bt_start_time)/pd.Timedelta(minutes=1)) \
    .assign(considered_duration_mins = lambda x: np.where(np.isnan(x.bt_duration_mins),x.otp_duration_mins,x.bt_duration_mins))

In [201]:
vehic_first_boardings_options_clean.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,considered_duration_mins
0,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,7.816667,7.816667
2,28262.0,1,1,BUS,827,BC929,12.0,33775.0,2017-05-01 13:40:17,2017-05-01 13:37:59,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,4.716667,6.25,6.25
4,28424.0,1,1,BUS,827,BC929,12.0,33535.0,2017-05-01 13:41:48,2017-05-01 13:39:51,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,3.2,4.383333,4.383333
6,28154.0,1,1,BUS,684,HA298,11.0,39378.0,2017-05-01 13:42:13,2017-05-01 13:36:28,31053.0,2017-05-01 14:18:00,2017-05-01 14:01:41,35.783333,25.216667,25.216667
7,28154.0,2,1,BUS,684,HA298,11.0,39378.0,2017-05-01 13:57:13,2017-05-01 13:36:28,31053.0,2017-05-01 14:33:00,2017-05-01 14:01:41,35.783333,25.216667,25.216667


In [202]:
len(vehic_first_boardings_options_clean)

138

#### Preparing OTP itinerary legs data

In [203]:
vehicle_boardings_obs_sch_itin_legs.columns.values

array(['bt_from_busCode', 'bt_from_distanceTraveledShape',
       'bt_from_gps_datetime', 'bt_from_parent_station', 'bt_from_route',
       'bt_from_shapeId', 'bt_from_stopPointId', 'bt_from_stop_lat',
       'bt_from_stop_lon', 'bt_from_tripNum', 'bt_to_busCode',
       'bt_to_distanceTraveledShape', 'bt_to_gps_datetime',
       'bt_to_parent_station', 'bt_to_route', 'bt_to_shapeId',
       'bt_to_stopPointId', 'bt_to_stop_lat', 'bt_to_stop_lon',
       'bt_to_tripNum', 'otp_date', 'otp_duration_mins', 'otp_end_time',
       'otp_from_parent_station', 'otp_from_stop_id', 'otp_itinerary_id',
       'otp_leg_id', 'otp_mode', 'otp_route', 'otp_start_time',
       'otp_to_parent_station', 'otp_to_stop_id', 'otp_user_trip_id',
       'sched_obs_end_timediff', 'sched_obs_start_timediff'], dtype=object)

In [204]:
vehicle_boardings_obs_sch_itin_legs_clean = vehicle_boardings_obs_sch_itin_legs \
    .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_mode','otp_route','bt_from_busCode',
             'bt_from_tripNum','otp_from_stop_id','otp_start_time','bt_from_gps_datetime','otp_to_stop_id',
             'otp_end_time','bt_to_gps_datetime','otp_duration_mins'])\
    .rename(index=str, columns={'bt_from_busCode':'bt_bus_code','bt_from_tripNum':'bt_trip_num',
                                'bt_from_gps_datetime':'bt_start_time','bt_to_gps_datetime':'bt_end_time'}) \
    .assign(bt_duration_mins = lambda x: (x.bt_end_time - x.bt_start_time)/pd.Timedelta(minutes=1)) \
    .assign(considered_duration_mins = lambda x: np.where(np.isnan(x.bt_duration_mins),x.otp_duration_mins,x.bt_duration_mins))

In [205]:
vehicle_boardings_obs_sch_itin_legs_clean.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,considered_duration_mins
0,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,8.933333,8.933333
1,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,8.933333,8.933333
2,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,8.933333,8.933333
3,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,8.933333,8.933333
4,28150.0,1,1,BUS,827,BC929,13.0,33788.0,2017-05-01 13:38:33,2017-05-01 14:22:34,30993.0,2017-05-01 13:45:00,2017-05-01 14:30:58,6.45,8.4,8.4


In [206]:
len(vehicle_boardings_obs_sch_itin_legs_clean)

1864

#### Concatenating Legs

In [207]:
len(vehic_first_boardings_options_clean)

138

In [208]:
vehic_first_boardings_options_clean_keys = vehic_first_boardings_options_clean.filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id']).drop_duplicates()

In [209]:
len(vehic_first_boardings_options_clean_keys)

125

In [210]:
vehic_first_boardings_options_clean_keys.head(10)

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id
0,28150.0,1,1
2,28262.0,1,1
4,28424.0,1,1
6,28154.0,1,1
7,28154.0,2,1
11,28229.0,1,1
12,28229.0,2,1
20,57607.0,1,1
22,57607.0,5,1
26,28161.0,4,1


In [211]:
vehicle_legs_merged = vehicle_boardings_obs_sch_itin_legs_clean.merge(vehic_first_boardings_options_clean_keys, how='outer', indicator=True)
vehicle_legs_rest_clean = vehicle_legs_merged[vehicle_legs_merged['_merge'] == 'left_only'].drop('_merge', axis=1)

In [212]:
len(vehicle_legs_rest_clean)

807

In [213]:
vehicle_legs_rest_clean

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,considered_duration_mins
6,28150.0,1,2,WALK,,,,,2017-05-01 13:45:00,NaT,,2017-05-01 13:45:52,NaT,0.866667,,0.866667
7,28150.0,1,3,BUS,303,DE722,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:02:48,25753.0,2017-05-01 14:01:59,2017-05-01 14:12:10,10.983333,9.366667,9.366667
8,28150.0,1,3,BUS,303,DE710,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:27:30,25753.0,2017-05-01 14:01:59,2017-05-01 13:35:59,10.983333,8.483333,8.483333
9,28150.0,1,3,BUS,303,DE713,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:16:34,25753.0,2017-05-01 14:01:59,2017-05-01 13:24:23,10.983333,7.816667,7.816667
10,28150.0,1,3,BUS,303,DE708,5.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:29:24,25753.0,2017-05-01 14:01:59,2017-05-01 14:37:26,10.983333,8.033333,8.033333
11,28150.0,1,3,BUS,303,DE719,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:07:22,25753.0,2017-05-01 14:01:59,2017-05-01 13:15:06,10.983333,7.733333,7.733333
12,28150.0,1,3,BUS,303,LE702,5.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:41:47,25753.0,2017-05-01 14:01:59,2017-05-01 14:50:32,10.983333,8.750000,8.750000
13,28150.0,1,4,WALK,,,,,2017-05-01 14:02:00,NaT,,2017-05-01 14:09:32,NaT,7.533333,,7.533333
23,28154.0,1,2,WALK,,,,,2017-05-01 14:18:00,NaT,,2017-05-01 14:19:14,NaT,1.233333,,1.233333
24,28154.0,1,3,BUS,603,HD243,7.0,26296.0,2017-05-01 14:22:00,2017-05-01 14:22:47,25521.0,2017-05-01 14:53:48,2017-05-01 14:49:17,31.800000,26.500000,26.500000


In [214]:
all_vehicle_legs_options = pd.concat([vehic_first_boardings_options_clean,vehicle_legs_rest_clean]) \
    .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','bt_start_time'])

In [215]:
len(all_vehicle_legs_options)

945

In [216]:
all_vehicle_legs_options

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,considered_duration_mins
0,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.450000,7.816667,7.816667
6,28150.0,1,2,WALK,,,,,2017-05-01 13:45:00,NaT,,2017-05-01 13:45:52,NaT,0.866667,,0.866667
11,28150.0,1,3,BUS,303,DE719,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:07:22,25753.0,2017-05-01 14:01:59,2017-05-01 13:15:06,10.983333,7.733333,7.733333
9,28150.0,1,3,BUS,303,DE713,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:16:34,25753.0,2017-05-01 14:01:59,2017-05-01 13:24:23,10.983333,7.816667,7.816667
8,28150.0,1,3,BUS,303,DE710,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:27:30,25753.0,2017-05-01 14:01:59,2017-05-01 13:35:59,10.983333,8.483333,8.483333
7,28150.0,1,3,BUS,303,DE722,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:02:48,25753.0,2017-05-01 14:01:59,2017-05-01 14:12:10,10.983333,9.366667,9.366667
10,28150.0,1,3,BUS,303,DE708,5.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:29:24,25753.0,2017-05-01 14:01:59,2017-05-01 14:37:26,10.983333,8.033333,8.033333
12,28150.0,1,3,BUS,303,LE702,5.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:41:47,25753.0,2017-05-01 14:01:59,2017-05-01 14:50:32,10.983333,8.750000,8.750000
13,28150.0,1,4,WALK,,,,,2017-05-01 14:02:00,NaT,,2017-05-01 14:09:32,NaT,7.533333,,7.533333
6,28154.0,1,1,BUS,684,HA298,11.0,39378.0,2017-05-01 13:42:13,2017-05-01 13:36:28,31053.0,2017-05-01 14:18:00,2017-05-01 14:01:41,35.783333,25.216667,25.216667


### Choose best actual leg matches (based on feasibility and start time)

In [240]:
sample_itinerary_options = all_vehicle_legs_options.iloc[0:9,]

In [241]:
sample_itinerary_options

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,considered_duration_mins
0,28150.0,1,1,BUS,827.0,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,7.816667,7.816667
6,28150.0,1,2,WALK,,,,,2017-05-01 13:45:00,NaT,,2017-05-01 13:45:52,NaT,0.866667,,0.866667
11,28150.0,1,3,BUS,303.0,DE719,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:07:22,25753.0,2017-05-01 14:01:59,2017-05-01 13:15:06,10.983333,7.733333,7.733333
9,28150.0,1,3,BUS,303.0,DE713,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:16:34,25753.0,2017-05-01 14:01:59,2017-05-01 13:24:23,10.983333,7.816667,7.816667
8,28150.0,1,3,BUS,303.0,DE710,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:27:30,25753.0,2017-05-01 14:01:59,2017-05-01 13:35:59,10.983333,8.483333,8.483333
7,28150.0,1,3,BUS,303.0,DE722,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:02:48,25753.0,2017-05-01 14:01:59,2017-05-01 14:12:10,10.983333,9.366667,9.366667
10,28150.0,1,3,BUS,303.0,DE708,5.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:29:24,25753.0,2017-05-01 14:01:59,2017-05-01 14:37:26,10.983333,8.033333,8.033333
12,28150.0,1,3,BUS,303.0,LE702,5.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:41:47,25753.0,2017-05-01 14:01:59,2017-05-01 14:50:32,10.983333,8.75,8.75
13,28150.0,1,4,WALK,,,,,2017-05-01 14:02:00,NaT,,2017-05-01 14:09:32,NaT,7.533333,,7.533333


In [417]:
def choose_leg_matches(leg_matches_groups,colnames):
        chosen_leg_matches = pd.DataFrame(columns = colnames)
        chosen_leg_matches = pd.DataFrame()
        prev_group_id = ()
        prev_itin_id = -1
        prev_leg_end_time = pd.NaT
        num_groups_not_survived = 0

        for name, group in leg_matches_groups:
            
                #print
                #print "Name:", name
                #print "Group:"
                #print group
                #print
                
                if ((prev_itin_id == -1)):
                        prev_itin_id = group['otp_itinerary_id'].iloc[0]
                        prev_leg_end_time = group['bt_start_time'].dt.floor('d').iloc[0]
                elif (prev_itin_id != group['otp_itinerary_id'].iloc[0]):
                        prev_leg_end_time = group['bt_start_time'].dt.floor('d').iloc[0]
                    

                #if (prev_group_id == ()):
                #        prev_leg_end_time = group['bt_start_time'].dt.floor('d')[0]

                #print
                #print "Previous leg end time:", prev_leg_end_time
                #print
                #print "Original Group"
                #print group.filter(['otp_start_time','bt_start_time','bt_end_time'])
                
                if (group['otp_mode'].iloc[0] == 'WALK'):
                    #print "Walking duration:", filtered_group['otp_duration_mins']
                    filtered_group = group.copy().reset_index()
                    filtered_group.loc[0,'bt_start_time'] = prev_leg_end_time
                    filtered_group.loc[0,'bt_end_time'] = prev_leg_end_time + \
                        pd.Timedelta(minutes=np.rint(filtered_group['otp_duration_mins'].iloc[0]))
                    
                    #print "Filtered Group"
                    #print filtered_group
                else:
                    filtered_group = group[group['bt_start_time'] > prev_leg_end_time]
                
                #print
                #print "Filtered Group"
                #print filtered_group.filter(['otp_start_time','bt_start_time','bt_end_time'])

                if (len(filtered_group) == 0):
                        #print "Group did not survive! =("
                        num_groups_not_survived += 1
                        continue

                chosen_leg_match = filtered_group.sort_values('bt_start_time').iloc[0]
                #print "Chosen Leg"
                #print chosen_leg_match

                chosen_leg_matches = chosen_leg_matches.append(chosen_leg_match)

                #Update variables
                #prev_group_id = name
                prev_itin_id = group['otp_itinerary_id'].iloc[0]
                prev_leg_end_time = chosen_leg_match['bt_end_time']

        #print num_groups_not_survived
        return chosen_leg_matches.filter(colnames)


In [418]:
chosen_legs = choose_leg_matches(all_vehicle_legs_options.groupby(['otp_user_trip_id','otp_itinerary_id','otp_leg_id']),
                                all_vehicle_legs_options.columns)

In [419]:
chosen_legs

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,considered_duration_mins
0,28150.0,1.0,1.0,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.450000,7.816667,7.816667
0,28150.0,1.0,2.0,WALK,,,,,2017-05-01 13:45:00,2017-05-01 13:44:14,,2017-05-01 13:45:52,2017-05-01 13:45:14,0.866667,,0.866667
7,28150.0,1.0,3.0,BUS,303,DE722,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:02:48,25753.0,2017-05-01 14:01:59,2017-05-01 14:12:10,10.983333,9.366667,9.366667
0,28150.0,1.0,4.0,WALK,,,,,2017-05-01 14:02:00,2017-05-01 14:12:10,,2017-05-01 14:09:32,2017-05-01 14:20:10,7.533333,,7.533333
0,28154.0,1.0,2.0,WALK,,,,,2017-05-01 14:18:00,2017-05-01 14:20:10,,2017-05-01 14:19:14,2017-05-01 14:21:10,1.233333,,1.233333
24,28154.0,1.0,3.0,BUS,603,HD243,7.0,26296.0,2017-05-01 14:22:00,2017-05-01 14:22:47,25521.0,2017-05-01 14:53:48,2017-05-01 14:49:17,31.800000,26.500000,26.500000
0,28154.0,1.0,4.0,WALK,,,,,2017-05-01 14:53:49,2017-05-01 14:49:17,,2017-05-01 14:55:49,2017-05-01 14:51:17,2.000000,,2.000000
7,28154.0,2.0,1.0,BUS,684,HA298,11.0,39378.0,2017-05-01 13:57:13,2017-05-01 13:36:28,31053.0,2017-05-01 14:33:00,2017-05-01 14:01:41,35.783333,25.216667,25.216667
0,28154.0,2.0,2.0,WALK,,,,,2017-05-01 14:33:00,2017-05-01 14:01:41,,2017-05-01 14:34:14,2017-05-01 14:02:41,1.233333,,1.233333
47,28154.0,2.0,3.0,BUS,603,HD250,6.0,26296.0,2017-05-01 14:39:00,2017-05-01 14:07:27,25521.0,2017-05-01 15:10:48,2017-05-01 14:32:45,31.800000,25.300000,25.300000


#### Remove last walk legs of itinerary

In [424]:
chosen_legs_clean = chosen_legs.groupby(['otp_user_trip_id','otp_itinerary_id']) \
            .apply(lambda x: x.iloc[:-1] if x.otp_mode.iloc[-1] == 'WALK' else x) \
            .reset_index(drop=True) \
            .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])

In [425]:
chosen_legs_clean

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,considered_duration_mins
0,28150.0,1.0,1.0,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.450000,7.816667,7.816667
1,28150.0,1.0,2.0,WALK,,,,,2017-05-01 13:45:00,2017-05-01 13:44:14,,2017-05-01 13:45:52,2017-05-01 13:45:14,0.866667,,0.866667
2,28150.0,1.0,3.0,BUS,303,DE722,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:02:48,25753.0,2017-05-01 14:01:59,2017-05-01 14:12:10,10.983333,9.366667,9.366667
3,28154.0,1.0,2.0,WALK,,,,,2017-05-01 14:18:00,2017-05-01 14:20:10,,2017-05-01 14:19:14,2017-05-01 14:21:10,1.233333,,1.233333
4,28154.0,1.0,3.0,BUS,603,HD243,7.0,26296.0,2017-05-01 14:22:00,2017-05-01 14:22:47,25521.0,2017-05-01 14:53:48,2017-05-01 14:49:17,31.800000,26.500000,26.500000
5,28154.0,2.0,1.0,BUS,684,HA298,11.0,39378.0,2017-05-01 13:57:13,2017-05-01 13:36:28,31053.0,2017-05-01 14:33:00,2017-05-01 14:01:41,35.783333,25.216667,25.216667
6,28154.0,2.0,2.0,WALK,,,,,2017-05-01 14:33:00,2017-05-01 14:01:41,,2017-05-01 14:34:14,2017-05-01 14:02:41,1.233333,,1.233333
7,28154.0,2.0,3.0,BUS,603,HD250,6.0,26296.0,2017-05-01 14:39:00,2017-05-01 14:07:27,25521.0,2017-05-01 15:10:48,2017-05-01 14:32:45,31.800000,25.300000,25.300000
8,28161.0,4.0,1.0,BUS,050,JB301,5.0,14404.0,2017-05-01 13:44:57,2017-05-01 13:36:33,32454.0,2017-05-01 13:51:00,2017-05-01 13:39:53,6.050000,3.333333,3.333333
9,28161.0,4.0,2.0,WALK,,,,,2017-05-01 13:51:00,2017-05-01 13:39:53,,2017-05-01 13:51:58,2017-05-01 13:40:53,0.966667,,0.966667


#### Add stops data to legs

In [436]:
def add_stops_data_to_legs(itineraries_legs,stops_locs):
    itineraries_legs_stops = itineraries_legs.merge(stops_locs, left_on='otp_from_stop_id', right_on='stop_id', how='left') \
                                                                                .drop('stop_id', axis=1) \
                                                                                .rename(index=str, columns={'stop_lat':'from_stop_lat','stop_lon':'from_stop_lon'}) \
                                                                                .merge(stops_locations, left_on='otp_to_stop_id', right_on='stop_id', how='left') \
                                                                                .drop('stop_id', axis=1) \
                                                                                .rename(index=str, columns={'stop_lat':'to_stop_lat','stop_lon':'to_stop_lon'}) 
    return itineraries_legs_stops


In [437]:
stops_locations = stops_df[['stop_id','stop_lat','stop_lon']]

In [438]:
itineraries_legs = add_stops_data_to_legs(chosen_legs_clean,stops_locations)

In [485]:
itineraries_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,considered_duration_mins,from_stop_lat,from_stop_lon,to_stop_lat,to_stop_lon
0,28150.0,1.0,1.0,BUS,827.0,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,7.816667,7.816667,-25.436303,-49.362221,-25.441705,-49.346328
1,28150.0,1.0,2.0,WALK,,,,,2017-05-01 13:45:00,2017-05-01 13:44:14,,2017-05-01 13:45:52,2017-05-01 13:45:14,0.866667,,0.866667,,,,
2,28150.0,1.0,3.0,BUS,303.0,DE722,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:02:48,25753.0,2017-05-01 14:01:59,2017-05-01 14:12:10,10.983333,9.366667,9.366667,-25.44172,-49.346978,-25.437968,-49.314099
3,28154.0,1.0,2.0,WALK,,,,,2017-05-01 14:18:00,2017-05-01 14:20:10,,2017-05-01 14:19:14,2017-05-01 14:21:10,1.233333,,1.233333,,,,
4,28154.0,1.0,3.0,BUS,603.0,HD243,7.0,26296.0,2017-05-01 14:22:00,2017-05-01 14:22:47,25521.0,2017-05-01 14:53:48,2017-05-01 14:49:17,31.8,26.5,26.5,-25.512252,-49.29558,-25.442988,-49.280469


In [503]:
itineraries_legs.columns.values

array(['otp_user_trip_id', 'otp_itinerary_id', 'otp_leg_id', 'otp_mode',
       'otp_route', 'bt_bus_code', 'bt_trip_num', 'otp_from_stop_id',
       'otp_start_time', 'bt_start_time', 'otp_to_stop_id',
       'otp_end_time', 'bt_end_time', 'otp_duration_mins',
       'bt_duration_mins', 'considered_duration_mins', 'from_stop_lat',
       'from_stop_lon', 'to_stop_lat', 'to_stop_lon'], dtype=object)

#### Add back card number

In [512]:
trips_on_pairs_full.columns

Index([u'cardNum', u'o_boarding_id', u'o_boarding_datetime', u'o_gps_datetime',
       u'o_route', u'o_busCode', u'o_tripNum', u'o_stopPointId', u'o_stop_lat',
       u'o_stop_lon', u'next_o_boarding_id', u'next_o_boarding_datetime',
       u'next_o_gps_datetime', u'next_o_route', u'next_o_busCode',
       u'next_o_tripNum', u'next_o_stopPointId', u'next_o_stop_lat',
       u'next_o_stop_lon', u'boardings_timediff', u'dist_between_origins'],
      dtype='object')

In [513]:
passenger_trips_ids = trips_on_pairs_full.filter(['cardNum','o_boarding_id']) \
                            .rename(index=str, columns={'cardNum':'card_num','o_boarding_id':'trip_id'})

In [514]:
passenger_trips_ids.head()

Unnamed: 0,card_num,trip_id
0,1886552.0,180.0
1,1886552.0,40392.0
2,3601076.0,181.0
3,3601076.0,31560.0
4,3282345.0,182.0


In [515]:
pass_itins_legs = itineraries_legs.merge(passenger_trips_ids, 
                       left_on=['otp_user_trip_id'], 
                       right_on=['trip_id'],
                       how='left') \
                .drop('otp_user_trip_id', axis=1) \
                .filter(np.append(['card_num','trip_id'],itineraries_legs.columns.values[1:])) \
                .sort_values(['card_num','trip_id','otp_itinerary_id','otp_leg_id'])

In [510]:
pass_itins_legs

Unnamed: 0,card_num,trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,...,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,considered_duration_mins,from_stop_lat,from_stop_lon,to_stop_lat,to_stop_lon
57,827182.0,28276.0,1.0,1.0,BUS,652,LA011,11.0,36694.0,2017-05-01 13:46:38,...,28609.0,2017-05-01 14:04:00,2017-05-01 13:55:44,17.366667,17.550000,17.550000,-25.534742,-49.332760,-25.511657,-49.324786
58,827182.0,28276.0,1.0,2.0,WALK,,,,,2017-05-01 14:04:00,...,,2017-05-01 14:04:05,2017-05-01 13:55:44,0.083333,,0.083333,,,,
59,827182.0,28276.0,1.0,4.0,WALK,,,,,2017-05-01 14:20:30,...,,2017-05-01 14:21:55,2017-05-01 13:56:44,1.416667,,1.416667,,,,
60,827182.0,28276.0,1.0,5.0,BUS,203,BD135,4.0,26199.0,2017-05-01 14:24:00,...,25732.0,2017-05-01 14:41:26,2017-05-01 14:23:46,17.433333,20.266667,20.266667,-25.491755,-49.293490,-25.448323,-49.287777
61,827182.0,28276.0,6.0,1.0,BUS,652,LA011,11.0,36694.0,2017-05-01 14:07:11,...,28609.0,2017-05-01 14:26:00,2017-05-01 13:55:44,18.816667,17.550000,17.550000,-25.534742,-49.332760,-25.511657,-49.324786
62,827182.0,28276.0,6.0,2.0,WALK,,,,,2017-05-01 14:26:00,...,,2017-05-01 14:26:05,2017-05-01 13:55:44,0.083333,,0.083333,,,,
63,827182.0,28276.0,6.0,4.0,WALK,,,,,2017-05-01 14:44:30,...,,2017-05-01 14:46:29,2017-05-01 13:57:44,1.983333,,1.983333,,,,
64,827182.0,28276.0,6.0,5.0,BUS,603,HD249,7.0,26294.0,2017-05-01 14:47:37,...,25732.0,2017-05-01 15:07:00,2017-05-01 14:14:50,19.383333,15.016667,15.016667,-25.492347,-49.293421,-25.448323,-49.287777
241,869122.0,52894.0,2.0,1.0,BUS,828,LA012,9.0,34119.0,2017-05-01 18:11:10,...,31000.0,2017-05-01 18:18:00,2017-05-01 16:38:17,6.833333,-80.966667,-80.966667,-25.451180,-49.350724,-25.441536,-49.347001
242,869122.0,52894.0,2.0,2.0,WALK,,,,,2017-05-01 18:18:00,...,,2017-05-01 18:20:42,2017-05-01 16:41:17,2.700000,,2.700000,,,,


In [509]:
pass_itins_legs.columns.values

array(['card_num', 'trip_id', 'otp_itinerary_id', 'otp_leg_id',
       'otp_mode', 'otp_route', 'bt_bus_code', 'bt_trip_num',
       'otp_from_stop_id', 'otp_start_time', 'bt_start_time',
       'otp_to_stop_id', 'otp_end_time', 'bt_end_time',
       'otp_duration_mins', 'bt_duration_mins',
       'considered_duration_mins', 'from_stop_lat', 'from_stop_lon',
       'to_stop_lat', 'to_stop_lon'], dtype=object)

### Summarizing suggested itineraries information

In [563]:
def build_candidate_itineraries_df(chosen_leg_matches_data):
        otp_buste_itineraries = chosen_leg_matches_data \
                                        .query('otp_mode == \'BUS\'') \
                                        .groupby(['card_num','trip_id','otp_itinerary_id']) \
                                        .agg({'otp_from_stop_id': lambda x: x.iloc[0],
                                                  'bt_start_time': lambda x: x.iloc[0],
                                                  'from_stop_lat': lambda x: x.iloc[0],
                                                  'from_stop_lon': lambda x: x.iloc[0],
                                                  'otp_to_stop_id': lambda x: x.iloc[-1],
                                                  'bt_end_time': lambda x: x.iloc[-1],
                                                  'to_stop_lat': lambda x: x.iloc[-1],
                                                  'to_stop_lon': lambda x: x.iloc[-1],
                                                  'otp_mode': lambda x: len(x)}) \
                                        .rename(index=str, columns={'otp_mode':'num_transfers'}) \
                                        .reset_index() \
                                        .reindex(['card_num','trip_id','otp_itinerary_id',
                                                  'otp_from_stop_id','bt_start_time','from_stop_lat',
                                                  'from_stop_lon','otp_to_stop_id','bt_end_time',
                                                  'to_stop_lat','to_stop_lon','num_transfers'], axis=1, copy=False)\
                                        .assign(card_num = lambda x: x['card_num'].astype(float),
                                                trip_id = lambda x: x['trip_id'].astype(float),
                                                otp_itinerary_id = lambda x: x['otp_itinerary_id'].astype(float))
        return otp_buste_itineraries


In [564]:
cand_itineraries_df = build_candidate_itineraries_df(pass_itins_legs)

In [565]:
cand_itineraries_df

Unnamed: 0,card_num,trip_id,otp_itinerary_id,otp_from_stop_id,bt_start_time,from_stop_lat,from_stop_lon,otp_to_stop_id,bt_end_time,to_stop_lat,to_stop_lon,num_transfers
0,827182.0,28276.0,1.0,36694.0,2017-05-01 13:38:11,-25.534742,-49.332760,25732.0,2017-05-01 14:23:46,-25.448323,-49.287777,2
1,827182.0,28276.0,6.0,36694.0,2017-05-01 13:38:11,-25.534742,-49.332760,25732.0,2017-05-01 14:14:50,-25.448323,-49.287777,2
2,869122.0,52894.0,2.0,34119.0,2017-05-01 17:59:15,-25.451180,-49.350724,31000.0,2017-05-01 16:38:17,-25.441536,-49.347001,1
3,1024869.0,28174.0,1.0,33580.0,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,2017-05-01 14:22:51,-25.438600,-49.268210,2
4,1024869.0,28174.0,5.0,33580.0,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,2017-05-01 14:22:51,-25.438600,-49.268210,2
5,1168962.0,28455.0,1.0,26296.0,2017-05-01 14:37:36,-25.512252,-49.295580,26197.0,2017-05-01 14:51:05,-25.476372,-49.292607,1
6,1168962.0,28455.0,2.0,37403.0,2017-05-01 13:40:12,-25.557041,-49.283864,26197.0,2017-05-01 14:27:12,-25.476372,-49.292607,2
7,1354462.0,28373.0,4.0,39837.0,2017-05-01 13:39:18,-25.556058,-49.332414,33987.0,2017-05-01 15:12:34,-25.428949,-49.353564,2
8,1354462.0,39353.0,1.0,33745.0,2017-05-01 15:40:50,-25.435506,-49.349914,35179.0,2017-05-01 16:40:50,-25.551285,-49.332021,2
9,1447119.0,58869.0,5.0,25716.0,2017-05-01 19:50:09,-25.422406,-49.291661,36283.0,2017-05-01 20:23:16,-25.573030,-49.334941,3


In [566]:
cand_itineraries_df.dtypes

card_num                   float64
trip_id                    float64
otp_itinerary_id           float64
otp_from_stop_id           float64
bt_start_time       datetime64[ns]
from_stop_lat              float64
from_stop_lon              float64
otp_to_stop_id             float64
bt_end_time         datetime64[ns]
to_stop_lat                float64
to_stop_lon                float64
num_transfers                int64
dtype: object

#### Add origin/next-origin locations

In [567]:
passenger_trips_valid_df = trips_on_pairs_full.filter(['cardNum','o_boarding_id','o_stop_lat','o_stop_lon','o_boarding_datetime','next_o_stop_lat','next_o_stop_lon','next_o_boarding_datetime']) \
                            .rename(index=str, columns={'cardNum':'card_num','o_boarding_id':'trip_id'})

In [568]:
passenger_trips_valid_df.head()

Unnamed: 0,card_num,trip_id,o_stop_lat,o_stop_lon,o_boarding_datetime,next_o_stop_lat,next_o_stop_lon,next_o_boarding_datetime
0,1886552.0,180.0,-25.568989,-49.332253,2017-05-01 04:48:42,-25.431951,-49.296491,2017-05-01 15:52:39
1,1886552.0,40392.0,-25.431951,-49.296491,2017-05-01 15:52:39,-25.568989,-49.332253,2017-05-01 04:48:42
2,3601076.0,181.0,-25.56686,-49.3329,2017-05-01 04:49:13,-25.447479,-49.263816,2017-05-01 14:16:55
3,3601076.0,31560.0,-25.447479,-49.263816,2017-05-01 14:16:55,-25.56686,-49.3329,2017-05-01 04:49:13
4,3282345.0,182.0,-25.56686,-49.3329,2017-05-01 04:49:19,-25.432534,-49.338889,2017-05-01 19:27:16


In [569]:
passenger_trips_locations.dtypes

card_num           float64
trip_id            float64
o_stop_lat         float64
o_stop_lon         float64
next_o_stop_lat    float64
next_o_stop_lon    float64
dtype: object

In [570]:
def dist(p1_lat, p1_lon, p2_lat, p2_lon):
    if(np.isnan([p1_lat, p1_lon, p2_lat, p2_lon]).any()):
        return -1
    else:
        return np.around(distance.geodesic((p1_lat,p1_lon),(p2_lat,p2_lon)).km,decimals=5)

In [571]:
def get_candidate_itineraries_summary(candidate_itineraries,trips_valid):
        otp_buste_itineraries_summary = candidate_itineraries \
                                        .merge(trips_valid,how='inner') \
                                        .assign(start_diff = lambda x: np.absolute(x['bt_start_time'] - x['o_boarding_datetime']),
                                                trip_duration = lambda x: x['bt_end_time'] - x['bt_start_time'],
                                                origin_dist = lambda y: y.apply(lambda x: dist(x['from_stop_lat'], x['from_stop_lon'], x['o_stop_lat'], x['o_stop_lon']),axis=1),
                                                next_origin_dist = lambda y: y.apply(lambda x: dist(x['to_stop_lat'], x['to_stop_lon'], x['next_o_stop_lat'], x['next_o_stop_lon']),axis=1),
                                                next_start_diff = lambda x: np.absolute(x['next_o_boarding_datetime'] - x['bt_end_time'])) \
                                        .sort_values(['card_num','trip_id'])

        #otp_buste_itineraries_summary = otp_buste_itineraries_summary[((otp_buste_itineraries_summary['trip_duration'] > pd.Timedelta('0s')) & 
        #                                                                                                                                (otp_buste_itineraries_summary['trip_duration'] < pd.Timedelta('2h'))) &
        #                                                                                                                        ((otp_buste_itineraries_summary['start_diff'] > pd.Timedelta('0s')) &
        #                                                                                                                                (otp_buste_itineraries_summary['start_diff'] < pd.Timedelta('1.5h')))] \
        #                                                                        .query('origin_dist < 0.1') \
        #                                                                        .query('next_origin_dist < 2.0')
        return otp_buste_itineraries_summary


In [572]:
cand_itineraries_loc_validation = get_candidate_itineraries_summary(cand_itineraries_df,passenger_trips_valid_df)

In [573]:
cand_itineraries_loc_validation

Unnamed: 0,card_num,trip_id,otp_itinerary_id,otp_from_stop_id,bt_start_time,from_stop_lat,from_stop_lon,otp_to_stop_id,bt_end_time,to_stop_lat,...,o_stop_lon,o_boarding_datetime,next_o_stop_lat,next_o_stop_lon,next_o_boarding_datetime,next_origin_dist,next_start_diff,origin_dist,start_diff,trip_duration
0,827182.0,28276.0,1.0,36694.0,2017-05-01 13:38:11,-25.534742,-49.332760,25732.0,2017-05-01 14:23:46,-25.448323,...,-49.332760,2017-05-01 13:38:11,-25.448327,-49.287780,2017-05-01 22:46:43,0.00051,08:22:57,0.00000,00:00:00,00:45:35
1,827182.0,28276.0,6.0,36694.0,2017-05-01 13:38:11,-25.534742,-49.332760,25732.0,2017-05-01 14:14:50,-25.448323,...,-49.332760,2017-05-01 13:38:11,-25.448327,-49.287780,2017-05-01 22:46:43,0.00051,08:31:53,0.00000,00:00:00,00:36:39
2,869122.0,52894.0,2.0,34119.0,2017-05-01 17:59:15,-25.451180,-49.350724,31000.0,2017-05-01 16:38:17,-25.441536,...,-49.350724,2017-05-01 17:59:15,-25.423548,-49.351123,2017-05-01 13:37:00,2.03535,03:01:17,0.00000,00:00:00,-1 days +22:39:02
3,1024869.0,28174.0,1.0,33580.0,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,2017-05-01 14:22:51,-25.438600,...,-49.224019,2017-05-01 13:36:42,-25.441209,-49.275853,2017-05-01 18:30:23,0.82126,04:07:32,0.00000,00:00:00,00:46:09
4,1024869.0,28174.0,5.0,33580.0,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,2017-05-01 14:22:51,-25.438600,...,-49.224019,2017-05-01 13:36:42,-25.441209,-49.275853,2017-05-01 18:30:23,0.82126,04:07:32,0.00000,00:00:00,00:46:09
5,1168962.0,28455.0,1.0,26296.0,2017-05-01 14:37:36,-25.512252,-49.295580,26197.0,2017-05-01 14:51:05,-25.476372,...,-49.283864,2017-05-01 13:40:12,-25.476335,-49.292629,2017-05-01 15:21:23,0.00463,00:30:18,5.09962,00:57:24,00:13:29
6,1168962.0,28455.0,2.0,37403.0,2017-05-01 13:40:12,-25.557041,-49.283864,26197.0,2017-05-01 14:27:12,-25.476372,...,-49.283864,2017-05-01 13:40:12,-25.476335,-49.292629,2017-05-01 15:21:23,0.00463,00:54:11,0.00000,00:00:00,00:47:00
7,1354462.0,28373.0,4.0,39837.0,2017-05-01 13:39:18,-25.556058,-49.332414,33987.0,2017-05-01 15:12:34,-25.428949,...,-49.332414,2017-05-01 13:39:18,-25.433111,-49.352019,2017-05-01 15:40:47,0.48663,00:28:13,0.00000,00:00:00,01:33:16
8,1354462.0,39353.0,1.0,33745.0,2017-05-01 15:40:50,-25.435506,-49.349914,35179.0,2017-05-01 16:40:50,-25.551285,...,-49.349914,2017-05-01 15:40:50,-25.556058,-49.332414,2017-05-01 13:39:14,0.53028,03:01:36,0.00000,00:00:00,01:00:00
9,1447119.0,58869.0,5.0,25716.0,2017-05-01 19:50:09,-25.422406,-49.291661,36283.0,2017-05-01 20:23:16,-25.573030,...,-49.301067,2017-05-01 18:59:21,-25.573303,-49.334882,2017-05-01 13:38:02,0.03086,06:45:14,1.37701,00:50:48,00:33:07


In [577]:
valid_candidate_itineraries = cand_itineraries_loc_validation[((cand_itineraries_loc_validation['trip_duration'] > pd.Timedelta('0s')) & 
                                                               (cand_itineraries_loc_validation['trip_duration'] < pd.Timedelta('2h'))) & 
                                                              ((cand_itineraries_loc_validation['start_diff'] >= pd.Timedelta('0s')) & 
                                                               (cand_itineraries_loc_validation['start_diff'] < pd.Timedelta('1.5h')))] \
                                #NEXT_START_DIFF
                                .query('origin_dist < 0.1') \
                                .query('next_origin_dist < 2.0')

In [578]:
valid_candidate_itineraries

Unnamed: 0,card_num,trip_id,otp_itinerary_id,otp_from_stop_id,bt_start_time,from_stop_lat,from_stop_lon,otp_to_stop_id,bt_end_time,to_stop_lat,...,o_stop_lon,o_boarding_datetime,next_o_stop_lat,next_o_stop_lon,next_o_boarding_datetime,next_origin_dist,next_start_diff,origin_dist,start_diff,trip_duration
0,827182.0,28276.0,1.0,36694.0,2017-05-01 13:38:11,-25.534742,-49.332760,25732.0,2017-05-01 14:23:46,-25.448323,...,-49.332760,2017-05-01 13:38:11,-25.448327,-49.287780,2017-05-01 22:46:43,0.00051,08:22:57,0.0,0 days,00:45:35
1,827182.0,28276.0,6.0,36694.0,2017-05-01 13:38:11,-25.534742,-49.332760,25732.0,2017-05-01 14:14:50,-25.448323,...,-49.332760,2017-05-01 13:38:11,-25.448327,-49.287780,2017-05-01 22:46:43,0.00051,08:31:53,0.0,0 days,00:36:39
3,1024869.0,28174.0,1.0,33580.0,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,2017-05-01 14:22:51,-25.438600,...,-49.224019,2017-05-01 13:36:42,-25.441209,-49.275853,2017-05-01 18:30:23,0.82126,04:07:32,0.0,0 days,00:46:09
4,1024869.0,28174.0,5.0,33580.0,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,2017-05-01 14:22:51,-25.438600,...,-49.224019,2017-05-01 13:36:42,-25.441209,-49.275853,2017-05-01 18:30:23,0.82126,04:07:32,0.0,0 days,00:46:09
6,1168962.0,28455.0,2.0,37403.0,2017-05-01 13:40:12,-25.557041,-49.283864,26197.0,2017-05-01 14:27:12,-25.476372,...,-49.283864,2017-05-01 13:40:12,-25.476335,-49.292629,2017-05-01 15:21:23,0.00463,00:54:11,0.0,0 days,00:47:00
7,1354462.0,28373.0,4.0,39837.0,2017-05-01 13:39:18,-25.556058,-49.332414,33987.0,2017-05-01 15:12:34,-25.428949,...,-49.332414,2017-05-01 13:39:18,-25.433111,-49.352019,2017-05-01 15:40:47,0.48663,00:28:13,0.0,0 days,01:33:16
8,1354462.0,39353.0,1.0,33745.0,2017-05-01 15:40:50,-25.435506,-49.349914,35179.0,2017-05-01 16:40:50,-25.551285,...,-49.349914,2017-05-01 15:40:50,-25.556058,-49.332414,2017-05-01 13:39:14,0.53028,03:01:36,0.0,0 days,01:00:00
12,1998953.0,52772.0,5.0,35642.0,2017-05-01 17:57:59,-25.423318,-49.268572,29384.0,2017-05-01 18:22:31,-25.366369,...,-49.268572,2017-05-01 17:57:59,-25.368502,-49.270891,2017-05-01 13:36:42,0.28449,04:45:49,0.0,0 days,00:24:32
14,2103372.0,47677.0,8.0,31824.0,2017-05-01 17:08:21,-25.419484,-49.224343,25683.0,2017-05-01 18:14:21,-25.410622,...,-49.224343,2017-05-01 17:08:21,-25.410580,-49.248257,2017-05-01 13:36:54,0.00575,04:37:27,0.0,0 days,01:06:00
15,2168963.0,28252.0,1.0,32459.0,2017-05-01 13:37:55,-25.471072,-49.194606,26045.0,2017-05-01 14:08:27,-25.438111,...,-49.194606,2017-05-01 13:37:55,-25.438187,-49.238717,2017-05-01 18:00:26,0.01067,03:51:59,0.0,0 days,00:30:32


In [581]:
inferred_trip_itineraries = valid_candidate_itineraries.sort_values(['card_num','trip_id','trip_duration']) \
                                .groupby(['card_num','trip_id']) \
                                .first() \
                                .reset_index()
                        

In [582]:
inferred_trip_itineraries

Unnamed: 0,card_num,trip_id,otp_itinerary_id,otp_from_stop_id,bt_start_time,from_stop_lat,from_stop_lon,otp_to_stop_id,bt_end_time,to_stop_lat,...,o_stop_lon,o_boarding_datetime,next_o_stop_lat,next_o_stop_lon,next_o_boarding_datetime,next_origin_dist,next_start_diff,origin_dist,start_diff,trip_duration
0,827182.0,28276.0,6.0,36694.0,2017-05-01 13:38:11,-25.534742,-49.332760,25732.0,2017-05-01 14:14:50,-25.448323,...,-49.332760,2017-05-01 13:38:11,-25.448327,-49.287780,2017-05-01 22:46:43,0.00051,08:31:53,0.0,0 days,00:36:39
1,1024869.0,28174.0,1.0,33580.0,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,2017-05-01 14:22:51,-25.438600,...,-49.224019,2017-05-01 13:36:42,-25.441209,-49.275853,2017-05-01 18:30:23,0.82126,04:07:32,0.0,0 days,00:46:09
2,1168962.0,28455.0,2.0,37403.0,2017-05-01 13:40:12,-25.557041,-49.283864,26197.0,2017-05-01 14:27:12,-25.476372,...,-49.283864,2017-05-01 13:40:12,-25.476335,-49.292629,2017-05-01 15:21:23,0.00463,00:54:11,0.0,0 days,00:47:00
3,1354462.0,28373.0,4.0,39837.0,2017-05-01 13:39:18,-25.556058,-49.332414,33987.0,2017-05-01 15:12:34,-25.428949,...,-49.332414,2017-05-01 13:39:18,-25.433111,-49.352019,2017-05-01 15:40:47,0.48663,00:28:13,0.0,0 days,01:33:16
4,1354462.0,39353.0,1.0,33745.0,2017-05-01 15:40:50,-25.435506,-49.349914,35179.0,2017-05-01 16:40:50,-25.551285,...,-49.349914,2017-05-01 15:40:50,-25.556058,-49.332414,2017-05-01 13:39:14,0.53028,03:01:36,0.0,0 days,01:00:00
5,1998953.0,52772.0,5.0,35642.0,2017-05-01 17:57:59,-25.423318,-49.268572,29384.0,2017-05-01 18:22:31,-25.366369,...,-49.268572,2017-05-01 17:57:59,-25.368502,-49.270891,2017-05-01 13:36:42,0.28449,04:45:49,0.0,0 days,00:24:32
6,2103372.0,47677.0,8.0,31824.0,2017-05-01 17:08:21,-25.419484,-49.224343,25683.0,2017-05-01 18:14:21,-25.410622,...,-49.224343,2017-05-01 17:08:21,-25.410580,-49.248257,2017-05-01 13:36:54,0.00575,04:37:27,0.0,0 days,01:06:00
7,2168963.0,28252.0,1.0,32459.0,2017-05-01 13:37:55,-25.471072,-49.194606,26045.0,2017-05-01 14:08:27,-25.438111,...,-49.194606,2017-05-01 13:37:55,-25.438187,-49.238717,2017-05-01 18:00:26,0.01067,03:51:59,0.0,0 days,00:30:32
8,2271249.0,28290.0,1.0,8644.0,2017-05-01 13:38:19,-25.576357,-49.322097,26197.0,2017-05-01 14:27:12,-25.476372,...,-49.322097,2017-05-01 13:38:19,-25.476335,-49.292629,2017-05-01 18:35:29,0.00463,04:08:17,0.0,0 days,00:48:53
9,2316370.0,28308.0,2.0,32043.0,2017-05-01 13:38:36,-25.534129,-49.300631,25730.0,2017-05-01 14:19:49,-25.441207,...,-49.300631,2017-05-01 13:38:36,-25.441209,-49.275853,2017-05-01 22:56:59,0.00020,08:37:10,0.0,0 days,00:41:13


In [586]:
inferred_trip_itineraries.filter(['next_origin_dist','next_start_diff','origin_dist','start_diff','trip_duration']).describe()

Unnamed: 0,next_origin_dist,next_start_diff,origin_dist,start_diff,trip_duration
count,69.0,69,69.0,69,69
mean,0.166341,0 days 04:52:47.246376,0.0,0 days 00:00:00,0 days 00:41:34.347826
std,0.317079,0 days 02:51:29.037900,0.0,0 days 00:00:00,0 days 00:25:54.544872
min,0.0,0 days 00:04:38,0.0,0 days 00:00:00,0 days 00:03:28
25%,0.01027,0 days 02:41:44,0.0,0 days 00:00:00,0 days 00:18:41
50%,0.01572,0 days 04:45:49,0.0,0 days 00:00:00,0 days 00:38:41
75%,0.18586,0 days 08:03:17,0.0,0 days 00:00:00,0 days 00:56:10
max,1.70656,0 days 09:41:28,0.0,0 days 00:00:00,0 days 01:41:49


#### Writing Inferred Trips Itineraries dataset to file

In [593]:
inf_trips_itineraries_output_filepath = output_folderpath + os.sep + file_date_str + '_' + itinerary_part_name + '_itins_inf_trips.csv'
inferred_trip_itineraries.to_csv(inf_trips_itineraries_output_filepath,index=False)

#### Getting Itineraries Legs back

In [589]:
inferred_trip_itineraries_legs = pass_itins_legs.merge(inferred_trip_itineraries \
                                                       .filter(['card_num','trip_id','otp_itinerary_id']),
                                                       how='inner') \
                                                .sort_values(['card_num','trip_id','otp_itinerary_id','otp_leg_id'])

In [590]:
inferred_trip_itineraries_legs

Unnamed: 0,card_num,trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,...,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,considered_duration_mins,from_stop_lat,from_stop_lon,to_stop_lat,to_stop_lon
0,827182.0,28276.0,6.0,1.0,BUS,652,LA011,11.0,36694.0,2017-05-01 14:07:11,...,28609.0,2017-05-01 14:26:00,2017-05-01 13:55:44,18.816667,17.550000,17.550000,-25.534742,-49.332760,-25.511657,-49.324786
1,827182.0,28276.0,6.0,2.0,WALK,,,,,2017-05-01 14:26:00,...,,2017-05-01 14:26:05,2017-05-01 13:55:44,0.083333,,0.083333,,,,
2,827182.0,28276.0,6.0,4.0,WALK,,,,,2017-05-01 14:44:30,...,,2017-05-01 14:46:29,2017-05-01 13:57:44,1.983333,,1.983333,,,,
3,827182.0,28276.0,6.0,5.0,BUS,603,HD249,7.0,26294.0,2017-05-01 14:47:37,...,25732.0,2017-05-01 15:07:00,2017-05-01 14:14:50,19.383333,15.016667,15.016667,-25.492347,-49.293421,-25.448323,-49.287777
4,1024869.0,28174.0,1.0,1.0,BUS,515,EA303,10.0,33580.0,2017-05-01 13:40:27,...,27791.0,2017-05-01 13:56:00,2017-05-01 13:52:12,15.550000,15.500000,15.500000,-25.506291,-49.224019,-25.481843,-49.246769
5,1024869.0,28174.0,1.0,2.0,WALK,,,,,2017-05-01 13:56:00,...,,2017-05-01 13:58:59,2017-05-01 13:55:12,2.983333,,2.983333,,,,
6,1024869.0,28174.0,1.0,3.0,BUS,503,KE842,9.0,27474.0,2017-05-01 14:01:04,...,26905.0,2017-05-01 14:18:06,2017-05-01 14:22:51,17.033333,17.350000,17.350000,-25.481586,-49.247071,-25.438600,-49.268210
7,1168962.0,28455.0,2.0,1.0,BUS,639,GA142,10.0,37403.0,2017-05-01 13:56:48,...,34160.0,2017-05-01 14:14:46,2017-05-01 13:54:13,17.966667,14.016667,14.016667,-25.557041,-49.283864,-25.520732,-49.295383
8,1168962.0,28455.0,2.0,2.0,WALK,,,,,2017-05-01 14:14:46,...,,2017-05-01 14:14:50,2017-05-01 13:54:13,0.066667,,0.066667,,,,
9,1168962.0,28455.0,2.0,3.0,BUS,502,HR028,7.0,25534.0,2017-05-01 14:20:14,...,26197.0,2017-05-01 14:37:00,2017-05-01 14:27:12,16.766667,18.200000,18.200000,-25.520710,-49.295454,-25.476372,-49.292607


#### Writing Inferred Trips Itineraries Legs dataset to file

In [None]:
inf_trips_itineraries_legs_output_filepath = output_folderpath + os.sep + file_date_str + '_' + itinerary_part_name + '_legs_inf_trips_itins.csv'
inferred_trip_itineraries.to_csv(inf_trips_itineraries_output_filepath,index=False)

In [460]:
if len(pass_itins_legs) == 0:
            print "No matches left after matching and selecting feasible bus legs."
            print "Skipping next steps..."
            exit(0)

In [461]:
# Writing suggested itineraries dataset to file
actual_itineraries_output_filepath = output_folderpath + os.sep + file_date_str + '_' + itinerary_part_name + '_actual_itin.csv'
pass_itins_legs.to_csv(actual_itineraries_output_filepath, index=False)