In [1]:
#Libraries

#Python Libs
import sys
import os
import glob
import traceback
from datetime import datetime
import time
from geopy import distance


#Data Analysis Libs
import pandas as pd
import numpy as np

In [2]:
#Functions
def select_input_files(enh_buste_base_path,init_date,fin_date,suffix):
        selected_files = []
        all_files = glob.glob(os.path.join(enh_buste_base_path,"*"))

        for file_ in all_files:
                try:
                        file_date = pd.to_datetime(file_.split('/')[-1],format=('%Y_%m_%d' + suffix  + '.csv'))
                        if (file_date >= init_date) and (file_date <= fin_date):
                                selected_files.append((file_,file_date))
                except:
                        continue

        return sorted(selected_files)

def get_router_id(query_date):
    INTERMEDIATE_OTP_DATE = pd.to_datetime("2017-06-30", format="%Y-%m-%d")
    router_id = ''

    if (query_date <= INTERMEDIATE_OTP_DATE):
        return 'ctba-2017-1'
    else:
        return 'ctba-2017-2'

In [3]:
#def match_terminal_boardings(selected_trips,itineraries_start):
#        terminal_boarding_origins = selected_trips[(selected_trips['o_busCode'].str.isdigit()) & (selected_trips['o_route'] != '021')]
#        matched_terminal_boardings = terminal_boarding_origins.merge(itineraries_start,
#                                                                                                                                left_on=['o_boarding_id','o_stopPointId'],
#                                                                                                                                right_on=['user_trip_id','from_parent_station'], how='inner')
#        num_matched_terminal_boardings = len(matched_terminal_boardings.drop_duplicates(subset=['cardNum','o_boarding_id']))
#        if num_matched_terminal_boardings == 0:
#            matched_perc = 0.0
#        else:
#            matched_perc = 100*(num_matched_terminal_boardings/float(len(terminal_boarding_origins)))
#        return (matched_terminal_boardings,num_matched_terminal_boardings,matched_perc)

#def match_terminal_021_boardings(selected_trips,itineraries_start):
#        terminal_021_origins = selected_trips[(selected_trips['o_busCode'].str.isdigit()) & (selected_trips['o_route'] == '021')]
#        matched_021_terminal_boardings = terminal_021_origins.merge(itineraries_start, left_on=['o_boarding_id','o_route','o_stopPointId'], right_on=['user_trip_id','route','from_parent_station'], how='inner')
#        num_matched_021_terminal_boardings = len(matched_021_terminal_boardings.drop_duplicates(subset=['cardNum','o_boarding_id']))
#        if num_matched_021_terminal_boardings == 0:
#                terminal_021_match_perc = 0.0
#        else:
#                terminal_021_match_perc = 100*(num_matched_021_terminal_boardings/float(len(terminal_021_origins)))
#        return (len(terminal_021_origins),matched_021_terminal_boardings,num_matched_021_terminal_boardings,terminal_021_match_perc)


In [4]:
#def get_candidate_itineraries_penalty_score(otp_buste_itineraries_filtered):
#        otp_buste_itineraries_penalty = otp_buste_itineraries_filtered \
#                                                        .assign(penalty = lambda x: 2*x['start_diff'].dt.total_seconds() + x['trip_duration'].dt.total_seconds() + x['match_num_transfers']*10) \
#                                                        [['cardNum','user_trip_id','itinerary_id','match_num_transfers','match_vehicle_boarding','next_origin_dist','origin_dist','start_diff','trip_duration','penalty']] \
#                                                        .sort_values(['user_trip_id','penalty'], ascending=True)
#        return otp_buste_itineraries_penalty


## Main

In [5]:
otp_suggestions_filepath = '/local/tarciso/data/otp_itineraries/2017_05_01_user_trips_100_otp_itineraries.csv'
user_trips_folderpath = '/local/tarciso/data/enhanced-buste/user_trips/'
bus_trips_folderpath = '/local/tarciso/data/enhanced-buste/bus_trips/'
gtfs_base_folderpath = '/local/tarciso/data/gtfs/'
output_folderpath = '/local/tarciso/data/test-odmat/'

In [6]:
file_date_str = otp_suggestions_filepath.split('/')[-1].split('_user_trips_')[0]
file_date = pd.to_datetime(file_date_str,format='%Y_%m_%d')
print "Processing File:", otp_suggestions_filepath

Processing File: /local/tarciso/data/otp_itineraries/2017_05_01_user_trips_100_otp_itineraries.csv


In [7]:
        # Extracting itinerary part name for later use
        itinerary_part_name = otp_suggestions_filepath.split('/')[-1].split('_')[5]
        # Read OTP Suggestions
        otp_suggestions_raw = pd.read_csv(otp_suggestions_filepath, parse_dates=['date','otp_start_time','otp_end_time'])

        if len(otp_suggestions_raw) == 0:
            print "Zero OTP suggestions found."
            print "Skipping next steps..."
            exit(0)


In [8]:
def prepare_otp_data(otp_data):
        #Fixing prefix
        otp_data.columns = otp_data.columns.str.replace('otp_','')
        otp_data = otp_data.add_prefix('otp_')
        
        #Fixing Timezone difference - when needed
        otp_data['otp_start_time'] = otp_data['otp_start_time'] - pd.Timedelta('10800 s')
        otp_data['otp_end_time'] = otp_data['otp_end_time'] - pd.Timedelta('10800 s')
        
        #Adjusting route format to have 3 numbers
        otp_data['otp_route'] = otp_data['otp_route'].astype(str)
        otp_data['otp_route'] = np.where(otp_data['otp_mode'] == 'BUS',
                            otp_data['otp_route'].astype(str).str.replace("\.0",'').str.zfill(3),
                            otp_data['otp_route'])

        return otp_data

In [9]:
        # Prepare OTP data for analysis
        otp_suggestions = prepare_otp_data(otp_suggestions_raw)

        # Read stops data
        stops_filepath = gtfs_base_folderpath + os.sep + get_router_id(file_date) + os.sep + 'stops.txt'
        stops_df = pd.read_csv(stops_filepath)

        # Adding Parent Stop data to OTP Suggestions
        stops_parent_stations = stops_df[['stop_id','parent_station']]
        otp_suggestions = otp_suggestions.merge(stops_parent_stations.add_prefix('from_'),
                                                left_on='otp_from_stop_id',
                                                right_on='from_stop_id',
                                                how='left') \
                                        .merge(stops_parent_stations.add_prefix('to_'),
                                                left_on='otp_to_stop_id',
                                                right_on='to_stop_id',
                                                how='left') \
                                        .drop(['from_stop_id','to_stop_id'], axis=1) \
                                        .rename(index=str, columns={'from_parent_station':'otp_from_parent_station',
                                                                    'to_parent_station':'otp_to_parent_station'})
        
        #otp_suggestions_bus_legs = otp_suggestions[otp_suggestions['otp_mode'] == 'BUS']
        #otp_suggestions_walk_legs = otp_suggestions[otp_suggestions['otp_mode'] == 'WALK']


In [10]:
otp_suggestions.head()

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827.0,33788.0,30993.0,6.45,,14508.0
1,2017-05-01,28150.0,1,2,2017-05-01 13:45:00,2017-05-01 13:45:52,WALK,,,,0.866667,,
2,2017-05-01,28150.0,1,3,2017-05-01 13:51:00,2017-05-01 14:01:59,BUS,303.0,26195.0,25753.0,10.983333,14508.0,41756.0
3,2017-05-01,28150.0,1,4,2017-05-01 14:02:00,2017-05-01 14:09:32,WALK,,,,7.533333,,
4,2017-05-01,28150.0,2,1,2017-05-01 13:42:58,2017-05-01 13:57:00,BUS,822.0,33788.0,30994.0,14.033333,,14508.0


## Match Scheduled Itineraries to Observed Bus Trips

In [11]:
def compatible_dates(otp_data,ticketing_data):
        otp_date = otp_data['otp_date'].iloc[0]
        ticketing_date = pd.to_datetime(ticketing_data['o_boarding_datetime'].dt.strftime('%Y-%m-%d')[0])

        return (otp_date == ticketing_date,otp_date,ticketing_date)


### Read and Prepare Origin/Next-Origin Pairs data

In [12]:
        # Read Origin/Next-Origin Pairs for the same date
        trips_origins_filepath = user_trips_folderpath + os.sep + file_date_str + '_user_trips.csv'
        trips_on_pairs_full = pd.read_csv(trips_origins_filepath,
                                                parse_dates=['o_boarding_datetime','o_gps_datetime','next_o_boarding_datetime','next_o_gps_datetime'])
        # Checking whether OTP and ticketing dates match
        dates_compatibility, otp_date, ticketing_date = compatible_dates(otp_suggestions,trips_on_pairs_full)
        if not dates_compatibility:
                print "ERROR: OTP date", otp_date, "does not match Ticketing data", ticketing_date
                print "Skipping current day"
                exit(1)
        
        trips_on_pairs = trips_on_pairs_full.filter(['o_boarding_id','next_o_boarding_id'])
        trips_origins = trips_on_pairs_full.filter([col for col in trips_on_pairs_full.columns if col.startswith('o_')])

In [13]:
trips_origins.head()

Unnamed: 0,o_boarding_id,o_boarding_datetime,o_gps_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_stop_lat,o_stop_lon
0,180.0,2017-05-01 04:48:42,2017-05-01 04:48:17,650,HA030,1.0,36293.0,-25.568989,-49.332253
1,40392.0,2017-05-01 15:52:39,NaT,21,08044,,41791.0,-25.431951,-49.296491
2,181.0,2017-05-01 04:49:13,2017-05-01 04:49:08,650,HA030,1.0,36294.0,-25.56686,-49.3329
3,31560.0,2017-05-01 14:16:55,NaT,0,00038,,26051.0,-25.447479,-49.263816
4,182.0,2017-05-01 04:49:19,2017-05-01 04:49:08,650,HA030,1.0,36294.0,-25.56686,-49.3329


In [14]:
trips_on_pairs.head()

Unnamed: 0,o_boarding_id,next_o_boarding_id
0,180.0,40392.0
1,40392.0,180.0
2,181.0,31560.0
3,31560.0,181.0
4,182.0,61724.0


### Filtering out non-vehicle-boarding itinerary suggestions

In [15]:
vehicle_boarding_trip_ids = trips_origins[np.logical_not(trips_origins['o_busCode'].str.isdigit())].o_boarding_id

In [16]:
otp_suggestions_vehicle = otp_suggestions[otp_suggestions['otp_user_trip_id'].isin(vehicle_boarding_trip_ids)]
otp_suggestions_bus_legs = otp_suggestions_vehicle[otp_suggestions_vehicle['otp_mode'] == 'BUS']
otp_suggestions_walk_legs = otp_suggestions_vehicle[otp_suggestions_vehicle['otp_mode'] == 'WALK']

In [17]:
len(otp_suggestions)

8016

In [18]:
len(otp_suggestions_vehicle)

4452

### Read and Prepare Bus Trips data

In [19]:
        # Find OTP Suggested Itineraries in BUSTE Data
        bus_trips_filepath = bus_trips_folderpath + os.sep + file_date_str + '_bus_trips.csv'
        bus_trips = pd.read_csv(bus_trips_filepath, dtype={'route': object},parse_dates=['gps_datetime']) \
                                        .sort_values(['route','busCode','tripNum','gps_datetime']) \
                                        .assign(route = lambda x: x['route'].astype(str).str.replace("\.0",'').str.zfill(3))  \
                                        .drop_duplicates()


In [20]:
bus_trips.head()

Unnamed: 0,route,busCode,shapeId,tripNum,stopPointId,gps_datetime,distanceTraveledShape,stop_lat,stop_lon,parent_station
29435,10,BB303,1708.0,1.0,33157.0,2017-05-01 06:28:07,537.974,-25.410517,-49.276479,
29436,10,BB303,1708.0,1.0,33159.0,2017-05-01 06:30:38,1141.061,-25.411726,-49.270902,
29437,10,BB303,1708.0,1.0,33158.0,2017-05-01 06:31:40,1624.751,-25.415285,-49.270134,
29438,10,BB303,1708.0,1.0,30150.0,2017-05-01 06:32:26,1972.077,-25.416733,-49.267863,
29439,10,BB303,1708.0,1.0,28637.0,2017-05-01 06:33:11,2378.349,-25.414184,-49.265917,


In [21]:
bus_trips_clean = bus_trips.filter(['route','busCode','tripNum','stopPointId','gps_datetime'])

In [22]:
bus_trips_clean.head()

Unnamed: 0,route,busCode,tripNum,stopPointId,gps_datetime
29435,10,BB303,1.0,33157.0,2017-05-01 06:28:07
29436,10,BB303,1.0,33159.0,2017-05-01 06:30:38
29437,10,BB303,1.0,33158.0,2017-05-01 06:31:40
29438,10,BB303,1.0,30150.0,2017-05-01 06:32:26
29439,10,BB303,1.0,28637.0,2017-05-01 06:33:11


#### Identify Possible Matches between OTP Itineraries and Bus Trips Observed Data

In [23]:
bus_trips.dtypes

route                            object
busCode                          object
shapeId                         float64
tripNum                         float64
stopPointId                     float64
gps_datetime             datetime64[ns]
distanceTraveledShape           float64
stop_lat                        float64
stop_lon                        float64
parent_station                  float64
dtype: object

In [24]:
otp_suggestions_bus_legs.dtypes

otp_date                   datetime64[ns]
otp_user_trip_id                  float64
otp_itinerary_id                    int64
otp_leg_id                          int64
otp_start_time             datetime64[ns]
otp_end_time               datetime64[ns]
otp_mode                           object
otp_route                          object
otp_from_stop_id                  float64
otp_to_stop_id                    float64
otp_duration_mins                 float64
otp_from_parent_station           float64
otp_to_parent_station             float64
dtype: object

In [15]:
# scheduled_itin_observed_od = otp_suggestions_bus_legs.merge(bus_trips.add_prefix('bt_from_'),
#                                 left_on=['otp_route','otp_from_stop_id'],
#                                 right_on=['bt_from_route','bt_from_stopPointId'],
#                                 how='inner') \
#                                 .assign(sched_obs_start_timediff = 
#                                         lambda x: np.absolute(x['bt_from_gps_datetime'] - x['otp_start_time']))
# scheduled_itin_observed_od = scheduled_itin_observed_od[scheduled_itin_observed_od['sched_obs_start_timediff'] <= pd.Timedelta(minutes=60)]

In [16]:
# scheduled_itin_observed_od = scheduled_itin_observed_od.merge(bus_trips.add_prefix('bt_to_'),
#                                 left_on=['otp_route','bt_from_busCode','bt_from_tripNum','otp_to_stop_id'],
#                                 right_on=['bt_to_route','bt_to_busCode','bt_to_tripNum','bt_to_stopPointId'],
#                                 how='inner') \
#                                 .assign(sched_obs_end_timediff = 
#                                         lambda x: np.absolute(x['bt_to_gps_datetime'] - x['otp_end_time'])) \
#                                 .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','sched_obs_start_timediff','sched_obs_end_timediff'])
# scheduled_itin_observed_od = scheduled_itin_observed_od[scheduled_itin_observed_od['sched_obs_end_timediff'] <= pd.Timedelta(minutes=60)]

In [30]:
scheduled_itin_observed_o = otp_suggestions_bus_legs.merge(bus_trips_clean.add_prefix('bt_'),
                                left_on=['otp_route','otp_from_stop_id'],
                                right_on=['bt_route','bt_stopPointId'],
                                how='inner') \
                                .drop(['bt_route','bt_stopPointId'], axis=1) \
                                .rename(index=str, columns={'bt_gps_datetime':'bt_start_time',
                                                            'bt_tripNum':'bt_trip_num',
                                                            'bt_busCode':'bt_bus_code'}) \
                                .assign(sched_obs_start_timediff = 
                                        lambda x: np.absolute(x['bt_start_time'] - x['otp_start_time']))
#scheduled_itin_observed_o = scheduled_itin_observed_o[scheduled_itin_observed_o['sched_obs_start_timediff'] <= pd.Timedelta(minutes=60)]

In [31]:
scheduled_itin_observed_od = scheduled_itin_observed_o.merge(bus_trips_clean.add_prefix('bt_'),
                                left_on=['otp_route','bt_bus_code','bt_trip_num','otp_to_stop_id'],
                                right_on=['bt_route','bt_busCode','bt_tripNum','bt_stopPointId'],
                                how='inner') \
                                .drop(['bt_route','bt_stopPointId'], axis=1) \
                                .rename(index=str, columns={'bt_gps_datetime':'bt_end_time'}) \
                                .assign(sched_obs_end_timediff = 
                                        lambda x: np.absolute(x['bt_end_time'] - x['otp_end_time'])) \
                                .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','sched_obs_start_timediff','sched_obs_end_timediff'])
#scheduled_itin_observed_od = scheduled_itin_observed_od[scheduled_itin_observed_od['sched_obs_end_timediff'] <= pd.Timedelta(minutes=60)]

In [32]:
len(scheduled_itin_observed_od)

77614

In [33]:
scheduled_itin_observed_od['bt_duration_mins'] = (scheduled_itin_observed_od['bt_end_time'] - scheduled_itin_observed_od['bt_start_time'])/pd.Timedelta(minutes=1)
scheduled_itin_observed_od = scheduled_itin_observed_od[scheduled_itin_observed_od['bt_duration_mins'] > 0]

In [34]:
len(scheduled_itin_observed_od)

68242

In [35]:
scheduled_itin_observed_od_full = pd.concat([scheduled_itin_observed_od,otp_suggestions_walk_legs], sort=False)

In [36]:
#bus_trips[(bus_trips['route'] == '827') & (bus_trips['stopPointId'] == 33788)].sort_values(['gps_datetime'])

In [38]:
scheduled_itin_observed_od_full.head() \
                            .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,...,otp_to_parent_station,bt_bus_code,bt_trip_num,bt_start_time,sched_obs_start_timediff,bt_busCode,bt_tripNum,bt_end_time,sched_obs_end_timediff,bt_duration_mins
66,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,...,14508.0,BC929,12.0,2017-05-01 13:35:18,00:03:15,BC929,12.0,2017-05-01 13:44:14,00:00:46,8.933333
72,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,...,14508.0,BC929,13.0,2017-05-01 14:22:34,00:44:01,BC929,13.0,2017-05-01 14:30:58,00:45:58,8.4
60,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,...,14508.0,BC929,11.0,2017-05-01 12:47:49,00:50:44,BC929,11.0,2017-05-01 12:53:04,00:51:56,5.25
78,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,...,14508.0,BC929,14.0,2017-05-01 15:11:15,01:32:42,BC929,14.0,2017-05-01 15:18:06,01:33:06,6.85
54,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,...,14508.0,BC929,10.0,2017-05-01 11:58:41,01:39:52,BC929,10.0,2017-05-01 12:07:05,01:37:55,8.4


In [39]:
scheduled_itin_observed_od_full_clean = scheduled_itin_observed_od_full \
                            .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_mode','otp_route',
                                     'bt_bus_code','bt_trip_num','otp_from_stop_id','otp_start_time',
                                     'bt_start_time','sched_obs_start_timediff','otp_to_stop_id',
                                     'otp_end_time','bt_end_time','sched_obs_end_timediff','otp_duration_mins']) \
                            .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])

In [40]:
scheduled_itin_observed_od_full_clean

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
66,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,0 days 00:03:15,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,0 days 00:00:46,6.450000
72,28150.0,1,1,BUS,827,BC929,13.0,33788.0,2017-05-01 13:38:33,2017-05-01 14:22:34,0 days 00:44:01,30993.0,2017-05-01 13:45:00,2017-05-01 14:30:58,0 days 00:45:58,6.450000
60,28150.0,1,1,BUS,827,BC929,11.0,33788.0,2017-05-01 13:38:33,2017-05-01 12:47:49,0 days 00:50:44,30993.0,2017-05-01 13:45:00,2017-05-01 12:53:04,0 days 00:51:56,6.450000
78,28150.0,1,1,BUS,827,BC929,14.0,33788.0,2017-05-01 13:38:33,2017-05-01 15:11:15,0 days 01:32:42,30993.0,2017-05-01 13:45:00,2017-05-01 15:18:06,0 days 01:33:06,6.450000
54,28150.0,1,1,BUS,827,BC929,10.0,33788.0,2017-05-01 13:38:33,2017-05-01 11:58:41,0 days 01:39:52,30993.0,2017-05-01 13:45:00,2017-05-01 12:07:05,0 days 01:37:55,6.450000
84,28150.0,1,1,BUS,827,BC929,15.0,33788.0,2017-05-01 13:38:33,2017-05-01 15:59:34,0 days 02:21:01,30993.0,2017-05-01 13:45:00,2017-05-01 16:06:35,0 days 02:21:35,6.450000
48,28150.0,1,1,BUS,827,BC929,9.0,33788.0,2017-05-01 13:38:33,2017-05-01 11:08:39,0 days 02:29:54,30993.0,2017-05-01 13:45:00,2017-05-01 11:19:47,0 days 02:25:13,6.450000
90,28150.0,1,1,BUS,827,BC929,16.0,33788.0,2017-05-01 13:38:33,2017-05-01 16:48:58,0 days 03:10:25,30993.0,2017-05-01 13:45:00,2017-05-01 16:55:23,0 days 03:10:23,6.450000
42,28150.0,1,1,BUS,827,BC929,8.0,33788.0,2017-05-01 13:38:33,2017-05-01 10:23:21,0 days 03:15:12,30993.0,2017-05-01 13:45:00,2017-05-01 10:33:22,0 days 03:11:38,6.450000
96,28150.0,1,1,BUS,827,BC929,17.0,33788.0,2017-05-01 13:38:33,2017-05-01 17:35:37,0 days 03:57:04,30993.0,2017-05-01 13:45:00,2017-05-01 17:41:38,0 days 03:56:38,6.450000


## Working with Vehicle Boardings

In [41]:
        # Selecting trips for whom OTP suggestions were found
        selected_trips = trips_origins[trips_origins['o_boarding_id'].isin(otp_suggestions_vehicle['otp_user_trip_id'])]
        num_selected_trips = len(selected_trips)


In [42]:
        # Matching all kinds of boarding events to valid OTP suggestions
        itins_first_bus_legs = otp_suggestions_bus_legs \
                                .groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                .first() \
                                .reset_index()

In [43]:
itins_first_bus_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_date,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,28150.0,1,2017-05-01,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,14508.0,14508.0
1,28150.0,2,2017-05-01,1,2017-05-01 13:42:58,2017-05-01 13:57:00,BUS,822,33788.0,30994.0,14.033333,14508.0,14508.0
2,28150.0,3,2017-05-01,2,2017-05-01 14:03:39,2017-05-01 14:20:05,BUS,870,33980.0,35079.0,16.433333,,
3,28150.0,4,2017-05-01,1,2017-05-01 13:57:58,2017-05-01 14:12:00,BUS,822,33788.0,30994.0,14.033333,14508.0,14508.0
4,28150.0,5,2017-05-01,2,2017-05-01 14:10:00,2017-05-01 14:14:00,BUS,821,33973.0,30995.0,4.0,14508.0,14508.0


In [44]:
len(itins_first_bus_legs)

1140

In [45]:
itins_first_bus_legs.otp_leg_id.describe()

count    1140.000000
mean        1.529825
std         0.499329
min         1.000000
25%         1.000000
50%         2.000000
75%         2.000000
max         2.000000
Name: otp_leg_id, dtype: float64

In [46]:
itins_first_bus_legs.dtypes

otp_user_trip_id                  float64
otp_itinerary_id                    int64
otp_date                   datetime64[ns]
otp_leg_id                          int64
otp_start_time             datetime64[ns]
otp_end_time               datetime64[ns]
otp_mode                           object
otp_route                          object
otp_from_stop_id                  float64
otp_to_stop_id                    float64
otp_duration_mins                 float64
otp_from_parent_station           float64
otp_to_parent_station             float64
dtype: object

In [47]:
selected_trips.head()

Unnamed: 0,o_boarding_id,o_boarding_datetime,o_gps_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_stop_lat,o_stop_lon
19800,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221
19803,64706.0,2017-05-01 20:22:36,2017-05-01 19:59:34,20,BB607,7.0,28132.0,-25.435878,-49.306888
19804,28156.0,2017-05-01 13:36:28,2017-05-01 13:36:21,650,HR410,5.0,36299.0,-25.565199,-49.333825
19806,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.58523,-49.33721
19809,57607.0,2017-05-01 18:46:16,2017-05-01 18:46:10,548,GA158,12.0,31582.0,-25.55636,-49.24951


In [48]:
itins_first_bus_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_date,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,28150.0,1,2017-05-01,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,14508.0,14508.0
1,28150.0,2,2017-05-01,1,2017-05-01 13:42:58,2017-05-01 13:57:00,BUS,822,33788.0,30994.0,14.033333,14508.0,14508.0
2,28150.0,3,2017-05-01,2,2017-05-01 14:03:39,2017-05-01 14:20:05,BUS,870,33980.0,35079.0,16.433333,,
3,28150.0,4,2017-05-01,1,2017-05-01 13:57:58,2017-05-01 14:12:00,BUS,822,33788.0,30994.0,14.033333,14508.0,14508.0
4,28150.0,5,2017-05-01,2,2017-05-01 14:10:00,2017-05-01 14:14:00,BUS,821,33973.0,30995.0,4.0,14508.0,14508.0


In [49]:
def match_vehicle_boardings(selected_trips,itineraries_st):
        vehicle_boarding_origins = selected_trips[np.logical_not(selected_trips['o_busCode'].str.isdigit())]
        matched_vehicle_boardings = vehicle_boarding_origins.merge(itineraries_st, left_on=['o_boarding_id','o_route','o_stopPointId'], 
                                                                   right_on=['otp_user_trip_id','otp_route','otp_from_stop_id'], how='inner')
        num_matched_vehicle_boardings = len(matched_vehicle_boardings.drop_duplicates(subset=['o_boarding_id']))
        if num_matched_vehicle_boardings == 0:
            match_perc = 0.0
        else:
            match_perc = 100*(num_matched_vehicle_boardings/float(len(vehicle_boarding_origins)))
        return (matched_vehicle_boardings,num_matched_vehicle_boardings,match_perc)

In [50]:
        # Matching vehicle boarding origins
        matched_vehicle_boardings,num_matched_vehicle_boardings,vehicle_match_perc = match_vehicle_boardings(selected_trips,itins_first_bus_legs)
        print "Vehicle boardings with matching OTP suggestions: ", num_matched_vehicle_boardings, "(",vehicle_match_perc, "%)"

        total_num_matches = num_matched_vehicle_boardings
        print "Total number of matches: ", total_num_matches, "(", 100*(total_num_matches/float(num_selected_trips)), "%)"

        if total_num_matches == 0:
            print "No match was found. Skipping next steps..."
            exit(0)


Vehicle boardings with matching OTP suggestions:  92 ( 80.701754386 %)
Total number of matches:  92 ( 80.701754386 %)


In [51]:
matched_vehicle_boardings.head()

Unnamed: 0,o_boarding_id,o_boarding_datetime,o_gps_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_stop_lat,o_stop_lon,otp_user_trip_id,...,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,28150.0,...,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,14508.0,14508.0
1,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,28150.0,...,1,2017-05-01 14:28:17,2017-05-01 14:35:00,BUS,827,33788.0,30993.0,6.716667,14508.0,14508.0
2,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.58523,-49.33721,28154.0,...,1,2017-05-01 13:42:13,2017-05-01 14:18:00,BUS,684,39378.0,31053.0,35.783333,14494.0,14494.0
3,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.58523,-49.33721,28154.0,...,1,2017-05-01 13:57:13,2017-05-01 14:33:00,BUS,684,39378.0,31053.0,35.783333,14494.0,14494.0
4,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.58523,-49.33721,28154.0,...,1,2017-05-01 14:12:13,2017-05-01 14:48:00,BUS,684,39378.0,31053.0,35.783333,14494.0,14494.0


In [52]:
vehic_first_boardings_options = matched_vehicle_boardings.merge(bus_trips_clean, 
                                left_on=['o_route','o_busCode','o_tripNum','otp_to_stop_id'],
                               right_on=['route','busCode','tripNum','stopPointId'],
                               how='inner') \
            #.filter(['otp_user_trip_id','otp_itinerary_id','route','busCode','tripNum','stopPointId','otp_start_time','o_gps_datetime','otp_end_time','gps_datetime'])
            #.filter(selected_trips.columns)
            

In [53]:
vehic_first_boardings_options

Unnamed: 0,o_boarding_id,o_boarding_datetime,o_gps_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_stop_lat,o_stop_lon,otp_user_trip_id,...,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station,route,busCode,tripNum,stopPointId,gps_datetime
0,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,28150.0,...,33788.0,30993.0,6.450000,14508.0,14508.0,827,BC929,12.0,30993.0,2017-05-01 13:44:14
1,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,28150.0,...,33788.0,30993.0,6.716667,14508.0,14508.0,827,BC929,12.0,30993.0,2017-05-01 13:44:14
2,28262.0,2017-05-01 13:37:59,2017-05-01 13:37:57,827,BC929,12.0,33775.0,-25.438158,-49.358207,28262.0,...,33775.0,30993.0,4.716667,14508.0,14508.0,827,BC929,12.0,30993.0,2017-05-01 13:44:14
3,28262.0,2017-05-01 13:37:59,2017-05-01 13:37:57,827,BC929,12.0,33775.0,-25.438158,-49.358207,28262.0,...,33775.0,30993.0,4.916667,14508.0,14508.0,827,BC929,12.0,30993.0,2017-05-01 13:44:14
4,28424.0,2017-05-01 13:39:51,2017-05-01 13:39:12,827,BC929,12.0,33535.0,-25.440179,-49.355608,28424.0,...,33535.0,30993.0,3.200000,14508.0,14508.0,827,BC929,12.0,30993.0,2017-05-01 13:44:14
5,28424.0,2017-05-01 13:39:51,2017-05-01 13:39:12,827,BC929,12.0,33535.0,-25.440179,-49.355608,28424.0,...,33535.0,30993.0,3.333333,14508.0,14508.0,827,BC929,12.0,30993.0,2017-05-01 13:44:14
6,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.585230,-49.337210,28154.0,...,39378.0,31053.0,35.783333,14494.0,14494.0,684,HA298,11.0,31053.0,2017-05-01 14:01:41
7,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.585230,-49.337210,28154.0,...,39378.0,31053.0,35.783333,14494.0,14494.0,684,HA298,11.0,31053.0,2017-05-01 14:01:41
8,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.585230,-49.337210,28154.0,...,39378.0,31053.0,35.783333,14494.0,14494.0,684,HA298,11.0,31053.0,2017-05-01 14:01:41
9,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.585230,-49.337210,28154.0,...,39378.0,31053.0,35.783333,14494.0,14494.0,684,HA298,11.0,31053.0,2017-05-01 14:01:41


### Removing Scheduled trips whose scheduled start time is more than 30 minutes away from the actual boarding time

In [56]:
#vehic_first_boardings_options['sched_actual_start_timediff'] = np.abs(vehic_first_boardings_options['otp_start_time']-vehic_first_boardings_options['o_gps_datetime'])

In [57]:
# vehic_first_boardings_options \
#     .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_start_time']) \
#     .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','route','busCode','tripNum','stopPointId','otp_start_time','o_gps_datetime','otp_end_time','gps_datetime','sched_actual_start_timediff'])

In [58]:
#vehic_first_boardings_options = vehic_first_boardings_options[vehic_first_boardings_options['sched_actual_start_timediff'] < pd.Timedelta(minutes=30)]

In [59]:
# vehic_first_boardings_options  \
#     .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_start_time']) \
#     .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','route','busCode','tripNum','stopPointId','otp_start_time','o_gps_datetime','otp_end_time','gps_datetime','sched_actual_start_timediff'])

In [60]:
otp_suggestions_vehicle.head()

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827.0,33788.0,30993.0,6.45,,14508.0
1,2017-05-01,28150.0,1,2,2017-05-01 13:45:00,2017-05-01 13:45:52,WALK,,,,0.866667,,
2,2017-05-01,28150.0,1,3,2017-05-01 13:51:00,2017-05-01 14:01:59,BUS,303.0,26195.0,25753.0,10.983333,14508.0,41756.0
3,2017-05-01,28150.0,1,4,2017-05-01 14:02:00,2017-05-01 14:09:32,WALK,,,,7.533333,,
4,2017-05-01,28150.0,2,1,2017-05-01 13:42:58,2017-05-01 13:57:00,BUS,822.0,33788.0,30994.0,14.033333,,14508.0


In [62]:
scheduled_itin_observed_od_full_clean.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
66,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,00:03:15,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,00:00:46,6.45
72,28150.0,1,1,BUS,827,BC929,13.0,33788.0,2017-05-01 13:38:33,2017-05-01 14:22:34,00:44:01,30993.0,2017-05-01 13:45:00,2017-05-01 14:30:58,00:45:58,6.45
60,28150.0,1,1,BUS,827,BC929,11.0,33788.0,2017-05-01 13:38:33,2017-05-01 12:47:49,00:50:44,30993.0,2017-05-01 13:45:00,2017-05-01 12:53:04,00:51:56,6.45
78,28150.0,1,1,BUS,827,BC929,14.0,33788.0,2017-05-01 13:38:33,2017-05-01 15:11:15,01:32:42,30993.0,2017-05-01 13:45:00,2017-05-01 15:18:06,01:33:06,6.45
54,28150.0,1,1,BUS,827,BC929,10.0,33788.0,2017-05-01 13:38:33,2017-05-01 11:58:41,01:39:52,30993.0,2017-05-01 13:45:00,2017-05-01 12:07:05,01:37:55,6.45


## Retain only OTP-BusTrips itineraries which figure in first-vehicle-boarding matched itineraries

In [63]:
matched_vehicle_boardings_itins = vehic_first_boardings_options.filter(['otp_user_trip_id','otp_itinerary_id'])

In [64]:
matched_vehicle_boardings_itins.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id
0,28150.0,1
1,28150.0,8
2,28262.0,1
3,28262.0,9
4,28424.0,1


In [68]:
scheduled_itin_observed_od_full_clean.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
66,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,00:03:15,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,00:00:46,6.45
72,28150.0,1,1,BUS,827,BC929,13.0,33788.0,2017-05-01 13:38:33,2017-05-01 14:22:34,00:44:01,30993.0,2017-05-01 13:45:00,2017-05-01 14:30:58,00:45:58,6.45
60,28150.0,1,1,BUS,827,BC929,11.0,33788.0,2017-05-01 13:38:33,2017-05-01 12:47:49,00:50:44,30993.0,2017-05-01 13:45:00,2017-05-01 12:53:04,00:51:56,6.45
78,28150.0,1,1,BUS,827,BC929,14.0,33788.0,2017-05-01 13:38:33,2017-05-01 15:11:15,01:32:42,30993.0,2017-05-01 13:45:00,2017-05-01 15:18:06,01:33:06,6.45
54,28150.0,1,1,BUS,827,BC929,10.0,33788.0,2017-05-01 13:38:33,2017-05-01 11:58:41,01:39:52,30993.0,2017-05-01 13:45:00,2017-05-01 12:07:05,01:37:55,6.45


In [69]:
len(scheduled_itin_observed_od_full_clean)

70661

In [70]:
scheduled_itin_observed_od_full_clean.columns

Index([u'otp_user_trip_id', u'otp_itinerary_id', u'otp_leg_id', u'otp_mode',
       u'otp_route', u'bt_bus_code', u'bt_trip_num', u'otp_from_stop_id',
       u'otp_start_time', u'bt_start_time', u'sched_obs_start_timediff',
       u'otp_to_stop_id', u'otp_end_time', u'bt_end_time',
       u'sched_obs_end_timediff', u'otp_duration_mins'],
      dtype='object')

In [71]:
matched_vehicle_boardings_itins.columns

Index([u'otp_user_trip_id', u'otp_itinerary_id'], dtype='object')

In [72]:
vehicle_boardings_obs_sch_itin_legs = scheduled_itin_observed_od_full_clean.merge(matched_vehicle_boardings_itins,how='inner') \
                                        .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])

In [73]:
len(vehicle_boardings_obs_sch_itin_legs)

22445

In [74]:
vehicle_boardings_obs_sch_itin_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
0,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,00:03:15,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,00:00:46,6.45
1,28150.0,1,1,BUS,827,BC929,13.0,33788.0,2017-05-01 13:38:33,2017-05-01 14:22:34,00:44:01,30993.0,2017-05-01 13:45:00,2017-05-01 14:30:58,00:45:58,6.45
2,28150.0,1,1,BUS,827,BC929,11.0,33788.0,2017-05-01 13:38:33,2017-05-01 12:47:49,00:50:44,30993.0,2017-05-01 13:45:00,2017-05-01 12:53:04,00:51:56,6.45
3,28150.0,1,1,BUS,827,BC929,14.0,33788.0,2017-05-01 13:38:33,2017-05-01 15:11:15,01:32:42,30993.0,2017-05-01 13:45:00,2017-05-01 15:18:06,01:33:06,6.45
4,28150.0,1,1,BUS,827,BC929,10.0,33788.0,2017-05-01 13:38:33,2017-05-01 11:58:41,01:39:52,30993.0,2017-05-01 13:45:00,2017-05-01 12:07:05,01:37:55,6.45


### Concatenating first boarding legs to other itinerary legs

#### Preparing First Boarding legs data

In [75]:
vehic_first_boardings_options.columns

Index([u'o_boarding_id', u'o_boarding_datetime', u'o_gps_datetime', u'o_route',
       u'o_busCode', u'o_tripNum', u'o_stopPointId', u'o_stop_lat',
       u'o_stop_lon', u'otp_user_trip_id', u'otp_itinerary_id', u'otp_date',
       u'otp_leg_id', u'otp_start_time', u'otp_end_time', u'otp_mode',
       u'otp_route', u'otp_from_stop_id', u'otp_to_stop_id',
       u'otp_duration_mins', u'otp_from_parent_station',
       u'otp_to_parent_station', u'route', u'busCode', u'tripNum',
       u'stopPointId', u'gps_datetime', u'sched_actual_start_timediff'],
      dtype='object')

In [84]:
vehic_first_boardings_options_clean = vehic_first_boardings_options \
    .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_mode','otp_route','o_busCode',
             'o_tripNum','otp_from_stop_id','otp_start_time','o_boarding_datetime','otp_to_stop_id',
             'otp_end_time','gps_datetime','otp_duration_mins']) \
    .rename(index=str, columns={'o_busCode':'bt_bus_code','o_tripNum':'bt_trip_num',
                                'o_boarding_datetime':'bt_start_time','gps_datetime':'bt_end_time'}) \
    .assign(bt_duration_mins = lambda x: (x.bt_end_time - x.bt_start_time)/pd.Timedelta(minutes=1))

In [85]:
vehic_first_boardings_options_clean.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins
0,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,7.816667
1,28150.0,8,1,BUS,827,BC929,12.0,33788.0,2017-05-01 14:28:17,2017-05-01 13:36:25,30993.0,2017-05-01 14:35:00,2017-05-01 13:44:14,6.716667,7.816667
2,28262.0,1,1,BUS,827,BC929,12.0,33775.0,2017-05-01 13:40:17,2017-05-01 13:37:59,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,4.716667,6.25
3,28262.0,9,1,BUS,827,BC929,12.0,33775.0,2017-05-01 14:30:05,2017-05-01 13:37:59,30993.0,2017-05-01 14:35:00,2017-05-01 13:44:14,4.916667,6.25
4,28424.0,1,1,BUS,827,BC929,12.0,33535.0,2017-05-01 13:41:48,2017-05-01 13:39:51,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,3.2,4.383333


In [86]:
len(vehic_first_boardings_options_clean)

316

#### Preparing OTP itinerary legs data

In [87]:
vehicle_boardings_obs_sch_itin_legs.columns.values

array(['otp_user_trip_id', 'otp_itinerary_id', 'otp_leg_id', 'otp_mode',
       'otp_route', 'bt_bus_code', 'bt_trip_num', 'otp_from_stop_id',
       'otp_start_time', 'bt_start_time', 'sched_obs_start_timediff',
       'otp_to_stop_id', 'otp_end_time', 'bt_end_time',
       'sched_obs_end_timediff', 'otp_duration_mins'], dtype=object)

In [88]:
vehicle_boardings_obs_sch_itin_legs_clean = vehicle_boardings_obs_sch_itin_legs \
    .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_mode','otp_route','bt_bus_code',
             'bt_trip_num','otp_from_stop_id','otp_start_time','bt_start_time','otp_to_stop_id',
             'otp_end_time','bt_end_time','otp_duration_mins'])\
    .assign(bt_duration_mins = lambda x: (x.bt_end_time - x.bt_start_time)/pd.Timedelta(minutes=1))

In [89]:
vehicle_boardings_obs_sch_itin_legs_clean.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins
0,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,8.933333
1,28150.0,1,1,BUS,827,BC929,13.0,33788.0,2017-05-01 13:38:33,2017-05-01 14:22:34,30993.0,2017-05-01 13:45:00,2017-05-01 14:30:58,6.45,8.4
2,28150.0,1,1,BUS,827,BC929,11.0,33788.0,2017-05-01 13:38:33,2017-05-01 12:47:49,30993.0,2017-05-01 13:45:00,2017-05-01 12:53:04,6.45,5.25
3,28150.0,1,1,BUS,827,BC929,14.0,33788.0,2017-05-01 13:38:33,2017-05-01 15:11:15,30993.0,2017-05-01 13:45:00,2017-05-01 15:18:06,6.45,6.85
4,28150.0,1,1,BUS,827,BC929,10.0,33788.0,2017-05-01 13:38:33,2017-05-01 11:58:41,30993.0,2017-05-01 13:45:00,2017-05-01 12:07:05,6.45,8.4


In [90]:
len(vehicle_boardings_obs_sch_itin_legs_clean)

22445

#### Concatenating Legs

In [91]:
len(vehic_first_boardings_options_clean)

316

In [92]:
vehic_first_boardings_options_clean_keys = vehic_first_boardings_options_clean.filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id']).drop_duplicates()

In [93]:
len(vehic_first_boardings_options_clean_keys)

309

In [96]:
vehic_first_boardings_options_clean_keys.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id
0,28150.0,1,1
1,28150.0,8,1
2,28262.0,1,1
3,28262.0,9,1
4,28424.0,1,1


In [97]:
vehicle_legs_merged = vehicle_boardings_obs_sch_itin_legs_clean.merge(vehic_first_boardings_options_clean_keys, how='outer', indicator=True)
vehicle_legs_rest_clean = vehicle_legs_merged[vehicle_legs_merged['_merge'] == 'left_only'].drop('_merge', axis=1)

In [98]:
len(vehicle_legs_rest_clean)

10358

In [99]:
vehicle_legs_rest_clean

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins
24,28150.0,1,2,WALK,,,,,2017-05-01 13:45:00,NaT,,2017-05-01 13:45:52,NaT,0.866667,
25,28150.0,1,3,BUS,303,DE722,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:02:48,25753.0,2017-05-01 14:01:59,2017-05-01 14:12:10,10.983333,9.366667
26,28150.0,1,3,BUS,303,DE710,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:27:30,25753.0,2017-05-01 14:01:59,2017-05-01 13:35:59,10.983333,8.483333
27,28150.0,1,3,BUS,303,DE713,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:16:34,25753.0,2017-05-01 14:01:59,2017-05-01 13:24:23,10.983333,7.816667
28,28150.0,1,3,BUS,303,DE708,5.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:29:24,25753.0,2017-05-01 14:01:59,2017-05-01 14:37:26,10.983333,8.033333
29,28150.0,1,3,BUS,303,DE719,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:07:22,25753.0,2017-05-01 14:01:59,2017-05-01 13:15:06,10.983333,7.733333
30,28150.0,1,3,BUS,303,LE702,5.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:41:47,25753.0,2017-05-01 14:01:59,2017-05-01 14:50:32,10.983333,8.750000
31,28150.0,1,3,BUS,303,LE702,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 12:40:58,25753.0,2017-05-01 14:01:59,2017-05-01 12:50:10,10.983333,9.200000
32,28150.0,1,3,BUS,303,DE719,5.0,26195.0,2017-05-01 13:51:00,2017-05-01 15:04:59,25753.0,2017-05-01 14:01:59,2017-05-01 15:13:00,10.983333,8.016667
33,28150.0,1,3,BUS,303,DE708,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 12:28:23,25753.0,2017-05-01 14:01:59,2017-05-01 12:36:19,10.983333,7.933333


In [100]:
all_vehicle_legs_options = pd.concat([vehic_first_boardings_options_clean,vehicle_legs_rest_clean]) \
    .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','bt_start_time'])

In [101]:
len(all_vehicle_legs_options)

10674

In [102]:
all_vehicle_legs_options

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins
0,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.450000,7.816667
24,28150.0,1,2,WALK,,,,,2017-05-01 13:45:00,NaT,,2017-05-01 13:45:52,NaT,0.866667,
70,28150.0,1,3,BUS,303,DE708,1.0,26195.0,2017-05-01 13:51:00,2017-05-01 06:29:10,25753.0,2017-05-01 14:01:59,2017-05-01 06:36:54,10.983333,7.733333
69,28150.0,1,3,BUS,303,LE702,1.0,26195.0,2017-05-01 13:51:00,2017-05-01 06:37:11,25753.0,2017-05-01 14:01:59,2017-05-01 06:47:25,10.983333,10.233333
67,28150.0,1,3,BUS,303,DE719,1.0,26195.0,2017-05-01 13:51:00,2017-05-01 07:01:26,25753.0,2017-05-01 14:01:59,2017-05-01 07:13:11,10.983333,11.750000
64,28150.0,1,3,BUS,303,DE713,1.0,26195.0,2017-05-01 13:51:00,2017-05-01 07:15:53,25753.0,2017-05-01 14:01:59,2017-05-01 07:24:14,10.983333,8.350000
63,28150.0,1,3,BUS,303,DE710,1.0,26195.0,2017-05-01 13:51:00,2017-05-01 07:24:41,25753.0,2017-05-01 14:01:59,2017-05-01 07:36:59,10.983333,12.300000
60,28150.0,1,3,BUS,303,DE722,1.0,26195.0,2017-05-01 13:51:00,2017-05-01 08:04:38,25753.0,2017-05-01 14:01:59,2017-05-01 08:12:58,10.983333,8.333333
57,28150.0,1,3,BUS,303,DE708,2.0,26195.0,2017-05-01 13:51:00,2017-05-01 08:27:29,25753.0,2017-05-01 14:01:59,2017-05-01 08:37:09,10.983333,9.666667
55,28150.0,1,3,BUS,303,LE702,2.0,26195.0,2017-05-01 13:51:00,2017-05-01 08:38:35,25753.0,2017-05-01 14:01:59,2017-05-01 08:48:59,10.983333,10.400000


### Filtering out itineraries which lost bus legs along the processing

In [103]:
original_suggested_itins_num_legs = otp_suggestions.groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                    .agg({'otp_leg_id': lambda x: len(x)}) \
                                    .reset_index() \
                                    .rename(index=str, columns={'otp_leg_id':'num_legs'})

In [104]:
original_suggested_itins_num_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,num_legs
0,28150.0,1,4
1,28150.0,2,4
2,28150.0,3,3
3,28150.0,4,4
4,28150.0,5,5


In [105]:
curr_matched_itins_num_legs = all_vehicle_legs_options.groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                    .agg({'otp_leg_id': lambda x: len(np.unique(x))}) \
                                    .reset_index() \
                                    .rename(index=str, columns={'otp_leg_id':'num_legs'})

In [107]:
curr_matched_itins_num_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,num_legs
0,28150.0,1,4
1,28150.0,8,4
2,28154.0,1,4
3,28154.0,2,4
4,28154.0,6,4


In [108]:
complete_matched_itins = original_suggested_itins_num_legs.merge(curr_matched_itins_num_legs, how='inner')

In [109]:
complete_matched_itins.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,num_legs
0,28150.0,1,4
1,28150.0,8,4
2,28154.0,1,4
3,28154.0,2,4
4,28154.0,6,4


In [110]:
all_complete_vehicle_legs_options = all_vehicle_legs_options.merge(complete_matched_itins.drop('num_legs', axis=1), how='inner')

In [111]:
len(all_vehicle_legs_options)

10674

In [113]:
len(all_complete_vehicle_legs_options)

9339

### Choose best actual leg matches (based on feasibility and start time)

In [118]:
sample_itinerary_options = all_vehicle_legs_options.iloc[0:30,]

In [120]:
sample_itinerary_options

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins
0,28150.0,1,1,BUS,827.0,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,7.816667
24,28150.0,1,2,WALK,,,,,2017-05-01 13:45:00,NaT,,2017-05-01 13:45:52,NaT,0.866667,
70,28150.0,1,3,BUS,303.0,DE708,1.0,26195.0,2017-05-01 13:51:00,2017-05-01 06:29:10,25753.0,2017-05-01 14:01:59,2017-05-01 06:36:54,10.983333,7.733333
69,28150.0,1,3,BUS,303.0,LE702,1.0,26195.0,2017-05-01 13:51:00,2017-05-01 06:37:11,25753.0,2017-05-01 14:01:59,2017-05-01 06:47:25,10.983333,10.233333
67,28150.0,1,3,BUS,303.0,DE719,1.0,26195.0,2017-05-01 13:51:00,2017-05-01 07:01:26,25753.0,2017-05-01 14:01:59,2017-05-01 07:13:11,10.983333,11.75
64,28150.0,1,3,BUS,303.0,DE713,1.0,26195.0,2017-05-01 13:51:00,2017-05-01 07:15:53,25753.0,2017-05-01 14:01:59,2017-05-01 07:24:14,10.983333,8.35
63,28150.0,1,3,BUS,303.0,DE710,1.0,26195.0,2017-05-01 13:51:00,2017-05-01 07:24:41,25753.0,2017-05-01 14:01:59,2017-05-01 07:36:59,10.983333,12.3
60,28150.0,1,3,BUS,303.0,DE722,1.0,26195.0,2017-05-01 13:51:00,2017-05-01 08:04:38,25753.0,2017-05-01 14:01:59,2017-05-01 08:12:58,10.983333,8.333333
57,28150.0,1,3,BUS,303.0,DE708,2.0,26195.0,2017-05-01 13:51:00,2017-05-01 08:27:29,25753.0,2017-05-01 14:01:59,2017-05-01 08:37:09,10.983333,9.666667
55,28150.0,1,3,BUS,303.0,LE702,2.0,26195.0,2017-05-01 13:51:00,2017-05-01 08:38:35,25753.0,2017-05-01 14:01:59,2017-05-01 08:48:59,10.983333,10.4


In [121]:
# def choose_leg_matches(leg_matches_groups):
#         colnames = leg_matches_groups.obj.columns.values
#         chosen_leg_matches = pd.DataFrame(columns = colnames)
#         prev_group_id = ()
#         prev_itin_id = -1
#         prev_leg_end_time = pd.NaT
#         num_groups_not_survived = 0

#         for name, group in leg_matches_groups:
            
#                 #print
#                 #print "Name:", name
#                 #print "Group:"
#                 #print group
#                 #print
                
#                 if (prev_itin_id == -1):
#                         prev_itin_id = group['otp_itinerary_id'].iloc[0]
#                         prev_leg_end_time = group['bt_start_time'].dt.floor('d').iloc[0]
#                 elif (prev_itin_id != group['otp_itinerary_id'].iloc[0]):
#                         prev_leg_end_time = group['bt_start_time'].dt.floor('d').iloc[0]
                    

#                 #if (prev_group_id == ()):
#                 #        prev_leg_end_time = group['bt_start_time'].dt.floor('d')[0]

#                 #print
#                 #print "Previous leg end time:", prev_leg_end_time
#                 #print
#                 #print "Original Group"
#                 #print group.filter(['otp_start_time','bt_start_time','bt_end_time'])
                
#                 if (group['otp_mode'].iloc[0] == 'WALK'):
#                     #print "Walking duration:", filtered_group['otp_duration_mins']
#                     filtered_group = group.reset_index()
#                     filtered_group.loc[0,'bt_start_time'] = prev_leg_end_time
#                     filtered_group.loc[0,'bt_end_time'] = prev_leg_end_time + \
#                         pd.Timedelta(minutes=np.rint(filtered_group['otp_duration_mins'].iloc[0]))
                    
#                     #print "Filtered Group"
#                     #print filtered_group
#                 else:
#                     filtered_group = group[group['bt_start_time'] > prev_leg_end_time]
                
#                 #print
#                 #print "Filtered Group"
#                 #print filtered_group.filter(['otp_start_time','bt_start_time','bt_end_time'])

#                 if (len(filtered_group) == 0):
#                         #print "Group did not survive! =("
#                         num_groups_not_survived += 1
#                         continue

#                 chosen_leg_match = filtered_group.sort_values('bt_start_time').iloc[0]
#                 #print "Chosen Leg"
#                 #print chosen_leg_match

#                 chosen_leg_matches = chosen_leg_matches.append(chosen_leg_match)

#                 #Update variables
#                 #prev_group_id = name
#                 prev_itin_id = group['otp_itinerary_id'].iloc[0]
#                 prev_leg_end_time = chosen_leg_match['bt_end_time']

#         #print num_groups_not_survived
#         return chosen_leg_matches.filter(colnames)


In [122]:
def is_new_itinerary(prev_trip_id,curr_trip_id,prev_itin_id,curr_itin_id):
    return ((prev_trip_id != curr_trip_id) | (prev_itin_id != curr_itin_id))    

def choose_leg_matches(leg_matches_groups):
        colnames = leg_matches_groups.obj.columns.values
        chosen_leg_matches = pd.DataFrame(columns = colnames)
        prev_trip_id = -1
        prev_itin_id = -1
        prev_leg_mode = ""
        prev_leg_end_time = pd.NaT
        num_groups_not_survived = 0
        new_itinerary = False

        for name, group in leg_matches_groups:
            
                #print
                #print "Name:", name
                #print "Group:"
                #print group
                #print
                
                curr_trip_id = group['otp_user_trip_id'].iloc[0]
                curr_itin_id = group['otp_itinerary_id'].iloc[0]
                curr_leg_id = group['otp_leg_id'].iloc[0]
                curr_leg_mode = group['otp_mode'].iloc[0]
                
                new_itinerary = is_new_itinerary(prev_trip_id,curr_trip_id,prev_itin_id,curr_itin_id)
                if new_itinerary:
                    prev_leg_end_time = group['otp_start_time'].dt.floor('d').iloc[0]

                #if (prev_group_id == ()):
                #        prev_leg_end_time = group['bt_start_time'].dt.floor('d')[0]

                #print
                #print "Previous itinerary id:", prev_itin_id
                #print "Previous leg mode:", prev_leg_mode
                #print "Previous leg end time:", prev_leg_end_time
                #print "Current leg id:", curr_leg_id
                #print "Current leg mode:", curr_leg_mode
                #print
                #print "Original Group"
                #print group.filter(['otp_start_time','bt_start_time','bt_end_time'])
                
                if (curr_leg_mode == 'WALK'):
                    #print "Walking duration:", filtered_group['otp_duration_mins']
                    filtered_group = group.reset_index()
                    if new_itinerary: #first leg is a WALK leg
                        filtered_group.loc[0,'bt_end_time'] = prev_leg_end_time
                    else:
                        filtered_group.loc[0,'bt_start_time'] = prev_leg_end_time
                        filtered_group.loc[0,'bt_end_time'] = prev_leg_end_time + \
                            pd.Timedelta(minutes=np.rint(filtered_group['otp_duration_mins'].iloc[0]))
                    #print "Filtered Group"
                    #print filtered_group
                else:
                    filtered_group = group[group['bt_start_time'] > prev_leg_end_time]
                
                #print
                #print "Filtered Group"
                #print filtered_group.filter(['otp_start_time','bt_start_time','bt_end_time'])

                if (len(filtered_group) == 0):
                        #print "Group did not survive! =("
                        #print
                        #print "Previous itinerary id:", prev_itin_id
                        #print "Previous leg mode:", prev_leg_mode
                        #print "Previous leg end time:", prev_leg_end_time
                        #print "Current leg id:", curr_leg_id
                        #print "Current leg mode:", curr_leg_mode
                        #print
                        #print "Original Group"
                        #print group#.filter(['otp_start_time','bt_start_time','bt_end_time'])
                        num_groups_not_survived += 1
                        continue

                chosen_leg_match = filtered_group.sort_values('bt_start_time').iloc[0]
                
                if ((curr_leg_id == 2) & 
                    ((curr_leg_mode == 'BUS') & (prev_leg_mode == 'WALK'))):
                        #Update previous walk start/end_times
                        #print
                        #print "Chosen Leg Matches"
                        #print chosen_leg_matches.iloc[-1]
                        #print
                        chosen_leg_matches.iloc[-1,chosen_leg_matches.columns.get_loc('bt_start_time')] = chosen_leg_match['bt_start_time'] - \
                            pd.Timedelta(minutes=np.rint(chosen_leg_matches.iloc[-1].otp_duration_mins))
                        chosen_leg_matches.iloc[-1,chosen_leg_matches.columns.get_loc('bt_end_time')] = chosen_leg_match['bt_start_time']
                #print "Chosen Leg"
                #print chosen_leg_match

                chosen_leg_matches = chosen_leg_matches.append(chosen_leg_match)

                #Update variables
                #prev_group_id = name
                prev_trip_id = curr_trip_id
                prev_itin_id = curr_itin_id
                prev_leg_mode = curr_leg_mode
                prev_leg_end_time = chosen_leg_match['bt_end_time']

        #print "Number of groups which did not survive:", num_groups_not_survived
        return chosen_leg_matches.filter(colnames)


In [125]:
legs_groups = all_complete_vehicle_legs_options.groupby(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])

In [126]:
legs_groups.obj.columns.values

array(['otp_user_trip_id', 'otp_itinerary_id', 'otp_leg_id', 'otp_mode',
       'otp_route', 'bt_bus_code', 'bt_trip_num', 'otp_from_stop_id',
       'otp_start_time', 'bt_start_time', 'otp_to_stop_id',
       'otp_end_time', 'bt_end_time', 'otp_duration_mins',
       'bt_duration_mins'], dtype=object)

In [193]:
import time

start = time.time()
#chosen_legs = choose_leg_matches(sample_itinerary_options.groupby(['otp_user_trip_id','otp_itinerary_id','otp_leg_id']))
feasible_legs = choose_leg_matches(legs_groups)
end = time.time()

print "Execution time in s:", (end-start)

Execution time in s: 6.71631813049


In [194]:
feasible_legs

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins
0,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.450000,7.816667
0,28150.0,1,2,WALK,,,,,2017-05-01 13:45:00,2017-05-01 13:44:14,,2017-05-01 13:45:52,2017-05-01 13:45:14,0.866667,
25,28150.0,1,3,BUS,303,DE722,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:02:48,25753.0,2017-05-01 14:01:59,2017-05-01 14:12:10,10.983333,9.366667
0,28150.0,1,4,WALK,,,,,2017-05-01 14:02:00,2017-05-01 14:12:10,,2017-05-01 14:09:32,2017-05-01 14:20:10,7.533333,
58,28150.0,8,1,BUS,827,BC929,12.0,33788.0,2017-05-01 14:28:17,2017-05-01 13:36:25,30993.0,2017-05-01 14:35:00,2017-05-01 13:44:14,6.716667,7.816667
0,28150.0,8,2,WALK,,,,,2017-05-01 14:35:00,2017-05-01 13:44:14,,2017-05-01 14:35:52,2017-05-01 13:45:14,0.866667,
83,28150.0,8,3,BUS,303,DE722,4.0,26195.0,2017-05-01 14:39:00,2017-05-01 14:02:48,25753.0,2017-05-01 14:51:19,2017-05-01 14:12:10,12.316667,9.366667
0,28150.0,8,4,WALK,,,,,2017-05-01 14:51:20,2017-05-01 14:12:10,,2017-05-01 14:58:52,2017-05-01 14:20:10,7.533333,
116,28154.0,1,1,BUS,684,HA298,11.0,39378.0,2017-05-01 13:42:13,2017-05-01 13:36:28,31053.0,2017-05-01 14:18:00,2017-05-01 14:01:41,35.783333,25.216667
0,28154.0,1,2,WALK,,,,,2017-05-01 14:18:00,2017-05-01 14:01:41,,2017-05-01 14:19:14,2017-05-01 14:02:41,1.233333,


### Filtering out itineraries which lost bus legs along the processing

In [196]:
original_suggested_itins_num_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,num_legs
0,28150.0,1,4
1,28150.0,2,4
2,28150.0,3,3
3,28150.0,4,4
4,28150.0,5,5


In [197]:
feasible_itins_num_legs = feasible_legs.groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                    .agg({'otp_leg_id': lambda x: len(x)}) \
                                    .reset_index() \
                                    .rename(index=str, columns={'otp_leg_id':'num_legs'})

In [198]:
feasible_itins_num_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,num_legs
0,28150.0,1,4
1,28150.0,8,4
2,28154.0,1,4
3,28154.0,2,4
4,28154.0,6,4


In [203]:
feasible_complete_itins = feasible_itins_num_legs.merge(original_suggested_itins_num_legs,how='inner')

In [204]:
feasible_complete_itins.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,num_legs
0,28150.0,1,4
1,28150.0,8,4
2,28154.0,1,4
3,28154.0,2,4
4,28154.0,6,4


In [207]:
feasible_complete_itins_legs = feasible_legs.merge(feasible_complete_itins.drop('num_legs', axis=1),how='inner')

In [208]:
len(feasible_legs)

869

In [210]:
len(feasible_complete_itins_legs)

849

## Perform an endogenous validation using location and time features

#### Add stops data to legs

In [256]:
def add_stops_data_to_legs(itineraries_legs,stops_locs):
    itineraries_legs_stops = itineraries_legs.merge(stops_locs, left_on='otp_from_stop_id', right_on='stop_id', how='left') \
                                                                                .drop('stop_id', axis=1) \
                                                                                .rename(index=str, columns={'stop_lat':'from_stop_lat','stop_lon':'from_stop_lon'}) \
                                                                                .merge(stops_locations, left_on='otp_to_stop_id', right_on='stop_id', how='left') \
                                                                                .drop('stop_id', axis=1) \
                                                                                .rename(index=str, columns={'stop_lat':'to_stop_lat','stop_lon':'to_stop_lon'}) 
    return itineraries_legs_stops


In [257]:
stops_locations = stops_df[['stop_id','stop_lat','stop_lon']]

In [258]:
itineraries_legs = add_stops_data_to_legs(feasible_complete_itins_legs,stops_locations)

In [259]:
itineraries_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,from_stop_lat,from_stop_lon,to_stop_lat,to_stop_lon
0,28150.0,1,1,BUS,827.0,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,7.816667,-25.436303,-49.362221,-25.441705,-49.346328
1,28150.0,1,2,WALK,,,,,2017-05-01 13:45:00,2017-05-01 13:44:14,,2017-05-01 13:45:52,2017-05-01 13:45:14,0.866667,,,,,
2,28150.0,1,3,BUS,303.0,DE722,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:02:48,25753.0,2017-05-01 14:01:59,2017-05-01 14:12:10,10.983333,9.366667,-25.44172,-49.346978,-25.437968,-49.314099
3,28150.0,1,4,WALK,,,,,2017-05-01 14:02:00,2017-05-01 14:12:10,,2017-05-01 14:09:32,2017-05-01 14:20:10,7.533333,,,,,
4,28150.0,8,1,BUS,827.0,BC929,12.0,33788.0,2017-05-01 14:28:17,2017-05-01 13:36:25,30993.0,2017-05-01 14:35:00,2017-05-01 13:44:14,6.716667,7.816667,-25.436303,-49.362221,-25.441705,-49.346328


In [260]:
itineraries_legs.columns.values

array(['otp_user_trip_id', 'otp_itinerary_id', 'otp_leg_id', 'otp_mode',
       'otp_route', 'bt_bus_code', 'bt_trip_num', 'otp_from_stop_id',
       'otp_start_time', 'bt_start_time', 'otp_to_stop_id',
       'otp_end_time', 'bt_end_time', 'otp_duration_mins',
       'bt_duration_mins', 'from_stop_lat', 'from_stop_lon',
       'to_stop_lat', 'to_stop_lon'], dtype=object)

#### Add card number and date

In [261]:
trips_on_pairs_full.columns

Index([u'cardNum', u'o_boarding_id', u'o_boarding_datetime', u'o_gps_datetime',
       u'o_route', u'o_busCode', u'o_tripNum', u'o_stopPointId', u'o_stop_lat',
       u'o_stop_lon', u'next_o_boarding_id', u'next_o_boarding_datetime',
       u'next_o_gps_datetime', u'next_o_route', u'next_o_busCode',
       u'next_o_tripNum', u'next_o_stopPointId', u'next_o_stop_lat',
       u'next_o_stop_lon', u'boardings_timediff', u'dist_between_origins'],
      dtype='object')

In [262]:
passenger_trips_ids = trips_on_pairs_full.filter(['cardNum','o_boarding_id']) \
                            .rename(index=str, columns={'cardNum':'card_num','o_boarding_id':'trip_id'})

In [263]:
passenger_trips_ids.head()

Unnamed: 0,card_num,trip_id
0,1886552.0,180.0
1,1886552.0,40392.0
2,3601076.0,181.0
3,3601076.0,31560.0
4,3282345.0,182.0


In [264]:
pass_itins_legs = itineraries_legs.merge(passenger_trips_ids, 
                       left_on=['otp_user_trip_id'], 
                       right_on=['trip_id'],
                       how='left') \
                .drop('otp_user_trip_id', axis=1) \
                .assign(date = lambda x: x.otp_start_time.dt.floor('d')) \
                .filter(np.append(['date','card_num','trip_id'],itineraries_legs.columns.values[1:])) \
                .sort_values(['card_num','trip_id','otp_itinerary_id','otp_leg_id'])

In [265]:
pass_itins_legs

Unnamed: 0,date,card_num,trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,...,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,from_stop_lat,from_stop_lon,to_stop_lat,to_stop_lon
108,2017-05-01,869122.0,28204.0,2,1,BUS,040,HB604,4.0,34260.0,...,2017-05-01 13:37:05,34123.0,2017-05-01 14:22:38,2017-05-01 13:52:15,15.633333,15.166667,-25.423548,-49.351123,-25.451351,-49.350814
109,2017-05-01,869122.0,28204.0,2,2,WALK,,,,,...,2017-05-01 13:52:15,,2017-05-01 14:22:52,2017-05-01 13:52:15,0.216667,,,,,
110,2017-05-01,869122.0,28204.0,5,1,BUS,040,HB604,4.0,34260.0,...,2017-05-01 13:37:05,34123.0,2017-05-01 14:33:38,2017-05-01 13:52:15,14.050000,15.166667,-25.423548,-49.351123,-25.451351,-49.350814
111,2017-05-01,869122.0,28204.0,5,2,WALK,,,,,...,2017-05-01 13:52:15,,2017-05-01 14:33:52,2017-05-01 13:52:15,0.216667,,,,,
112,2017-05-01,869122.0,28204.0,6,1,BUS,040,HB604,4.0,34260.0,...,2017-05-01 13:37:05,34123.0,2017-05-01 14:47:38,2017-05-01 13:52:15,14.050000,15.166667,-25.423548,-49.351123,-25.451351,-49.350814
113,2017-05-01,869122.0,28204.0,6,2,WALK,,,,,...,2017-05-01 13:52:15,,2017-05-01 14:47:52,2017-05-01 13:52:15,0.216667,,,,,
114,2017-05-01,869122.0,28204.0,8,1,BUS,040,HB604,4.0,34260.0,...,2017-05-01 13:37:05,34123.0,2017-05-01 15:33:38,2017-05-01 13:52:15,15.633333,15.166667,-25.423548,-49.351123,-25.451351,-49.350814
115,2017-05-01,869122.0,28204.0,8,2,WALK,,,,,...,2017-05-01 13:52:15,,2017-05-01 15:33:52,2017-05-01 13:52:15,0.216667,,,,,
116,2017-05-01,869122.0,28204.0,9,1,BUS,040,HB604,4.0,34260.0,...,2017-05-01 13:37:05,34123.0,2017-05-01 15:46:38,2017-05-01 13:52:15,15.283333,15.166667,-25.423548,-49.351123,-25.451351,-49.350814
117,2017-05-01,869122.0,28204.0,9,2,WALK,,,,,...,2017-05-01 13:52:15,,2017-05-01 15:46:52,2017-05-01 13:52:15,0.216667,,,,,


In [266]:
pass_itins_legs.columns.values

array(['date', 'card_num', 'trip_id', 'otp_itinerary_id', 'otp_leg_id',
       'otp_mode', 'otp_route', 'bt_bus_code', 'bt_trip_num',
       'otp_from_stop_id', 'otp_start_time', 'bt_start_time',
       'otp_to_stop_id', 'otp_end_time', 'bt_end_time',
       'otp_duration_mins', 'bt_duration_mins', 'from_stop_lat',
       'from_stop_lon', 'to_stop_lat', 'to_stop_lon'], dtype=object)

### Summarizing suggested itineraries information

In [267]:
def build_candidate_itineraries_df(chosen_leg_matches_data):
        itins_bus_info = chosen_leg_matches_data \
                                        .query('otp_mode == \'BUS\'') \
                                        .groupby(['card_num','trip_id','otp_itinerary_id']) \
                                        .agg({'otp_from_stop_id': lambda x: x.iloc[0],
                                              'from_stop_lat': lambda x: x.iloc[0],
                                              'from_stop_lon': lambda x: x.iloc[0],
                                              'otp_to_stop_id': lambda x: x.iloc[-1],
                                              'to_stop_lat': lambda x: x.iloc[-1],
                                              'to_stop_lon': lambda x: x.iloc[-1],
                                              'otp_mode': lambda x: len(x)}) \
                                        .reset_index() \
                                        .rename(index=str, columns={'otp_mode':'num_transfers',
                                                                    'otp_from_stop_id':'from_stop_id',
                                                                    'otp_to_stop_id':'to_stop_id'})
        itins_time_info = chosen_leg_matches_data \
                                        .groupby(['card_num','trip_id','otp_itinerary_id']) \
                                        .agg({'bt_start_time': lambda x: x.iloc[0],
                                              'bt_end_time': lambda x: x.iloc[-1],
                                              'otp_start_time': lambda x: x.iloc[0],
                                              'otp_end_time': lambda x: x.iloc[-1],
                                              'date': lambda x: x.iloc[0]}) \
                                        .reset_index() \
                                        .rename(index=str, columns={'otp_start_time':'sch_start_time',
                                                                    'otp_end_time':'sch_end_time',
                                                                    'bt_start_time':'obs_start_time',
                                                                    'bt_end_time':'obs_end_time'}) 
        
        otp_buste_itineraries = itins_bus_info.merge(itins_time_info) \
                                        .reindex(['date','card_num','trip_id','otp_itinerary_id',
                                                  'from_stop_id','sch_start_time','obs_start_time',
                                                  'from_stop_lat','from_stop_lon','to_stop_id',
                                                  'sch_end_time','obs_end_time','to_stop_lat',
                                                  'to_stop_lon','num_transfers'], axis=1, copy=False)\
                                        .assign(card_num = lambda x: x['card_num'].astype(float),
                                                trip_id = lambda x: x['trip_id'].astype(float),
                                                otp_itinerary_id = lambda x: x['otp_itinerary_id'].astype(float))
        return otp_buste_itineraries


In [268]:
cand_itineraries_df = build_candidate_itineraries_df(pass_itins_legs)

In [269]:
cand_itineraries_df

Unnamed: 0,date,card_num,trip_id,otp_itinerary_id,from_stop_id,sch_start_time,obs_start_time,from_stop_lat,from_stop_lon,to_stop_id,sch_end_time,obs_end_time,to_stop_lat,to_stop_lon,num_transfers
0,2017-05-01,869122.0,28204.0,2.0,34260.0,2017-05-01 14:07:00,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,2017-05-01 14:22:52,2017-05-01 13:52:15,-25.451351,-49.350814,1
1,2017-05-01,869122.0,28204.0,5.0,34260.0,2017-05-01 14:19:35,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,2017-05-01 14:33:52,2017-05-01 13:52:15,-25.451351,-49.350814,1
2,2017-05-01,869122.0,28204.0,6.0,34260.0,2017-05-01 14:33:35,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,2017-05-01 14:47:52,2017-05-01 13:52:15,-25.451351,-49.350814,1
3,2017-05-01,869122.0,28204.0,8.0,34260.0,2017-05-01 15:18:00,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,2017-05-01 15:33:52,2017-05-01 13:52:15,-25.451351,-49.350814,1
4,2017-05-01,869122.0,28204.0,9.0,34260.0,2017-05-01 15:31:21,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,2017-05-01 15:46:52,2017-05-01 13:52:15,-25.451351,-49.350814,1
5,2017-05-01,869122.0,28204.0,10.0,34260.0,2017-05-01 15:46:00,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,2017-05-01 16:01:36,2017-05-01 13:52:15,-25.451351,-49.350814,1
6,2017-05-01,1024869.0,28174.0,1.0,33580.0,2017-05-01 13:40:27,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,2017-05-01 14:28:56,2017-05-01 14:33:51,-25.438600,-49.268210,2
7,2017-05-01,1024869.0,28174.0,5.0,33580.0,2017-05-01 13:55:27,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,2017-05-01 14:50:00,2017-05-01 14:33:51,-25.438600,-49.268210,2
8,2017-05-01,1024869.0,28174.0,9.0,33580.0,2017-05-01 14:10:27,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,2017-05-01 15:01:56,2017-05-01 14:33:51,-25.438600,-49.268210,2
9,2017-05-01,1024869.0,28174.0,10.0,33580.0,2017-05-01 14:25:27,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,2017-05-01 15:12:56,2017-05-01 14:33:51,-25.438600,-49.268210,2


In [270]:
cand_itineraries_df.dtypes

date                datetime64[ns]
card_num                   float64
trip_id                    float64
otp_itinerary_id           float64
from_stop_id               float64
sch_start_time      datetime64[ns]
obs_start_time      datetime64[ns]
from_stop_lat              float64
from_stop_lon              float64
to_stop_id                 float64
sch_end_time        datetime64[ns]
obs_end_time        datetime64[ns]
to_stop_lat                float64
to_stop_lon                float64
num_transfers                int64
dtype: object

#### Add origin/next-origin locations

In [271]:
passenger_trips_valid_df = trips_on_pairs_full.filter(['cardNum','o_boarding_id','o_stop_lat','o_stop_lon','o_boarding_datetime','next_o_stop_lat','next_o_stop_lon','next_o_boarding_datetime']) \
                            .rename(index=str, columns={'cardNum':'card_num','o_boarding_id':'trip_id'})

In [272]:
passenger_trips_valid_df.head()

Unnamed: 0,card_num,trip_id,o_stop_lat,o_stop_lon,o_boarding_datetime,next_o_stop_lat,next_o_stop_lon,next_o_boarding_datetime
0,1886552.0,180.0,-25.568989,-49.332253,2017-05-01 04:48:42,-25.431951,-49.296491,2017-05-01 15:52:39
1,1886552.0,40392.0,-25.431951,-49.296491,2017-05-01 15:52:39,-25.568989,-49.332253,2017-05-01 04:48:42
2,3601076.0,181.0,-25.56686,-49.3329,2017-05-01 04:49:13,-25.447479,-49.263816,2017-05-01 14:16:55
3,3601076.0,31560.0,-25.447479,-49.263816,2017-05-01 14:16:55,-25.56686,-49.3329,2017-05-01 04:49:13
4,3282345.0,182.0,-25.56686,-49.3329,2017-05-01 04:49:19,-25.432534,-49.338889,2017-05-01 19:27:16


In [273]:
passenger_trips_valid_df.dtypes

card_num                           float64
trip_id                            float64
o_stop_lat                         float64
o_stop_lon                         float64
o_boarding_datetime         datetime64[ns]
next_o_stop_lat                    float64
next_o_stop_lon                    float64
next_o_boarding_datetime    datetime64[ns]
dtype: object

In [274]:
def dist(p1_lat, p1_lon, p2_lat, p2_lon):
    if(np.isnan([p1_lat, p1_lon, p2_lat, p2_lon]).any()):
        return -1
    else:
        return np.around(distance.geodesic((p1_lat,p1_lon),(p2_lat,p2_lon)).km,decimals=5)

In [275]:
def get_candidate_itineraries_summary(candidate_itineraries,trips_validation):
        otp_buste_itineraries_summary = candidate_itineraries \
                                        .merge(trips_validation,how='inner') \
                                        .assign(start_diff = lambda x: np.absolute(x['obs_start_time'] - x['o_boarding_datetime']),
                                                origin_dist = lambda y: y.apply(lambda x: dist(x['from_stop_lat'], x['from_stop_lon'], x['o_stop_lat'], x['o_stop_lon']),axis=1),
                                                next_origin_dist = lambda y: y.apply(lambda x: dist(x['to_stop_lat'], x['to_stop_lon'], x['next_o_stop_lat'], x['next_o_stop_lon']),axis=1),
                                                next_start_diff = lambda x: np.absolute(x['next_o_boarding_datetime'] - x['obs_end_time']),
                                                sch_duration_mins = lambda x: 
                                                (x.sch_end_time - x.sch_start_time)/pd.Timedelta('1m'),
                                                obs_duration_mins = lambda x:
                                                (x.obs_end_time - x.obs_start_time)/pd.Timedelta('1m')) \
                                        .sort_values(['card_num','trip_id'])
        return otp_buste_itineraries_summary


In [276]:
cand_itineraries_loc_validation = get_candidate_itineraries_summary(cand_itineraries_df,passenger_trips_valid_df)

In [277]:
cand_itineraries_loc_validation

Unnamed: 0,date,card_num,trip_id,otp_itinerary_id,from_stop_id,sch_start_time,obs_start_time,from_stop_lat,from_stop_lon,to_stop_id,...,o_boarding_datetime,next_o_stop_lat,next_o_stop_lon,next_o_boarding_datetime,next_origin_dist,next_start_diff,obs_duration_mins,origin_dist,sch_duration_mins,start_diff
0,2017-05-01,869122.0,28204.0,2.0,34260.0,2017-05-01 14:07:00,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,...,2017-05-01 13:37:05,-25.451180,-49.350724,2017-05-01 17:59:08,0.02096,04:06:53,15.166667,0.0,15.866667,0 days
1,2017-05-01,869122.0,28204.0,5.0,34260.0,2017-05-01 14:19:35,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,...,2017-05-01 13:37:05,-25.451180,-49.350724,2017-05-01 17:59:08,0.02096,04:06:53,15.166667,0.0,14.283333,0 days
2,2017-05-01,869122.0,28204.0,6.0,34260.0,2017-05-01 14:33:35,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,...,2017-05-01 13:37:05,-25.451180,-49.350724,2017-05-01 17:59:08,0.02096,04:06:53,15.166667,0.0,14.283333,0 days
3,2017-05-01,869122.0,28204.0,8.0,34260.0,2017-05-01 15:18:00,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,...,2017-05-01 13:37:05,-25.451180,-49.350724,2017-05-01 17:59:08,0.02096,04:06:53,15.166667,0.0,15.866667,0 days
4,2017-05-01,869122.0,28204.0,9.0,34260.0,2017-05-01 15:31:21,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,...,2017-05-01 13:37:05,-25.451180,-49.350724,2017-05-01 17:59:08,0.02096,04:06:53,15.166667,0.0,15.516667,0 days
5,2017-05-01,869122.0,28204.0,10.0,34260.0,2017-05-01 15:46:00,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,...,2017-05-01 13:37:05,-25.451180,-49.350724,2017-05-01 17:59:08,0.02096,04:06:53,15.166667,0.0,15.600000,0 days
6,2017-05-01,1024869.0,28174.0,1.0,33580.0,2017-05-01 13:40:27,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,...,2017-05-01 13:36:42,-25.441209,-49.275853,2017-05-01 18:30:23,0.82126,03:56:32,57.150000,0.0,48.483333,0 days
7,2017-05-01,1024869.0,28174.0,5.0,33580.0,2017-05-01 13:55:27,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,...,2017-05-01 13:36:42,-25.441209,-49.275853,2017-05-01 18:30:23,0.82126,03:56:32,57.150000,0.0,54.550000,0 days
8,2017-05-01,1024869.0,28174.0,9.0,33580.0,2017-05-01 14:10:27,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,...,2017-05-01 13:36:42,-25.441209,-49.275853,2017-05-01 18:30:23,0.82126,03:56:32,57.150000,0.0,51.483333,0 days
9,2017-05-01,1024869.0,28174.0,10.0,33580.0,2017-05-01 14:25:27,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,...,2017-05-01 13:36:42,-25.441209,-49.275853,2017-05-01 18:30:23,0.82126,03:56:32,57.150000,0.0,47.483333,0 days


In [278]:
valid_candidate_itineraries = cand_itineraries_loc_validation[((cand_itineraries_loc_validation['obs_duration_mins'] > 0) & 
                                                               (cand_itineraries_loc_validation['obs_duration_mins'] < 120)) & 
                                                              ((cand_itineraries_loc_validation['start_diff'] >= pd.Timedelta('0s')) & 
                                                               (cand_itineraries_loc_validation['start_diff'] < pd.Timedelta('1.5h')))] \
                                    .query('origin_dist < 0.1') \
                                    .query('next_origin_dist < 2.0')
                                    #NEXT_START_DIFF
                                

In [279]:
valid_candidate_itineraries

Unnamed: 0,date,card_num,trip_id,otp_itinerary_id,from_stop_id,sch_start_time,obs_start_time,from_stop_lat,from_stop_lon,to_stop_id,...,o_boarding_datetime,next_o_stop_lat,next_o_stop_lon,next_o_boarding_datetime,next_origin_dist,next_start_diff,obs_duration_mins,origin_dist,sch_duration_mins,start_diff
0,2017-05-01,869122.0,28204.0,2.0,34260.0,2017-05-01 14:07:00,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,...,2017-05-01 13:37:05,-25.451180,-49.350724,2017-05-01 17:59:08,0.02096,04:06:53,15.166667,0.0,15.866667,0 days
1,2017-05-01,869122.0,28204.0,5.0,34260.0,2017-05-01 14:19:35,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,...,2017-05-01 13:37:05,-25.451180,-49.350724,2017-05-01 17:59:08,0.02096,04:06:53,15.166667,0.0,14.283333,0 days
2,2017-05-01,869122.0,28204.0,6.0,34260.0,2017-05-01 14:33:35,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,...,2017-05-01 13:37:05,-25.451180,-49.350724,2017-05-01 17:59:08,0.02096,04:06:53,15.166667,0.0,14.283333,0 days
3,2017-05-01,869122.0,28204.0,8.0,34260.0,2017-05-01 15:18:00,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,...,2017-05-01 13:37:05,-25.451180,-49.350724,2017-05-01 17:59:08,0.02096,04:06:53,15.166667,0.0,15.866667,0 days
4,2017-05-01,869122.0,28204.0,9.0,34260.0,2017-05-01 15:31:21,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,...,2017-05-01 13:37:05,-25.451180,-49.350724,2017-05-01 17:59:08,0.02096,04:06:53,15.166667,0.0,15.516667,0 days
5,2017-05-01,869122.0,28204.0,10.0,34260.0,2017-05-01 15:46:00,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,...,2017-05-01 13:37:05,-25.451180,-49.350724,2017-05-01 17:59:08,0.02096,04:06:53,15.166667,0.0,15.600000,0 days
6,2017-05-01,1024869.0,28174.0,1.0,33580.0,2017-05-01 13:40:27,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,...,2017-05-01 13:36:42,-25.441209,-49.275853,2017-05-01 18:30:23,0.82126,03:56:32,57.150000,0.0,48.483333,0 days
7,2017-05-01,1024869.0,28174.0,5.0,33580.0,2017-05-01 13:55:27,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,...,2017-05-01 13:36:42,-25.441209,-49.275853,2017-05-01 18:30:23,0.82126,03:56:32,57.150000,0.0,54.550000,0 days
8,2017-05-01,1024869.0,28174.0,9.0,33580.0,2017-05-01 14:10:27,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,...,2017-05-01 13:36:42,-25.441209,-49.275853,2017-05-01 18:30:23,0.82126,03:56:32,57.150000,0.0,51.483333,0 days
9,2017-05-01,1024869.0,28174.0,10.0,33580.0,2017-05-01 14:25:27,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,...,2017-05-01 13:36:42,-25.441209,-49.275853,2017-05-01 18:30:23,0.82126,03:56:32,57.150000,0.0,47.483333,0 days


In [280]:
inferred_trip_itineraries.columns.values

array(['card_num', 'trip_id', 'date', 'otp_itinerary_id', 'from_stop_id',
       'sch_start_time', 'obs_start_time', 'from_stop_lat',
       'from_stop_lon', 'to_stop_id', 'sch_end_time', 'obs_end_time',
       'to_stop_lat', 'to_stop_lon', 'num_transfers', 'o_stop_lat',
       'o_stop_lon', 'o_boarding_datetime', 'next_o_stop_lat',
       'next_o_stop_lon', 'next_o_boarding_datetime', 'next_origin_dist',
       'next_start_diff', 'obs_duration_mins', 'origin_dist',
       'sch_duration_mins', 'start_diff'], dtype=object)

In [281]:
inferred_trip_itineraries = valid_candidate_itineraries.sort_values(['card_num','trip_id','obs_duration_mins']) \
                                .groupby(['card_num','trip_id']) \
                                .first() \
                                .reset_index()
                        

In [282]:
inferred_trip_itineraries

Unnamed: 0,card_num,trip_id,date,otp_itinerary_id,from_stop_id,sch_start_time,obs_start_time,from_stop_lat,from_stop_lon,to_stop_id,...,o_boarding_datetime,next_o_stop_lat,next_o_stop_lon,next_o_boarding_datetime,next_origin_dist,next_start_diff,obs_duration_mins,origin_dist,sch_duration_mins,start_diff
0,869122.0,28204.0,2017-05-01,2.0,34260.0,2017-05-01 14:07:00,2017-05-01 13:37:05,-25.423548,-49.351123,34123.0,...,2017-05-01 13:37:05,-25.451180,-49.350724,2017-05-01 17:59:08,0.02096,04:06:53,15.166667,0.0,15.866667,0 days
1,1024869.0,28174.0,2017-05-01,1.0,33580.0,2017-05-01 13:40:27,2017-05-01 13:36:42,-25.506291,-49.224019,26905.0,...,2017-05-01 13:36:42,-25.441209,-49.275853,2017-05-01 18:30:23,0.82126,03:56:32,57.150000,0.0,48.483333,0 days
2,1168962.0,28455.0,2017-05-01,7.0,37403.0,2017-05-01 14:50:48,2017-05-01 13:40:12,-25.557041,-49.283864,26239.0,...,2017-05-01 13:40:12,-25.476335,-49.292629,2017-05-01 15:21:23,0.04269,01:11:24,29.783333,0.0,40.833333,0 days
3,1354462.0,28373.0,2017-05-01,6.0,39837.0,2017-05-01 14:47:24,2017-05-01 13:39:18,-25.556058,-49.332414,33970.0,...,2017-05-01 13:39:18,-25.433111,-49.352019,2017-05-01 15:40:47,0.01922,01:00:38,60.850000,0.0,80.016667,0 days
4,1354462.0,39353.0,2017-05-01,1.0,33745.0,2017-05-01 15:51:59,2017-05-01 15:40:50,-25.435506,-49.349914,35179.0,...,2017-05-01 15:40:50,-25.556058,-49.332414,2017-05-01 13:39:14,0.53028,03:09:36,68.000000,0.0,72.900000,0 days
5,1998953.0,28180.0,2017-05-01,5.0,31854.0,2017-05-01 14:03:29,2017-05-01 13:36:46,-25.366552,-49.272462,28615.0,...,2017-05-01 13:36:46,-25.423318,-49.268572,2017-05-01 17:57:56,0.15181,03:57:19,23.850000,0.0,31.300000,0 days
6,1998953.0,52772.0,2017-05-01,5.0,35642.0,2017-05-01 18:15:31,2017-05-01 17:57:59,-25.423318,-49.268572,29384.0,...,2017-05-01 17:57:59,-25.368502,-49.270891,2017-05-01 13:36:42,0.28449,04:49:49,28.533333,0.0,45.700000,0 days
7,2100344.0,28332.0,2017-05-01,6.0,31691.0,2017-05-01 14:07:12,2017-05-01 13:38:56,-25.381614,-49.243812,30598.0,...,2017-05-01 13:38:56,-25.397440,-49.256690,2017-05-01 14:24:29,0.21880,00:29:06,16.450000,0.0,20.283333,0 days
8,2100344.0,32136.0,2017-05-01,4.0,30828.0,2017-05-01 14:55:11,2017-05-01 14:24:29,-25.397440,-49.256690,31692.0,...,2017-05-01 14:24:29,-25.381614,-49.243812,2017-05-01 13:38:56,0.05977,00:58:05,12.533333,0.0,17.350000,0 days
9,2168963.0,28252.0,2017-05-01,1.0,32459.0,2017-05-01 13:44:34,2017-05-01 13:37:55,-25.471072,-49.194606,26045.0,...,2017-05-01 13:37:55,-25.438187,-49.238717,2017-05-01 18:00:26,0.01067,03:49:59,32.533333,0.0,34.350000,0 days


In [283]:
inferred_trip_itineraries.filter(['next_origin_dist','next_start_diff','origin_dist','start_diff','trip_duration']).describe()

Unnamed: 0,next_origin_dist,next_start_diff,origin_dist,start_diff
count,77.0,77,77.0,77
mean,0.176025,0 days 04:33:41.194805,0.0,0 days 00:00:00
std,0.305169,0 days 02:49:42.961504,0.0,0 days 00:00:00
min,0.0,0 days 00:01:22,0.0,0 days 00:00:00
25%,0.01068,0 days 02:16:34,0.0,0 days 00:00:00
50%,0.03265,0 days 04:15:54,0.0,0 days 00:00:00
75%,0.2068,0 days 06:37:22,0.0,0 days 00:00:00
max,1.3039,0 days 09:44:28,0.0,0 days 00:00:00


#### Writing Inferred Trips Itineraries dataset to file

In [251]:
inf_trips_itineraries_output_filepath = output_folderpath + os.sep + file_date_str + '_' + itinerary_part_name + '_itins_inf_trips.csv'
inferred_trip_itineraries.to_csv(inf_trips_itineraries_output_filepath,index=False)

#### Getting Itineraries Legs back

In [252]:
    inferred_trip_itineraries_legs = pass_itins_legs.merge(inferred_trip_itineraries \
                                                           .filter(['card_num','trip_id','otp_itinerary_id']),
                                                           how='inner') \
                                                    .sort_values(['card_num','trip_id','otp_itinerary_id','otp_leg_id'])

In [254]:
inferred_trip_itineraries_legs

Unnamed: 0,date,card_num,trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,...,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,from_stop_lat,from_stop_lon,to_stop_lat,to_stop_lon
0,2017-05-01,869122.0,28204.0,2,1,BUS,040,HB604,4.0,34260.0,...,2017-05-01 13:37:05,34123.0,2017-05-01 14:22:38,2017-05-01 13:52:15,15.633333,15.166667,-25.423548,-49.351123,-25.451351,-49.350814
1,2017-05-01,869122.0,28204.0,2,2,WALK,,,,,...,2017-05-01 13:52:15,,2017-05-01 14:22:52,2017-05-01 13:52:15,0.216667,,,,,
2,2017-05-01,1024869.0,28174.0,1,1,BUS,515,EA303,10.0,33580.0,...,2017-05-01 13:36:42,27791.0,2017-05-01 13:56:00,2017-05-01 13:52:12,15.550000,15.500000,-25.506291,-49.224019,-25.481843,-49.246769
3,2017-05-01,1024869.0,28174.0,1,2,WALK,,,,,...,2017-05-01 13:52:12,,2017-05-01 13:58:59,2017-05-01 13:55:12,2.983333,,,,,
4,2017-05-01,1024869.0,28174.0,1,3,BUS,503,KE842,9.0,27474.0,...,2017-05-01 14:05:30,26905.0,2017-05-01 14:18:06,2017-05-01 14:22:51,17.033333,17.350000,-25.481586,-49.247071,-25.438600,-49.268210
5,2017-05-01,1024869.0,28174.0,1,4,WALK,,,,,...,2017-05-01 14:22:51,,2017-05-01 14:28:56,2017-05-01 14:33:51,10.816667,,,,,
6,2017-05-01,1168962.0,28455.0,7,1,BUS,639,GA142,10.0,37403.0,...,2017-05-01 13:40:12,31026.0,2017-05-01 15:12:00,2017-05-01 13:58:18,21.200000,18.100000,-25.557041,-49.283864,-25.512969,-49.294833
7,2017-05-01,1168962.0,28455.0,7,2,WALK,,,,,...,2017-05-01 13:58:18,,2017-05-01 15:12:04,2017-05-01 13:58:18,0.066667,,,,,
8,2017-05-01,1168962.0,28455.0,7,3,BUS,204,HL307,2.0,26252.0,...,2017-05-01 13:58:29,26239.0,2017-05-01 15:29:33,2017-05-01 14:07:59,13.550000,9.500000,-25.512983,-49.294505,-25.476439,-49.292220
9,2017-05-01,1168962.0,28455.0,7,4,WALK,,,,,...,2017-05-01 14:07:59,,2017-05-01 15:31:38,2017-05-01 14:09:59,2.066667,,,,,


#### Writing Inferred Trips Itineraries Legs dataset to file

In [255]:
inf_trips_itineraries_legs_output_filepath = output_folderpath + os.sep + file_date_str + '_' + itinerary_part_name + '_legs_inf_trips.csv'
inferred_trip_itineraries_legs.to_csv(inf_trips_itineraries_legs_output_filepath,index=False)

In [108]:
#if len(pass_itins_legs) == 0:
#            print "No matches left after matching and selecting feasible bus legs."
#            print "Skipping next steps..."
#            exit(0)

In [109]:
# Writing suggested itineraries dataset to file
#actual_itineraries_output_filepath = output_folderpath + os.sep + file_date_str + '_' + itinerary_part_name + '_actual_itin.csv'
#pass_itins_legs.to_csv(actual_itineraries_output_filepath, index=False)