In [57]:
#Libraries

#Python Libs
import sys
import os
import glob
import traceback
from datetime import datetime
import time

#Data Analysis Libs
import pandas as pd
import numpy as np

In [2]:
def prepare_otp_data(otp_data):
        #Fixing prefix
        otp_data.columns = otp_data.columns.str.replace('otp_','')
        otp_data = otp_data.add_prefix('otp_')
        
        #Fixing Timezone difference - when needed
        otp_data['otp_start_time'] = otp_data['otp_start_time'] - pd.Timedelta('10800 s')
        otp_data['otp_end_time'] = otp_data['otp_end_time'] - pd.Timedelta('10800 s')
        
        #Adjusting route format to have 3 numbers
        otp_data['otp_route'] = otp_data['otp_route'].astype(str)
        otp_data['otp_route'] = np.where(otp_data['otp_mode'] == 'BUS',
                            otp_data['otp_route'].astype(str).str.replace("\.0",'').str.zfill(3),
                            otp_data['otp_route'])

        return otp_data

def get_router_id(query_date):
    INTERMEDIATE_OTP_DATE = pd.to_datetime("2017-06-30", format="%Y-%m-%d")
    router_id = ''

    if (query_date <= INTERMEDIATE_OTP_DATE):
        return 'ctba-2017-1'
    else:
        return 'ctba-2017-2'

In [3]:
otp_suggestions_filepath = '/local/tarciso/data/otp_itineraries/2017_05_01_user_trips_100_otp_itineraries.csv'
user_trips_folderpath = '/local/tarciso/data/enhanced-buste/user_trips/'
bus_trips_folderpath = '/local/tarciso/data/enhanced-buste/bus_trips/'
gtfs_base_folderpath = '/local/tarciso/data/gtfs/'
output_folderpath = '/local/tarciso/data/test-odmat/'

In [4]:
file_date_str = otp_suggestions_filepath.split('/')[-1].split('_user_trips_')[0]
file_date = pd.to_datetime(file_date_str,format='%Y_%m_%d')
print "Processing File:", otp_suggestions_filepath

Processing File: /local/tarciso/data/otp_itineraries/2017_05_01_user_trips_100_otp_itineraries.csv


### Reading and Preparing OTP Suggestions data

In [5]:
# Extracting itinerary part name for later use
itinerary_part_name = otp_suggestions_filepath.split('/')[-1].split('_')[5]
# Read OTP Suggestions
otp_suggestions_raw = pd.read_csv(otp_suggestions_filepath, parse_dates=['date','otp_start_time','otp_end_time'])

if len(otp_suggestions_raw) == 0:
    print "Zero OTP suggestions found."
    print "Skipping next steps..."
    exit(0)

In [6]:
        # Prepare OTP data for analysis
        otp_suggestions = prepare_otp_data(otp_suggestions_raw)

        # Read stops data
        stops_filepath = gtfs_base_folderpath + os.sep + get_router_id(file_date) + os.sep + 'stops.txt'
        stops_df = pd.read_csv(stops_filepath)

        # Adding Parent Stop data to OTP Suggestions
        stops_parent_stations = stops_df[['stop_id','parent_station']]
        otp_suggestions = otp_suggestions.merge(stops_parent_stations.add_prefix('from_'),
                                                left_on='otp_from_stop_id',
                                                right_on='from_stop_id',
                                                how='left') \
                                        .merge(stops_parent_stations.add_prefix('to_'),
                                                left_on='otp_to_stop_id',
                                                right_on='to_stop_id',
                                                how='left') \
                                        .drop(['from_stop_id','to_stop_id'], axis=1) \
                                        .rename(index=str, columns={'from_parent_station':'otp_from_parent_station',
                                                                    'to_parent_station':'otp_to_parent_station'})
        
        otp_suggestions_bus_legs = otp_suggestions[otp_suggestions['otp_mode'] == 'BUS']
        otp_suggestions_walk_legs = otp_suggestions[otp_suggestions['otp_mode'] == 'WALK']


In [155]:
otp_suggestions.head(20)

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827.0,33788.0,30993.0,6.45,,14508.0
1,2017-05-01,28150.0,1,2,2017-05-01 13:45:00,2017-05-01 13:45:52,WALK,,,,0.866667,,
2,2017-05-01,28150.0,1,3,2017-05-01 13:51:00,2017-05-01 14:01:59,BUS,303.0,26195.0,25753.0,10.983333,14508.0,41756.0
3,2017-05-01,28150.0,1,4,2017-05-01 14:02:00,2017-05-01 14:09:32,WALK,,,,7.533333,,
4,2017-05-01,28150.0,2,1,2017-05-01 13:42:58,2017-05-01 13:57:00,BUS,822.0,33788.0,30994.0,14.033333,,14508.0
5,2017-05-01,28150.0,2,2,2017-05-01 13:57:00,2017-05-01 13:57:39,WALK,,,,0.65,,
6,2017-05-01,28150.0,2,3,2017-05-01 14:03:00,2017-05-01 14:15:19,BUS,303.0,26195.0,25753.0,12.316667,14508.0,41756.0
7,2017-05-01,28150.0,2,4,2017-05-01 14:15:20,2017-05-01 14:22:52,WALK,,,,7.533333,,
8,2017-05-01,28150.0,3,1,2017-05-01 13:41:55,2017-05-01 14:03:38,WALK,,,,21.716667,,
9,2017-05-01,28150.0,3,2,2017-05-01 14:03:39,2017-05-01 14:20:05,BUS,870.0,33980.0,35079.0,16.433333,,


### Reading and Preparing User Trips data

In [59]:
def compatible_dates(otp_data,ticketing_data):
        otp_date = otp_data['otp_date'].iloc[0]
        ticketing_date = pd.to_datetime(ticketing_data['o_boarding_datetime'].dt.strftime('%Y-%m-%d')[0])

        return (otp_date == ticketing_date,otp_date,ticketing_date)


In [60]:
        # Read Origin/Next-Origin Pairs for the same date
        trips_origins_filepath = user_trips_folderpath + os.sep + file_date_str + '_user_trips.csv'
        trips_on_pairs_full = pd.read_csv(trips_origins_filepath,
                                                parse_dates=['o_boarding_datetime','o_gps_datetime','next_o_boarding_datetime','next_o_gps_datetime'])
        # Checking whether OTP and ticketing dates match
        dates_compatibility, otp_date, ticketing_date = compatible_dates(otp_suggestions,trips_on_pairs_full)
        if not dates_compatibility:
                print "ERROR: OTP date", otp_date, "does not match Ticketing data", ticketing_date
                print "Skipping current day"
                exit(1)
        
        trips_on_pairs = trips_on_pairs_full.filter(['o_boarding_id','next_o_boarding_id'])
        trips_origins = trips_on_pairs_full.filter([col for col in trips_on_pairs_full.columns if col.startswith('o_')])

In [64]:
vehicle_boarding_trip_ids = trips_origins[np.logical_not(trips_origins['o_busCode'].str.isdigit())].o_boarding_id

In [66]:
vehicle_boarding_trip_ids.head()

0      180.0
2      181.0
4      182.0
5    61724.0
7      187.0
Name: o_boarding_id, dtype: float64

### Filtering out non-vehicle-boarding itinerary suggestions

In [67]:
otp_suggestions_vehicle = otp_suggestions[otp_suggestions['otp_user_trip_id'].isin(vehicle_boarding_trip_ids)]
otp_suggestions_bus_legs = otp_suggestions_vehicle[otp_suggestions_vehicle['otp_mode'] == 'BUS']
otp_suggestions_walk_legs = otp_suggestions_vehicle[otp_suggestions_vehicle['otp_mode'] == 'WALK']

In [68]:
len(otp_suggestions)

8016

In [69]:
len(otp_suggestions_vehicle)

4452

### Reading and Preparing Bus Trips data

In [70]:
        # Find OTP Suggested Itineraries in BUSTE Data
        bus_trips_filepath = bus_trips_folderpath + os.sep + file_date_str + '_bus_trips.csv'
        bus_trips = pd.read_csv(bus_trips_filepath, dtype={'route': object},parse_dates=['gps_datetime']) \
                                        .sort_values(['route','busCode','tripNum','gps_datetime']) \
                                        .assign(route = lambda x: x['route'].astype(str).str.replace("\.0",'').str.zfill(3)) \
                                        .drop_duplicates()


In [71]:
bus_trips.head()

Unnamed: 0,route,busCode,shapeId,tripNum,stopPointId,gps_datetime,distanceTraveledShape,stop_lat,stop_lon,parent_station
29435,10,BB303,1708.0,1.0,33157.0,2017-05-01 06:28:07,537.974,-25.410517,-49.276479,
29436,10,BB303,1708.0,1.0,33159.0,2017-05-01 06:30:38,1141.061,-25.411726,-49.270902,
29437,10,BB303,1708.0,1.0,33158.0,2017-05-01 06:31:40,1624.751,-25.415285,-49.270134,
29438,10,BB303,1708.0,1.0,30150.0,2017-05-01 06:32:26,1972.077,-25.416733,-49.267863,
29439,10,BB303,1708.0,1.0,28637.0,2017-05-01 06:33:11,2378.349,-25.414184,-49.265917,


### Matching OTP Suggestions data to Bus Trips

In [72]:
bus_trips_clean = bus_trips.filter(['route','busCode','tripNum','stopPointId','gps_datetime'])

In [73]:
bus_trips_clean.head()

Unnamed: 0,route,busCode,tripNum,stopPointId,gps_datetime
29435,10,BB303,1.0,33157.0,2017-05-01 06:28:07
29436,10,BB303,1.0,33159.0,2017-05-01 06:30:38
29437,10,BB303,1.0,33158.0,2017-05-01 06:31:40
29438,10,BB303,1.0,30150.0,2017-05-01 06:32:26
29439,10,BB303,1.0,28637.0,2017-05-01 06:33:11


In [74]:
otp_suggestions_bus_legs.sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id']).head()

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,,14508.0
2,2017-05-01,28150.0,1,3,2017-05-01 13:51:00,2017-05-01 14:01:59,BUS,303,26195.0,25753.0,10.983333,14508.0,41756.0
4,2017-05-01,28150.0,2,1,2017-05-01 13:42:58,2017-05-01 13:57:00,BUS,822,33788.0,30994.0,14.033333,,14508.0
6,2017-05-01,28150.0,2,3,2017-05-01 14:03:00,2017-05-01 14:15:19,BUS,303,26195.0,25753.0,12.316667,14508.0,41756.0
9,2017-05-01,28150.0,3,2,2017-05-01 14:03:39,2017-05-01 14:20:05,BUS,870,33980.0,35079.0,16.433333,,


In [101]:
scheduled_itin_observed_o = otp_suggestions_bus_legs.merge(bus_trips_clean.add_prefix('bt_'),
                                left_on=['otp_route','otp_from_stop_id'],
                                right_on=['bt_route','bt_stopPointId'],
                                how='inner') \
                                .drop(['bt_route','bt_stopPointId'], axis=1) \
                                .rename(index=str, columns={'bt_gps_datetime':'bt_start_time',
                                                            'bt_tripNum':'bt_trip_num',
                                                            'bt_busCode':'bt_bus_code'}) \
                                .assign(sched_obs_start_timediff = 
                                        lambda x: np.absolute(x['bt_start_time'] - x['otp_start_time']))
#scheduled_itin_observed_o = scheduled_itin_observed_o[scheduled_itin_observed_o['sched_obs_start_timediff'] <= pd.Timedelta(minutes=60)]

In [102]:
scheduled_itin_observed_o.head(10)

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station,bt_bus_code,bt_trip_num,bt_start_time,sched_obs_start_timediff
0,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,,14508.0,BC929,1.0,2017-05-01 05:02:20,08:36:13
1,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,,14508.0,BC929,2.0,2017-05-01 05:51:25,07:47:08
2,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,,14508.0,BC929,3.0,2017-05-01 06:33:45,07:04:48
3,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,,14508.0,BC929,4.0,2017-05-01 07:21:37,06:16:56
4,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,,14508.0,BC929,5.0,2017-05-01 08:07:09,05:31:24
5,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,,14508.0,BC929,6.0,2017-05-01 08:46:39,04:51:54
6,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,,14508.0,BC929,7.0,2017-05-01 09:35:13,04:03:20
7,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,,14508.0,BC929,8.0,2017-05-01 10:23:21,03:15:12
8,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,,14508.0,BC929,9.0,2017-05-01 11:08:39,02:29:54
9,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,,14508.0,BC929,10.0,2017-05-01 11:58:41,01:39:52


In [103]:
scheduled_itin_observed_od = scheduled_itin_observed_o.merge(bus_trips_clean.add_prefix('bt_'),
                                left_on=['otp_route','bt_bus_code','bt_trip_num','otp_to_stop_id'],
                                right_on=['bt_route','bt_busCode','bt_tripNum','bt_stopPointId'],
                                how='inner') \
                                .drop(['bt_route','bt_stopPointId'], axis=1) \
                                .rename(index=str, columns={'bt_gps_datetime':'bt_end_time'}) \
                                .assign(sched_obs_end_timediff = 
                                        lambda x: np.absolute(x['bt_end_time'] - x['otp_end_time'])) \
                                .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','sched_obs_start_timediff','sched_obs_end_timediff'])
#scheduled_itin_observed_od = scheduled_itin_observed_od[scheduled_itin_observed_od['sched_obs_end_timediff'] <= pd.Timedelta(minutes=60)]

In [104]:
scheduled_itin_observed_od_full = pd.concat([scheduled_itin_observed_od,otp_suggestions_walk_legs], sort=False)

In [105]:
scheduled_itin_observed_od_full.head()

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,...,otp_from_parent_station,otp_to_parent_station,bt_bus_code,bt_trip_num,bt_start_time,sched_obs_start_timediff,bt_busCode,bt_tripNum,bt_end_time,sched_obs_end_timediff
66,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,...,,14508.0,BC929,12.0,2017-05-01 13:35:18,00:03:15,BC929,12.0,2017-05-01 13:44:14,00:00:46
72,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,...,,14508.0,BC929,13.0,2017-05-01 14:22:34,00:44:01,BC929,13.0,2017-05-01 14:30:58,00:45:58
60,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,...,,14508.0,BC929,11.0,2017-05-01 12:47:49,00:50:44,BC929,11.0,2017-05-01 12:53:04,00:51:56
78,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,...,,14508.0,BC929,14.0,2017-05-01 15:11:15,01:32:42,BC929,14.0,2017-05-01 15:18:06,01:33:06
54,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,...,,14508.0,BC929,10.0,2017-05-01 11:58:41,01:39:52,BC929,10.0,2017-05-01 12:07:05,01:37:55


In [106]:
scheduled_itin_observed_od_full_clean = scheduled_itin_observed_od_full \
                            .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_mode','otp_route',
                                     'bt_bus_code','bt_trip_num','otp_from_stop_id','otp_start_time',
                                     'bt_start_time','sched_obs_start_timediff','otp_to_stop_id',
                                     'otp_end_time','bt_end_time','sched_obs_end_timediff','otp_duration_mins']) \
                            .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])

In [107]:
scheduled_itin_observed_od_full_clean

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
66,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,0 days 00:03:15,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,0 days 00:00:46,6.450000
72,28150.0,1,1,BUS,827,BC929,13.0,33788.0,2017-05-01 13:38:33,2017-05-01 14:22:34,0 days 00:44:01,30993.0,2017-05-01 13:45:00,2017-05-01 14:30:58,0 days 00:45:58,6.450000
60,28150.0,1,1,BUS,827,BC929,11.0,33788.0,2017-05-01 13:38:33,2017-05-01 12:47:49,0 days 00:50:44,30993.0,2017-05-01 13:45:00,2017-05-01 12:53:04,0 days 00:51:56,6.450000
78,28150.0,1,1,BUS,827,BC929,14.0,33788.0,2017-05-01 13:38:33,2017-05-01 15:11:15,0 days 01:32:42,30993.0,2017-05-01 13:45:00,2017-05-01 15:18:06,0 days 01:33:06,6.450000
54,28150.0,1,1,BUS,827,BC929,10.0,33788.0,2017-05-01 13:38:33,2017-05-01 11:58:41,0 days 01:39:52,30993.0,2017-05-01 13:45:00,2017-05-01 12:07:05,0 days 01:37:55,6.450000
84,28150.0,1,1,BUS,827,BC929,15.0,33788.0,2017-05-01 13:38:33,2017-05-01 15:59:34,0 days 02:21:01,30993.0,2017-05-01 13:45:00,2017-05-01 16:06:35,0 days 02:21:35,6.450000
48,28150.0,1,1,BUS,827,BC929,9.0,33788.0,2017-05-01 13:38:33,2017-05-01 11:08:39,0 days 02:29:54,30993.0,2017-05-01 13:45:00,2017-05-01 11:19:47,0 days 02:25:13,6.450000
90,28150.0,1,1,BUS,827,BC929,16.0,33788.0,2017-05-01 13:38:33,2017-05-01 16:48:58,0 days 03:10:25,30993.0,2017-05-01 13:45:00,2017-05-01 16:55:23,0 days 03:10:23,6.450000
42,28150.0,1,1,BUS,827,BC929,8.0,33788.0,2017-05-01 13:38:33,2017-05-01 10:23:21,0 days 03:15:12,30993.0,2017-05-01 13:45:00,2017-05-01 10:33:22,0 days 03:11:38,6.450000
96,28150.0,1,1,BUS,827,BC929,17.0,33788.0,2017-05-01 13:38:33,2017-05-01 17:35:37,0 days 03:57:04,30993.0,2017-05-01 13:45:00,2017-05-01 17:41:38,0 days 03:56:38,6.450000


### Filtering out leg matches whose observed start_time is earlier than the boarding record start_time or are more than 15 minutes later

In [108]:
first_bus_legs = scheduled_itin_observed_od_full_clean.query('otp_mode == \'BUS\'')\
                                .groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                .first() \
                                .reset_index()

In [128]:
first_bus_legs[(first_bus_legs['otp_user_trip_id'] == 28150.0) & (first_bus_legs['otp_itinerary_id'] == 2)]

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
1,28150.0,2,3,BUS,303,DE722,4.0,26195.0,2017-05-01 14:03:00,2017-05-01 14:02:48,00:00:12,25753.0,2017-05-01 14:15:19,2017-05-01 14:12:10,00:03:09,12.316667


In [126]:
first_bus_legs[first_bus_legs['otp_leg_id'] > 2]

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
1,28150.0,2,3,BUS,303,DE722,4.0,26195.0,2017-05-01 14:03:00,2017-05-01 14:02:48,00:00:12,25753.0,2017-05-01 14:15:19,2017-05-01 14:12:10,00:03:09,12.316667
3,28150.0,4,3,BUS,303,LE710,4.0,26195.0,2017-05-01 14:15:00,2017-05-01 14:18:42,00:03:42,25753.0,2017-05-01 14:25:59,2017-05-01 12:25:36,02:00:23,10.983333
6,28150.0,7,3,BUS,303,DE708,5.0,26195.0,2017-05-01 14:33:00,2017-05-01 14:29:24,00:03:36,25753.0,2017-05-01 14:43:59,2017-05-01 14:37:26,00:06:33,10.983333
8,28150.0,9,3,BUS,303,LE702,5.0,26195.0,2017-05-01 14:45:00,2017-05-01 14:41:47,00:03:13,25753.0,2017-05-01 14:57:19,2017-05-01 14:50:32,00:06:47,12.316667
9,28150.0,10,3,BUS,303,DE719,5.0,26195.0,2017-05-01 15:02:00,2017-05-01 15:04:59,00:02:59,25753.0,2017-05-01 15:14:19,2017-05-01 15:13:00,00:01:19,12.316667
34,28161.0,5,3,BUS,303,DE715,4.0,26198.0,2017-05-01 14:03:36,2017-05-01 14:02:26,00:01:10,25646.0,2017-05-01 14:16:02,2017-05-01 14:12:36,00:03:26,12.433333
59,28169.0,1,4,BUS,30,KB605,3.0,27636.0,2017-05-01 13:56:00,2017-05-01 13:43:50,00:12:10,32610.0,2017-05-01 14:11:26,2017-05-01 11:16:16,02:55:10,15.433333
64,28169.0,6,4,BUS,625,GA149,6.0,27639.0,2017-05-01 14:21:00,2017-05-01 14:25:52,00:04:52,36064.0,2017-05-01 14:44:54,2017-05-01 14:47:07,00:02:13,23.9
65,28169.0,7,4,BUS,625,GA164,4.0,27639.0,2017-05-01 14:38:00,2017-05-01 14:46:56,00:08:56,36064.0,2017-05-01 15:01:54,2017-05-01 15:06:51,00:04:57,23.9
71,28172.0,3,4,BUS,30,KB604,4.0,32546.0,2017-05-01 14:07:02,2017-05-01 14:18:17,00:11:15,30860.0,2017-05-01 14:25:46,2017-05-01 14:32:02,00:06:16,18.733333


In [109]:
vehicle_boarding_start_time = trips_origins.filter(['o_boarding_id','o_boarding_datetime'])

In [110]:
vehicle_boarding_start_time.head()

Unnamed: 0,o_boarding_id,o_boarding_datetime
0,180.0,2017-05-01 04:48:42
1,40392.0,2017-05-01 15:52:39
2,181.0,2017-05-01 04:49:13
3,31560.0,2017-05-01 14:16:55
4,182.0,2017-05-01 04:49:19


In [125]:
first_bus_legs_time_validation = first_bus_legs.merge(vehicle_boarding_start_time, 
                                                      left_on='otp_user_trip_id',
                                                      right_on='o_boarding_id',
                                                      how='inner') \
                                                .assign(pass_obs_start_diff = lambda x: x.bt_start_time - x.o_boarding_datetime)

In [123]:
first_bus_legs_time_validation

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins,o_boarding_id,o_boarding_datetime,pass_obs_start_diff
0,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,00:03:15,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,00:00:46,6.450000,28150.0,2017-05-01 13:36:25,-1 days +23:58:53
1,28150.0,2,3,BUS,303,DE722,4.0,26195.0,2017-05-01 14:03:00,2017-05-01 14:02:48,00:00:12,25753.0,2017-05-01 14:15:19,2017-05-01 14:12:10,00:03:09,12.316667,28150.0,2017-05-01 13:36:25,00:26:23
2,28150.0,3,2,BUS,870,BC283,8.0,33980.0,2017-05-01 14:03:39,2017-05-01 13:57:41,00:05:58,35079.0,2017-05-01 14:20:05,2017-05-01 14:07:33,00:12:32,16.433333,28150.0,2017-05-01 13:36:25,00:21:16
3,28150.0,4,3,BUS,303,LE710,4.0,26195.0,2017-05-01 14:15:00,2017-05-01 14:18:42,00:03:42,25753.0,2017-05-01 14:25:59,2017-05-01 12:25:36,02:00:23,10.983333,28150.0,2017-05-01 13:36:25,00:42:17
4,28150.0,5,2,BUS,821,MA013,9.0,33973.0,2017-05-01 14:10:00,2017-05-01 13:44:07,00:25:53,30995.0,2017-05-01 14:14:00,2017-05-01 13:46:33,00:27:27,4.000000,28150.0,2017-05-01 13:36:25,00:07:42
5,28150.0,6,2,BUS,870,BC014,8.0,33980.0,2017-05-01 14:18:39,2017-05-01 14:21:56,00:03:17,35079.0,2017-05-01 14:35:05,2017-05-01 14:30:25,00:04:40,16.433333,28150.0,2017-05-01 13:36:25,00:45:31
6,28150.0,7,3,BUS,303,DE708,5.0,26195.0,2017-05-01 14:33:00,2017-05-01 14:29:24,00:03:36,25753.0,2017-05-01 14:43:59,2017-05-01 14:37:26,00:06:33,10.983333,28150.0,2017-05-01 13:36:25,00:52:59
7,28150.0,8,1,BUS,827,BC929,13.0,33788.0,2017-05-01 14:28:17,2017-05-01 14:22:34,00:05:43,30993.0,2017-05-01 14:35:00,2017-05-01 14:30:58,00:04:02,6.716667,28150.0,2017-05-01 13:36:25,00:46:09
8,28150.0,9,3,BUS,303,LE702,5.0,26195.0,2017-05-01 14:45:00,2017-05-01 14:41:47,00:03:13,25753.0,2017-05-01 14:57:19,2017-05-01 14:50:32,00:06:47,12.316667,28150.0,2017-05-01 13:36:25,01:05:22
9,28150.0,10,3,BUS,303,DE719,5.0,26195.0,2017-05-01 15:02:00,2017-05-01 15:04:59,00:02:59,25753.0,2017-05-01 15:14:19,2017-05-01 15:13:00,00:01:19,12.316667,28150.0,2017-05-01 13:36:25,01:28:34


In [121]:
first_bus_legs_time_validation[first_bus_legs_time_validation['otp_leg_id'] > 2]

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins,o_boarding_id,o_boarding_datetime,pass_obs_start_diff
1,28150.0,2,3,BUS,303,DE722,4.0,26195.0,2017-05-01 14:03:00,2017-05-01 14:02:48,00:00:12,25753.0,2017-05-01 14:15:19,2017-05-01 14:12:10,00:03:09,12.316667,28150.0,2017-05-01 13:36:25,00:26:23
3,28150.0,4,3,BUS,303,LE710,4.0,26195.0,2017-05-01 14:15:00,2017-05-01 14:18:42,00:03:42,25753.0,2017-05-01 14:25:59,2017-05-01 12:25:36,02:00:23,10.983333,28150.0,2017-05-01 13:36:25,00:42:17
6,28150.0,7,3,BUS,303,DE708,5.0,26195.0,2017-05-01 14:33:00,2017-05-01 14:29:24,00:03:36,25753.0,2017-05-01 14:43:59,2017-05-01 14:37:26,00:06:33,10.983333,28150.0,2017-05-01 13:36:25,00:52:59
8,28150.0,9,3,BUS,303,LE702,5.0,26195.0,2017-05-01 14:45:00,2017-05-01 14:41:47,00:03:13,25753.0,2017-05-01 14:57:19,2017-05-01 14:50:32,00:06:47,12.316667,28150.0,2017-05-01 13:36:25,01:05:22
9,28150.0,10,3,BUS,303,DE719,5.0,26195.0,2017-05-01 15:02:00,2017-05-01 15:04:59,00:02:59,25753.0,2017-05-01 15:14:19,2017-05-01 15:13:00,00:01:19,12.316667,28150.0,2017-05-01 13:36:25,01:28:34
34,28161.0,5,3,BUS,303,DE715,4.0,26198.0,2017-05-01 14:03:36,2017-05-01 14:02:26,00:01:10,25646.0,2017-05-01 14:16:02,2017-05-01 14:12:36,00:03:26,12.433333,28161.0,2017-05-01 13:36:33,00:25:53
59,28169.0,1,4,BUS,30,KB605,3.0,27636.0,2017-05-01 13:56:00,2017-05-01 13:43:50,00:12:10,32610.0,2017-05-01 14:11:26,2017-05-01 11:16:16,02:55:10,15.433333,28169.0,2017-05-01 13:36:36,00:07:14
64,28169.0,6,4,BUS,625,GA149,6.0,27639.0,2017-05-01 14:21:00,2017-05-01 14:25:52,00:04:52,36064.0,2017-05-01 14:44:54,2017-05-01 14:47:07,00:02:13,23.9,28169.0,2017-05-01 13:36:36,00:49:16
65,28169.0,7,4,BUS,625,GA164,4.0,27639.0,2017-05-01 14:38:00,2017-05-01 14:46:56,00:08:56,36064.0,2017-05-01 15:01:54,2017-05-01 15:06:51,00:04:57,23.9,28169.0,2017-05-01 13:36:36,01:10:20
71,28172.0,3,4,BUS,30,KB604,4.0,32546.0,2017-05-01 14:07:02,2017-05-01 14:18:17,00:11:15,30860.0,2017-05-01 14:25:46,2017-05-01 14:32:02,00:06:16,18.733333,28172.0,2017-05-01 13:36:41,00:41:36


In [115]:
valid_first_bus_legs_matches = first_bus_legs_time_validation[
    (first_bus_legs_time_validation['pass_obs_start_diff'] > pd.Timedelta('0s')) &
    (first_bus_legs_time_validation['pass_obs_start_diff'] <= pd.Timedelta('15m'))] \
    .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','bt_start_time'])

In [116]:
valid_first_bus_legs_matches

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins,o_boarding_id,o_boarding_datetime,pass_obs_start_diff
4,28150.0,5,2,BUS,821,MA013,9.0,33973.0,2017-05-01 14:10:00,2017-05-01 13:44:07,00:25:53,30995.0,2017-05-01 14:14:00,2017-05-01 13:46:33,00:27:27,4.000000,28150.0,2017-05-01 13:36:25,00:07:42
20,28156.0,1,1,BUS,619,HA020,10.0,36299.0,2017-05-01 13:38:40,2017-05-01 13:47:28,00:08:48,30434.0,2017-05-01 14:14:00,2017-05-01 14:09:40,00:04:20,35.333333,28156.0,2017-05-01 13:36:28,00:11:00
22,28156.0,3,1,BUS,619,HA020,10.0,36299.0,2017-05-01 13:55:43,2017-05-01 13:47:28,00:08:15,30434.0,2017-05-01 14:26:00,2017-05-01 14:09:40,00:16:20,30.283333,28156.0,2017-05-01 13:36:28,00:11:00
27,28156.0,8,1,BUS,619,HA020,10.0,36299.0,2017-05-01 14:14:30,2017-05-01 13:47:28,00:27:02,33244.0,2017-05-01 14:34:32,2017-05-01 14:00:05,00:34:27,20.033333,28156.0,2017-05-01 13:36:28,00:11:00
31,28161.0,2,2,BUS,303,LE702,5.0,25544.0,2017-05-01 13:51:32,2017-05-01 13:50:17,00:01:15,25646.0,2017-05-01 14:05:38,2017-05-01 14:04:25,00:01:13,14.100000,28161.0,2017-05-01 13:36:33,00:13:44
36,28161.0,8,2,BUS,462,DC093,7.0,33025.0,2017-05-01 13:50:45,2017-05-01 13:44:34,00:06:11,30746.0,2017-05-01 14:07:57,2017-05-01 13:55:26,00:12:31,17.200000,28161.0,2017-05-01 13:36:33,00:08:01
59,28169.0,1,4,BUS,030,KB605,3.0,27636.0,2017-05-01 13:56:00,2017-05-01 13:43:50,00:12:10,32610.0,2017-05-01 14:11:26,2017-05-01 11:16:16,02:55:10,15.433333,28169.0,2017-05-01 13:36:36,00:07:14
60,28169.0,2,2,BUS,658,HA027,5.0,35911.0,2017-05-01 13:46:18,2017-05-01 13:42:19,00:03:59,27620.0,2017-05-01 14:03:00,2017-05-01 13:51:50,00:11:10,16.700000,28169.0,2017-05-01 13:36:36,00:05:43
70,28172.0,2,1,BUS,332,DA015,22.0,27786.0,2017-05-01 13:35:58,2017-05-01 13:40:21,00:04:23,30123.0,2017-05-01 13:40:00,2017-05-01 13:47:17,00:07:17,4.033333,28172.0,2017-05-01 13:36:41,00:03:40
72,28172.0,4,1,BUS,332,DA015,22.0,27786.0,2017-05-01 13:47:58,2017-05-01 13:40:21,00:07:37,30123.0,2017-05-01 13:52:00,2017-05-01 13:47:17,00:04:43,4.033333,28172.0,2017-05-01 13:36:41,00:03:40


In [None]:
valid_itineraries

#### Filtering out invalid itineraries

In [130]:
scheduled_itin_observed_od_full_clean.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
66,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,00:03:15,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,00:00:46,6.45
72,28150.0,1,1,BUS,827,BC929,13.0,33788.0,2017-05-01 13:38:33,2017-05-01 14:22:34,00:44:01,30993.0,2017-05-01 13:45:00,2017-05-01 14:30:58,00:45:58,6.45
60,28150.0,1,1,BUS,827,BC929,11.0,33788.0,2017-05-01 13:38:33,2017-05-01 12:47:49,00:50:44,30993.0,2017-05-01 13:45:00,2017-05-01 12:53:04,00:51:56,6.45
78,28150.0,1,1,BUS,827,BC929,14.0,33788.0,2017-05-01 13:38:33,2017-05-01 15:11:15,01:32:42,30993.0,2017-05-01 13:45:00,2017-05-01 15:18:06,01:33:06,6.45
54,28150.0,1,1,BUS,827,BC929,10.0,33788.0,2017-05-01 13:38:33,2017-05-01 11:58:41,01:39:52,30993.0,2017-05-01 13:45:00,2017-05-01 12:07:05,01:37:55,6.45


In [132]:
len(scheduled_itin_observed_od_full_clean)

80033

In [187]:
valid_first_bus_itineraris_matches_keys = valid_first_bus_legs_matches.filter(['otp_user_trip_id','otp_itinerary_id']).drop_duplicates()

In [188]:
valid_first_bus_itineraris_matches_keys.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id
4,28150.0,5
20,28156.0,1
22,28156.0,3
27,28156.0,8
31,28161.0,2


In [189]:
valid_sched_obs_itins = scheduled_itin_observed_od_full_clean.merge(valid_first_bus_itineraris_matches_keys, how='inner')

In [190]:
len(valid_sched_obs_itins)

14035

In [191]:
valid_sched_obs_itins

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
0,28150.0,5,1,WALK,,,,,2017-05-01 13:53:08,NaT,NaT,,2017-05-01 14:09:59,NaT,NaT,16.850000
1,28150.0,5,2,BUS,821,MA013,9.0,33973.0,2017-05-01 14:10:00,2017-05-01 13:44:07,00:25:53,30995.0,2017-05-01 14:14:00,2017-05-01 13:46:33,00:27:27,4.000000
2,28150.0,5,2,BUS,821,MA013,10.0,33973.0,2017-05-01 14:10:00,2017-05-01 14:42:01,00:32:01,30995.0,2017-05-01 14:14:00,2017-05-01 14:45:51,00:31:51,4.000000
3,28150.0,5,2,BUS,821,MA013,8.0,33973.0,2017-05-01 14:10:00,2017-05-01 12:44:09,01:25:51,30995.0,2017-05-01 14:14:00,2017-05-01 12:48:35,01:25:25,4.000000
4,28150.0,5,2,BUS,821,MA013,11.0,33973.0,2017-05-01 14:10:00,2017-05-01 15:44:37,01:34:37,30995.0,2017-05-01 14:14:00,2017-05-01 15:48:45,01:34:45,4.000000
5,28150.0,5,2,BUS,821,MA013,7.0,33973.0,2017-05-01 14:10:00,2017-05-01 11:44:09,02:25:51,30995.0,2017-05-01 14:14:00,2017-05-01 11:48:50,02:25:10,4.000000
6,28150.0,5,2,BUS,821,MA013,12.0,33973.0,2017-05-01 14:10:00,2017-05-01 16:45:07,02:35:07,30995.0,2017-05-01 14:14:00,2017-05-01 16:48:05,02:34:05,4.000000
7,28150.0,5,2,BUS,821,MA013,6.0,33973.0,2017-05-01 14:10:00,2017-05-01 10:43:17,03:26:43,30995.0,2017-05-01 14:14:00,2017-05-01 10:47:08,03:26:52,4.000000
8,28150.0,5,2,BUS,821,MA013,13.0,33973.0,2017-05-01 14:10:00,2017-05-01 17:44:10,03:34:10,30995.0,2017-05-01 14:14:00,2017-05-01 17:50:36,03:36:36,4.000000
9,28150.0,5,2,BUS,821,MA013,5.0,33973.0,2017-05-01 14:10:00,2017-05-01 09:41:56,04:28:04,30995.0,2017-05-01 14:14:00,2017-05-01 09:45:09,04:28:51,4.000000


### Filtering out itineraries which lost bus legs along the processing

In [192]:
original_suggested_itins_num_legs = otp_suggestions.groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                    .agg({'otp_leg_id': lambda x: len(x)}) \
                                    .reset_index() \
                                    .rename(index=str, columns={'otp_leg_id':'num_legs'})

In [193]:
original_suggested_itins_num_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,num_legs
0,28150.0,1,4
1,28150.0,2,4
2,28150.0,3,3
3,28150.0,4,4
4,28150.0,5,5


In [194]:
final_matched_itins_num_legs = valid_sched_obs_itins.groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                    .agg({'otp_leg_id': lambda x: len(np.unique(x))}) \
                                    .reset_index() \
                                    .rename(index=str, columns={'otp_leg_id':'num_legs'})

In [195]:
final_matched_itins_num_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,num_legs
0,28150.0,5,5
1,28156.0,1,3
2,28156.0,3,3
3,28156.0,8,5
4,28161.0,2,3


In [196]:
complete_matched_itins = original_suggested_itins_num_legs.merge(final_matched_itins_num_legs, how='inner')

In [197]:
complete_matched_itins.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,num_legs
0,28150.0,5,5
1,28161.0,2,3
2,28161.0,8,3
3,28169.0,2,5
4,28172.0,2,6


In [294]:
valid_complete_first_bus_legs_matches = valid_first_bus_legs_matches.merge(complete_matched_itins.drop('num_legs', axis=1),
                                                                  how='inner')

In [295]:
len(valid_first_bus_legs_matches)

184

In [296]:
len(valid_complete_first_bus_legs_matches)

155

In [198]:
complete_matched_itins_legs_options = valid_sched_obs_itins.merge(complete_matched_itins.drop('num_legs', axis=1),
                                                                  how='inner')

In [289]:
len(valid_sched_obs_itins)

14035

In [290]:
len(complete_matched_itins_legs_options)

12317

### Substitute first bus legs per valid first bus legs

In [291]:
complete_matched_itins_legs_options.iloc[50:100,]

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
50,28150.0,5,4,BUS,303,DE713,6.0,26195.0,2017-05-01 14:21:00,2017-05-01 17:19:18,02:58:18,25753.0,2017-05-01 14:33:19,2017-05-01 17:27:59,02:54:40,12.316667
51,28150.0,5,4,BUS,303,DE713,3.0,26195.0,2017-05-01 14:21:00,2017-05-01 11:16:41,03:04:19,25753.0,2017-05-01 14:33:19,2017-05-01 11:23:51,03:09:28,12.316667
52,28150.0,5,4,BUS,303,DE710,6.0,26195.0,2017-05-01 14:21:00,2017-05-01 17:29:54,03:08:54,25753.0,2017-05-01 14:33:19,2017-05-01 17:37:35,03:04:16,12.316667
53,28150.0,5,4,BUS,303,LE700,6.0,26195.0,2017-05-01 14:21:00,2017-05-01 17:36:36,03:15:36,25753.0,2017-05-01 14:33:19,2017-05-01 15:51:39,01:18:20,12.316667
54,28150.0,5,4,BUS,303,DE719,3.0,26195.0,2017-05-01 14:21:00,2017-05-01 11:02:21,03:18:39,25753.0,2017-05-01 14:33:19,2017-05-01 11:12:20,03:20:59,12.316667
55,28150.0,5,4,BUS,303,DE715,2.0,26195.0,2017-05-01 14:21:00,2017-05-01 10:54:57,03:26:03,25753.0,2017-05-01 14:33:19,2017-05-01 09:00:55,05:32:24,12.316667
56,28150.0,5,4,BUS,303,DE716,6.0,26195.0,2017-05-01 14:21:00,2017-05-01 17:52:58,03:31:58,25753.0,2017-05-01 14:33:19,2017-05-01 16:00:03,01:26:44,12.316667
57,28150.0,5,4,BUS,303,LE702,3.0,26195.0,2017-05-01 14:21:00,2017-05-01 10:40:50,03:40:10,25753.0,2017-05-01 14:33:19,2017-05-01 10:49:42,03:43:37,12.316667
58,28150.0,5,4,BUS,303,DE722,6.0,26195.0,2017-05-01 14:21:00,2017-05-01 18:03:01,03:42:01,25753.0,2017-05-01 14:33:19,2017-05-01 18:11:53,03:38:34,12.316667
59,28150.0,5,4,BUS,303,DE708,3.0,26195.0,2017-05-01 14:21:00,2017-05-01 10:27:31,03:53:29,25753.0,2017-05-01 14:33:19,2017-05-01 10:36:31,03:56:48,12.316667


In [297]:
valid_complete_first_bus_legs_matches.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins,o_boarding_id,o_boarding_datetime,pass_obs_start_diff
0,28150.0,5,2,BUS,821,MA013,9.0,33973.0,2017-05-01 14:10:00,2017-05-01 13:44:07,00:25:53,30995.0,2017-05-01 14:14:00,2017-05-01 13:46:33,00:27:27,4.0,28150.0,2017-05-01 13:36:25,00:07:42
1,28161.0,2,2,BUS,303,LE702,5.0,25544.0,2017-05-01 13:51:32,2017-05-01 13:50:17,00:01:15,25646.0,2017-05-01 14:05:38,2017-05-01 14:04:25,00:01:13,14.1,28161.0,2017-05-01 13:36:33,00:13:44
2,28161.0,8,2,BUS,462,DC093,7.0,33025.0,2017-05-01 13:50:45,2017-05-01 13:44:34,00:06:11,30746.0,2017-05-01 14:07:57,2017-05-01 13:55:26,00:12:31,17.2,28161.0,2017-05-01 13:36:33,00:08:01
3,28169.0,2,2,BUS,658,HA027,5.0,35911.0,2017-05-01 13:46:18,2017-05-01 13:42:19,00:03:59,27620.0,2017-05-01 14:03:00,2017-05-01 13:51:50,00:11:10,16.7,28169.0,2017-05-01 13:36:36,00:05:43
4,28172.0,2,1,BUS,332,DA015,22.0,27786.0,2017-05-01 13:35:58,2017-05-01 13:40:21,00:04:23,30123.0,2017-05-01 13:40:00,2017-05-01 13:47:17,00:07:17,4.033333,28172.0,2017-05-01 13:36:41,00:03:40


In [308]:
valid_bus_legs_matches_rest = complete_matched_itins_legs_options.merge(valid_complete_first_bus_legs_matches \
                                            .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id']),
                                          how='left', 
                                          indicator=True) \
                                    .query('_merge == \'left_only\'') \
                                    .drop('_merge', axis=1)

In [309]:
valid_bus_legs_matches_rest

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
0,28150.0,5,1,WALK,,,,,2017-05-01 13:53:08,NaT,NaT,,2017-05-01 14:09:59,NaT,NaT,16.850000
20,28150.0,5,3,WALK,,,,,2017-05-01 14:14:00,NaT,NaT,,2017-05-01 14:14:26,NaT,NaT,0.433333
21,28150.0,5,4,BUS,303,LE710,4.0,26195.0,2017-05-01 14:21:00,2017-05-01 14:18:42,00:02:18,25753.0,2017-05-01 14:33:19,2017-05-01 12:25:36,02:07:43,12.316667
22,28150.0,5,4,BUS,303,DE708,5.0,26195.0,2017-05-01 14:21:00,2017-05-01 14:29:24,00:08:24,25753.0,2017-05-01 14:33:19,2017-05-01 14:37:26,00:04:07,12.316667
23,28150.0,5,4,BUS,303,DE722,4.0,26195.0,2017-05-01 14:21:00,2017-05-01 14:02:48,00:18:12,25753.0,2017-05-01 14:33:19,2017-05-01 14:12:10,00:21:09,12.316667
24,28150.0,5,4,BUS,303,LE702,5.0,26195.0,2017-05-01 14:21:00,2017-05-01 14:41:47,00:20:47,25753.0,2017-05-01 14:33:19,2017-05-01 14:50:32,00:17:13,12.316667
25,28150.0,5,4,BUS,303,DE716,4.0,26195.0,2017-05-01 14:21:00,2017-05-01 13:50:21,00:30:39,25753.0,2017-05-01 14:33:19,2017-05-01 12:00:07,02:33:12,12.316667
26,28150.0,5,4,BUS,303,DE715,4.0,26195.0,2017-05-01 14:21:00,2017-05-01 14:56:11,00:35:11,25753.0,2017-05-01 14:33:19,2017-05-01 13:05:00,01:28:19,12.316667
27,28150.0,5,4,BUS,303,LE700,4.0,26195.0,2017-05-01 14:21:00,2017-05-01 13:37:03,00:43:57,25753.0,2017-05-01 14:33:19,2017-05-01 11:47:48,02:45:31,12.316667
28,28150.0,5,4,BUS,303,DE719,5.0,26195.0,2017-05-01 14:21:00,2017-05-01 15:04:59,00:43:59,25753.0,2017-05-01 14:33:19,2017-05-01 15:13:00,00:39:41,12.316667


In [310]:
valid_complete_first_bus_legs_matches.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins,o_boarding_id,o_boarding_datetime,pass_obs_start_diff
0,28150.0,5,2,BUS,821,MA013,9.0,33973.0,2017-05-01 14:10:00,2017-05-01 13:44:07,00:25:53,30995.0,2017-05-01 14:14:00,2017-05-01 13:46:33,00:27:27,4.0,28150.0,2017-05-01 13:36:25,00:07:42
1,28161.0,2,2,BUS,303,LE702,5.0,25544.0,2017-05-01 13:51:32,2017-05-01 13:50:17,00:01:15,25646.0,2017-05-01 14:05:38,2017-05-01 14:04:25,00:01:13,14.1,28161.0,2017-05-01 13:36:33,00:13:44
2,28161.0,8,2,BUS,462,DC093,7.0,33025.0,2017-05-01 13:50:45,2017-05-01 13:44:34,00:06:11,30746.0,2017-05-01 14:07:57,2017-05-01 13:55:26,00:12:31,17.2,28161.0,2017-05-01 13:36:33,00:08:01
3,28169.0,2,2,BUS,658,HA027,5.0,35911.0,2017-05-01 13:46:18,2017-05-01 13:42:19,00:03:59,27620.0,2017-05-01 14:03:00,2017-05-01 13:51:50,00:11:10,16.7,28169.0,2017-05-01 13:36:36,00:05:43
4,28172.0,2,1,BUS,332,DA015,22.0,27786.0,2017-05-01 13:35:58,2017-05-01 13:40:21,00:04:23,30123.0,2017-05-01 13:40:00,2017-05-01 13:47:17,00:07:17,4.033333,28172.0,2017-05-01 13:36:41,00:03:40


In [311]:
valid_bus_legs_matches_all = pd.concat([valid_complete_first_bus_legs_matches.drop(['o_boarding_id','o_boarding_datetime','pass_obs_start_diff'], axis=1),
                                        valid_bus_legs_matches_rest], sort=False) \
                                .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','bt_start_time'])

In [314]:
valid_bus_legs_matches_all

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
0,28150.0,5,1,WALK,,,,,2017-05-01 13:53:08,NaT,NaT,,2017-05-01 14:09:59,NaT,NaT,16.850000
0,28150.0,5,2,BUS,821,MA013,9.0,33973.0,2017-05-01 14:10:00,2017-05-01 13:44:07,00:25:53,30995.0,2017-05-01 14:14:00,2017-05-01 13:46:33,00:27:27,4.000000
20,28150.0,5,3,WALK,,,,,2017-05-01 14:14:00,NaT,NaT,,2017-05-01 14:14:26,NaT,NaT,0.433333
100,28150.0,5,4,BUS,303,DE708,1.0,26195.0,2017-05-01 14:21:00,2017-05-01 06:29:10,07:51:50,25753.0,2017-05-01 14:33:19,2017-05-01 06:36:54,07:56:25,12.316667
98,28150.0,5,4,BUS,303,LE702,1.0,26195.0,2017-05-01 14:21:00,2017-05-01 06:37:11,07:43:49,25753.0,2017-05-01 14:33:19,2017-05-01 06:47:25,07:45:54,12.316667
94,28150.0,5,4,BUS,303,DE719,1.0,26195.0,2017-05-01 14:21:00,2017-05-01 07:01:26,07:19:34,25753.0,2017-05-01 14:33:19,2017-05-01 07:13:11,07:20:08,12.316667
93,28150.0,5,4,BUS,303,DE713,1.0,26195.0,2017-05-01 14:21:00,2017-05-01 07:15:53,07:05:07,25753.0,2017-05-01 14:33:19,2017-05-01 07:24:14,07:09:05,12.316667
91,28150.0,5,4,BUS,303,DE710,1.0,26195.0,2017-05-01 14:21:00,2017-05-01 07:24:41,06:56:19,25753.0,2017-05-01 14:33:19,2017-05-01 07:36:59,06:56:20,12.316667
89,28150.0,5,4,BUS,303,LE700,1.0,26195.0,2017-05-01 14:21:00,2017-05-01 07:39:38,06:41:22,25753.0,2017-05-01 14:33:19,2017-05-01 05:47:45,08:45:34,12.316667
86,28150.0,5,4,BUS,303,DE716,1.0,26195.0,2017-05-01 14:21:00,2017-05-01 07:51:44,06:29:16,25753.0,2017-05-01 14:33:19,2017-05-01 06:02:32,08:30:47,12.316667


In [315]:
len(valid_bus_legs_matches_all)

6638

In [338]:
valid_bus_legs_matches_all[valid_bus_legs_matches_all['otp_user_trip_id'] == 28169.0]

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
215,28169.0,2,1,WALK,,,,,2017-05-01 13:42:03,NaT,NaT,,2017-05-01 13:46:17,NaT,NaT,4.233333
3,28169.0,2,2,BUS,658.0,HA027,5.0,35911.0,2017-05-01 13:46:18,2017-05-01 13:42:19,00:03:59,27620.0,2017-05-01 14:03:00,2017-05-01 13:51:50,00:11:10,16.7
269,28169.0,2,3,WALK,,,,,2017-05-01 14:03:00,NaT,NaT,,2017-05-01 14:03:03,NaT,NaT,0.05
309,28169.0,2,4,BUS,625.0,GA162,1.0,27639.0,2017-05-01 14:05:00,2017-05-01 05:58:59,08:06:01,36064.0,2017-05-01 14:28:54,2017-05-01 06:18:17,08:10:37,23.9
307,28169.0,2,4,BUS,625.0,GA161,1.0,27639.0,2017-05-01 14:05:00,2017-05-01 06:26:07,07:38:53,36064.0,2017-05-01 14:28:54,2017-05-01 06:49:36,07:39:18,23.9
305,28169.0,2,4,BUS,625.0,GA162,2.0,27639.0,2017-05-01 14:05:00,2017-05-01 06:56:07,07:08:53,36064.0,2017-05-01 14:28:54,2017-05-01 07:18:17,07:10:37,23.9
303,28169.0,2,4,BUS,625.0,GA161,2.0,27639.0,2017-05-01 14:05:00,2017-05-01 07:26:07,06:38:53,36064.0,2017-05-01 14:28:54,2017-05-01 07:49:06,06:39:48,23.9
302,28169.0,2,4,BUS,625.0,GA162,3.0,27639.0,2017-05-01 14:05:00,2017-05-01 07:54:30,06:10:30,36064.0,2017-05-01 14:28:54,2017-05-01 08:19:57,06:08:57,23.9
299,28169.0,2,4,BUS,625.0,GA161,3.0,27639.0,2017-05-01 14:05:00,2017-05-01 08:26:05,05:38:55,36064.0,2017-05-01 14:28:54,2017-05-01 08:49:13,05:39:41,23.9
295,28169.0,2,4,BUS,625.0,GA162,4.0,27639.0,2017-05-01 14:05:00,2017-05-01 08:57:30,05:07:30,36064.0,2017-05-01 14:28:54,2017-05-01 09:18:58,05:09:56,23.9


### Choosing Feasible Legs

In [331]:
def choose_leg_matches(leg_matches_groups):
        colnames = leg_matches_groups.obj.columns.values
        chosen_leg_matches = pd.DataFrame(columns = colnames)
        prev_itin_id = -1
        prev_leg_mode = ""
        prev_leg_end_time = pd.NaT
        num_groups_not_survived = 0

        for name, group in leg_matches_groups:
            
                #print
                #print "Name:", name
                #print "Group:"
                #print group
                #print
                curr_itin_id = group['otp_itinerary_id'].iloc[0]
                curr_leg_mode = group['otp_mode'].iloc[0]
                curr_leg_id = group['otp_leg_id'].iloc[0]
                
                if (prev_itin_id != curr_itin_id): #new itinerary
                    prev_leg_end_time = group['otp_start_time'].dt.floor('d').iloc[0]

                #if (prev_group_id == ()):
                #        prev_leg_end_time = group['bt_start_time'].dt.floor('d')[0]

                #print
                #print "Previous itinerary id:", prev_itin_id
                #print "Previous leg mode:", prev_leg_mode
                #print "Previous leg end time:", prev_leg_end_time
                #print "Current leg id:", curr_leg_id
                #print "Current leg mode:", curr_leg_mode
                #print
                #print "Original Group"
                #print group.filter(['otp_start_time','bt_start_time','bt_end_time'])
                
                if (curr_leg_mode == 'WALK'):
                    #print "Walking duration:", filtered_group['otp_duration_mins']
                    filtered_group = group.reset_index()
                    if (prev_itin_id != curr_itin_id): #first leg is a WALK leg
                        filtered_group.loc[0,'bt_end_time'] = prev_leg_end_time
                    else:
                        filtered_group.loc[0,'bt_start_time'] = prev_leg_end_time
                        filtered_group.loc[0,'bt_end_time'] = prev_leg_end_time + \
                            pd.Timedelta(minutes=np.rint(filtered_group['otp_duration_mins'].iloc[0]))
                    #print "Filtered Group"
                    #print filtered_group
                else:
                    filtered_group = group[group['bt_start_time'] > prev_leg_end_time]
                
                #print
                #print "Filtered Group"
                #print filtered_group.filter(['otp_start_time','bt_start_time','bt_end_time'])

                if (len(filtered_group) == 0):
                        #print "Group did not survive! =("
                        num_groups_not_survived += 1
                        continue

                chosen_leg_match = filtered_group.sort_values('bt_start_time').iloc[0]
                
                if ((curr_leg_id == 2) & 
                    ((curr_leg_mode == 'BUS') & (prev_leg_mode == 'WALK'))):
                        #Update previous walk start/end_times
                        #print
                        #print "Chosen Leg Matches"
                        #print chosen_leg_matches.iloc[-1]
                        #print
                        chosen_leg_matches.iloc[-1,chosen_leg_matches.columns.get_loc('bt_start_time')] = chosen_leg_match['bt_start_time'] - \
                            pd.Timedelta(minutes=np.rint(chosen_leg_matches.iloc[-1].otp_duration_mins))
                        chosen_leg_matches.iloc[-1,chosen_leg_matches.columns.get_loc('bt_end_time')] = chosen_leg_match['bt_start_time']
                #print "Chosen Leg"
                #print chosen_leg_match

                chosen_leg_matches = chosen_leg_matches.append(chosen_leg_match)

                #Update variables
                #prev_group_id = name
                prev_itin_id = curr_itin_id
                prev_leg_mode = curr_leg_mode
                prev_leg_end_time = chosen_leg_match['bt_end_time']

        #print num_groups_not_survived
        return chosen_leg_matches.filter(colnames)


In [332]:
sample_leg_matches = valid_bus_legs_matches_all[valid_bus_legs_matches_all['otp_user_trip_id'] == 28169.0]

In [333]:
sample_leg_matches

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
215,28169.0,2,1,WALK,,,,,2017-05-01 13:42:03,NaT,NaT,,2017-05-01 13:46:17,NaT,NaT,4.233333
3,28169.0,2,2,BUS,658.0,HA027,5.0,35911.0,2017-05-01 13:46:18,2017-05-01 13:42:19,00:03:59,27620.0,2017-05-01 14:03:00,2017-05-01 13:51:50,00:11:10,16.7
269,28169.0,2,3,WALK,,,,,2017-05-01 14:03:00,NaT,NaT,,2017-05-01 14:03:03,NaT,NaT,0.05
309,28169.0,2,4,BUS,625.0,GA162,1.0,27639.0,2017-05-01 14:05:00,2017-05-01 05:58:59,08:06:01,36064.0,2017-05-01 14:28:54,2017-05-01 06:18:17,08:10:37,23.9
307,28169.0,2,4,BUS,625.0,GA161,1.0,27639.0,2017-05-01 14:05:00,2017-05-01 06:26:07,07:38:53,36064.0,2017-05-01 14:28:54,2017-05-01 06:49:36,07:39:18,23.9
305,28169.0,2,4,BUS,625.0,GA162,2.0,27639.0,2017-05-01 14:05:00,2017-05-01 06:56:07,07:08:53,36064.0,2017-05-01 14:28:54,2017-05-01 07:18:17,07:10:37,23.9
303,28169.0,2,4,BUS,625.0,GA161,2.0,27639.0,2017-05-01 14:05:00,2017-05-01 07:26:07,06:38:53,36064.0,2017-05-01 14:28:54,2017-05-01 07:49:06,06:39:48,23.9
302,28169.0,2,4,BUS,625.0,GA162,3.0,27639.0,2017-05-01 14:05:00,2017-05-01 07:54:30,06:10:30,36064.0,2017-05-01 14:28:54,2017-05-01 08:19:57,06:08:57,23.9
299,28169.0,2,4,BUS,625.0,GA161,3.0,27639.0,2017-05-01 14:05:00,2017-05-01 08:26:05,05:38:55,36064.0,2017-05-01 14:28:54,2017-05-01 08:49:13,05:39:41,23.9
295,28169.0,2,4,BUS,625.0,GA162,4.0,27639.0,2017-05-01 14:05:00,2017-05-01 08:57:30,05:07:30,36064.0,2017-05-01 14:28:54,2017-05-01 09:18:58,05:09:56,23.9


In [336]:
start = time.time()
feasible_itins_legs = choose_leg_matches(valid_bus_legs_matches_all.groupby(['otp_user_trip_id','otp_itinerary_id','otp_leg_id']))
#feasible_itins_legs = choose_leg_matches(sample_leg_matches.groupby(['otp_user_trip_id','otp_itinerary_id','otp_leg_id']))
end = time.time()

print "Time to get feasible legs:", end-start

Time to get feasible legs: 5.1568570137


In [337]:
feasible_itins_legs

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
0,28150.0,5,1,WALK,,,,,2017-05-01 13:53:08,2017-05-01 13:27:07,NaT,,2017-05-01 14:09:59,2017-05-01 13:44:07,NaT,16.850000
0,28150.0,5,2,BUS,821,MA013,9.0,33973.0,2017-05-01 14:10:00,2017-05-01 13:44:07,0 days 00:25:53,30995.0,2017-05-01 14:14:00,2017-05-01 13:46:33,0 days 00:27:27,4.000000
0,28150.0,5,3,WALK,,,,,2017-05-01 14:14:00,2017-05-01 13:46:33,NaT,,2017-05-01 14:14:26,2017-05-01 13:46:33,NaT,0.433333
25,28150.0,5,4,BUS,303,DE716,4.0,26195.0,2017-05-01 14:21:00,2017-05-01 13:50:21,0 days 00:30:39,25753.0,2017-05-01 14:33:19,2017-05-01 12:00:07,0 days 02:33:12,12.316667
0,28150.0,5,5,WALK,,,,,2017-05-01 14:33:20,2017-05-01 12:00:07,NaT,,2017-05-01 14:40:52,2017-05-01 12:08:07,NaT,7.533333
0,28161.0,2,1,WALK,,,,,2017-05-01 13:37:32,2017-05-01 13:36:17,NaT,,2017-05-01 13:51:31,2017-05-01 13:50:17,NaT,13.983333
1,28161.0,2,2,BUS,303,LE702,5.0,25544.0,2017-05-01 13:51:32,2017-05-01 13:50:17,0 days 00:01:15,25646.0,2017-05-01 14:05:38,2017-05-01 14:04:25,0 days 00:01:13,14.100000
0,28161.0,2,3,WALK,,,,,2017-05-01 14:05:39,2017-05-01 14:04:25,NaT,,2017-05-01 14:05:41,2017-05-01 14:04:25,NaT,0.033333
0,28161.0,8,1,WALK,,,,,2017-05-01 13:45:52,2017-05-01 13:39:34,NaT,,2017-05-01 13:50:44,2017-05-01 13:44:34,NaT,4.866667
2,28161.0,8,2,BUS,462,DC093,7.0,33025.0,2017-05-01 13:50:45,2017-05-01 13:44:34,0 days 00:06:11,30746.0,2017-05-01 14:07:57,2017-05-01 13:55:26,0 days 00:12:31,17.200000


In [221]:
feasible_itins_legs.tail(1)

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins,o_boarding_id,o_boarding_datetime
0,71056.0,4,8,WALK,,,,,2017-05-02 00:31:29,NaT,NaT,,2017-05-02 00:34:42,NaT,NaT,3.216667,,NaT


In [224]:
feasible_itins_legs.iloc[-1].otp_leg_id = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
