In [88]:
#Libraries

#Python Libs
import sys
import os
import glob
import traceback
from datetime import datetime
import time
from geopy import distance


#Data Analysis Libs
import pandas as pd
import numpy as np


In [89]:
#Functions
def select_input_files(enh_buste_base_path,init_date,fin_date,suffix):
        selected_files = []
        all_files = glob.glob(os.path.join(enh_buste_base_path,"*"))

        for file_ in all_files:
                try:
                        file_date = pd.to_datetime(file_.split('/')[-1],format=('%Y_%m_%d' + suffix  + '.csv'))
                        if (file_date >= init_date) and (file_date <= fin_date):
                                selected_files.append((file_,file_date))
                except:
                        continue

        return sorted(selected_files)

def dist(p1_lat, p1_lon, p2_lat, p2_lon):
    return np.around(distance.geodesic((p1_lat,p1_lon),(p2_lat,p2_lon)).km,decimals=5)

def get_router_id(query_date):
    INTERMEDIATE_OTP_DATE = pd.to_datetime("2017-06-30", format="%Y-%m-%d")
    router_id = ''

    if (query_date <= INTERMEDIATE_OTP_DATE):
        return 'ctba-2017-1'
    else:
        return 'ctba-2017-2'

In [90]:
def match_terminal_boardings(selected_trips,itineraries_start):
        terminal_boarding_origins = selected_trips[(selected_trips['o_busCode'].str.isdigit()) & (selected_trips['o_route'] != '021')]
        matched_terminal_boardings = terminal_boarding_origins.merge(itineraries_start,
                                                                                                                                left_on=['o_boarding_id','o_stopPointId'],
                                                                                                                                right_on=['user_trip_id','from_parent_station'], how='inner')
        num_matched_terminal_boardings = len(matched_terminal_boardings.drop_duplicates(subset=['cardNum','o_boarding_id']))
        if num_matched_terminal_boardings == 0:
            matched_perc = 0.0
        else:
            matched_perc = 100*(num_matched_terminal_boardings/float(len(terminal_boarding_origins)))
        return (matched_terminal_boardings,num_matched_terminal_boardings,matched_perc)

def match_terminal_021_boardings(selected_trips,itineraries_start):
        terminal_021_origins = selected_trips[(selected_trips['o_busCode'].str.isdigit()) & (selected_trips['o_route'] == '021')]
        matched_021_terminal_boardings = terminal_021_origins.merge(itineraries_start, left_on=['o_boarding_id','o_route','o_stopPointId'], right_on=['user_trip_id','route','from_parent_station'], how='inner')
        num_matched_021_terminal_boardings = len(matched_021_terminal_boardings.drop_duplicates(subset=['cardNum','o_boarding_id']))
        if num_matched_021_terminal_boardings == 0:
                terminal_021_match_perc = 0.0
        else:
                terminal_021_match_perc = 100*(num_matched_021_terminal_boardings/float(len(terminal_021_origins)))
        return (len(terminal_021_origins),matched_021_terminal_boardings,num_matched_021_terminal_boardings,terminal_021_match_perc)


In [91]:
def add_stops_data_to_leg_matches(chosen_leg_matches,stops_locations):
        chosen_leg_matches_data = chosen_leg_matches.merge(stops_locations, left_on='from_stop_id', right_on='stop_id', how='left') \
                                                                                .drop('stop_id', axis=1) \
                                                                                .rename(index=str, columns={'stop_lat':'from_stop_lat','stop_lon':'from_stop_lon'}) \
                                                                                .merge(stops_locations, left_on='to_stop_id', right_on='stop_id', how='left') \
                                                                                .drop('stop_id', axis=1) \
                                                                                .rename(index=str, columns={'stop_lat':'to_stop_lat','stop_lon':'to_stop_lon'}) \
                                                                                .merge(user_trips_ids, on=['user_trip_id'], how='inner') \
                                                                                [np.append(np.append(['cardNum'],otp_legs_buste.columns.values),['from_stop_lat','from_stop_lon','to_stop_lat','to_stop_lon'])]
        return chosen_leg_matches_data


In [92]:
def build_candidate_itineraries_df(chosen_leg_matches_data):
        otp_buste_itineraries = chosen_leg_matches_data \
                                        .groupby(['cardNum','user_trip_id','itinerary_id']) \
                                        .agg({'from_stop_id': lambda x: x.iloc[0],
                                                  'matched_start_time': lambda x: x.iloc[0],
                                                  'from_stop_lat': lambda x: x.iloc[0],
                                                  'from_stop_lon': lambda x: x.iloc[0],
                                                  'to_stop_id': lambda x: x.iloc[-1],
                                                  'matched_end_time': lambda x: x.iloc[-1],
                                                  'to_stop_lat': lambda x: x.iloc[-1],
                                                  'to_stop_lon': lambda x: x.iloc[-1],
                                                  'leg_id': lambda x: len(x),
                                                  'first_vehicle_boarding' : lambda x: x.any()}) \
                                        .rename(index=str, columns={'leg_id':'num_transfers','first_vehicle_boarding':'vehicle_boarding'}) \
                                        .add_prefix('match_') \
                                        .reset_index() \
                                        .assign(cardNum = lambda x: x['cardNum'].astype(float),
                                                        user_trip_id = lambda x: x['user_trip_id'].astype(int),
                                                        itinerary_id = lambda x: x['itinerary_id'].astype(int))
        return otp_buste_itineraries


In [93]:
def get_candidate_itineraries_summary(candidate_itineraries,boarding_suggestions_matches):
        otp_buste_itineraries_summary = candidate_itineraries \
                                        .merge(boarding_suggestions_matches \
                                                                .drop_duplicates(subset=['cardNum','user_trip_id','itinerary_id']),
                                                        on=['cardNum','user_trip_id','itinerary_id'],
                                                        how='inner') \
                                        [['cardNum', 'user_trip_id', 'itinerary_id',
                                                  'match_from_stop_id', 'match_matched_start_time', 'o_boarding_datetime',
                                                  'match_from_stop_lat', 'match_from_stop_lon', 'o_stop_lat', 'o_stop_lon',
                                                  'match_to_stop_id', 'match_matched_end_time', 'next_o_boarding_datetime',
                                                  'match_to_stop_lat', 'match_to_stop_lon', 'next_o_stop_lat', 'next_o_stop_lon','match_num_transfers', 'match_vehicle_boarding']] \
                                        .assign(start_diff = lambda x: np.absolute(x['match_matched_start_time'] - x['o_boarding_datetime']),
                                                        trip_duration = lambda x: x['match_matched_end_time'] - x['match_matched_start_time'],
                                                        origin_dist = lambda y: y.apply(lambda x: dist(x['match_from_stop_lat'], x['match_from_stop_lon'], x['o_stop_lat'], x['o_stop_lon']),axis=1),
                                                        next_origin_dist = lambda y: y.apply(lambda x: dist(x['match_to_stop_lat'], x['match_to_stop_lon'], x['next_o_stop_lat'], x['next_o_stop_lon']),axis=1)) \
                                        .sort_values(['cardNum','user_trip_id'])

        otp_buste_itineraries_summary = otp_buste_itineraries_summary[((otp_buste_itineraries_summary['trip_duration'] > pd.Timedelta('0s')) & 
                                                                                                                                        (otp_buste_itineraries_summary['trip_duration'] < pd.Timedelta('2h'))) &
                                                                                                                                ((otp_buste_itineraries_summary['start_diff'] > pd.Timedelta('0s')) &
                                                                                                                                        (otp_buste_itineraries_summary['start_diff'] < pd.Timedelta('1.5h')))] \
                                                                                .query('origin_dist < 0.1') \
                                                                                .query('next_origin_dist < 2.0')
        return otp_buste_itineraries_summary


In [94]:
def get_candidate_itineraries_penalty_score(otp_buste_itineraries_filtered):
        otp_buste_itineraries_penalty = otp_buste_itineraries_filtered \
                                                        .assign(penalty = lambda x: 2*x['start_diff'].dt.total_seconds() + x['trip_duration'].dt.total_seconds() + x['match_num_transfers']*10) \
                                                        [['cardNum','user_trip_id','itinerary_id','match_num_transfers','match_vehicle_boarding','next_origin_dist','origin_dist','start_diff','trip_duration','penalty']] \
                                                        .sort_values(['user_trip_id','penalty'], ascending=True)
        return otp_buste_itineraries_penalty


## Main

In [96]:
otp_suggestions_filepath = '/local/tarciso/data/otp_itineraries/2017_05_01_user_trips_100_otp_itineraries.csv'
user_trips_folderpath = '/local/tarciso/data/enhanced-buste/user_trips/'
bus_trips_folderpath = '/local/tarciso/data/enhanced-buste/bus_trips/'
gtfs_base_folderpath = '/local/tarciso/data/gtfs/'
output_folderpath = '/local/tarciso/data/test-odmat/'

In [97]:
file_date_str = otp_suggestions_filepath.split('/')[-1].split('_user_trips_')[0]
file_date = pd.to_datetime(file_date_str,format='%Y_%m_%d')
print "Processing File:", otp_suggestions_filepath

Processing File: /local/tarciso/data/otp_itineraries/2017_05_01_user_trips_100_otp_itineraries.csv


In [98]:
        # Extracting itinerary part name for later use
        itinerary_part_name = otp_suggestions_filepath.split('/')[-1].split('_')[5]
        # Read OTP Suggestions
        otp_suggestions_raw = pd.read_csv(otp_suggestions_filepath, parse_dates=['date','otp_start_time','otp_end_time'])

        if len(otp_suggestions_raw) == 0:
            print "Zero OTP suggestions found."
            print "Skipping next steps..."
            exit(0)


In [99]:
def prepare_otp_data(otp_data):
        #Fixing prefix
        otp_data.columns = otp_data.columns.str.replace('otp_','')
        otp_data = otp_data.add_prefix('otp_')
        
        #Fixing Timezone difference - when needed
        otp_data['otp_start_time'] = otp_data['otp_start_time'] - pd.Timedelta('10800 s')
        otp_data['otp_end_time'] = otp_data['otp_end_time'] - pd.Timedelta('10800 s')
        
        #Adjusting route format to have 3 numbers
        otp_data['otp_route'] = otp_data['otp_route'].astype(str)
        otp_data['otp_route'] = np.where(otp_data['otp_mode'] == 'BUS',
                            otp_data['otp_route'].astype(str).str.replace("\.0",'').str.zfill(3),
                            otp_data['otp_route'])

        return otp_data

In [100]:
        # Prepare OTP data for analysis
        otp_suggestions = prepare_otp_data(otp_suggestions_raw)

        # Read stops data
        stops_filepath = gtfs_base_folderpath + os.sep + get_router_id(file_date) + os.sep + 'stops.txt'
        stops_df = pd.read_csv(stops_filepath)

        # Adding Parent Stop data to OTP Suggestions
        stops_parent_stations = stops_df[['stop_id','parent_station']]
        otp_suggestions = otp_suggestions.merge(stops_parent_stations.add_prefix('from_'),
                                                left_on='otp_from_stop_id',
                                                right_on='from_stop_id',
                                                how='left') \
                                        .merge(stops_parent_stations.add_prefix('to_'),
                                                left_on='otp_to_stop_id',
                                                right_on='to_stop_id',
                                                how='left') \
                                        .drop(['from_stop_id','to_stop_id'], axis=1) \
                                        .rename(index=str, columns={'from_parent_station':'otp_from_parent_station',
                                                                    'to_parent_station':'otp_to_parent_station'})
        
        otp_suggestions_bus_legs = otp_suggestions[otp_suggestions['otp_mode'] == 'BUS']
        otp_suggestions_walk_legs = otp_suggestions[otp_suggestions['otp_mode'] == 'WALK']


In [101]:
otp_suggestions.head()

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827.0,33788.0,30993.0,6.45,,14508.0
1,2017-05-01,28150.0,1,2,2017-05-01 13:45:00,2017-05-01 13:45:52,WALK,,,,0.866667,,
2,2017-05-01,28150.0,1,3,2017-05-01 13:51:00,2017-05-01 14:01:59,BUS,303.0,26195.0,25753.0,10.983333,14508.0,41756.0
3,2017-05-01,28150.0,1,4,2017-05-01 14:02:00,2017-05-01 14:09:32,WALK,,,,7.533333,,
4,2017-05-01,28150.0,2,1,2017-05-01 13:42:58,2017-05-01 13:57:00,BUS,822.0,33788.0,30994.0,14.033333,,14508.0


## Match Scheduled Itineraries to Observed Bus Trips

### Read and Prepare Bus Trips data

In [102]:
        # Find OTP Suggested Itineraries in BUSTE Data
        bus_trips_filepath = bus_trips_folderpath + os.sep + file_date_str + '_bus_trips.csv'
        bus_trips = pd.read_csv(bus_trips_filepath, dtype={'route': object},parse_dates=['gps_datetime']) \
                                        .sort_values(['route','busCode','tripNum','gps_datetime']) \
                                        .assign(route = lambda x: x['route'].astype(str).str.replace("\.0",'').str.zfill(3))


In [103]:
bus_trips.head()

Unnamed: 0,route,busCode,shapeId,tripNum,stopPointId,gps_datetime,distanceTraveledShape,stop_lat,stop_lon,parent_station
29435,10,BB303,1708.0,1.0,33157.0,2017-05-01 06:28:07,537.974,-25.410517,-49.276479,
29436,10,BB303,1708.0,1.0,33159.0,2017-05-01 06:30:38,1141.061,-25.411726,-49.270902,
29437,10,BB303,1708.0,1.0,33158.0,2017-05-01 06:31:40,1624.751,-25.415285,-49.270134,
29438,10,BB303,1708.0,1.0,30150.0,2017-05-01 06:32:26,1972.077,-25.416733,-49.267863,
29439,10,BB303,1708.0,1.0,28637.0,2017-05-01 06:33:11,2378.349,-25.414184,-49.265917,


In [104]:
bus_trips.dtypes

route                            object
busCode                          object
shapeId                         float64
tripNum                         float64
stopPointId                     float64
gps_datetime             datetime64[ns]
distanceTraveledShape           float64
stop_lat                        float64
stop_lon                        float64
parent_station                  float64
dtype: object

In [105]:
otp_suggestions_bus_legs.dtypes

otp_date                   datetime64[ns]
otp_user_trip_id                  float64
otp_itinerary_id                    int64
otp_leg_id                          int64
otp_start_time             datetime64[ns]
otp_end_time               datetime64[ns]
otp_mode                           object
otp_route                          object
otp_from_stop_id                  float64
otp_to_stop_id                    float64
otp_duration_mins                 float64
otp_from_parent_station           float64
otp_to_parent_station             float64
dtype: object

In [106]:
scheduled_itin_observed_od = otp_suggestions_bus_legs.merge(bus_trips.add_prefix('bt_from_'),
                                left_on=['otp_route','otp_from_stop_id'],
                                right_on=['bt_from_route','bt_from_stopPointId'],
                                how='inner') \
                                .assign(sched_obs_start_timediff = 
                                        lambda x: np.absolute(x['bt_from_gps_datetime'] - x['otp_start_time']))
scheduled_itin_observed_od = scheduled_itin_observed_od[scheduled_itin_observed_od['sched_obs_start_timediff'] <= pd.Timedelta(minutes=60)]

In [107]:
scheduled_itin_observed_od = scheduled_itin_observed_od.merge(bus_trips.add_prefix('bt_to_'),
                                left_on=['otp_route','bt_from_busCode','bt_from_tripNum','otp_to_stop_id'],
                                right_on=['bt_to_route','bt_to_busCode','bt_to_tripNum','bt_to_stopPointId'],
                                how='inner') \
                                .assign(sched_obs_end_timediff = 
                                        lambda x: np.absolute(x['bt_to_gps_datetime'] - x['otp_end_time'])) \
                                .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','sched_obs_start_timediff','sched_obs_end_timediff'])
scheduled_itin_observed_od = scheduled_itin_observed_od[scheduled_itin_observed_od['sched_obs_end_timediff'] <= pd.Timedelta(minutes=60)]

In [108]:
scheduled_itin_observed_od_full = pd.concat([scheduled_itin_observed_od,otp_suggestions_walk_legs])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [109]:
#bus_trips[(bus_trips['route'] == '827') & (bus_trips['stopPointId'] == 33788)].sort_values(['gps_datetime'])

In [110]:
scheduled_itin_observed_od_full.filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_route','bt_from_busCode','bt_from_tripNum','bt_from_stopPointId','otp_start_time','bt_from_gps_datetime','sched_obs_start_timediff','bt_to_stopPointId','otp_end_time','bt_to_gps_datetime','sched_obs_end_timediff']) \
                            .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_route,bt_from_busCode,bt_from_tripNum,bt_from_stopPointId,otp_start_time,bt_from_gps_datetime,sched_obs_start_timediff,bt_to_stopPointId,otp_end_time,bt_to_gps_datetime,sched_obs_end_timediff
3,28150.0,1,1,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,00:03:15,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,00:00:46
4,28150.0,1,1,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,00:03:15,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,00:00:46
5,28150.0,1,1,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,00:03:15,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,00:00:46
6,28150.0,1,1,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,00:03:15,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,00:00:46
15,28150.0,1,1,827,BC929,13.0,33788.0,2017-05-01 13:38:33,2017-05-01 14:22:34,00:44:01,30993.0,2017-05-01 13:45:00,2017-05-01 14:30:58,00:45:58
0,28150.0,1,1,827,BC929,11.0,33788.0,2017-05-01 13:38:33,2017-05-01 12:47:49,00:50:44,30993.0,2017-05-01 13:45:00,2017-05-01 12:53:04,00:51:56
1,28150.0,1,2,,,,,2017-05-01 13:45:00,NaT,NaT,,2017-05-01 13:45:52,NaT,NaT
110,28150.0,1,3,303,DE722,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:02:48,00:11:48,25753.0,2017-05-01 14:01:59,2017-05-01 14:12:10,00:10:11
51,28150.0,1,3,303,DE710,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:27:30,00:23:30,25753.0,2017-05-01 14:01:59,2017-05-01 13:35:59,00:26:00
67,28150.0,1,3,303,DE713,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:16:34,00:34:26,25753.0,2017-05-01 14:01:59,2017-05-01 13:24:23,00:37:36


In [111]:
#Run choose_legs on scheduled_itin_observed_od

In [112]:
scheduled_itin_observed_od_earliest = scheduled_itin_observed_od_full.sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','sched_obs_start_timediff','sched_obs_end_timediff']) \
                        .groupby(['otp_user_trip_id','otp_itinerary_id','otp_leg_id']) \
                        .first() \
                        .filter(['otp_route','otp_mode','stopPointId','otp_start_time','bt_from_gps_datetime','sched_obs_start_timediff','sched_obs_end_timediff']) \
                        .reset_index()

In [113]:
scheduled_itin_observed_od_earliest

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_route,otp_mode,otp_start_time,bt_from_gps_datetime,sched_obs_start_timediff,sched_obs_end_timediff
0,28150.0,1,1,827,BUS,2017-05-01 13:38:33,2017-05-01 13:35:18,00:03:15,00:00:46
1,28150.0,1,2,,WALK,2017-05-01 13:45:00,NaT,NaT,NaT
2,28150.0,1,3,303,BUS,2017-05-01 13:51:00,2017-05-01 14:02:48,00:11:48,00:10:11
3,28150.0,1,4,,WALK,2017-05-01 14:02:00,NaT,NaT,NaT
4,28150.0,2,2,,WALK,2017-05-01 13:57:00,NaT,NaT,NaT
5,28150.0,2,3,303,BUS,2017-05-01 14:03:00,2017-05-01 14:02:48,00:00:12,00:03:09
6,28150.0,2,4,,WALK,2017-05-01 14:15:20,NaT,NaT,NaT
7,28150.0,3,1,,WALK,2017-05-01 13:41:55,NaT,NaT,NaT
8,28150.0,3,2,870,BUS,2017-05-01 14:03:39,2017-05-01 13:57:41,00:05:58,00:12:32
9,28150.0,3,3,,WALK,2017-05-01 14:20:06,NaT,NaT,NaT


In [114]:
scheduled_itin_observed_od_earliest[scheduled_itin_observed_od_earliest['otp_mode'] == 'BUS'].sched_obs_start_timediff.quantile(.95)

Timedelta('0 days 00:28:10.500000')

### Read and Prepare Origin/Next-Origin Pairs data

In [115]:
def compatible_dates(otp_data,ticketing_data):
        otp_date = otp_data['otp_date'].iloc[0]
        ticketing_date = pd.to_datetime(ticketing_data['o_boarding_datetime'].dt.strftime('%Y-%m-%d')[0])

        return (otp_date == ticketing_date,otp_date,ticketing_date)


In [116]:
        # Read Origin/Next-Origin Pairs for the same date
        trips_origins_filepath = user_trips_folderpath + os.sep + file_date_str + '_user_trips.csv'
        trips_on_pairs_full = pd.read_csv(trips_origins_filepath,
                                                parse_dates=['o_boarding_datetime','o_gps_datetime','next_o_boarding_datetime','next_o_gps_datetime'])
        # Checking whether OTP and ticketing dates match
        dates_compatibility, otp_date, ticketing_date = compatible_dates(otp_suggestions,trips_on_pairs_full)
        if not dates_compatibility:
                print "ERROR: OTP date", otp_date, "does not match Ticketing data", ticketing_date
                print "Skipping current day"
                exit(1)
        
        trips_on_pairs = trips_on_pairs_full.filter(['o_boarding_id','next_o_boarding_id'])
        trips_origins = trips_on_pairs_full.filter([col for col in trips_on_pairs_full.columns if col.startswith('o_')])
        trips_next_origins = trips_on_pairs_full.filter([col for col in trips_on_pairs_full.columns if col.startswith('next_o_')])


In [117]:
trips_origins.head()

Unnamed: 0,o_boarding_id,o_boarding_datetime,o_gps_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_stop_lat,o_stop_lon
0,180.0,2017-05-01 04:48:42,2017-05-01 04:48:17,650,HA030,1.0,36293.0,-25.568989,-49.332253
1,40392.0,2017-05-01 15:52:39,NaT,21,08044,,41791.0,-25.431951,-49.296491
2,181.0,2017-05-01 04:49:13,2017-05-01 04:49:08,650,HA030,1.0,36294.0,-25.56686,-49.3329
3,31560.0,2017-05-01 14:16:55,NaT,0,00038,,26051.0,-25.447479,-49.263816
4,182.0,2017-05-01 04:49:19,2017-05-01 04:49:08,650,HA030,1.0,36294.0,-25.56686,-49.3329


In [118]:
trips_next_origins.head()

Unnamed: 0,next_o_boarding_id,next_o_boarding_datetime,next_o_gps_datetime,next_o_route,next_o_busCode,next_o_tripNum,next_o_stopPointId,next_o_stop_lat,next_o_stop_lon
0,40392.0,2017-05-01 15:52:39,NaT,21,08044,,41791.0,-25.431951,-49.296491
1,180.0,2017-05-01 04:48:42,2017-05-01 04:48:17,650,HA030,1.0,36293.0,-25.568989,-49.332253
2,31560.0,2017-05-01 14:16:55,NaT,0,00038,,26051.0,-25.447479,-49.263816
3,181.0,2017-05-01 04:49:13,2017-05-01 04:49:08,650,HA030,1.0,36294.0,-25.56686,-49.3329
4,61724.0,2017-05-01 19:27:16,2017-05-01 19:26:26,814,LA851,21.0,30284.0,-25.432534,-49.338889


In [119]:
trips_on_pairs.head()

Unnamed: 0,o_boarding_id,next_o_boarding_id
0,180.0,40392.0
1,40392.0,180.0
2,181.0,31560.0
3,31560.0,181.0
4,182.0,61724.0


In [120]:
        # Selecting trips for whom OTP suggestions were found
        selected_trips = trips_origins[trips_origins['o_boarding_id'].isin(otp_suggestions['otp_user_trip_id'])]
        num_selected_trips = len(selected_trips)


In [121]:
        # Matching all kinds of boarding events to valid OTP suggestions
        itins_first_bus_legs = otp_suggestions.query('otp_mode == \'BUS\'') \
                                .groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                .first() \
                                .reset_index()

In [122]:
itins_first_bus_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_date,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,28150.0,1,2017-05-01,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,14508.0,14508.0
1,28150.0,2,2017-05-01,1,2017-05-01 13:42:58,2017-05-01 13:57:00,BUS,822,33788.0,30994.0,14.033333,14508.0,14508.0
2,28150.0,3,2017-05-01,2,2017-05-01 14:03:39,2017-05-01 14:20:05,BUS,870,33980.0,35079.0,16.433333,,
3,28150.0,4,2017-05-01,1,2017-05-01 13:57:58,2017-05-01 14:12:00,BUS,822,33788.0,30994.0,14.033333,14508.0,14508.0
4,28150.0,5,2017-05-01,2,2017-05-01 14:10:00,2017-05-01 14:14:00,BUS,821,33973.0,30995.0,4.0,14508.0,14508.0


In [123]:
len(itins_first_bus_legs)

2000

In [124]:
len(pd.concat([itins_first_bus_legs,otp_suggestions]).drop_duplicates(keep=False))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


7092

In [125]:
itins_rest_bus_legs = otp_suggestions.query('otp_mode == \'BUS\'') \
                                .groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                .apply(lambda group: group.iloc[1:]) #\
                                #.reset_index(level=0)

In [126]:
itins_rest_bus_legs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
otp_user_trip_id,otp_itinerary_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
28150.0,1,2,2017-05-01,28150.0,1,3,2017-05-01 13:51:00,2017-05-01 14:01:59,BUS,303,26195.0,25753.0,10.983333,14508.0,41756.0
28150.0,2,6,2017-05-01,28150.0,2,3,2017-05-01 14:03:00,2017-05-01 14:15:19,BUS,303,26195.0,25753.0,12.316667,14508.0,41756.0
28150.0,4,13,2017-05-01,28150.0,4,3,2017-05-01 14:15:00,2017-05-01 14:25:59,BUS,303,26195.0,25753.0,10.983333,14508.0,41756.0
28150.0,5,18,2017-05-01,28150.0,5,4,2017-05-01 14:21:00,2017-05-01 14:33:19,BUS,303,26195.0,25753.0,12.316667,14508.0,41756.0
28150.0,7,25,2017-05-01,28150.0,7,3,2017-05-01 14:33:00,2017-05-01 14:43:59,BUS,303,26195.0,25753.0,10.983333,14508.0,41756.0
28150.0,8,29,2017-05-01,28150.0,8,3,2017-05-01 14:39:00,2017-05-01 14:51:19,BUS,303,26195.0,25753.0,12.316667,14508.0,41756.0
28150.0,9,33,2017-05-01,28150.0,9,3,2017-05-01 14:45:00,2017-05-01 14:57:19,BUS,303,26195.0,25753.0,12.316667,14508.0,41756.0
28150.0,10,37,2017-05-01,28150.0,10,3,2017-05-01 15:02:00,2017-05-01 15:14:19,BUS,303,26195.0,25753.0,12.316667,14508.0,41756.0
28154.0,1,71,2017-05-01,28154.0,1,3,2017-05-01 14:22:00,2017-05-01 14:53:48,BUS,603,26296.0,25521.0,31.800000,14494.0,26096.0
28154.0,2,75,2017-05-01,28154.0,2,3,2017-05-01 14:39:00,2017-05-01 15:10:48,BUS,603,26296.0,25521.0,31.800000,14494.0,26096.0


In [127]:
itins_first_bus_legs.otp_leg_id.describe()

count    2000.000000
mean        1.726000
std         0.446121
min         1.000000
25%         1.000000
50%         2.000000
75%         2.000000
max         2.000000
Name: otp_leg_id, dtype: float64

In [128]:
itins_first_bus_legs.dtypes

otp_user_trip_id                  float64
otp_itinerary_id                    int64
otp_date                   datetime64[ns]
otp_leg_id                          int64
otp_start_time             datetime64[ns]
otp_end_time               datetime64[ns]
otp_mode                           object
otp_route                          object
otp_from_stop_id                  float64
otp_to_stop_id                    float64
otp_duration_mins                 float64
otp_from_parent_station           float64
otp_to_parent_station             float64
dtype: object

## Working with Vehicle Boardings

In [129]:
selected_trips.head()

Unnamed: 0,o_boarding_id,o_boarding_datetime,o_gps_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_stop_lat,o_stop_lon
19800,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221
19801,57666.0,2017-05-01 18:46:58,NaT,021,08065,,41778.0,-25.436627,-49.317949
19802,28153.0,2017-05-01 13:36:28,NaT,TCJ,06003,,14478.0,-25.43998,-49.221858
19803,64706.0,2017-05-01 20:22:36,2017-05-01 19:59:34,020,BB607,7.0,28132.0,-25.435878,-49.306888
19804,28156.0,2017-05-01 13:36:28,2017-05-01 13:36:21,650,HR410,5.0,36299.0,-25.565199,-49.333825


In [130]:
itins_first_bus_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_date,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,28150.0,1,2017-05-01,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,14508.0,14508.0
1,28150.0,2,2017-05-01,1,2017-05-01 13:42:58,2017-05-01 13:57:00,BUS,822,33788.0,30994.0,14.033333,14508.0,14508.0
2,28150.0,3,2017-05-01,2,2017-05-01 14:03:39,2017-05-01 14:20:05,BUS,870,33980.0,35079.0,16.433333,,
3,28150.0,4,2017-05-01,1,2017-05-01 13:57:58,2017-05-01 14:12:00,BUS,822,33788.0,30994.0,14.033333,14508.0,14508.0
4,28150.0,5,2017-05-01,2,2017-05-01 14:10:00,2017-05-01 14:14:00,BUS,821,33973.0,30995.0,4.0,14508.0,14508.0


In [131]:
#itins_first_bus_legs.groupby(['otp_user_trip_id','otp_route','otp_from_stop_id']) \
#                .drop()

In [132]:
def match_vehicle_boardings(selected_trips,itineraries_st):
        vehicle_boarding_origins = selected_trips[np.logical_not(selected_trips['o_busCode'].str.isdigit())]
        matched_vehicle_boardings = vehicle_boarding_origins.merge(itineraries_st, left_on=['o_boarding_id','o_route','o_stopPointId'], 
                                                                   right_on=['otp_user_trip_id','otp_route','otp_from_stop_id'], how='inner')
        num_matched_vehicle_boardings = len(matched_vehicle_boardings.drop_duplicates(subset=['o_boarding_id']))
        if num_matched_vehicle_boardings == 0:
            match_perc = 0.0
        else:
            match_perc = 100*(num_matched_vehicle_boardings/float(len(vehicle_boarding_origins)))
        return (matched_vehicle_boardings,num_matched_vehicle_boardings,match_perc)

In [133]:
        # Matching vehicle boarding origins
        matched_vehicle_boardings,num_matched_vehicle_boardings,vehicle_match_perc = match_vehicle_boardings(selected_trips,itins_first_bus_legs)
        print "Vehicle boardings with matching OTP suggestions: ", num_matched_vehicle_boardings, "(",vehicle_match_perc, "%)"


        # Matching terminal boarding origins
        #matched_terminal_boardings,num_matched_terminal_boardings,terminal_matched_perc = match_terminal_boardings(selected_trips,itineraries_start)
        #print "Terminal boardings with matching OTP suggestions: ", num_matched_terminal_boardings, "(", terminal_matched_perc, "%)"

        # Matching special case route 021 terminal boarding origins 
        #num_terminal_021_boardings,matched_021_terminal_boardings,num_matched_021_terminal_boardings,terminal_021_match_perc = match_terminal_021_boardings(selected_trips,itineraries_start)
        #if (num_terminal_021_boardings > 0):
        #        print "Line 021 Terminal boardings with matching OTP suggestions: ", num_matched_021_terminal_boardings, "(",terminal_021_match_perc, "%)"
        #else:
        #        print "No Line 021 Terminal boardings found. Skipping matching."

        #boarding_suggestions_matches = pd.concat([matched_vehicle_boardings,matched_021_terminal_boardings,matched_terminal_boardings])

        #total_num_matches = num_matched_vehicle_boardings + num_matched_021_terminal_boardings + num_matched_terminal_boardings
        #print "Total number of matches: ", total_num_matches, "(", 100*(total_num_matches/float(num_selected_trips)), "%)"

        #if total_num_matches == 0:
        #    print "No match was found. Skipping next steps..."
        #    exit(0)


Vehicle boardings with matching OTP suggestions:  92 ( 80.701754386 %)


In [134]:
matched_vehicle_boardings.head()

Unnamed: 0,o_boarding_id,o_boarding_datetime,o_gps_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_stop_lat,o_stop_lon,otp_user_trip_id,...,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,28150.0,...,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,30993.0,6.45,14508.0,14508.0
1,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,28150.0,...,1,2017-05-01 14:28:17,2017-05-01 14:35:00,BUS,827,33788.0,30993.0,6.716667,14508.0,14508.0
2,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.58523,-49.33721,28154.0,...,1,2017-05-01 13:42:13,2017-05-01 14:18:00,BUS,684,39378.0,31053.0,35.783333,14494.0,14494.0
3,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.58523,-49.33721,28154.0,...,1,2017-05-01 13:57:13,2017-05-01 14:33:00,BUS,684,39378.0,31053.0,35.783333,14494.0,14494.0
4,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.58523,-49.33721,28154.0,...,1,2017-05-01 14:12:13,2017-05-01 14:48:00,BUS,684,39378.0,31053.0,35.783333,14494.0,14494.0


In [147]:
vehic_first_boardings_options = matched_vehicle_boardings.merge(bus_trips, 
                                left_on=['o_route','o_busCode','o_tripNum','otp_to_stop_id'],
                               right_on=['route','busCode','tripNum','stopPointId'],
                               how='inner') \
            #.filter(['otp_user_trip_id','otp_itinerary_id','route','busCode','tripNum','stopPointId','otp_start_time','o_gps_datetime','otp_end_time','gps_datetime'])
            #.filter(selected_trips.columns)
            

In [148]:
vehic_first_boardings_options

Unnamed: 0,o_boarding_id,o_boarding_datetime,o_gps_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_stop_lat,o_stop_lon,otp_user_trip_id,...,route,busCode,shapeId,tripNum,stopPointId,gps_datetime,distanceTraveledShape,stop_lat,stop_lon,parent_station
0,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,28150.0,...,827,BC929,2895.0,12.0,30993.0,2017-05-01 13:44:14,8568.987,-25.441705,-49.346328,14508.0
1,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,28150.0,...,827,BC929,2895.0,12.0,30993.0,2017-05-01 13:44:14,8568.987,-25.441705,-49.346328,14508.0
2,28262.0,2017-05-01 13:37:59,2017-05-01 13:37:57,827,BC929,12.0,33775.0,-25.438158,-49.358207,28262.0,...,827,BC929,2895.0,12.0,30993.0,2017-05-01 13:44:14,8568.987,-25.441705,-49.346328,14508.0
3,28262.0,2017-05-01 13:37:59,2017-05-01 13:37:57,827,BC929,12.0,33775.0,-25.438158,-49.358207,28262.0,...,827,BC929,2895.0,12.0,30993.0,2017-05-01 13:44:14,8568.987,-25.441705,-49.346328,14508.0
4,28424.0,2017-05-01 13:39:51,2017-05-01 13:39:12,827,BC929,12.0,33535.0,-25.440179,-49.355608,28424.0,...,827,BC929,2895.0,12.0,30993.0,2017-05-01 13:44:14,8568.987,-25.441705,-49.346328,14508.0
5,28424.0,2017-05-01 13:39:51,2017-05-01 13:39:12,827,BC929,12.0,33535.0,-25.440179,-49.355608,28424.0,...,827,BC929,2895.0,12.0,30993.0,2017-05-01 13:44:14,8568.987,-25.441705,-49.346328,14508.0
6,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.585230,-49.337210,28154.0,...,684,HA298,2148.0,11.0,31053.0,2017-05-01 14:01:41,15448.084,-25.513098,-49.295303,14494.0
7,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.585230,-49.337210,28154.0,...,684,HA298,2148.0,11.0,31053.0,2017-05-01 14:01:41,15448.084,-25.513098,-49.295303,14494.0
8,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.585230,-49.337210,28154.0,...,684,HA298,2148.0,11.0,31053.0,2017-05-01 14:01:41,15448.084,-25.513098,-49.295303,14494.0
9,28154.0,2017-05-01 13:36:28,2017-05-01 13:35:38,684,HA298,11.0,39378.0,-25.585230,-49.337210,28154.0,...,684,HA298,2148.0,11.0,31053.0,2017-05-01 14:01:41,15448.084,-25.513098,-49.295303,14494.0


### Removing Scheduled trips whose scheduled start time is more than 30 minutes away from the actual boarding time

In [149]:
vehic_first_boardings_options['sched_actual_start_timediff'] = np.abs(vehic_first_boardings_options['otp_start_time']-vehic_first_boardings_options['o_gps_datetime'])

In [150]:
vehic_first_boardings_options \
    .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_start_time']) \
    .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','route','busCode','tripNum','stopPointId','otp_start_time','o_gps_datetime','otp_end_time','gps_datetime','sched_actual_start_timediff'])

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,route,busCode,tripNum,stopPointId,otp_start_time,o_gps_datetime,otp_end_time,gps_datetime,sched_actual_start_timediff
0,28150.0,1,1,827,BC929,12.0,30993.0,2017-05-01 13:38:33,2017-05-01 13:35:18,2017-05-01 13:45:00,2017-05-01 13:44:14,00:03:15
1,28150.0,8,1,827,BC929,12.0,30993.0,2017-05-01 14:28:17,2017-05-01 13:35:18,2017-05-01 14:35:00,2017-05-01 13:44:14,00:52:59
6,28154.0,1,1,684,HA298,11.0,31053.0,2017-05-01 13:42:13,2017-05-01 13:35:38,2017-05-01 14:18:00,2017-05-01 14:01:41,00:06:35
7,28154.0,2,1,684,HA298,11.0,31053.0,2017-05-01 13:57:13,2017-05-01 13:35:38,2017-05-01 14:33:00,2017-05-01 14:01:41,00:21:35
8,28154.0,6,1,684,HA298,11.0,31053.0,2017-05-01 14:12:13,2017-05-01 13:35:38,2017-05-01 14:48:00,2017-05-01 14:01:41,00:36:35
9,28154.0,7,1,684,HA298,11.0,31053.0,2017-05-01 14:27:13,2017-05-01 13:35:38,2017-05-01 15:03:00,2017-05-01 14:01:41,00:51:35
10,28154.0,8,1,684,HA298,11.0,31053.0,2017-05-01 14:42:13,2017-05-01 13:35:38,2017-05-01 15:18:00,2017-05-01 14:01:41,01:06:35
17,28154.0,9,1,684,HA298,11.0,34160.0,2017-05-01 14:57:13,2017-05-01 13:35:38,2017-05-01 15:30:02,2017-05-01 13:57:14,01:21:35
18,28154.0,10,1,684,HA298,11.0,34160.0,2017-05-01 15:12:13,2017-05-01 13:35:38,2017-05-01 15:45:02,2017-05-01 13:57:14,01:36:35
26,28161.0,4,1,050,JB301,5.0,32454.0,2017-05-01 13:44:57,2017-05-01 13:35:59,2017-05-01 13:51:00,2017-05-01 13:39:53,00:08:58


In [151]:
vehic_first_boardings_options = vehic_first_boardings_options[vehic_first_boardings_options['sched_actual_start_timediff'] < pd.Timedelta(minutes=30)]

In [153]:
vehic_first_boardings_options  \
    .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_start_time']) \
    .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','route','busCode','tripNum','stopPointId','otp_start_time','o_gps_datetime','otp_end_time','gps_datetime','sched_actual_start_timediff'])

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,route,busCode,tripNum,stopPointId,otp_start_time,o_gps_datetime,otp_end_time,gps_datetime,sched_actual_start_timediff
0,28150.0,1,1,827,BC929,12.0,30993.0,2017-05-01 13:38:33,2017-05-01 13:35:18,2017-05-01 13:45:00,2017-05-01 13:44:14,00:03:15
6,28154.0,1,1,684,HA298,11.0,31053.0,2017-05-01 13:42:13,2017-05-01 13:35:38,2017-05-01 14:18:00,2017-05-01 14:01:41,00:06:35
7,28154.0,2,1,684,HA298,11.0,31053.0,2017-05-01 13:57:13,2017-05-01 13:35:38,2017-05-01 14:33:00,2017-05-01 14:01:41,00:21:35
26,28161.0,4,1,050,JB301,5.0,32454.0,2017-05-01 13:44:57,2017-05-01 13:35:59,2017-05-01 13:51:00,2017-05-01 13:39:53,00:08:58
27,28161.0,10,1,050,JB301,5.0,32454.0,2017-05-01 14:03:36,2017-05-01 13:35:59,2017-05-01 14:09:00,2017-05-01 13:39:53,00:27:37
28,28164.0,1,1,812,BA020,9.0,28138.0,2017-05-01 13:46:46,2017-05-01 13:36:32,2017-05-01 14:03:00,2017-05-01 13:47:34,00:10:14
43,28165.0,1,1,462,DC093,7.0,26157.0,2017-05-01 13:38:46,2017-05-01 13:35:42,2017-05-01 14:12:00,2017-05-01 14:00:22,00:03:04
44,28165.0,2,1,462,DC093,7.0,26157.0,2017-05-01 13:52:02,2017-05-01 13:35:42,2017-05-01 14:28:00,2017-05-01 14:00:22,00:16:20
50,28169.0,4,1,545,JA300,6.0,29967.0,2017-05-01 13:50:13,2017-05-01 13:36:30,2017-05-01 14:35:04,2017-05-01 14:14:48,00:13:43
58,28172.0,6,1,334,DA023,20.0,27812.0,2017-05-01 13:55:39,2017-05-01 13:36:33,2017-05-01 14:02:00,2017-05-01 13:20:33,00:19:06


In [154]:
otp_suggestions.head()

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,2017-05-01,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827.0,33788.0,30993.0,6.45,,14508.0
1,2017-05-01,28150.0,1,2,2017-05-01 13:45:00,2017-05-01 13:45:52,WALK,,,,0.866667,,
2,2017-05-01,28150.0,1,3,2017-05-01 13:51:00,2017-05-01 14:01:59,BUS,303.0,26195.0,25753.0,10.983333,14508.0,41756.0
3,2017-05-01,28150.0,1,4,2017-05-01 14:02:00,2017-05-01 14:09:32,WALK,,,,7.533333,,
4,2017-05-01,28150.0,2,1,2017-05-01 13:42:58,2017-05-01 13:57:00,BUS,822.0,33788.0,30994.0,14.033333,,14508.0


In [156]:
scheduled_itin_observed_od_full.head()

Unnamed: 0,bt_from_busCode,bt_from_distanceTraveledShape,bt_from_gps_datetime,bt_from_parent_station,bt_from_route,bt_from_shapeId,bt_from_stopPointId,bt_from_stop_lat,bt_from_stop_lon,bt_from_tripNum,...,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,otp_start_time,otp_to_parent_station,otp_to_stop_id,otp_user_trip_id,sched_obs_end_timediff,sched_obs_start_timediff
3,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
4,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
5,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
6,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
15,BC929,6303.387,2017-05-01 14:22:34,,827,2895.0,33788.0,-25.436303,-49.362221,13.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:45:58,00:44:01


## Keep only OTP-BusTrips itineraries which figure in first-vehicle-boarding matched itineraries

In [169]:
matched_vehicle_boardings_itins = vehic_first_boardings_options.filter(['otp_user_trip_id','otp_itinerary_id'])

In [170]:
matched_vehicle_boardings_itins.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id
0,28150.0,1
2,28262.0,1
4,28424.0,1
6,28154.0,1
7,28154.0,2


In [171]:
scheduled_itin_observed_od_full.head()

Unnamed: 0,bt_from_busCode,bt_from_distanceTraveledShape,bt_from_gps_datetime,bt_from_parent_station,bt_from_route,bt_from_shapeId,bt_from_stopPointId,bt_from_stop_lat,bt_from_stop_lon,bt_from_tripNum,...,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,otp_start_time,otp_to_parent_station,otp_to_stop_id,otp_user_trip_id,sched_obs_end_timediff,sched_obs_start_timediff
3,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
4,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
5,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
6,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
15,BC929,6303.387,2017-05-01 14:22:34,,827,2895.0,33788.0,-25.436303,-49.362221,13.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:45:58,00:44:01


In [172]:
len(scheduled_itin_observed_od_full)

19121

In [173]:
scheduled_itin_observed_od.columns

Index([u'otp_date', u'otp_user_trip_id', u'otp_itinerary_id', u'otp_leg_id',
       u'otp_start_time', u'otp_end_time', u'otp_mode', u'otp_route',
       u'otp_from_stop_id', u'otp_to_stop_id', u'otp_duration_mins',
       u'otp_from_parent_station', u'otp_to_parent_station', u'bt_from_route',
       u'bt_from_busCode', u'bt_from_shapeId', u'bt_from_tripNum',
       u'bt_from_stopPointId', u'bt_from_gps_datetime',
       u'bt_from_distanceTraveledShape', u'bt_from_stop_lat',
       u'bt_from_stop_lon', u'bt_from_parent_station',
       u'sched_obs_start_timediff', u'bt_to_route', u'bt_to_busCode',
       u'bt_to_shapeId', u'bt_to_tripNum', u'bt_to_stopPointId',
       u'bt_to_gps_datetime', u'bt_to_distanceTraveledShape',
       u'bt_to_stop_lat', u'bt_to_stop_lon', u'bt_to_parent_station',
       u'sched_obs_end_timediff'],
      dtype='object')

In [174]:
matched_vehicle_boardings_itins.columns

Index([u'otp_user_trip_id', u'otp_itinerary_id'], dtype='object')

In [196]:
vehicle_boardings_obs_sch_itin_legs = scheduled_itin_observed_od_full.merge(matched_vehicle_boardings_itins,how='inner') \
                                        .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])

In [197]:
len(vehicle_boardings_obs_sch_itin_legs)

1864

In [198]:
vehicle_boardings_obs_sch_itin_legs.head()

Unnamed: 0,bt_from_busCode,bt_from_distanceTraveledShape,bt_from_gps_datetime,bt_from_parent_station,bt_from_route,bt_from_shapeId,bt_from_stopPointId,bt_from_stop_lat,bt_from_stop_lon,bt_from_tripNum,...,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,otp_start_time,otp_to_parent_station,otp_to_stop_id,otp_user_trip_id,sched_obs_end_timediff,sched_obs_start_timediff
0,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
1,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
2,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
3,BC929,6303.387,2017-05-01 13:35:18,,827,2895.0,33788.0,-25.436303,-49.362221,12.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:00:46,00:03:15
4,BC929,6303.387,2017-05-01 14:22:34,,827,2895.0,33788.0,-25.436303,-49.362221,13.0,...,1,1,BUS,827,2017-05-01 13:38:33,14508.0,30993.0,28150.0,00:45:58,00:44:01


### Concatenating first boarding legs to other itinerary legs

#### Preparing First Boarding legs data

In [199]:
vehic_first_boardings_options.columns

Index([u'o_boarding_id', u'o_boarding_datetime', u'o_gps_datetime', u'o_route',
       u'o_busCode', u'o_tripNum', u'o_stopPointId', u'o_stop_lat',
       u'o_stop_lon', u'otp_user_trip_id', u'otp_itinerary_id', u'otp_date',
       u'otp_leg_id', u'otp_start_time', u'otp_end_time', u'otp_mode',
       u'otp_route', u'otp_from_stop_id', u'otp_to_stop_id',
       u'otp_duration_mins', u'otp_from_parent_station',
       u'otp_to_parent_station', u'route', u'busCode', u'shapeId', u'tripNum',
       u'stopPointId', u'gps_datetime', u'distanceTraveledShape', u'stop_lat',
       u'stop_lon', u'parent_station', u'sched_actual_start_timediff'],
      dtype='object')

In [200]:
vehic_first_boardings_options_clean = vehic_first_boardings_options \
    .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_mode','otp_route','o_busCode',
             'o_tripNum','otp_from_stop_id','otp_start_time','o_boarding_datetime','otp_to_stop_id',
             'otp_end_time','gps_datetime','otp_duration_mins']) \
    .rename(index=str, columns={'o_busCode':'bt_bus_code','o_tripNum':'bt_trip_num',
                                'o_boarding_datetime':'bt_start_time','gps_datetime':'bt_end_time'}) \
    .assign(bt_duration_mins = lambda x: (x.bt_end_time - x.bt_start_time)/pd.Timedelta(minutes=1)) \
    .assign(considered_duration_mins = lambda x: np.where(np.isnan(x.bt_duration_mins),x.otp_duration_mins,x.bt_duration_mins))

In [201]:
vehic_first_boardings_options_clean.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,considered_duration_mins
0,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,7.816667,7.816667
2,28262.0,1,1,BUS,827,BC929,12.0,33775.0,2017-05-01 13:40:17,2017-05-01 13:37:59,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,4.716667,6.25,6.25
4,28424.0,1,1,BUS,827,BC929,12.0,33535.0,2017-05-01 13:41:48,2017-05-01 13:39:51,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,3.2,4.383333,4.383333
6,28154.0,1,1,BUS,684,HA298,11.0,39378.0,2017-05-01 13:42:13,2017-05-01 13:36:28,31053.0,2017-05-01 14:18:00,2017-05-01 14:01:41,35.783333,25.216667,25.216667
7,28154.0,2,1,BUS,684,HA298,11.0,39378.0,2017-05-01 13:57:13,2017-05-01 13:36:28,31053.0,2017-05-01 14:33:00,2017-05-01 14:01:41,35.783333,25.216667,25.216667


In [202]:
len(vehic_first_boardings_options_clean)

138

#### Preparing OTP itinerary legs data

In [203]:
vehicle_boardings_obs_sch_itin_legs.columns.values

array(['bt_from_busCode', 'bt_from_distanceTraveledShape',
       'bt_from_gps_datetime', 'bt_from_parent_station', 'bt_from_route',
       'bt_from_shapeId', 'bt_from_stopPointId', 'bt_from_stop_lat',
       'bt_from_stop_lon', 'bt_from_tripNum', 'bt_to_busCode',
       'bt_to_distanceTraveledShape', 'bt_to_gps_datetime',
       'bt_to_parent_station', 'bt_to_route', 'bt_to_shapeId',
       'bt_to_stopPointId', 'bt_to_stop_lat', 'bt_to_stop_lon',
       'bt_to_tripNum', 'otp_date', 'otp_duration_mins', 'otp_end_time',
       'otp_from_parent_station', 'otp_from_stop_id', 'otp_itinerary_id',
       'otp_leg_id', 'otp_mode', 'otp_route', 'otp_start_time',
       'otp_to_parent_station', 'otp_to_stop_id', 'otp_user_trip_id',
       'sched_obs_end_timediff', 'sched_obs_start_timediff'], dtype=object)

In [204]:
vehicle_boardings_obs_sch_itin_legs_clean = vehicle_boardings_obs_sch_itin_legs \
    .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_mode','otp_route','bt_from_busCode',
             'bt_from_tripNum','otp_from_stop_id','otp_start_time','bt_from_gps_datetime','otp_to_stop_id',
             'otp_end_time','bt_to_gps_datetime','otp_duration_mins'])\
    .rename(index=str, columns={'bt_from_busCode':'bt_bus_code','bt_from_tripNum':'bt_trip_num',
                                'bt_from_gps_datetime':'bt_start_time','bt_to_gps_datetime':'bt_end_time'}) \
    .assign(bt_duration_mins = lambda x: (x.bt_end_time - x.bt_start_time)/pd.Timedelta(minutes=1)) \
    .assign(considered_duration_mins = lambda x: np.where(np.isnan(x.bt_duration_mins),x.otp_duration_mins,x.bt_duration_mins))

In [205]:
vehicle_boardings_obs_sch_itin_legs_clean.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,considered_duration_mins
0,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,8.933333,8.933333
1,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,8.933333,8.933333
2,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,8.933333,8.933333
3,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:35:18,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,8.933333,8.933333
4,28150.0,1,1,BUS,827,BC929,13.0,33788.0,2017-05-01 13:38:33,2017-05-01 14:22:34,30993.0,2017-05-01 13:45:00,2017-05-01 14:30:58,6.45,8.4,8.4


In [206]:
len(vehicle_boardings_obs_sch_itin_legs_clean)

1864

#### Concatenating Legs

In [207]:
len(vehic_first_boardings_options_clean)

138

In [208]:
vehic_first_boardings_options_clean_keys = vehic_first_boardings_options_clean.filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id']).drop_duplicates()

In [209]:
len(vehic_first_boardings_options_clean_keys)

125

In [210]:
vehic_first_boardings_options_clean_keys.head(10)

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id
0,28150.0,1,1
2,28262.0,1,1
4,28424.0,1,1
6,28154.0,1,1
7,28154.0,2,1
11,28229.0,1,1
12,28229.0,2,1
20,57607.0,1,1
22,57607.0,5,1
26,28161.0,4,1


In [211]:
vehicle_legs_merged = vehicle_boardings_obs_sch_itin_legs_clean.merge(vehic_first_boardings_options_clean_keys, how='outer', indicator=True)
vehicle_legs_rest_clean = vehicle_legs_merged[vehicle_legs_merged['_merge'] == 'left_only'].drop('_merge', axis=1)

In [212]:
len(vehicle_legs_rest_clean)

807

In [213]:
vehicle_legs_rest_clean

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,considered_duration_mins
6,28150.0,1,2,WALK,,,,,2017-05-01 13:45:00,NaT,,2017-05-01 13:45:52,NaT,0.866667,,0.866667
7,28150.0,1,3,BUS,303,DE722,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:02:48,25753.0,2017-05-01 14:01:59,2017-05-01 14:12:10,10.983333,9.366667,9.366667
8,28150.0,1,3,BUS,303,DE710,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:27:30,25753.0,2017-05-01 14:01:59,2017-05-01 13:35:59,10.983333,8.483333,8.483333
9,28150.0,1,3,BUS,303,DE713,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:16:34,25753.0,2017-05-01 14:01:59,2017-05-01 13:24:23,10.983333,7.816667,7.816667
10,28150.0,1,3,BUS,303,DE708,5.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:29:24,25753.0,2017-05-01 14:01:59,2017-05-01 14:37:26,10.983333,8.033333,8.033333
11,28150.0,1,3,BUS,303,DE719,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:07:22,25753.0,2017-05-01 14:01:59,2017-05-01 13:15:06,10.983333,7.733333,7.733333
12,28150.0,1,3,BUS,303,LE702,5.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:41:47,25753.0,2017-05-01 14:01:59,2017-05-01 14:50:32,10.983333,8.750000,8.750000
13,28150.0,1,4,WALK,,,,,2017-05-01 14:02:00,NaT,,2017-05-01 14:09:32,NaT,7.533333,,7.533333
23,28154.0,1,2,WALK,,,,,2017-05-01 14:18:00,NaT,,2017-05-01 14:19:14,NaT,1.233333,,1.233333
24,28154.0,1,3,BUS,603,HD243,7.0,26296.0,2017-05-01 14:22:00,2017-05-01 14:22:47,25521.0,2017-05-01 14:53:48,2017-05-01 14:49:17,31.800000,26.500000,26.500000


In [214]:
all_vehicle_legs_options = pd.concat([vehic_first_boardings_options_clean,vehicle_legs_rest_clean]) \
    .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','bt_start_time'])

In [215]:
len(all_vehicle_legs_options)

945

In [216]:
all_vehicle_legs_options

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,considered_duration_mins
0,28150.0,1,1,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.450000,7.816667,7.816667
6,28150.0,1,2,WALK,,,,,2017-05-01 13:45:00,NaT,,2017-05-01 13:45:52,NaT,0.866667,,0.866667
11,28150.0,1,3,BUS,303,DE719,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:07:22,25753.0,2017-05-01 14:01:59,2017-05-01 13:15:06,10.983333,7.733333,7.733333
9,28150.0,1,3,BUS,303,DE713,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:16:34,25753.0,2017-05-01 14:01:59,2017-05-01 13:24:23,10.983333,7.816667,7.816667
8,28150.0,1,3,BUS,303,DE710,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:27:30,25753.0,2017-05-01 14:01:59,2017-05-01 13:35:59,10.983333,8.483333,8.483333
7,28150.0,1,3,BUS,303,DE722,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:02:48,25753.0,2017-05-01 14:01:59,2017-05-01 14:12:10,10.983333,9.366667,9.366667
10,28150.0,1,3,BUS,303,DE708,5.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:29:24,25753.0,2017-05-01 14:01:59,2017-05-01 14:37:26,10.983333,8.033333,8.033333
12,28150.0,1,3,BUS,303,LE702,5.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:41:47,25753.0,2017-05-01 14:01:59,2017-05-01 14:50:32,10.983333,8.750000,8.750000
13,28150.0,1,4,WALK,,,,,2017-05-01 14:02:00,NaT,,2017-05-01 14:09:32,NaT,7.533333,,7.533333
6,28154.0,1,1,BUS,684,HA298,11.0,39378.0,2017-05-01 13:42:13,2017-05-01 13:36:28,31053.0,2017-05-01 14:18:00,2017-05-01 14:01:41,35.783333,25.216667,25.216667


### Choose best actual leg matches (based on feasibility and start time)

In [240]:
sample_itinerary_options = all_vehicle_legs_options.iloc[0:9,]

In [241]:
sample_itinerary_options

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,considered_duration_mins
0,28150.0,1,1,BUS,827.0,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.45,7.816667,7.816667
6,28150.0,1,2,WALK,,,,,2017-05-01 13:45:00,NaT,,2017-05-01 13:45:52,NaT,0.866667,,0.866667
11,28150.0,1,3,BUS,303.0,DE719,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:07:22,25753.0,2017-05-01 14:01:59,2017-05-01 13:15:06,10.983333,7.733333,7.733333
9,28150.0,1,3,BUS,303.0,DE713,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:16:34,25753.0,2017-05-01 14:01:59,2017-05-01 13:24:23,10.983333,7.816667,7.816667
8,28150.0,1,3,BUS,303.0,DE710,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 13:27:30,25753.0,2017-05-01 14:01:59,2017-05-01 13:35:59,10.983333,8.483333,8.483333
7,28150.0,1,3,BUS,303.0,DE722,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:02:48,25753.0,2017-05-01 14:01:59,2017-05-01 14:12:10,10.983333,9.366667,9.366667
10,28150.0,1,3,BUS,303.0,DE708,5.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:29:24,25753.0,2017-05-01 14:01:59,2017-05-01 14:37:26,10.983333,8.033333,8.033333
12,28150.0,1,3,BUS,303.0,LE702,5.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:41:47,25753.0,2017-05-01 14:01:59,2017-05-01 14:50:32,10.983333,8.75,8.75
13,28150.0,1,4,WALK,,,,,2017-05-01 14:02:00,NaT,,2017-05-01 14:09:32,NaT,7.533333,,7.533333


In [408]:
def choose_leg_matches(leg_matches_groups,colnames):
        chosen_leg_matches = pd.DataFrame(columns = colnames)
        chosen_leg_matches = pd.DataFrame()
        prev_group_id = ()
        prev_itin_id = -1
        prev_leg_end_time = pd.NaT
        num_groups_not_survived = 0

        for name, group in leg_matches_groups:
            
                #print
                #print "Name:", name
                #print "Group:"
                #print group
                #print
                
                if ((prev_itin_id == -1)):
                        prev_itin_id = group['otp_itinerary_id'][0]
                        prev_leg_end_time = group['bt_start_time'].dt.floor('d')[0]
                elif (prev_itin_id != group['otp_itinerary_id'][0]):
                        prev_leg_end_time = group['bt_start_time'].dt.floor('d')[0]
                    

                #if (prev_group_id == ()):
                #        prev_leg_end_time = group['bt_start_time'].dt.floor('d')[0]

                #print
                #print "Previous leg end time:", prev_leg_end_time
                #print
                #print "Original Group"
                #print group.filter(['otp_start_time','bt_start_time','bt_end_time'])
                
                if (group['otp_mode'].iloc[0] == 'WALK'):
                    #print "Walking duration:", filtered_group['otp_duration_mins']
                    filtered_group = group.copy().reset_index()
                    filtered_group.loc[0,'bt_start_time'] = prev_leg_end_time
                    filtered_group.loc[0,'bt_end_time'] = prev_leg_end_time + \
                        pd.Timedelta(minutes=np.rint(filtered_group['otp_duration_mins'].iloc[0]))
                    
                    #print "Filtered Group"
                    #print filtered_group
                else:
                    filtered_group = group[group['bt_start_time'] > prev_leg_end_time]
                
                #print
                #print "Filtered Group"
                #print filtered_group.filter(['otp_start_time','bt_start_time','bt_end_time'])

                if (len(filtered_group) == 0):
                        #print "Group did not survive! =("
                        num_groups_not_survived += 1
                        continue

                chosen_leg_match = filtered_group.sort_values('bt_start_time').iloc[0]
                #print "Chosen Leg"
                #print chosen_leg_match

                chosen_leg_matches = chosen_leg_matches.append(chosen_leg_match)

                #Update variables
                prev_group_id = name
                prev_leg_end_time = chosen_leg_match['bt_end_time']

        #print num_groups_not_survived
        return chosen_leg_matches.filter(colnames)


In [411]:
chosen_legs = choose_leg_matches(all_vehicle_legs_options.groupby(['otp_user_trip_id','otp_itinerary_id','otp_leg_id']),
                                all_vehicle_legs_options.columns)

In [412]:
chosen_legs

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,otp_to_stop_id,otp_end_time,bt_end_time,otp_duration_mins,bt_duration_mins,considered_duration_mins
0,28150.0,1.0,1.0,BUS,827,BC929,12.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,6.450000,7.816667,7.816667
0,28150.0,1.0,2.0,WALK,,,,,2017-05-01 13:45:00,2017-05-01 13:44:14,,2017-05-01 13:45:52,2017-05-01 13:45:14,0.866667,,0.866667
7,28150.0,1.0,3.0,BUS,303,DE722,4.0,26195.0,2017-05-01 13:51:00,2017-05-01 14:02:48,25753.0,2017-05-01 14:01:59,2017-05-01 14:12:10,10.983333,9.366667,9.366667
0,28150.0,1.0,4.0,WALK,,,,,2017-05-01 14:02:00,2017-05-01 14:12:10,,2017-05-01 14:09:32,2017-05-01 14:20:10,7.533333,,7.533333
0,28154.0,1.0,2.0,WALK,,,,,2017-05-01 14:18:00,2017-05-01 14:20:10,,2017-05-01 14:19:14,2017-05-01 14:21:10,1.233333,,1.233333
24,28154.0,1.0,3.0,BUS,603,HD243,7.0,26296.0,2017-05-01 14:22:00,2017-05-01 14:22:47,25521.0,2017-05-01 14:53:48,2017-05-01 14:49:17,31.800000,26.500000,26.500000
0,28154.0,1.0,4.0,WALK,,,,,2017-05-01 14:53:49,2017-05-01 14:49:17,,2017-05-01 14:55:49,2017-05-01 14:51:17,2.000000,,2.000000
0,28154.0,2.0,2.0,WALK,,,,,2017-05-01 14:33:00,2017-05-01 14:51:17,,2017-05-01 14:34:14,2017-05-01 14:52:17,1.233333,,1.233333
46,28154.0,2.0,3.0,BUS,603,HD249,8.0,26296.0,2017-05-01 14:39:00,2017-05-01 15:06:34,25521.0,2017-05-01 15:10:48,2017-05-01 15:31:11,31.800000,24.616667,24.616667
0,28154.0,2.0,4.0,WALK,,,,,2017-05-01 15:10:49,2017-05-01 15:31:11,,2017-05-01 15:12:49,2017-05-01 15:33:11,2.000000,,2.000000


## Find OTP Vehicle Boarding Itineraries first alighting in BUSTE Data

In [185]:
bus_trips.head()

Unnamed: 0,route,busCode,shapeId,tripNum,stopPointId,gps_datetime,distanceTraveledShape,stop_lat,stop_lon,parent_station
29435,10,BB303,1708.0,1.0,33157.0,2017-05-01 06:28:07,537.974,-25.410517,-49.276479,
29436,10,BB303,1708.0,1.0,33159.0,2017-05-01 06:30:38,1141.061,-25.411726,-49.270902,
29437,10,BB303,1708.0,1.0,33158.0,2017-05-01 06:31:40,1624.751,-25.415285,-49.270134,
29438,10,BB303,1708.0,1.0,30150.0,2017-05-01 06:32:26,1972.077,-25.416733,-49.267863,
29439,10,BB303,1708.0,1.0,28637.0,2017-05-01 06:33:11,2378.349,-25.414184,-49.265917,


In [228]:
vehicle_first_boardings = matched_vehicle_boardings.merge(bus_trips,
                                left_on=['o_route','o_busCode','o_tripNum','otp_to_stop_id'],
                                right_on=['route','busCode','tripNum','stopPointId'],
                                how='inner') \
                                .assign(otp_buste_end_timediff = 
                                        lambda x: np.absolute(x['gps_datetime'] - x['otp_end_time']),
                                        actual_duration = lambda x: x.gps_datetime - x.o_boarding_datetime) \
                                .sort_values(['otp_user_trip_id','otp_itinerary_id','actual_duration','otp_buste_end_timediff'])

vehicle_first_boardings = vehicle_first_boardings[vehicle_first_boardings['actual_duration'] > pd.Timedelta(minutes=0)]

In [229]:
vehicle_first_boardings.filter(['o_boarding_id','otp_itinerary_id','route','busCode','tripNum','otp_from_stop_id','o_stopPointId','otp_start_time','o_boarding_datetime','otp_to_stop_id','stopPointId','otp_end_time','gps_datetime','otp_buste_end_timediff','actual_duration']) #\
                        #.sort_values(['o_boarding_id','otp_itinerary_id'])

Unnamed: 0,o_boarding_id,otp_itinerary_id,route,busCode,tripNum,otp_from_stop_id,o_stopPointId,otp_start_time,o_boarding_datetime,otp_to_stop_id,stopPointId,otp_end_time,gps_datetime,otp_buste_end_timediff,actual_duration
0,28150.0,1,827,BC929,12.0,33788.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,00:00:46,00:07:49
1,28150.0,8,827,BC929,12.0,33788.0,33788.0,2017-05-01 14:28:17,2017-05-01 13:36:25,30993.0,30993.0,2017-05-01 14:35:00,2017-05-01 13:44:14,00:50:46,00:07:49
6,28154.0,1,684,HA298,11.0,39378.0,39378.0,2017-05-01 13:42:13,2017-05-01 13:36:28,31053.0,31053.0,2017-05-01 14:18:00,2017-05-01 14:01:41,00:16:19,00:25:13
7,28154.0,2,684,HA298,11.0,39378.0,39378.0,2017-05-01 13:57:13,2017-05-01 13:36:28,31053.0,31053.0,2017-05-01 14:33:00,2017-05-01 14:01:41,00:31:19,00:25:13
8,28154.0,6,684,HA298,11.0,39378.0,39378.0,2017-05-01 14:12:13,2017-05-01 13:36:28,31053.0,31053.0,2017-05-01 14:48:00,2017-05-01 14:01:41,00:46:19,00:25:13
9,28154.0,7,684,HA298,11.0,39378.0,39378.0,2017-05-01 14:27:13,2017-05-01 13:36:28,31053.0,31053.0,2017-05-01 15:03:00,2017-05-01 14:01:41,01:01:19,00:25:13
10,28154.0,8,684,HA298,11.0,39378.0,39378.0,2017-05-01 14:42:13,2017-05-01 13:36:28,31053.0,31053.0,2017-05-01 15:18:00,2017-05-01 14:01:41,01:16:19,00:25:13
17,28154.0,9,684,HA298,11.0,39378.0,39378.0,2017-05-01 14:57:13,2017-05-01 13:36:28,34160.0,34160.0,2017-05-01 15:30:02,2017-05-01 13:57:14,01:32:48,00:20:46
18,28154.0,10,684,HA298,11.0,39378.0,39378.0,2017-05-01 15:12:13,2017-05-01 13:36:28,34160.0,34160.0,2017-05-01 15:45:02,2017-05-01 13:57:14,01:47:48,00:20:46
26,28161.0,4,050,JB301,5.0,14404.0,14404.0,2017-05-01 13:44:57,2017-05-01 13:36:33,32454.0,32454.0,2017-05-01 13:51:00,2017-05-01 13:39:53,00:11:07,00:03:20


In [230]:
vehicle_first_boardings_filtered = vehicle_first_boardings.sort_values(['actual_duration','otp_buste_end_timediff']) \
                                        .groupby(['o_boarding_id']) \
                                        .first()

In [231]:
vehicle_first_boardings_filtered.filter(['o_boarding_id','otp_itinerary_id','route','busCode','tripNum','otp_from_stop_id','o_stopPointId','otp_start_time','o_boarding_datetime','otp_to_stop_id','stopPointId','otp_end_time','gps_datetime','otp_buste_end_timediff','actual_duration'])

Unnamed: 0_level_0,otp_itinerary_id,route,busCode,tripNum,otp_from_stop_id,o_stopPointId,otp_start_time,o_boarding_datetime,otp_to_stop_id,stopPointId,otp_end_time,gps_datetime,otp_buste_end_timediff,actual_duration
o_boarding_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
28150.0,1,827,BC929,12.0,33788.0,33788.0,2017-05-01 13:38:33,2017-05-01 13:36:25,30993.0,30993.0,2017-05-01 13:45:00,2017-05-01 13:44:14,00:00:46,00:07:49
28154.0,9,684,HA298,11.0,39378.0,39378.0,2017-05-01 14:57:13,2017-05-01 13:36:28,34160.0,34160.0,2017-05-01 15:30:02,2017-05-01 13:57:14,01:32:48,00:20:46
28161.0,4,050,JB301,5.0,14404.0,14404.0,2017-05-01 13:44:57,2017-05-01 13:36:33,32454.0,32454.0,2017-05-01 13:51:00,2017-05-01 13:39:53,00:11:07,00:03:20
28164.0,10,812,BA020,9.0,30320.0,30320.0,2017-05-01 15:35:48,2017-05-01 13:36:35,30270.0,30270.0,2017-05-01 15:48:42,2017-05-01 13:45:17,02:03:25,00:08:42
28165.0,1,462,DC093,7.0,30798.0,30798.0,2017-05-01 13:38:46,2017-05-01 13:36:35,26157.0,26157.0,2017-05-01 14:12:00,2017-05-01 14:00:22,00:11:38,00:23:47
28169.0,9,545,JA300,6.0,33835.0,33835.0,2017-05-01 14:14:28,2017-05-01 13:36:36,5452.0,5452.0,2017-05-01 14:54:51,2017-05-01 14:10:54,00:43:57,00:34:18
28174.0,1,515,EA303,10.0,33580.0,33580.0,2017-05-01 13:40:27,2017-05-01 13:36:42,27791.0,27791.0,2017-05-01 13:56:00,2017-05-01 13:52:12,00:03:48,00:15:30
28177.0,5,658,JA026,8.0,35212.0,35212.0,2017-05-01 13:45:18,2017-05-01 13:36:44,27620.0,27620.0,2017-05-01 13:48:00,2017-05-01 13:38:12,00:09:48,00:01:28
28180.0,5,182,BC002,8.0,31854.0,31854.0,2017-05-01 14:03:29,2017-05-01 13:36:46,28615.0,28615.0,2017-05-01 14:29:58,2017-05-01 13:55:37,00:34:21,00:18:51
28204.0,2,040,HB604,4.0,34260.0,34260.0,2017-05-01 14:07:00,2017-05-01 13:37:05,34123.0,34123.0,2017-05-01 14:22:38,2017-05-01 13:52:15,00:30:23,00:15:10


In [222]:
vehicle_first_boardings_filtered.otp_buste_end_timediff.describe()

count                        83
mean     0 days 00:19:36.337349
std      0 days 00:44:32.422767
min             0 days 00:00:36
25%             0 days 00:06:48
50%             0 days 00:12:02
75%             0 days 00:21:27
max             0 days 06:46:55
Name: otp_buste_end_timediff, dtype: object

In [59]:
def get_otp_matched_legs(ebuste_otp_matches_itin,otp_suggestions):
        otp_legs_suggestions_matches = ebuste_otp_matches_itin.merge(otp_suggestions,
                                                                     left_on=['o_boarding_id','itinerary_id'],
                                                                     right_on=['user_trip_id','itinerary_id'],
                                                                     how='inner')
        #otp_legs_suggestions_matches['first_vehicle_boarding'] = np.where((otp_legs_suggestions_matches['mode'] == "BUS") & 
        #                                                                  (np.logical_not(otp_legs_suggestions_matches['o_busCode'].str.isdigit()) & 
        #                                                                   (otp_legs_suggestions_matches['o_route'] == otp_legs_suggestions_matches['route'])),
        #                                                                  True,
        #                                                                  False) #\
        #                                                                #.filter(np.append(otp_suggestions.columns.values,['first_vehicle_boarding','o_busCode','o_tripNum','o_boarding_datetime']))
        return otp_legs_suggestions_matches


In [64]:
        # Add OTP extra origin/next-origin pairs to final dataset
        #vehicle_boarding_matched_itins = matched_vehicle_boardings.filter(np.append(trips_origins.columns.values,['itinerary_id']))
        vehicle_otp_filtered_legs = get_otp_matched_legs(vehicle_boarding_matched_itins,otp_suggestions)
        vehicle_otp_filtered_bus_legs = get_otp_matched_legs(vehicle_boarding_matched_itins,otp_suggestions).query('mode == \'BUS\'')
        vehicle_otp_filtered_walk_legs = get_otp_matched_legs(vehicle_boarding_matched_itins,otp_suggestions).query('mode == \'WALK\'')


In [65]:
vehicle_otp_filtered_legs.head()

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_gps_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_stop_lat,o_stop_lon,...,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,from_parent_station,to_parent_station,first_vehicle_boarding
0,3823073.0,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,...,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827.0,33788.0,30993.0,6.45,,14508.0,True
1,3823073.0,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,...,2017-05-01 13:45:00,2017-05-01 13:45:52,WALK,,,,0.866667,,,False
2,3823073.0,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,...,2017-05-01 13:51:00,2017-05-01 14:01:59,BUS,303.0,26195.0,25753.0,10.983333,14508.0,41756.0,False
3,3823073.0,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,...,2017-05-01 14:02:00,2017-05-01 14:09:32,WALK,,,,7.533333,,,False
4,3823073.0,28150.0,2017-05-01 13:36:25,2017-05-01 13:35:18,827,BC929,12.0,33788.0,-25.436303,-49.362221,...,2017-05-01 14:28:17,2017-05-01 14:35:00,BUS,827.0,33788.0,30993.0,6.716667,,14508.0,True


In [66]:
vehicle_otp_filtered_legs.dtypes

cardNum                            float64
o_boarding_id                      float64
o_boarding_datetime         datetime64[ns]
o_gps_datetime              datetime64[ns]
o_route                             object
o_busCode                           object
o_tripNum                          float64
o_stopPointId                      float64
o_stop_lat                         float64
o_stop_lon                         float64
next_o_boarding_id                 float64
next_o_boarding_datetime    datetime64[ns]
next_o_gps_datetime         datetime64[ns]
next_o_route                        object
next_o_busCode                      object
next_o_tripNum                     float64
next_o_stopPointId                 float64
next_o_stop_lat                    float64
next_o_stop_lon                    float64
boardings_timediff                  object
dist_between_origins               float64
itinerary_id                         int64
date                        datetime64[ns]
user_trip_i

In [72]:
bus_trips.head()

Unnamed: 0,route,busCode,shapeId,tripNum,stopPointId,gps_datetime,distanceTraveledShape,stop_lat,stop_lon,parent_station
29435,10,BB303,1708.0,1.0,33157.0,2017-05-01 06:28:07,537.974,-25.410517,-49.276479,
29436,10,BB303,1708.0,1.0,33159.0,2017-05-01 06:30:38,1141.061,-25.411726,-49.270902,
29437,10,BB303,1708.0,1.0,33158.0,2017-05-01 06:31:40,1624.751,-25.415285,-49.270134,
29438,10,BB303,1708.0,1.0,30150.0,2017-05-01 06:32:26,1972.077,-25.416733,-49.267863,
29439,10,BB303,1708.0,1.0,28637.0,2017-05-01 06:33:11,2378.349,-25.414184,-49.265917,


In [79]:
def match_vehicle_boardings_otp_legs_start_to_buste(otp_buste_legs,bus_trips):
        otp_legs_buste_start = otp_filtered_legs.merge(bus_trips,
                                                       left_on=['route','o_busCode','o_tripNum','from_stop_id'],
                                                       right_on=['route','busCode','tripNum','stopPointId'],
                                                       how='inner') \
                                                .assign(otp_buste_start_timediff = 
                                                        lambda x: np.absolute(x['gps_datetime'] - x['otp_start_time'])) \
                                                .sort_values(['user_trip_id','itinerary_id','leg_id','otp_buste_start_timediff']) \
                                                .rename(index=str, columns={'to_stop_id':'stopPointId', 'gps_datetime':'matched_start_time'})

        #otp_legs_buste_start = otp_legs_buste_start[otp_legs_buste_start['otp_buste_start_timediff'] < pd.Timedelta('60min')]
        return otp_legs_buste_start


In [74]:
        # Find legs candidate match start point
        otp_legs_buste_start = match_vehicle_boardings_otp_legs_start_to_buste(vehicle_otp_filtered_bus_legs,bus_trips)


In [77]:
otp_legs_buste_start.filter(np.append(bus_trips.columns,otp_suggestions.columns)).head()

Unnamed: 0,route,busCode,tripNum,stopPointId,user_trip_id,itinerary_id,leg_id,otp_start_time,otp_end_time,mode,route.1,from_stop_id,otp_duration_mins
0,827,BC929,12.0,30993.0,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,6.45
1,827,BC929,12.0,30993.0,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,6.45
2,827,BC929,12.0,30993.0,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,6.45
3,827,BC929,12.0,30993.0,28150.0,1,1,2017-05-01 13:38:33,2017-05-01 13:45:00,BUS,827,33788.0,6.45
4,827,BC929,12.0,30993.0,28150.0,8,1,2017-05-01 14:28:17,2017-05-01 14:35:00,BUS,827,33788.0,6.716667


In [8]:
def match_otp_legs_end_to_buste(otp_filtered_legs,bus_trips):
        otp_legs_buste = otp_legs_buste_start \
                                .merge(bus_trips,
                                                 on=['route','busCode','tripNum','stopPointId'],
                                                 how='inner') \
                                .assign(otp_buste_end_timediff =
                                                        lambda x: np.absolute(x['gps_datetime'] - x['otp_end_time'])) \
                                .rename(index=str, columns={'stopPointId':'to_stop_id', 'gps_datetime':'matched_end_time'}) \
                                .assign(matched_leg_duration_mins = lambda x: x['matched_end_time'] - x['matched_start_time'],
                                                boarding_matched_start_timediff =
                                                        lambda x: np.absolute(x['o_boarding_datetime'] - x['matched_start_time'])) \
                                .query('matched_end_time > matched_start_time') \
                                .filter(['user_trip_id','first_vehicle_boarding','itinerary_id','leg_id','mode','route','busCode',
                                                 'o_busCode','tripNum','o_tripNum','from_stop_id','otp_start_time',
                                                 'matched_start_time','o_boarding_datetime','otp_buste_start_timediff',
                                                 'to_stop_id','otp_end_time','matched_end_time','otp_buste_end_timediff',
                                                 'boarding_matched_start_timediff', 'otp_duration_mins','matched_leg_duration_mins']) \
                                .sort_values(['user_trip_id','itinerary_id','leg_id','otp_buste_end_timediff'])

        otp_legs_buste = otp_legs_buste[otp_legs_buste['otp_buste_end_timediff'] < pd.Timedelta('60min')]
        return otp_legs_buste


In [None]:
itineraries_rest = pd.concat([itineraries_start,otp_suggestions.query('otp_mode == \'BUS\'').sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])]) \
                        .reset_index() \
                        .drop_duplicates(keep=False)