In [1]:
#Libraries

#Python Libs
import sys
import os
import glob
from datetime import datetime
import time
from geopy import distance

#Data Analysis Libs
import pandas as pd
import numpy as np

#### Functions

In [112]:
def printUsage():
    print "Usage: " + sys.argv[0] + " <otp-suggestions-filepath> <enhanced-buste-folderpath> <gtfs-base-folderpath> <output-folderpath>"

def select_input_files(enh_buste_base_path,init_date,fin_date,suffix):
    selected_files = []
    all_files = glob.glob(os.path.join(enh_buste_base_path,"*"))

    for file_ in all_files:
        try:
            file_date = pd.to_datetime(file_.split('/')[-1],format=('%Y_%m_%d' + suffix  + '.csv'))
            if (file_date >= init_date) and (file_date <= fin_date):
                selected_files.append((file_,file_date))
        except:
            continue

    return sorted(selected_files)

def dist(p1_lat, p1_lon, p2_lat, p2_lon):
    return np.around(distance.geodesic((p1_lat,p1_lon),(p2_lat,p2_lon)).km,decimals=5)

def get_router_id(query_date):
    INTERMEDIATE_OTP_DATE = pd.to_datetime("2017-06-30", format="%Y-%m-%d")
    router_id = ''

    if (query_date <= INTERMEDIATE_OTP_DATE):
        return 'ctba-2017-1'
    else:
        return 'ctba-2017-2'

def choose_leg_matches(leg_matches_groups):
    chosen_leg_matches = pd.DataFrame(columns = otp_legs_buste.columns.values)
    prev_group_id = ()
    num_groups_not_survived = 0

    for name, group in legs_matches_groups:

        if (prev_group_id != name[0:2]):
            prev_leg_end_time = otp_suggestions['date'][0]

        #print
        #print prev_leg_end_time
        #print
        #print "Original Group"
        #print group.filter(['otp_start_time','matched_start_time'])

        filtered_group = group[group['matched_start_time'] > prev_leg_end_time]
        #print
        #print "Filtered Group"
        #print filtered_group.filter(['otp_start_time','matched_start_time'])

        if (len(filtered_group) == 0):
            num_groups_not_survived += 1
            continue

        chosen_leg_match = filtered_group.sort_values('boarding_otp_match_start_timediff').iloc[0]
        #print "Chosen Leg"
        #print chosen_leg_match

        chosen_leg_matches = chosen_leg_matches.append(chosen_leg_match)

        #Update variables
        prev_group_id = name[0:2]
        prev_leg_end_time = chosen_leg_match['matched_end_time']
        
    chosen_leg_matches = chosen_leg_matches.filter(otp_legs_buste.columns.values) \
                                            .assign(user_trip_id = lambda x: x['user_trip_id'].astype(int),
                                                    itinerary_id = lambda x: x['itinerary_id'].astype(int),
                                                    leg_id = lambda x: x['leg_id'].astype(int))


    #print num_groups_not_survived
    return chosen_leg_matches

def prepare_otp_data(otp_data):
    otp_data['otp_start_time'] = otp_data['otp_start_time'] - pd.Timedelta('10800 s')
    otp_data['otp_end_time'] = otp_data['otp_end_time'] - pd.Timedelta('10800 s')
    otp_data['route'] = otp_data['route'].astype(str)
    otp_data['route'] = np.where(otp_data['mode'] == 'BUS',
                            otp_data['route'].astype(str).str.replace("\.0",'').str.zfill(3),
                            otp_data['route'])

    return otp_data

def match_vehicle_boardings(selected_trips,itineraries_start):
    vehicle_boarding_origins = selected_trips[np.logical_not(selected_trips['o_busCode'].str.isdigit())]
    matched_vehicle_boardings = vehicle_boarding_origins.merge(itineraries_start, left_on=['o_boarding_id','o_route','o_stopPointId'], right_on=['user_trip_id','route','from_stop_id'], how='inner')
    num_matched_vehicle_boardings = len(matched_vehicle_boardings.drop_duplicates(subset=['cardNum','o_boarding_id']))
    match_perc = 100*(num_matched_vehicle_boardings/float(len(vehicle_boarding_origins)))
    return (matched_vehicle_boardings,num_matched_vehicle_boardings,match_perc)

def compatible_dates(otp_data,ticketing_data):
    otp_date = otp_data['date'].iloc[0]
    ticketing_date = pd.to_datetime(ticketing_data['o_boarding_datetime'].dt.strftime('%Y-%m-%d')[0])

    return (otp_date == ticketing_date,otp_date,ticketing_date)

def match_terminal_boardings(selected_trips,itineraries_start):
    terminal_boarding_origins = selected_trips[(selected_trips['o_busCode'].str.isdigit()) & (selected_trips['o_route'] != '021')]
    matched_terminal_boardings = terminal_boarding_origins.merge(itineraries_start,
                                                                left_on=['o_boarding_id','o_stopPointId'],
                                                                right_on=['user_trip_id','parent_station'], how='inner')
    num_matched_terminal_boardings = len(matched_terminal_boardings.drop_duplicates(subset=['cardNum','o_boarding_id']))
    matched_perc = 100*(num_matched_terminal_boardings/float(len(terminal_boarding_origins)))
    return (matched_terminal_boardings,num_matched_terminal_boardings,matched_perc)

def match_terminal_021_boardings(selected_trips,itineraries_start):
    terminal_021_origins = selected_trips[(selected_trips['o_busCode'].str.isdigit()) & (selected_trips['o_route'] == '021')]
    matched_021_terminal_boardings = terminal_021_origins.merge(itineraries_start, left_on=['o_boarding_id','o_route','o_stopPointId'], right_on=['user_trip_id','route','parent_station'], how='inner')
    num_matched_021_terminal_boardings = len(matched_021_terminal_boardings.drop_duplicates(subset=['cardNum','o_boarding_id']))
    terminal_021_match_perc = 100*(num_matched_021_terminal_boardings/float(len(terminal_021_origins)))
    return (len(terminal_021_origins),matched_021_terminal_boardings,num_matched_021_terminal_boardings,terminal_021_match_perc)

def get_otp_matched_legs(boarding_suggestions_matches,otp_suggestions):
    otp_legs_suggestions_matches = boarding_suggestions_matches \
                                    .filter(np.append(trips_origins.columns.values,['itinerary_id'])) \
                                    .merge(otp_suggestions,
                                            left_on=['o_boarding_id','itinerary_id'],
                                            right_on=['user_trip_id','itinerary_id'],
                                            how='inner') \
                                    .query('mode == \'BUS\'') \
                                    .assign(first_vehicle_boarding = lambda x: np.where(np.logical_not(x['o_busCode'].str.isdigit()) & (
                                                                                        x['o_route'] == x['route']),
                                                                                          True,
                                                                                          False)) \
                                    .filter(np.append(otp_suggestions.columns.values,['first_vehicle_boarding','o_busCode','o_tripNum','o_boarding_datetime']))
    return otp_legs_suggestions_matches

def match_otp_legs_start_to_buste(otp_filtered_legs,bus_trips):
    otp_legs_buste_start = otp_filtered_legs \
                            .merge(bus_trips,
                                     left_on=['route','from_stop_id'],
                                     right_on=['route','stopPointId'],
                                     how='inner') \
                            .assign(otp_buste_start_timediff =
                                lambda x: np.absolute(x['gps_datetime'] - x['otp_start_time'])) \
                            .filter(['user_trip_id','first_vehicle_boarding','itinerary_id','leg_id','route','busCode',
                                 'o_busCode','tripNum','o_tripNum','from_stop_id','otp_start_time','gps_datetime',
                                 'o_boarding_datetime','otp_buste_start_timediff','to_stop_id','otp_end_time']) \
                            .sort_values(['user_trip_id','itinerary_id','leg_id','otp_buste_start_timediff']) \
                            .rename(index=str, columns={'to_stop_id':'stopPointId', 'gps_datetime':'matched_start_time'})

    otp_legs_buste_start = otp_legs_buste_start[otp_legs_buste_start['otp_buste_start_timediff'] < pd.Timedelta('60min')]
    return otp_legs_buste_start

def match_otp_legs_end_to_buste(otp_filtered_legs,bus_trips):
    otp_legs_buste = otp_legs_buste_start \
                .merge(bus_trips,
                         on=['route','busCode','tripNum','stopPointId'],
                         how='inner') \
                .assign(otp_buste_end_timediff =
                            lambda x: np.absolute(x['gps_datetime'] - x['otp_end_time'])) \
                .rename(index=str, columns={'stopPointId':'to_stop_id', 'gps_datetime':'matched_end_time'}) \
                .assign(leg_duration = lambda x: x['matched_end_time'] - x['matched_start_time'],
                        boarding_otp_match_start_timediff =
                            lambda x: np.absolute(x['o_boarding_datetime'] - x['matched_start_time'])) \
                .query('matched_end_time > matched_start_time') \
                .filter(['user_trip_id','first_vehicle_boarding','itinerary_id','leg_id','route','busCode',
                         'o_busCode','tripNum','o_tripNum','from_stop_id','otp_start_time',
                         'matched_start_time','o_boarding_datetime','otp_buste_start_timediff',
                         'to_stop_id','otp_end_time','matched_end_time','otp_buste_end_timediff',
                         'boarding_otp_match_start_timediff', 'leg_duration']) \
                .sort_values(['user_trip_id','itinerary_id','leg_id','otp_buste_end_timediff'])

    otp_legs_buste = otp_legs_buste[otp_legs_buste['otp_buste_end_timediff'] < pd.Timedelta('60min')]
    return otp_legs_buste

def add_stops_data_to_leg_matches(chosen_leg_matches,stops_locations):
    chosen_leg_matches_data = chosen_leg_matches \
                                        .merge(stops_locations, left_on='from_stop_id', right_on='stop_id', how='left') \
                                        .drop('stop_id', axis=1) \
                                        .rename(index=str, columns={'stop_lat':'from_stop_lat','stop_lon':'from_stop_lon'}) \
                                        .merge(stops_locations, left_on='to_stop_id', right_on='stop_id', how='left') \
                                        .drop('stop_id', axis=1) \
                                        .rename(index=str, columns={'stop_lat':'to_stop_lat','stop_lon':'to_stop_lon'}) \
                                        .merge(user_trips_ids, on=['user_trip_id'], how='inner') \
                                        [np.append(np.append(['cardNum'],otp_legs_buste.columns.values),['from_stop_lat','from_stop_lon','to_stop_lat','to_stop_lon'])]
    return chosen_leg_matches_data

def build_candidate_itineraries_df(chosen_leg_matches_data):
    otp_buste_itineraries = chosen_leg_matches_data \
                    .groupby(['cardNum','user_trip_id','itinerary_id']) \
                    .agg({'from_stop_id': lambda x: x.iloc[0],
                          'matched_start_time': lambda x: x.iloc[0],
                          'from_stop_lat': lambda x: x.iloc[0],
                          'from_stop_lon': lambda x: x.iloc[0],
                          'to_stop_id': lambda x: x.iloc[-1],
                          'matched_end_time': lambda x: x.iloc[-1],
                          'to_stop_lat': lambda x: x.iloc[-1],
                          'to_stop_lon': lambda x: x.iloc[-1],
                          'leg_id': lambda x: len(x),
                          'first_vehicle_boarding' : lambda x: x.any()}) \
                    .rename(index=str, columns={'leg_id':'num_transfers','first_vehicle_boarding':'vehicle_boarding'}) \
                    .add_prefix('match_') \
                    .reset_index() \
                    .assign(cardNum = lambda x: x['cardNum'].astype(float),
                            user_trip_id = lambda x: x['user_trip_id'].astype(int),
                            itinerary_id = lambda x: x['itinerary_id'].astype(int))
    return otp_buste_itineraries

def get_candidate_itineraries_summary(candidate_itineraries,boarding_suggestions_matches):
    otp_buste_itineraries_summary = candidate_itineraries \
                    .merge(boarding_suggestions_matches \
                                .drop_duplicates(subset=['cardNum','user_trip_id','itinerary_id']),
                            on=['cardNum','user_trip_id','itinerary_id'],
                            how='inner') \
                    [['cardNum', 'user_trip_id', 'itinerary_id',
                          'match_from_stop_id', 'match_matched_start_time', 'o_boarding_datetime',
                          'match_from_stop_lat', 'match_from_stop_lon', 'o_stop_lat', 'o_stop_lon',
                          'match_to_stop_id', 'match_matched_end_time', 'next_o_boarding_datetime',
                          'match_to_stop_lat', 'match_to_stop_lon', 'next_o_stop_lat', 'next_o_stop_lon','match_num_transfers', 'match_vehicle_boarding']] \
                    .assign(start_diff = lambda x: np.absolute(x['match_matched_start_time'] - x['o_boarding_datetime']),
                            trip_duration = lambda x: x['match_matched_end_time'] - x['match_matched_start_time'],
                            origin_dist = lambda y: y.apply(lambda x: dist(x['match_from_stop_lat'], x['match_from_stop_lon'], x['o_stop_lat'], x['o_stop_lon']),axis=1),
                            next_origin_dist = lambda y: y.apply(lambda x: dist(x['match_to_stop_lat'], x['match_to_stop_lon'], x['next_o_stop_lat'], x['next_o_stop_lon']),axis=1)) \
                    .sort_values(['cardNum','user_trip_id'])

    otp_buste_itineraries_summary = otp_buste_itineraries_summary[((otp_buste_itineraries_summary['trip_duration'] > pd.Timedelta('0s')) &
                                                                    (otp_buste_itineraries_summary['trip_duration'] < pd.Timedelta('2h'))) &
                                                                ((otp_buste_itineraries_summary['start_diff'] > pd.Timedelta('0s')) &
                                                                    (otp_buste_itineraries_summary['start_diff'] < pd.Timedelta('1.5h')))] \
                                        .query('origin_dist < 0.1') \
                                        .query('next_origin_dist < 2.0')
    return otp_buste_itineraries_summary

def get_candidate_itineraries_penalty_score(otp_buste_itineraries_filtered):
    otp_buste_itineraries_penalty = otp_buste_itineraries_filtered \
                            .assign(penalty = lambda x: 2*x['start_diff'].dt.total_seconds() + x['trip_duration'].dt.total_seconds() + x['match_num_transfers']*10) \
                            [['cardNum','user_trip_id','itinerary_id','match_num_transfers','match_vehicle_boarding','next_origin_dist','origin_dist','start_diff','trip_duration','penalty']] \
                            .sort_values(['user_trip_id','penalty'], ascending=True)
    return otp_buste_itineraries_penalty


#### Read OTP Suggestions

In [3]:
base_data_folderpath = '/home/tarciso/data/'

In [153]:
otp_suggestions_filepath = base_data_folderpath + '/otp-itineraries/test/2017_06_14_user_trips_aa_otp_itineraries.csv'
otp_suggestions_raw = pd.read_csv(otp_suggestions_filepath, parse_dates=['date','otp_start_time','otp_end_time'])

In [154]:
otp_suggestions = prepare_otp_data(otp_suggestions_raw)

In [155]:
len(otp_suggestions.drop_duplicates(subset=['user_trip_id']))

200

In [156]:
otp_suggestions.head()

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins
0,2017-06-14,456,1,1,2017-06-14 18:29:58,2017-06-14 18:29:59,WALK,,,,0.016667
1,2017-06-14,456,1,2,2017-06-14 18:30:00,2017-06-14 19:02:09,BUS,304.0,2654.0,25915.0,32.15
2,2017-06-14,456,1,3,2017-06-14 19:02:09,2017-06-14 19:03:29,WALK,,,,1.333333
3,2017-06-14,456,1,4,2017-06-14 19:04:20,2017-06-14 19:38:00,BUS,303.0,26030.0,26192.0,33.666667
4,2017-06-14,456,1,5,2017-06-14 19:38:01,2017-06-14 19:38:17,WALK,,,,0.266667


In [157]:
len(otp_suggestions)

8055

#### Adding Parent Stop data to OTP Suggestions

In [158]:
gtfs_base_folderpath = '/home/tarciso/data/gtfs/'
file_date = otp_suggestions['date'][0]
stops_filepath = gtfs_base_folderpath + os.sep + get_router_id(file_date) + os.sep + 'stops.txt'
stops_df = pd.read_csv(stops_filepath)

In [159]:
stops_df.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,70,104505.0,Terminal Campina do Siqueira - 303 - Centenári...,Terminal Campina do Siqueira - Campo Comprido,-25.435724,-49.306998,,,0,14506.0,,
1,270,104905.0,Terminal Carmo - 030 - Interbairros III,Terminal Carmo - 030 - Interbairros III (Senti...,-25.501341,-49.237597,,,0,14485.0,,
2,276,105606.0,Terminal Oficinas - 030 - Interbairros III,Terminal Oficinas - 030 - Interbairros III (Se...,-25.45155,-49.214917,,,0,14481.0,,
3,299,105603.0,Terminal Oficinas - 030 - Interbairros III,Terminal Oficinas - 030 - Interbairros III (Se...,-25.451665,-49.215086,,,0,14481.0,,
4,308,104907.0,Terminal Carmo - 030 - Interbairros III,Terminal Carmo - 030 - Interbairros III (Senti...,-25.501311,-49.237825,,,0,14485.0,,


In [160]:
stops_parent_stations = stops_df[['stop_id','parent_station']]

In [161]:
otp_suggestions = otp_suggestions.merge(stops_parent_stations, left_on='from_stop_id', right_on='stop_id', how='left') \
                                    .drop(['stop_id'], axis=1)

In [162]:
otp_suggestions.head()

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station
0,2017-06-14,456,1,1,2017-06-14 18:29:58,2017-06-14 18:29:59,WALK,,,,0.016667,
1,2017-06-14,456,1,2,2017-06-14 18:30:00,2017-06-14 19:02:09,BUS,304.0,2654.0,25915.0,32.15,41752.0
2,2017-06-14,456,1,3,2017-06-14 19:02:09,2017-06-14 19:03:29,WALK,,,,1.333333,
3,2017-06-14,456,1,4,2017-06-14 19:04:20,2017-06-14 19:38:00,BUS,303.0,26030.0,26192.0,33.666667,26032.0
4,2017-06-14,456,1,5,2017-06-14 19:38:01,2017-06-14 19:38:17,WALK,,,,0.266667,


#### Read Origin/Next-Origin Pairs

In [163]:
trips_origins = pd.read_csv(base_data_folderpath + '/enhanced-buste/2017_06_14_user_trips.csv', parse_dates=['o_boarding_datetime','o_gps_datetime','next_o_boarding_datetime','next_o_gps_datetime'])

In [164]:
len(trips_origins)

73469

In [165]:
trips_origins.sort_values(['cardNum','o_boarding_id']).head()

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,next_o_boarding_datetime,next_o_route,next_o_busCode,next_o_tripNum,next_o_stopPointId,next_o_gps_datetime,next_o_stop_lat,next_o_stop_lon,boardings_timediff,dist_between_origins
12649,228696.0,0,2017-06-14 06:55:38,000,8024,,41752,NaT,-25.449641,-49.353854,...,2017-06-14 14:05:47,000,7030,,41920,NaT,-25.429398,-49.272319,0 days 07:10:09.000000000,8.50251
12650,228696.0,991,2017-06-14 14:05:47,000,7030,,41920,NaT,-25.429398,-49.272319,...,2017-06-14 06:55:38,000,8024,,41752,NaT,-25.449641,-49.353854,0 days 07:10:09.000000000,8.50251
29236,230146.0,7670,2017-06-14 07:50:45,000,57,,43328,NaT,-25.52542,-49.230897,...,2017-06-14 16:13:00,TCA,72,,14485,NaT,-25.501303,-49.237642,0 days 08:22:15.000000000,2.75643
29237,230146.0,8039,2017-06-14 16:13:00,TCA,72,,14485,NaT,-25.501303,-49.237642,...,2017-06-14 07:50:45,000,57,,43328,NaT,-25.52542,-49.230897,0 days 08:22:15.000000000,2.75643
47723,305601.0,19840,2017-06-14 10:59:47,000,5007,,14517,NaT,-25.377432,-49.262446,...,2017-06-14 13:25:52,000,44,,43723,NaT,-25.433715,-49.270258,0 days 02:26:05.000000000,6.28427


In [166]:
trips_origins.dist_between_origins.describe()

count    73469.000000
mean         6.007518
std          3.053211
min          1.500850
25%          3.489690
50%          5.648710
75%          8.018600
max         20.098560
Name: dist_between_origins, dtype: float64

In [167]:
trips_origins.boardings_timediff.describe()

count                         73469
unique                        29224
top       0 days 10:08:12.000000000
freq                             13
Name: boardings_timediff, dtype: object

In [168]:
len(trips_origins)

73469

In [169]:
trips_origins.dtypes

cardNum                            float64
o_boarding_id                        int64
o_boarding_datetime         datetime64[ns]
o_route                             object
o_busCode                           object
o_tripNum                          float64
o_stopPointId                        int64
o_gps_datetime              datetime64[ns]
o_stop_lat                         float64
o_stop_lon                         float64
next_o_boarding_id                   int64
next_o_boarding_datetime    datetime64[ns]
next_o_route                        object
next_o_busCode                      object
next_o_tripNum                     float64
next_o_stopPointId                   int64
next_o_gps_datetime         datetime64[ns]
next_o_stop_lat                    float64
next_o_stop_lon                    float64
boardings_timediff                  object
dist_between_origins               float64
dtype: object

#### Selecting trips for whom OTP suggestions were found

In [170]:
selected_trips = trips_origins[trips_origins['o_boarding_id'].isin(otp_suggestions['user_trip_id'])]
num_selected_trips = len(selected_trips)

In [171]:
len(selected_trips)

200

In [172]:
selected_trips.head()

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,next_o_boarding_datetime,next_o_route,next_o_busCode,next_o_tripNum,next_o_stopPointId,next_o_gps_datetime,next_o_stop_lat,next_o_stop_lon,boardings_timediff,dist_between_origins
0,3777381.0,150496,2017-06-14 00:23:47,226,BA118,9.0,1899,2017-06-14 12:18:25,-25.368544,-49.245159,...,2017-06-15 00:04:25,0,05022,,14474,NaT,-25.406659,-49.252791,0 days 23:40:38.000000000,4.29156
1,3777381.0,49257,2017-06-15 00:04:25,0,05022,,14474,NaT,-25.406659,-49.252791,...,2017-06-14 00:23:47,226,BA118,9.0,1899,2017-06-14 12:18:25,-25.368544,-49.245159,0 days 23:40:38.000000000,4.29156
2,3563770.0,133872,2017-06-14 04:41:39,0,00010,,14492,NaT,-25.534177,-49.26778,...,2017-06-14 17:07:28,0,06054,,26039,NaT,-25.435546,-49.250567,0 days 12:25:49.000000000,11.06266
3,3563770.0,163584,2017-06-14 17:07:28,0,06054,,26039,NaT,-25.435546,-49.250567,...,2017-06-14 04:41:39,0,00010,,14492,NaT,-25.534177,-49.26778,0 days 12:25:49.000000000,11.06266
4,1799518.0,132335,2017-06-14 04:41:45,0,00010,,14492,NaT,-25.534177,-49.26778,...,2017-06-14 17:31:21,0,00042,,26044,NaT,-25.43886,-49.268205,0 days 12:49:36.000000000,10.55934


In [173]:
itineraries_start = otp_suggestions.query('mode == \'BUS\'') \
                    .groupby(['user_trip_id','itinerary_id']) \
                    .first() \
                    .reset_index()

In [174]:
itineraries_start.head()

Unnamed: 0,user_trip_id,itinerary_id,date,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station
0,456,1,2017-06-14,2,2017-06-14 18:30:00,2017-06-14 19:02:09,BUS,304,2654.0,25915.0,32.15,41752.0
1,456,2,2017-06-14,2,2017-06-14 18:35:00,2017-06-14 18:38:51,BUS,304,2654.0,26287.0,3.85,41752.0
2,456,3,2017-06-14,2,2017-06-14 18:48:00,2017-06-14 19:22:08,BUS,304,2654.0,25915.0,34.133333,41752.0
3,456,4,2017-06-14,2,2017-06-14 18:55:00,2017-06-14 18:58:43,BUS,304,2654.0,26287.0,3.716667,41752.0
4,456,5,2017-06-14,2,2017-06-14 18:40:00,2017-06-14 19:15:08,BUS,304,2654.0,25915.0,35.133333,41752.0


In [175]:
matched_vehicle_boardings,num_matched_vehicle_boardings,vehicle_match_perc = match_vehicle_boardings(selected_trips,itineraries_start)

In [176]:
print "Vehicle boardings with matching OTP suggestions: ", num_matched_vehicle_boardings, "(",vehicle_match_perc, "%)"

Vehicle boardings with matching OTP suggestions:  4 ( 80.0 %)


In [177]:
matched_terminal_boardings,num_matched_terminal_boardings,terminal_matched_perc = match_terminal_boardings(selected_trips,itineraries_start)

In [178]:
print "Terminal boardings with matching OTP suggestions: ", num_matched_terminal_boardings, "(", terminal_matched_perc, "%)"

Terminal boardings with matching OTP suggestions:  141 ( 78.3333333333 %)


In [179]:
num_terminal_021_boardings,matched_021_terminal_boardings,num_matched_021_terminal_boardings,terminal_021_match_perc = match_terminal_021_boardings(selected_trips,itineraries_start)

In [180]:
if (num_terminal_021_boardings > 0):
    print "Line 021 Terminal boardings with matching OTP suggestions: ", num_matched_021_terminal_boardings, "(",terminal_021_match_perc, "%)"
else:
    print "No Line 021 Terminal boardings found. Skipping matching."


Line 021 Terminal boardings with matching OTP suggestions:  0 ( 0.0 %)


In [181]:
boarding_suggestions_matches = pd.concat([matched_vehicle_boardings,matched_021_terminal_boardings,matched_terminal_boardings])
total_num_matches = num_matched_vehicle_boardings + num_matched_021_terminal_boardings + num_matched_terminal_boardings


In [182]:
print "Total number of matches: ", total_num_matches, "(", 100*(total_num_matches/float(num_selected_trips)), "%)"

Total number of matches:  145 ( 72.5 %)


In [183]:
boarding_suggestions_matches.head()

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,date,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station
0,3777381.0,150496,2017-06-14 00:23:47,226,BA118,9.0,1899,2017-06-14 12:18:25,-25.368544,-49.245159,...,2017-06-14,1,2017-06-14 05:00:00,2017-06-14 05:20:00,BUS,226,1899.0,30452.0,20.0,14471.0
1,3777381.0,150496,2017-06-14 00:23:47,226,BA118,9.0,1899,2017-06-14 12:18:25,-25.368544,-49.245159,...,2017-06-14,1,2017-06-14 05:25:00,2017-06-14 05:44:00,BUS,226,1899.0,30452.0,19.0,14471.0
2,3777381.0,150496,2017-06-14 00:23:47,226,BA118,9.0,1899,2017-06-14 12:18:25,-25.368544,-49.245159,...,2017-06-14,1,2017-06-14 05:43:00,2017-06-14 06:04:00,BUS,226,1899.0,30452.0,21.0,14471.0
3,3777381.0,150496,2017-06-14 00:23:47,226,BA118,9.0,1899,2017-06-14 12:18:25,-25.368544,-49.245159,...,2017-06-14,1,2017-06-14 06:00:00,2017-06-14 06:20:00,BUS,226,1899.0,30452.0,20.0,14471.0
4,3777381.0,150496,2017-06-14 00:23:47,226,BA118,9.0,1899,2017-06-14 12:18:25,-25.368544,-49.245159,...,2017-06-14,1,2017-06-14 06:15:00,2017-06-14 06:34:00,BUS,226,1899.0,30452.0,19.0,14471.0


#### Add OTP extra origin/next-origin pairs to final dataset

In [184]:
boarding_suggestions_matches.dtypes

cardNum                            float64
o_boarding_id                        int64
o_boarding_datetime         datetime64[ns]
o_route                             object
o_busCode                           object
o_tripNum                          float64
o_stopPointId                        int64
o_gps_datetime              datetime64[ns]
o_stop_lat                         float64
o_stop_lon                         float64
next_o_boarding_id                   int64
next_o_boarding_datetime    datetime64[ns]
next_o_route                        object
next_o_busCode                      object
next_o_tripNum                     float64
next_o_stopPointId                   int64
next_o_gps_datetime         datetime64[ns]
next_o_stop_lat                    float64
next_o_stop_lon                    float64
boardings_timediff                  object
dist_between_origins               float64
user_trip_id                         int64
itinerary_id                         int64
date       

In [185]:
trips_origins.dtypes

cardNum                            float64
o_boarding_id                        int64
o_boarding_datetime         datetime64[ns]
o_route                             object
o_busCode                           object
o_tripNum                          float64
o_stopPointId                        int64
o_gps_datetime              datetime64[ns]
o_stop_lat                         float64
o_stop_lon                         float64
next_o_boarding_id                   int64
next_o_boarding_datetime    datetime64[ns]
next_o_route                        object
next_o_busCode                      object
next_o_tripNum                     float64
next_o_stopPointId                   int64
next_o_gps_datetime         datetime64[ns]
next_o_stop_lat                    float64
next_o_stop_lon                    float64
boardings_timediff                  object
dist_between_origins               float64
dtype: object

In [186]:
otp_filtered_legs = get_otp_matched_legs(boarding_suggestions_matches,otp_suggestions)

In [187]:
otp_filtered_legs

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station,first_vehicle_boarding,o_busCode,o_tripNum,o_boarding_datetime
0,2017-06-14,150496,1,1,2017-06-14 05:00:00,2017-06-14 05:20:00,BUS,226,1899.0,30452.0,20.000000,,True,BA118,9.0,2017-06-14 00:23:47
2,2017-06-14,150496,1,3,2017-06-14 05:25:46,2017-06-14 05:31:00,BUS,203,26190.0,26187.0,5.233333,14471.0,False,BA118,9.0,2017-06-14 00:23:47
4,2017-06-14,150496,4,1,2017-06-14 05:25:00,2017-06-14 05:44:00,BUS,226,1899.0,30452.0,19.000000,,True,BA118,9.0,2017-06-14 00:23:47
6,2017-06-14,150496,4,3,2017-06-14 05:48:46,2017-06-14 05:54:00,BUS,203,26190.0,26187.0,5.233333,14471.0,False,BA118,9.0,2017-06-14 00:23:47
8,2017-06-14,150496,7,1,2017-06-14 05:43:00,2017-06-14 06:04:00,BUS,226,1899.0,30452.0,21.000000,,True,BA118,9.0,2017-06-14 00:23:47
10,2017-06-14,150496,7,3,2017-06-14 06:05:46,2017-06-14 06:11:00,BUS,203,26190.0,26187.0,5.233333,14471.0,False,BA118,9.0,2017-06-14 00:23:47
12,2017-06-14,150496,9,1,2017-06-14 06:00:00,2017-06-14 06:20:00,BUS,226,1899.0,30452.0,20.000000,,True,BA118,9.0,2017-06-14 00:23:47
14,2017-06-14,150496,9,3,2017-06-14 06:22:46,2017-06-14 06:28:00,BUS,203,26190.0,26187.0,5.233333,14471.0,False,BA118,9.0,2017-06-14 00:23:47
16,2017-06-14,150496,10,1,2017-06-14 06:15:00,2017-06-14 06:34:00,BUS,226,1899.0,30452.0,19.000000,,True,BA118,9.0,2017-06-14 00:23:47
18,2017-06-14,150496,10,3,2017-06-14 06:38:05,2017-06-14 06:45:05,BUS,203,26190.0,26187.0,7.000000,14471.0,False,BA118,9.0,2017-06-14 00:23:47


In [188]:
otp_filtered_legs.dtypes

date                      datetime64[ns]
user_trip_id                       int64
itinerary_id                       int64
leg_id                             int64
otp_start_time            datetime64[ns]
otp_end_time              datetime64[ns]
mode                              object
route                             object
from_stop_id                     float64
to_stop_id                       float64
otp_duration_mins                float64
parent_station                   float64
first_vehicle_boarding              bool
o_busCode                         object
o_tripNum                        float64
o_boarding_datetime       datetime64[ns]
dtype: object

In [189]:
len(otp_filtered_legs)

1820

#### Find OTP Suggested Itineraries in BUSTE Data

In [255]:
bus_trips_filepath = base_data_folderpath + '/enhanced-buste/2017_06_14_bus_trips.csv'
bus_trips = pd.read_csv(bus_trips_filepath, dtype={'route': object},parse_dates=['gps_datetime']) \
                .sort_values(['route','busCode','tripNum','gps_datetime']) \
                .assign(route = lambda x: x['route'].astype(str).str.replace("\.0",'').str.zfill(3))

In [256]:
bus_trips.head()

Unnamed: 0,route,busCode,shapeId,tripNum,stopPointId,gps_datetime,distanceTraveledShape,stop_lat,stop_lon,parent_station
13482,10,BB302,1708,1.0,33157,2017-06-14 06:02:57,537.974,-25.410517,-49.276479,
13483,10,BB302,1708,1.0,33159,2017-06-14 06:04:49,1141.061,-25.411726,-49.270902,
13484,10,BB302,1708,1.0,33158,2017-06-14 06:05:11,1624.751,-25.415285,-49.270134,
13485,10,BB302,1708,1.0,30150,2017-06-14 06:05:24,1972.077,-25.416733,-49.267863,
13486,10,BB302,1708,1.0,28637,2017-06-14 06:05:40,2378.349,-25.414184,-49.265917,


In [257]:
len(bus_trips)

50863

In [201]:
otp_filtered_legs.head()

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station,first_vehicle_boarding,o_busCode,o_tripNum,o_boarding_datetime
0,2017-06-14,150496,1,1,2017-06-14 05:00:00,2017-06-14 05:20:00,BUS,226,1899.0,30452.0,20.0,,True,BA118,9.0,2017-06-14 00:23:47
2,2017-06-14,150496,1,3,2017-06-14 05:25:46,2017-06-14 05:31:00,BUS,203,26190.0,26187.0,5.233333,14471.0,False,BA118,9.0,2017-06-14 00:23:47
4,2017-06-14,150496,4,1,2017-06-14 05:25:00,2017-06-14 05:44:00,BUS,226,1899.0,30452.0,19.0,,True,BA118,9.0,2017-06-14 00:23:47
6,2017-06-14,150496,4,3,2017-06-14 05:48:46,2017-06-14 05:54:00,BUS,203,26190.0,26187.0,5.233333,14471.0,False,BA118,9.0,2017-06-14 00:23:47
8,2017-06-14,150496,7,1,2017-06-14 05:43:00,2017-06-14 06:04:00,BUS,226,1899.0,30452.0,21.0,,True,BA118,9.0,2017-06-14 00:23:47


#### Find candidate matches in BUSTE data

In [202]:
otp_legs_buste_start = match_otp_legs_start_to_buste(otp_filtered_legs,bus_trips)

In [203]:
len(otp_legs_buste_start)

544

In [204]:
otp_legs_buste_start.head()

Unnamed: 0,user_trip_id,first_vehicle_boarding,itinerary_id,leg_id,route,busCode,o_busCode,tripNum,o_tripNum,from_stop_id,otp_start_time,matched_start_time,o_boarding_datetime,otp_buste_start_timediff,stopPointId,otp_end_time
5320,7303,False,2,2,22,MR801,5062,5.0,,25683.0,2017-06-14 15:29:01,2017-06-14 14:31:13,2017-06-14 15:24:47,00:57:48,25684.0,2017-06-14 15:35:05
5329,7303,False,5,2,22,MR801,5062,6.0,,25683.0,2017-06-14 15:39:01,2017-06-14 16:29:49,2017-06-14 15:24:47,00:50:48,25684.0,2017-06-14 15:45:05
5337,7303,False,7,2,22,MR801,5062,6.0,,25683.0,2017-06-14 15:49:01,2017-06-14 16:29:49,2017-06-14 15:24:47,00:40:48,25684.0,2017-06-14 15:55:05
5345,7303,False,8,2,22,MR801,5062,6.0,,25683.0,2017-06-14 15:58:01,2017-06-14 16:29:49,2017-06-14 15:24:47,00:31:48,25684.0,2017-06-14 16:04:05
5273,7354,False,1,2,22,MR801,5062,6.0,,25683.0,2017-06-14 15:49:01,2017-06-14 16:29:49,2017-06-14 15:43:40,00:40:48,26276.0,2017-06-14 16:11:00


In [205]:
otp_legs_buste_start.otp_buste_start_timediff.describe()

count                       544
mean     0 days 00:30:03.040441
std      0 days 00:17:50.113641
min             0 days 00:00:03
25%      0 days 00:14:35.750000
50%             0 days 00:30:24
75%      0 days 00:43:46.750000
max             0 days 00:59:15
Name: otp_buste_start_timediff, dtype: object

In [206]:
otp_legs_buste = match_otp_legs_end_to_buste(otp_legs_buste_start,bus_trips)

In [207]:
len(otp_legs_buste)

487

In [208]:
otp_legs_buste.head(50)

Unnamed: 0,user_trip_id,first_vehicle_boarding,itinerary_id,leg_id,route,busCode,o_busCode,tripNum,o_tripNum,from_stop_id,otp_start_time,matched_start_time,o_boarding_datetime,otp_buste_start_timediff,to_stop_id,otp_end_time,matched_end_time,otp_buste_end_timediff,boarding_otp_match_start_timediff,leg_duration
0,7303,False,2,2,22,MR801,5062,5.0,,25683.0,2017-06-14 15:29:01,2017-06-14 14:31:13,2017-06-14 15:24:47,00:57:48,25684.0,2017-06-14 15:35:05,2017-06-14 14:37:04,00:58:01,00:53:34,00:05:51
1,7303,False,5,2,22,MR801,5062,6.0,,25683.0,2017-06-14 15:39:01,2017-06-14 16:29:49,2017-06-14 15:24:47,00:50:48,25684.0,2017-06-14 15:45:05,2017-06-14 16:36:38,00:51:33,01:05:02,00:06:49
2,7303,False,7,2,22,MR801,5062,6.0,,25683.0,2017-06-14 15:49:01,2017-06-14 16:29:49,2017-06-14 15:24:47,00:40:48,25684.0,2017-06-14 15:55:05,2017-06-14 16:36:38,00:41:33,01:05:02,00:06:49
3,7303,False,8,2,22,MR801,5062,6.0,,25683.0,2017-06-14 15:58:01,2017-06-14 16:29:49,2017-06-14 15:24:47,00:31:48,25684.0,2017-06-14 16:04:05,2017-06-14 16:36:38,00:32:33,01:05:02,00:06:49
4,7354,False,1,2,22,MR801,5062,6.0,,25683.0,2017-06-14 15:49:01,2017-06-14 16:29:49,2017-06-14 15:43:40,00:40:48,26276.0,2017-06-14 16:11:00,2017-06-14 16:52:30,00:41:30,00:46:09,00:22:41
10,7354,False,3,2,23,ML309,5062,5.0,,25683.0,2017-06-14 15:42:46,2017-06-14 14:43:41,2017-06-14 15:43:40,00:59:05,26237.0,2017-06-14 15:45:00,2017-06-14 14:49:12,00:55:48,00:59:59,00:05:31
5,7354,False,4,2,22,MR801,5062,6.0,,25683.0,2017-06-14 15:58:01,2017-06-14 16:29:49,2017-06-14 15:43:40,00:31:48,26276.0,2017-06-14 16:20:00,2017-06-14 16:52:30,00:32:30,00:46:09,00:22:41
6,7354,False,5,2,22,MR801,5062,6.0,,25683.0,2017-06-14 16:07:01,2017-06-14 16:29:49,2017-06-14 15:43:40,00:22:48,26276.0,2017-06-14 16:29:00,2017-06-14 16:52:30,00:23:30,00:46:09,00:22:41
7,7354,False,7,2,22,MR801,5062,6.0,,25683.0,2017-06-14 16:15:06,2017-06-14 16:29:49,2017-06-14 15:43:40,00:14:43,26276.0,2017-06-14 16:38:00,2017-06-14 16:52:30,00:14:30,00:46:09,00:22:41
8,7354,False,9,2,22,MR801,5062,6.0,,25683.0,2017-06-14 16:25:11,2017-06-14 16:29:49,2017-06-14 15:43:40,00:04:38,26276.0,2017-06-14 16:49:00,2017-06-14 16:52:30,00:03:30,00:46:09,00:22:41


In [209]:
otp_legs_buste.otp_buste_end_timediff.describe()

count                       487
mean     0 days 00:29:17.036960
std      0 days 00:17:48.158531
min             0 days 00:00:02
25%      0 days 00:13:16.500000
50%             0 days 00:29:17
75%             0 days 00:44:08
max             0 days 00:59:18
Name: otp_buste_end_timediff, dtype: object

In [210]:
otp_legs_buste.boarding_otp_match_start_timediff.describe()

count                       487
mean     0 days 01:17:22.661190
std      0 days 01:13:54.199650
min             0 days 00:00:09
25%             0 days 00:38:36
50%             0 days 01:09:36
75%             0 days 01:37:37
max             0 days 06:33:28
Name: boarding_otp_match_start_timediff, dtype: object

In [211]:
otp_legs_buste.leg_duration.describe()

count                       487
mean     0 days 00:19:36.410677
std      0 days 00:13:24.797551
min             0 days 00:01:44
25%             0 days 00:09:34
50%             0 days 00:17:17
75%             0 days 00:24:12
max             0 days 00:56:44
Name: leg_duration, dtype: object

#### Choosing best leg match using current and previous leg information

In [212]:
legs_matches_groups = otp_legs_buste.groupby(['user_trip_id','itinerary_id','leg_id'])

In [213]:
len(legs_matches_groups)

158

In [214]:
len(otp_legs_buste)

487

In [215]:
chosen_leg_matches = choose_leg_matches(legs_matches_groups)

In [216]:
chosen_leg_matches.head()

Unnamed: 0,user_trip_id,first_vehicle_boarding,itinerary_id,leg_id,route,busCode,o_busCode,tripNum,o_tripNum,from_stop_id,otp_start_time,matched_start_time,o_boarding_datetime,otp_buste_start_timediff,to_stop_id,otp_end_time,matched_end_time,otp_buste_end_timediff,boarding_otp_match_start_timediff,leg_duration
0,7303,False,2,2,22,MR801,5062,5.0,,25683.0,2017-06-14 15:29:01,2017-06-14 14:31:13,2017-06-14 15:24:47,00:57:48,25684.0,2017-06-14 15:35:05,2017-06-14 14:37:04,00:58:01,00:53:34,00:05:51
1,7303,False,5,2,22,MR801,5062,6.0,,25683.0,2017-06-14 15:39:01,2017-06-14 16:29:49,2017-06-14 15:24:47,00:50:48,25684.0,2017-06-14 15:45:05,2017-06-14 16:36:38,00:51:33,01:05:02,00:06:49
2,7303,False,7,2,22,MR801,5062,6.0,,25683.0,2017-06-14 15:49:01,2017-06-14 16:29:49,2017-06-14 15:24:47,00:40:48,25684.0,2017-06-14 15:55:05,2017-06-14 16:36:38,00:41:33,01:05:02,00:06:49
3,7303,False,8,2,22,MR801,5062,6.0,,25683.0,2017-06-14 15:58:01,2017-06-14 16:29:49,2017-06-14 15:24:47,00:31:48,25684.0,2017-06-14 16:04:05,2017-06-14 16:36:38,00:32:33,01:05:02,00:06:49
4,7354,False,1,2,22,MR801,5062,6.0,,25683.0,2017-06-14 15:49:01,2017-06-14 16:29:49,2017-06-14 15:43:40,00:40:48,26276.0,2017-06-14 16:11:00,2017-06-14 16:52:30,00:41:30,00:46:09,00:22:41


In [217]:
len(chosen_leg_matches)

154

#### Choosing itinerary

#### Adding stops location data

In [218]:
stops_locations = stops_df[['stop_id','stop_lat','stop_lon']]

In [219]:
user_trips_ids = boarding_suggestions_matches[['cardNum','user_trip_id']] \
                    .drop_duplicates() \
                    .sort_values(['cardNum','user_trip_id'])


In [220]:
user_trips_ids.dtypes

cardNum         float64
user_trip_id      int64
dtype: object

In [221]:
stops_locations.dtypes

stop_id       int64
stop_lat    float64
stop_lon    float64
dtype: object

In [222]:
chosen_leg_matches.dtypes

user_trip_id                                   int64
first_vehicle_boarding                        object
itinerary_id                                   int64
leg_id                                         int64
route                                         object
busCode                                       object
o_busCode                                     object
tripNum                                      float64
o_tripNum                                    float64
from_stop_id                                 float64
otp_start_time                        datetime64[ns]
matched_start_time                    datetime64[ns]
o_boarding_datetime                   datetime64[ns]
otp_buste_start_timediff             timedelta64[ns]
to_stop_id                                   float64
otp_end_time                          datetime64[ns]
matched_end_time                      datetime64[ns]
otp_buste_end_timediff               timedelta64[ns]
boarding_otp_match_start_timediff    timedelta

In [223]:
chosen_leg_matches_data = add_stops_data_to_leg_matches(chosen_leg_matches,stops_locations)

In [224]:
chosen_leg_matches_data.head()

Unnamed: 0,cardNum,user_trip_id,first_vehicle_boarding,itinerary_id,leg_id,route,busCode,o_busCode,tripNum,o_tripNum,...,to_stop_id,otp_end_time,matched_end_time,otp_buste_end_timediff,boarding_otp_match_start_timediff,leg_duration,from_stop_lat,from_stop_lon,to_stop_lat,to_stop_lon
0,2349164.0,7303,False,2,2,22,MR801,5062,5.0,,...,25684.0,2017-06-14 15:35:05,2017-06-14 14:37:04,00:58:01,00:53:34,00:05:51,-25.410622,-49.248224,-25.425924,-49.242351
1,2349164.0,7303,False,5,2,22,MR801,5062,6.0,,...,25684.0,2017-06-14 15:45:05,2017-06-14 16:36:38,00:51:33,01:05:02,00:06:49,-25.410622,-49.248224,-25.425924,-49.242351
2,2349164.0,7303,False,7,2,22,MR801,5062,6.0,,...,25684.0,2017-06-14 15:55:05,2017-06-14 16:36:38,00:41:33,01:05:02,00:06:49,-25.410622,-49.248224,-25.425924,-49.242351
3,2349164.0,7303,False,8,2,22,MR801,5062,6.0,,...,25684.0,2017-06-14 16:04:05,2017-06-14 16:36:38,00:32:33,01:05:02,00:06:49,-25.410622,-49.248224,-25.425924,-49.242351
4,2773624.0,7354,False,1,2,22,MR801,5062,6.0,,...,26276.0,2017-06-14 16:11:00,2017-06-14 16:52:30,00:41:30,00:46:09,00:22:41,-25.410622,-49.248224,-25.439824,-49.222281


In [225]:
len(chosen_leg_matches_data)

154

In [226]:
candidate_itineraries = build_candidate_itineraries_df(chosen_leg_matches_data)

In [227]:
candidate_itineraries.head()

Unnamed: 0,cardNum,user_trip_id,itinerary_id,match_from_stop_lon,match_matched_start_time,match_num_transfers,match_from_stop_lat,match_from_stop_id,match_to_stop_lat,match_vehicle_boarding,match_matched_end_time,match_to_stop_lon,match_to_stop_id
0,1041080.0,58106,3,-49.247395,2017-06-14 15:09:40,1,-25.48145,26267.0,-25.491786,False,2017-06-14 15:27:11,-49.293089,26246.0
1,1656668.0,113801,2,-49.307134,2017-06-14 18:53:12,1,-25.435959,28138.0,-25.419008,False,2017-06-14 19:09:40,-49.349336,48230.0
2,1656668.0,113801,4,-49.346051,2017-06-14 20:03:04,1,-25.441699,30991.0,-25.416593,False,2017-06-14 20:12:59,-49.349767,34169.0
3,1656668.0,113801,8,-49.307134,2017-06-14 20:10:49,1,-25.435959,28138.0,-25.419008,False,2017-06-14 20:30:22,-49.349336,48230.0
4,1832132.0,132352,2,-49.246963,2017-06-14 06:29:07,1,-25.481453,27746.0,-25.428618,False,2017-06-14 07:03:53,-49.226474,30194.0


In [228]:
boarding_suggestions_matches.dtypes

cardNum                            float64
o_boarding_id                        int64
o_boarding_datetime         datetime64[ns]
o_route                             object
o_busCode                           object
o_tripNum                          float64
o_stopPointId                        int64
o_gps_datetime              datetime64[ns]
o_stop_lat                         float64
o_stop_lon                         float64
next_o_boarding_id                   int64
next_o_boarding_datetime    datetime64[ns]
next_o_route                        object
next_o_busCode                      object
next_o_tripNum                     float64
next_o_stopPointId                   int64
next_o_gps_datetime         datetime64[ns]
next_o_stop_lat                    float64
next_o_stop_lon                    float64
boardings_timediff                  object
dist_between_origins               float64
user_trip_id                         int64
itinerary_id                         int64
date       

In [229]:
candidate_itineraries_filtered = get_candidate_itineraries_summary(candidate_itineraries,boarding_suggestions_matches)

In [230]:
candidate_itineraries_filtered.head()

Unnamed: 0,cardNum,user_trip_id,itinerary_id,match_from_stop_id,match_matched_start_time,o_boarding_datetime,match_from_stop_lat,match_from_stop_lon,o_stop_lat,o_stop_lon,...,match_to_stop_lat,match_to_stop_lon,next_o_stop_lat,next_o_stop_lon,match_num_transfers,match_vehicle_boarding,next_origin_dist,origin_dist,start_diff,trip_duration
39,2349164.0,7303,2,25683.0,2017-06-14 14:31:13,2017-06-14 15:24:47,-25.410622,-49.248224,-25.41058,-49.248257,...,-25.425924,-49.242351,-25.438187,-49.238717,1,False,1.40684,0.00575,00:53:34,00:05:51
40,2349164.0,7303,5,25683.0,2017-06-14 16:29:49,2017-06-14 15:24:47,-25.410622,-49.248224,-25.41058,-49.248257,...,-25.425924,-49.242351,-25.438187,-49.238717,1,False,1.40684,0.00575,01:05:02,00:06:49
41,2349164.0,7303,7,25683.0,2017-06-14 16:29:49,2017-06-14 15:24:47,-25.410622,-49.248224,-25.41058,-49.248257,...,-25.425924,-49.242351,-25.438187,-49.238717,1,False,1.40684,0.00575,01:05:02,00:06:49
42,2349164.0,7303,8,25683.0,2017-06-14 16:29:49,2017-06-14 15:24:47,-25.410622,-49.248224,-25.41058,-49.248257,...,-25.425924,-49.242351,-25.438187,-49.238717,1,False,1.40684,0.00575,01:05:02,00:06:49
56,2397992.0,166665,2,30438.0,2017-06-14 06:44:57,2017-06-14 06:38:19,-25.511471,-49.324923,-25.511654,-49.324539,...,-25.513203,-49.295115,-25.503476,-49.295922,1,False,1.08061,0.0436,00:06:38,00:11:34


In [231]:
candidate_itineraries_filtered.origin_dist.describe(percentiles=[.25,.5,.75,.9,.95,.99])

count    23.000000
mean      0.032947
std       0.027622
min       0.000000
25%       0.005750
50%       0.038520
75%       0.055920
90%       0.070112
95%       0.070580
99%       0.070580
max       0.070580
Name: origin_dist, dtype: float64

In [232]:
candidate_itineraries_filtered.next_origin_dist.describe(percentiles=[.25,.5,.75,.9,.95,.99])

count    23.000000
mean      0.495832
std       0.584097
min       0.000000
25%       0.015680
50%       0.253130
75%       1.106645
90%       1.406840
95%       1.406840
99%       1.406840
max       1.406840
Name: next_origin_dist, dtype: float64

In [233]:
len(candidate_itineraries_filtered)

23

In [234]:
candidate_itineraries_filtered.match_vehicle_boarding.describe()

count        23
unique        2
top       False
freq         18
Name: match_vehicle_boarding, dtype: object

In [235]:
candidate_itineraries_filtered.drop_duplicates('user_trip_id').match_vehicle_boarding.value_counts()

False    6
True     1
Name: match_vehicle_boarding, dtype: int64

In [236]:
otp_buste_itineraries_penalty = get_candidate_itineraries_penalty_score(candidate_itineraries_filtered)

In [237]:
otp_buste_itineraries_penalty.head()

Unnamed: 0,cardNum,user_trip_id,itinerary_id,match_num_transfers,match_vehicle_boarding,next_origin_dist,origin_dist,start_diff,trip_duration,penalty
39,2349164.0,7303,2,1,False,1.40684,0.00575,00:53:34,00:05:51,6789.0
40,2349164.0,7303,5,1,False,1.40684,0.00575,01:05:02,00:06:49,8223.0
41,2349164.0,7303,7,1,False,1.40684,0.00575,01:05:02,00:06:49,8223.0
42,2349164.0,7303,8,1,False,1.40684,0.00575,01:05:02,00:06:49,8223.0
119,3472598.0,70244,2,1,False,1.13268,0.06824,00:03:12,00:23:51,1825.0


In [238]:
chosen_itineraries = otp_buste_itineraries_penalty.groupby(['user_trip_id']).first().reset_index()

In [239]:
len(chosen_itineraries)

7

In [240]:
chosen_itineraries.head()

Unnamed: 0,user_trip_id,cardNum,itinerary_id,match_num_transfers,match_vehicle_boarding,next_origin_dist,origin_dist,start_diff,trip_duration,penalty
0,7303,2349164.0,2,1,False,1.40684,0.00575,00:53:34,00:05:51,6789.0
1,70244,3472598.0,2,1,False,1.13268,0.06824,00:03:12,00:23:51,1825.0
2,96768,3191622.0,7,2,False,0.0,0.03852,01:27:43,01:04:50,14436.0
3,97233,3678065.0,1,1,False,0.06986,0.03852,00:07:28,00:23:48,2334.0
4,123405,2801134.0,2,1,False,0.25313,0.07058,00:36:11,00:54:54,7646.0


In [241]:
chosen_itineraries = chosen_itineraries[(np.logical_not(chosen_itineraries['match_vehicle_boarding'])) |
                                        ((chosen_itineraries['match_vehicle_boarding'])
                                        & (chosen_itineraries['start_diff'] < pd.Timedelta('20 min')))]
num_chosen_itineraries = len(chosen_itineraries)

In [242]:
len(chosen_itineraries)

7

In [243]:
chosen_itineraries.describe(percentiles=[.25,.5,.75,.9,.95,.99])

Unnamed: 0,user_trip_id,cardNum,itinerary_id,match_num_transfers,next_origin_dist,origin_dist,start_diff,trip_duration,penalty
count,7.0,7.0,7.0,7.0,7.0,7.0,7,7,7.0
mean,111633.857143,2970507.0,2.428571,1.142857,0.565543,0.037887,0 days 00:27:56.285714,0 days 00:30:05.428571,5169.428571
std,68187.777552,508054.1,2.070197,0.377964,0.61376,0.027361,0 days 00:32:57.833217,0 days 00:21:47.657684,4824.58129
min,7303.0,2349164.0,1.0,1.0,0.0,0.0,0 days 00:00:48,0 days 00:05:51,1500.0
25%,83506.0,2599563.0,1.5,1.0,0.04277,0.022135,0 days 00:04:55,0 days 00:17:41,1740.5
50%,97233.0,2902977.0,2.0,1.0,0.25313,0.03852,0 days 00:07:28,0 days 00:23:51,2334.0
75%,145035.0,3332110.0,2.0,1.0,1.106645,0.05592,0 days 00:44:52.500000,0 days 00:40:22,7217.5
90%,187926.6,3554785.0,4.0,1.4,1.242344,0.069176,0 days 01:07:13.600000,0 days 00:58:52.400000,10362.0
95%,203872.8,3616425.0,5.5,1.7,1.324592,0.069878,0 days 01:17:28.299999,0 days 01:01:51.199999,12399.0
99%,216629.76,3665737.0,6.7,1.94,1.39039,0.07044,0 days 01:25:40.059999,0 days 01:04:14.239999,14028.6


In [244]:
chosen_itineraries.head()

Unnamed: 0,user_trip_id,cardNum,itinerary_id,match_num_transfers,match_vehicle_boarding,next_origin_dist,origin_dist,start_diff,trip_duration,penalty
0,7303,2349164.0,2,1,False,1.40684,0.00575,00:53:34,00:05:51,6789.0
1,70244,3472598.0,2,1,False,1.13268,0.06824,00:03:12,00:23:51,1825.0
2,96768,3191622.0,7,2,False,0.0,0.03852,01:27:43,01:04:50,14436.0
3,97233,3678065.0,1,1,False,0.06986,0.03852,00:07:28,00:23:48,2334.0
4,123405,2801134.0,2,1,False,0.25313,0.07058,00:36:11,00:54:54,7646.0


In [245]:
chosen_itineraries.dtypes

user_trip_id                        int64
cardNum                           float64
itinerary_id                        int64
match_num_transfers                 int64
match_vehicle_boarding               bool
next_origin_dist                  float64
origin_dist                       float64
start_diff                timedelta64[ns]
trip_duration             timedelta64[ns]
penalty                           float64
dtype: object

In [246]:
print "Final number of matches (after processing): ", len(chosen_itineraries) , "(", 100*(num_chosen_itineraries/float(num_selected_trips)), "%)"

Final number of matches (after processing):  7 ( 3.5 %)


In [247]:
chosen_itineraries.dtypes

user_trip_id                        int64
cardNum                           float64
itinerary_id                        int64
match_num_transfers                 int64
match_vehicle_boarding               bool
next_origin_dist                  float64
origin_dist                       float64
start_diff                timedelta64[ns]
trip_duration             timedelta64[ns]
penalty                           float64
dtype: object

In [248]:
chosen_leg_matches_data.dtypes

cardNum                                      float64
user_trip_id                                   int64
first_vehicle_boarding                        object
itinerary_id                                   int64
leg_id                                         int64
route                                         object
busCode                                       object
o_busCode                                     object
tripNum                                      float64
o_tripNum                                    float64
from_stop_id                                 float64
otp_start_time                        datetime64[ns]
matched_start_time                    datetime64[ns]
o_boarding_datetime                   datetime64[ns]
otp_buste_start_timediff             timedelta64[ns]
to_stop_id                                   float64
otp_end_time                          datetime64[ns]
matched_end_time                      datetime64[ns]
otp_buste_end_timediff               timedelta

In [249]:
od_trips = chosen_itineraries.merge(chosen_leg_matches_data, on=['cardNum','user_trip_id','itinerary_id'], how='inner') \
                                .filter(['cardNum','user_trip_id','itinerary_id','leg_id','route','busCode','tripNum',
                                        'from_stop_id','matched_start_time','from_stop_lat','from_stop_lon','to_stop_id',
                                        'matched_end_time','to_stop_lat','to_stop_lon','leg_duration']) \
                                .rename(index=str, columns={'matched_start_time':'start_time','matched_end_time':'end_time'})

In [250]:
od_trips.head()

Unnamed: 0,cardNum,user_trip_id,itinerary_id,leg_id,route,busCode,tripNum,from_stop_id,start_time,from_stop_lat,from_stop_lon,to_stop_id,end_time,to_stop_lat,to_stop_lon,leg_duration
0,2349164.0,7303,2,2,22,MR801,5.0,25683.0,2017-06-14 14:31:13,-25.410622,-49.248224,25684.0,2017-06-14 14:37:04,-25.425924,-49.242351,00:05:51
1,3472598.0,70244,2,2,20,BB612,1.0,26228.0,2017-06-14 10:50:46,-25.492283,-49.293083,28132.0,2017-06-14 11:14:37,-25.435878,-49.306888,00:23:51
2,3191622.0,96768,7,2,40,MB605,1.0,31633.0,2017-06-14 06:30:22,-25.477397,-49.32666,33119.0,2017-06-14 07:17:58,-25.400519,-49.330766,00:47:36
3,3191622.0,96768,7,4,901,MC076,3.0,33116.0,2017-06-14 07:24:30,-25.401112,-49.329953,33665.0,2017-06-14 07:35:12,-25.413641,-49.313042,00:10:42
4,3678065.0,97233,1,2,40,MB605,4.0,31633.0,2017-06-14 14:14:05,-25.477397,-49.32666,30991.0,2017-06-14 14:37:53,-25.441699,-49.346051,00:23:48


In [251]:
od_trips.dtypes

cardNum                  float64
user_trip_id               int64
itinerary_id               int64
leg_id                     int64
route                     object
busCode                   object
tripNum                  float64
from_stop_id             float64
start_time        datetime64[ns]
from_stop_lat            float64
from_stop_lon            float64
to_stop_id               float64
end_time          datetime64[ns]
to_stop_lat              float64
to_stop_lon              float64
leg_duration     timedelta64[ns]
dtype: object

In [252]:
od_trips.head(10).filter(['cardNum','user_trip_id','start_time','end_time','leg_duration'])

Unnamed: 0,cardNum,user_trip_id,start_time,end_time,leg_duration
0,2349164.0,7303,2017-06-14 14:31:13,2017-06-14 14:37:04,00:05:51
1,3472598.0,70244,2017-06-14 10:50:46,2017-06-14 11:14:37,00:23:51
2,3191622.0,96768,2017-06-14 06:30:22,2017-06-14 07:17:58,00:47:36
3,3191622.0,96768,2017-06-14 07:24:30,2017-06-14 07:35:12,00:10:42
4,3678065.0,97233,2017-06-14 14:14:05,2017-06-14 14:37:53,00:23:48
5,2801134.0,123405,2017-06-14 05:45:41,2017-06-14 06:40:35,00:54:54
6,2397992.0,166665,2017-06-14 06:44:57,2017-06-14 06:56:31,00:11:34
7,2902977.0,219819,2017-06-14 05:10:50,2017-06-14 05:36:40,00:25:50


In [253]:
od_trips.tail(10).filter(['cardNum','user_trip_id','start_time','end_time','leg_duration'])

Unnamed: 0,cardNum,user_trip_id,start_time,end_time,leg_duration
0,2349164.0,7303,2017-06-14 14:31:13,2017-06-14 14:37:04,00:05:51
1,3472598.0,70244,2017-06-14 10:50:46,2017-06-14 11:14:37,00:23:51
2,3191622.0,96768,2017-06-14 06:30:22,2017-06-14 07:17:58,00:47:36
3,3191622.0,96768,2017-06-14 07:24:30,2017-06-14 07:35:12,00:10:42
4,3678065.0,97233,2017-06-14 14:14:05,2017-06-14 14:37:53,00:23:48
5,2801134.0,123405,2017-06-14 05:45:41,2017-06-14 06:40:35,00:54:54
6,2397992.0,166665,2017-06-14 06:44:57,2017-06-14 06:56:31,00:11:34
7,2902977.0,219819,2017-06-14 05:10:50,2017-06-14 05:36:40,00:25:50


In [254]:
len(od_trips)

8