In [332]:
#Libraries

#Python Libs
import sys
import os
import glob
from datetime import datetime
import time
from geopy import distance

#Data Analysis Libs
import pandas as pd
import numpy as np

#### Functions

In [334]:
def printUsage():
    print "Usage: " + sys.argv[0] + " <otp-suggestions-filepath> <enhanced-buste-folderpath> <gtfs-base-folderpath> <output-folderpath>"

def select_input_files(enh_buste_base_path,init_date,fin_date,suffix):
    selected_files = []
    all_files = glob.glob(os.path.join(enh_buste_base_path,"*"))

    for file_ in all_files:
        try:
            file_date = pd.to_datetime(file_.split('/')[-1],format=('%Y_%m_%d' + suffix  + '.csv'))
            if (file_date >= init_date) and (file_date <= fin_date):
                selected_files.append((file_,file_date))
        except:
            continue

    return sorted(selected_files)

def dist(p1_lat, p1_lon, p2_lat, p2_lon):
    return np.around(distance.geodesic((p1_lat,p1_lon),(p2_lat,p2_lon)).km,decimals=5)

def get_router_id(query_date):
    INTERMEDIATE_OTP_DATE = pd.to_datetime("2017-06-30", format="%Y-%m-%d")
    router_id = ''

    if (query_date <= INTERMEDIATE_OTP_DATE):
        return 'ctba-2017-1'
    else:
        return 'ctba-2017-2'

def choose_leg_matches(leg_matches_groups):
    chosen_leg_matches = pd.DataFrame(columns = otp_legs_buste.columns.values)
    prev_group_id = ()
    num_groups_not_survived = 0

    for name, group in legs_matches_groups:

        if (prev_group_id != name[0:2]):
            prev_leg_end_time = otp_suggestions['date'][0]

        #print
        #print prev_leg_end_time
        #print
        #print "Original Group"
        #print group.filter(['otp_start_time','matched_start_time'])

        filtered_group = group[group['matched_start_time'] > prev_leg_end_time]
        #print
        #print "Filtered Group"
        #print filtered_group.filter(['otp_start_time','matched_start_time'])

        if (len(filtered_group) == 0):
            num_groups_not_survived += 1
            continue

        chosen_leg_match = filtered_group.sort_values('boarding_otp_match_start_timediff').iloc[0]
        #print "Chosen Leg"
        #print chosen_leg_match

        chosen_leg_matches = chosen_leg_matches.append(chosen_leg_match)

        #Update variables
        prev_group_id = name[0:2]
        prev_leg_end_time = chosen_leg_match['matched_end_time']


    #print num_groups_not_survived
    return chosen_leg_matches.filter(otp_legs_buste.columns.values)

def prepare_otp_data(otp_data):
    otp_data['otp_start_time'] = otp_data['otp_start_time'] - pd.Timedelta('10800 s')
    otp_data['otp_end_time'] = otp_data['otp_end_time'] - pd.Timedelta('10800 s')
    otp_data['route'] = otp_data['route'].astype(str)
    otp_data['route'] = np.where(otp_data['mode'] == 'BUS',
                            otp_data['route'].astype(str).str.replace("\.0",'').str.zfill(3),
                            otp_data['route'])

    return otp_data

def match_vehicle_boardings(selected_trips,itineraries_start):
    vehicle_boarding_origins = selected_trips[np.logical_not(selected_trips['o_busCode'].str.isdigit())]
    matched_vehicle_boardings = vehicle_boarding_origins.merge(itineraries_start, left_on=['o_boarding_id','o_route','o_stopPointId'], right_on=['user_trip_id','route','from_stop_id'], how='inner')
    num_matched_vehicle_boardings = len(matched_vehicle_boardings.drop_duplicates(subset=['cardNum','o_boarding_id']))
    match_perc = 100*(num_matched_vehicle_boardings/float(len(vehicle_boarding_origins)))
    return (matched_vehicle_boardings,num_matched_vehicle_boardings,match_perc)

def compatible_dates(otp_data,ticketing_data):
    otp_date = otp_data['date'].iloc[0]
    ticketing_date = pd.to_datetime(ticketing_data['o_boarding_datetime'].dt.strftime('%Y-%m-%d')[0])

    return (otp_date == ticketing_date,otp_date,ticketing_date)

def match_terminal_boardings(selected_trips,itineraries_start):
    terminal_boarding_origins = selected_trips[(selected_trips['o_busCode'].str.isdigit()) & (selected_trips['o_route'] != '021')]
    matched_terminal_boardings = terminal_boarding_origins.merge(itineraries_start,
                                                                left_on=['o_boarding_id','o_stopPointId'],
                                                                right_on=['user_trip_id','parent_station'], how='inner')
    num_matched_terminal_boardings = len(matched_terminal_boardings.drop_duplicates(subset=['cardNum','o_boarding_id']))
    matched_perc = 100*(num_matched_terminal_boardings/float(len(terminal_boarding_origins)))
    return (matched_terminal_boardings,num_matched_terminal_boardings,matched_perc)

def match_terminal_021_boardings(selected_trips,itineraries_start):
    terminal_021_origins = selected_trips[(selected_trips['o_busCode'].str.isdigit()) & (selected_trips['o_route'] == '021')]
    matched_021_terminal_boardings = terminal_021_origins.merge(itineraries_start, left_on=['o_boarding_id','o_route','o_stopPointId'], right_on=['user_trip_id','route','parent_station'], how='inner')
    num_matched_021_terminal_boardings = len(matched_021_terminal_boardings.drop_duplicates(subset=['cardNum','o_boarding_id']))
    terminal_021_match_perc = 100*(num_matched_021_terminal_boardings/float(len(terminal_021_origins)))
    return (len(terminal_021_origins),matched_021_terminal_boardings,num_matched_021_terminal_boardings,terminal_021_match_perc)

def get_otp_matched_legs(boarding_suggestions_matches,otp_suggestions):
    otp_legs_suggestions_matches = boarding_suggestions_matches \
                                    .filter(np.append(trips_origins.columns.values,['itinerary_id'])) \
                                    .merge(otp_suggestions,
                                            left_on=['o_boarding_id','itinerary_id'],
                                            right_on=['user_trip_id','itinerary_id'],
                                            how='inner') \
                                    .query('mode == \'BUS\'') \
                                    .assign(first_vehicle_boarding = lambda x: np.where(np.logical_not(x['o_busCode'].str.isdigit()) & (
                                                                                        x['o_route'] == x['route']),
                                                                                          True,
                                                                                          False)) \
                                    .filter(np.append(otp_suggestions.columns.values,['first_vehicle_boarding','o_busCode','o_tripNum','o_boarding_datetime']))
    return otp_legs_suggestions_matches

def match_otp_legs_start_to_buste(otp_filtered_legs,bus_trips):
    otp_legs_buste_start = otp_filtered_legs \
                            .merge(bus_trips,
                                     left_on=['route','from_stop_id'],
                                     right_on=['route','stopPointId'],
                                     how='inner') \
                            .assign(otp_buste_start_timediff =
                                lambda x: np.absolute(x['gps_datetime'] - x['otp_start_time'])) \
                            .filter(['user_trip_id','first_vehicle_boarding','itinerary_id','leg_id','route','busCode',
                                 'o_busCode','tripNum','o_tripNum','from_stop_id','otp_start_time','gps_datetime',
                                 'o_boarding_datetime','otp_buste_start_timediff','to_stop_id','otp_end_time']) \
                            .sort_values(['user_trip_id','itinerary_id','leg_id','otp_buste_start_timediff']) \
                            .rename(index=str, columns={'to_stop_id':'stopPointId', 'gps_datetime':'matched_start_time'})

    otp_legs_buste_start = otp_legs_buste_start[otp_legs_buste_start['otp_buste_start_timediff'] < pd.Timedelta('60min')]
    return otp_legs_buste_start

def match_otp_legs_end_to_buste(otp_filtered_legs,bus_trips):
    otp_legs_buste = otp_legs_buste_start \
                .merge(bus_trips,
                         on=['route','busCode','tripNum','stopPointId'],
                         how='inner') \
                .assign(otp_buste_end_timediff =
                            lambda x: np.absolute(x['gps_datetime'] - x['otp_end_time'])) \
                .rename(index=str, columns={'stopPointId':'to_stop_id', 'gps_datetime':'matched_end_time'}) \
                .assign(leg_duration = lambda x: x['matched_end_time'] - x['matched_start_time'],
                        boarding_otp_match_start_timediff =
                            lambda x: np.absolute(x['o_boarding_datetime'] - x['matched_start_time'])) \
                .query('matched_end_time > matched_start_time') \
                .filter(['user_trip_id','first_vehicle_boarding','itinerary_id','leg_id','route','busCode',
                         'o_busCode','tripNum','o_tripNum','from_stop_id','otp_start_time',
                         'matched_start_time','o_boarding_datetime','otp_buste_start_timediff',
                         'to_stop_id','otp_end_time','matched_end_time','otp_buste_end_timediff',
                         'boarding_otp_match_start_timediff', 'leg_duration']) \
                .sort_values(['user_trip_id','itinerary_id','leg_id','otp_buste_end_timediff'])

    otp_legs_buste = otp_legs_buste[otp_legs_buste['otp_buste_end_timediff'] < pd.Timedelta('60min')]
    return otp_legs_buste

def add_stops_data_to_leg_matches(chosen_leg_matches,stops_locations):
    chosen_leg_matches_data = chosen_leg_matches.merge(stops_locations, left_on='from_stop_id', right_on='stop_id', how='left') \
                                        .drop('stop_id', axis=1) \
                                        .rename(index=str, columns={'stop_lat':'from_stop_lat','stop_lon':'from_stop_lon'}) \
                                        .merge(stops_locations, left_on='to_stop_id', right_on='stop_id', how='left') \
                                        .drop('stop_id', axis=1) \
                                        .rename(index=str, columns={'stop_lat':'to_stop_lat','stop_lon':'to_stop_lon'}) \
                                        .merge(user_trips_ids, on=['user_trip_id'], how='inner') \
                                        [np.append(np.append(['cardNum'],otp_legs_buste.columns.values),['from_stop_lat','from_stop_lon','to_stop_lat','to_stop_lon'])]
    return chosen_leg_matches_data

def build_candidate_itineraries_df(chosen_leg_matches_data):
    otp_buste_itineraries = chosen_leg_matches_data \
                    .groupby(['cardNum','user_trip_id','itinerary_id']) \
                    .agg({'from_stop_id': lambda x: x.iloc[0],
                          'matched_start_time': lambda x: x.iloc[0],
                          'from_stop_lat': lambda x: x.iloc[0],
                          'from_stop_lon': lambda x: x.iloc[0],
                          'to_stop_id': lambda x: x.iloc[-1],
                          'matched_end_time': lambda x: x.iloc[-1],
                          'to_stop_lat': lambda x: x.iloc[-1],
                          'to_stop_lon': lambda x: x.iloc[-1],
                          'leg_id': lambda x: len(x),
                          'first_vehicle_boarding' : lambda x: x.any()}) \
                    .rename(index=str, columns={'leg_id':'num_transfers','first_vehicle_boarding':'vehicle_boarding'}) \
                    .add_prefix('match_') \
                    .reset_index() \
                    .assign(cardNum = lambda x: x['cardNum'].astype(float),
                            user_trip_id = lambda x: x['user_trip_id'].astype(int),
                            itinerary_id = lambda x: x['itinerary_id'].astype(int))
    return otp_buste_itineraries

def get_candidate_itineraries_summary(candidate_itineraries,boarding_suggestions_matches):
    otp_buste_itineraries_summary = candidate_itineraries \
                    .merge(boarding_suggestions_matches \
                                .drop_duplicates(subset=['cardNum','user_trip_id','itinerary_id']),
                            on=['cardNum','user_trip_id','itinerary_id'],
                            how='inner') \
                    [['cardNum', 'user_trip_id', 'itinerary_id',
                          'match_from_stop_id', 'match_matched_start_time', 'o_boarding_datetime',
                          'match_from_stop_lat', 'match_from_stop_lon', 'o_stop_lat', 'o_stop_lon',
                          'match_to_stop_id', 'match_matched_end_time', 'next_o_boarding_datetime',
                          'match_to_stop_lat', 'match_to_stop_lon', 'next_o_stop_lat', 'next_o_stop_lon','match_num_transfers', 'match_vehicle_boarding']] \
                    .assign(start_diff = lambda x: np.absolute(x['match_matched_start_time'] - x['o_boarding_datetime']),
                            trip_duration = lambda x: x['match_matched_end_time'] - x['match_matched_start_time'],
                            origin_dist = lambda y: y.apply(lambda x: dist(x['match_from_stop_lat'], x['match_from_stop_lon'], x['o_stop_lat'], x['o_stop_lon']),axis=1),
                            next_origin_dist = lambda y: y.apply(lambda x: dist(x['match_to_stop_lat'], x['match_to_stop_lon'], x['next_o_stop_lat'], x['next_o_stop_lon']),axis=1)) \
                    .sort_values(['cardNum','user_trip_id'])

    otp_buste_itineraries_summary = otp_buste_itineraries_summary[((otp_buste_itineraries_summary['trip_duration'] > pd.Timedelta('0s')) &
                                                                    (otp_buste_itineraries_summary['trip_duration'] < pd.Timedelta('2h'))) &
                                                                ((otp_buste_itineraries_summary['start_diff'] > pd.Timedelta('0s')) &
                                                                    (otp_buste_itineraries_summary['start_diff'] < pd.Timedelta('1.5h')))] \
                                        .query('origin_dist < 0.1') \
                                        .query('next_origin_dist < 2.0')
    return otp_buste_itineraries_summary

def get_candidate_itineraries_penalty_score(otp_buste_itineraries_filtered):
    otp_buste_itineraries_penalty = otp_buste_itineraries_filtered \
                            .assign(penalty = lambda x: 2*x['start_diff'].dt.total_seconds() + x['trip_duration'].dt.total_seconds() + x['match_num_transfers']*10) \
                            [['cardNum','user_trip_id','itinerary_id','match_num_transfers','match_vehicle_boarding','next_origin_dist','origin_dist','start_diff','trip_duration','penalty']] \
                            .sort_values(['user_trip_id','penalty'], ascending=True)
    return otp_buste_itineraries_penalty


#### Read OTP Suggestions

In [369]:
otp_suggestions_filepath = '/local/tarciso/masters/data/bus_trips/latest/otp-itineraries/it-junho/2017_06_16_user_trips_aa_otp_itineraries.csv'
otp_suggestions_raw = pd.read_csv(otp_suggestions_filepath, parse_dates=['date','otp_start_time','otp_end_time'])

In [370]:
otp_suggestions = prepare_otp_data(otp_suggestions_raw)

In [371]:
len(otp_suggestions.drop_duplicates(subset=['user_trip_id']))

199

In [372]:
otp_suggestions.head()

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins
0,2017-06-16,4699,1,1,2017-06-16 05:00:25,2017-06-16 05:01:59,WALK,,,,1.566667
1,2017-06-16,4699,1,2,2017-06-16 05:02:00,2017-06-16 05:18:21,BUS,342.0,30448.0,34916.0,16.35
2,2017-06-16,4699,1,3,2017-06-16 05:18:22,2017-06-16 05:18:37,WALK,,,,0.25
3,2017-06-16,4699,2,1,2017-06-16 05:15:25,2017-06-16 05:16:59,WALK,,,,1.566667
4,2017-06-16,4699,2,2,2017-06-16 05:17:00,2017-06-16 05:33:21,BUS,342.0,30448.0,34916.0,16.35


In [373]:
len(otp_suggestions)

7839

#### Adding Parent Stop data to OTP Suggestions

In [374]:
gtfs_base_folderpath = '/local/tarciso/data/gtfs/'
file_date = otp_suggestions['date'][0]
stops_filepath = gtfs_base_folderpath + os.sep + get_router_id(file_date) + os.sep + 'stops.txt'
stops_df = pd.read_csv(stops_filepath)

In [375]:
stops_df.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,70,104505,Terminal Campina do Siqueira - 303 - Centenári...,Terminal Campina do Siqueira - Campo Comprido,-25.435724,-49.306998,,,0,14506.0,,
1,270,104905,Terminal Carmo - 030 - Interbairros III,Terminal Carmo - 030 - Interbairros III (Senti...,-25.501341,-49.237597,,,0,14485.0,,
2,276,105606,Terminal Oficinas - 030 - Interbairros III,Terminal Oficinas - 030 - Interbairros III (Se...,-25.45155,-49.214917,,,0,14481.0,,
3,299,105603,Terminal Oficinas - 030 - Interbairros III,Terminal Oficinas - 030 - Interbairros III (Se...,-25.451665,-49.215086,,,0,14481.0,,
4,308,104907,Terminal Carmo - 030 - Interbairros III,Terminal Carmo - 030 - Interbairros III (Senti...,-25.501311,-49.237825,,,0,14485.0,,


In [376]:
stops_parent_stations = stops_df[['stop_id','parent_station']]

In [377]:
otp_suggestions = otp_suggestions.merge(stops_parent_stations, left_on='from_stop_id', right_on='stop_id', how='left') \
                                    .drop(['stop_id'], axis=1)

In [378]:
otp_suggestions.head()

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station
0,2017-06-16,4699,1,1,2017-06-16 05:00:25,2017-06-16 05:01:59,WALK,,,,1.566667,
1,2017-06-16,4699,1,2,2017-06-16 05:02:00,2017-06-16 05:18:21,BUS,342.0,30448.0,34916.0,16.35,14471.0
2,2017-06-16,4699,1,3,2017-06-16 05:18:22,2017-06-16 05:18:37,WALK,,,,0.25,
3,2017-06-16,4699,2,1,2017-06-16 05:15:25,2017-06-16 05:16:59,WALK,,,,1.566667,
4,2017-06-16,4699,2,2,2017-06-16 05:17:00,2017-06-16 05:33:21,BUS,342.0,30448.0,34916.0,16.35,14471.0


#### Read Origin/Next-Origin Pairs

In [379]:
trips_origins = pd.read_csv('/local/tarciso/masters/data/bus_trips/latest/enhanced-buste/2017_06_16_user_trips.csv', parse_dates=['o_boarding_datetime','o_gps_datetime','next_o_boarding_datetime','next_o_gps_datetime'])

In [380]:
len(trips_origins)

139422

In [381]:
trips_origins.sort_values(['cardNum','o_boarding_id']).head()

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,next_o_boarding_datetime,next_o_route,next_o_busCode,next_o_tripNum,next_o_stopPointId,next_o_gps_datetime,next_o_stop_lat,next_o_stop_lon,boardings_timediff,dist_between_origins
133300,229948.0,2,2017-06-16 15:50:03,828,JA018,7.0,35386,2017-06-16 15:48:58,-25.481932,-49.323108,...,2017-06-16 18:17:11,0,03014,,14499,NaT,-25.476335,-49.292629,0 days 02:27:08.000000000,3.12215
133301,229948.0,4,2017-06-16 18:17:13,0,03014,,14499,NaT,-25.476335,-49.292629,...,2017-06-16 15:50:01,828,JA018,7.0,35386,2017-06-16 15:48:58,-25.481932,-49.323108,0 days 02:27:12.000000000,3.12215
79691,230146.0,5,2017-06-16 08:41:53,0,00057,,43328,NaT,-25.52542,-49.230897,...,2017-06-16 13:13:27,0,00049,,43723,NaT,-25.433715,-49.270258,0 days 04:31:34.000000000,10.935813
79692,230146.0,6,2017-06-16 13:13:27,0,00049,,43723,NaT,-25.433715,-49.270258,...,2017-06-16 08:41:53,0,00057,,43328,NaT,-25.52542,-49.230897,0 days 04:31:34.000000000,10.935813
116647,273085.0,8,2017-06-16 12:58:53,828,JA019,6.0,31000,2017-06-16 12:57:02,-25.441536,-49.347001,...,2017-06-16 15:05:52,611,JA003,14.0,28392,2017-06-16 15:03:46,-25.476564,-49.29235,0 days 02:06:59.000000000,6.728725


In [382]:
trips_origins.dist_between_origins.describe()

count    139422.000000
mean          6.988813
std           3.567462
min           1.500231
25%           4.249495
50%           6.517735
75%           9.069419
max          27.613992
Name: dist_between_origins, dtype: float64

In [383]:
trips_origins.boardings_timediff.describe()

count                        139422
unique                        39142
top       0 days 09:14:19.000000000
freq                             21
Name: boardings_timediff, dtype: object

In [384]:
len(trips_origins)

139422

In [385]:
trips_origins.dtypes

cardNum                            float64
o_boarding_id                        int64
o_boarding_datetime         datetime64[ns]
o_route                             object
o_busCode                           object
o_tripNum                          float64
o_stopPointId                        int64
o_gps_datetime              datetime64[ns]
o_stop_lat                         float64
o_stop_lon                         float64
next_o_boarding_id                   int64
next_o_boarding_datetime    datetime64[ns]
next_o_route                        object
next_o_busCode                      object
next_o_tripNum                     float64
next_o_stopPointId                   int64
next_o_gps_datetime         datetime64[ns]
next_o_stop_lat                    float64
next_o_stop_lon                    float64
boardings_timediff                  object
dist_between_origins               float64
dtype: object

#### Selecting trips for whom OTP suggestions were found

In [386]:
selected_trips = trips_origins[trips_origins['o_boarding_id'].isin(otp_suggestions['user_trip_id'])]
num_selected_trips = len(selected_trips)

In [387]:
len(selected_trips)

199

In [388]:
selected_trips.head()

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,next_o_boarding_datetime,next_o_route,next_o_busCode,next_o_tripNum,next_o_stopPointId,next_o_gps_datetime,next_o_stop_lat,next_o_stop_lon,boardings_timediff,dist_between_origins
1,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16 00:00:18,860,LC029,5.0,30162,2017-06-16 12:00:07,-25.449347,-49.299662,0 days 08:58:55.000000000,4.914025
2,1712296.0,27414,2017-06-16 05:27:39,166,BC012,9.0,26740,2017-06-16 12:59:12,-25.391601,-49.300962,...,2017-06-16 05:48:02,0,05068,,41920,NaT,-25.429398,-49.272319,0 days 00:20:23.000000000,5.093096
3,1712296.0,27415,2017-06-16 05:48:02,0,05068,,41920,NaT,-25.429398,-49.272319,...,2017-06-16 18:31:55,0,00008,,14487,NaT,-25.481927,-49.246999,0 days 12:43:53.000000000,6.370171
4,1712296.0,27416,2017-06-16 18:31:55,0,00008,,14487,NaT,-25.481927,-49.246999,...,2017-06-16 00:00:58,166,BC016,8.0,26754,2017-06-16 12:00:07,-25.391386,-49.297205,0 days 18:30:57.000000000,11.259416
5,3854484.0,244272,2017-06-16 00:03:29,372,BC318,6.0,30249,2017-06-16 12:01:44,-25.427524,-49.256166,...,2017-06-16 14:29:13,0,01000,,14476,NaT,-25.413047,-49.20548,0 days 14:25:44.000000000,5.338798


In [390]:
itineraries_start = otp_suggestions.query('mode == \'BUS\'') \
                    .groupby(['user_trip_id','itinerary_id']) \
                    .first() \
                    .reset_index()

In [391]:
itineraries_start.head()

Unnamed: 0,user_trip_id,itinerary_id,date,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station
0,4699,1,2017-06-16,2,2017-06-16 05:02:00,2017-06-16 05:18:21,BUS,342,30448.0,34916.0,16.35,14471.0
1,4699,2,2017-06-16,2,2017-06-16 05:17:00,2017-06-16 05:33:21,BUS,342,30448.0,34916.0,16.35,14471.0
2,4699,3,2017-06-16,2,2017-06-16 05:32:00,2017-06-16 05:50:30,BUS,342,30448.0,34916.0,18.5,14471.0
3,4699,4,2017-06-16,2,2017-06-16 05:29:00,2017-06-16 05:43:53,BUS,222,30450.0,32246.0,14.883333,14471.0
4,4699,5,2017-06-16,2,2017-06-16 05:50:00,2017-06-16 06:08:30,BUS,342,30448.0,34916.0,18.5,14471.0


In [392]:
matched_vehicle_boardings,num_matched_vehicle_boardings,vehicle_match_perc = match_vehicle_boardings(selected_trips,itineraries_start)

In [393]:
print "Vehicle boardings with matching OTP suggestions: ", num_matched_vehicle_boardings, "(",vehicle_match_perc, "%)"

Vehicle boardings with matching OTP suggestions:  82 ( 65.0793650794 %)


In [394]:
matched_terminal_boardings,num_matched_terminal_boardings,terminal_matched_perc = match_terminal_boardings(selected_trips,itineraries_start)

In [395]:
print "Terminal boardings with matching OTP suggestions: ", num_matched_terminal_boardings, "(", terminal_matched_perc, "%)"

Terminal boardings with matching OTP suggestions:  53 ( 75.7142857143 %)


In [396]:
num_terminal_021_boardings,matched_021_terminal_boardings,num_matched_021_terminal_boardings,terminal_021_match_perc = match_terminal_021_boardings(selected_trips,itineraries_start)

In [397]:
if (num_terminal_021_boardings > 0):
    print "Line 021 Terminal boardings with matching OTP suggestions: ", num_matched_021_terminal_boardings, "(",terminal_021_match_perc, "%)"
else:
    print "No Line 021 Terminal boardings found. Skipping matching."


Line 021 Terminal boardings with matching OTP suggestions:  0 ( 0.0 %)


In [398]:
boarding_suggestions_matches = pd.concat([matched_vehicle_boardings,matched_021_terminal_boardings,matched_terminal_boardings])
total_num_matches = num_matched_vehicle_boardings + num_matched_021_terminal_boardings + num_matched_terminal_boardings


In [399]:
print "Total number of matches: ", total_num_matches, "(", 100*(total_num_matches/float(num_selected_trips)), "%)"

Total number of matches:  135 ( 67.8391959799 %)


In [400]:
boarding_suggestions_matches.head()

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,date,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station
0,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16,1,2017-06-16 08:57:21,2017-06-16 09:24:28,BUS,860,34130.0,30162.0,27.116667,
1,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16,1,2017-06-16 09:14:21,2017-06-16 09:41:28,BUS,860,34130.0,30162.0,27.116667,
2,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16,1,2017-06-16 09:31:21,2017-06-16 09:58:28,BUS,860,34130.0,30162.0,27.116667,
3,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16,1,2017-06-16 09:48:21,2017-06-16 10:15:28,BUS,860,34130.0,30162.0,27.116667,
4,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16,1,2017-06-16 10:05:21,2017-06-16 10:32:28,BUS,860,34130.0,30162.0,27.116667,


#### Add OTP extra origin/next-origin pairs to final dataset

In [401]:
boarding_suggestions_matches.dtypes

cardNum                            float64
o_boarding_id                        int64
o_boarding_datetime         datetime64[ns]
o_route                             object
o_busCode                           object
o_tripNum                          float64
o_stopPointId                       object
o_gps_datetime              datetime64[ns]
o_stop_lat                         float64
o_stop_lon                         float64
next_o_boarding_id                   int64
next_o_boarding_datetime    datetime64[ns]
next_o_route                        object
next_o_busCode                      object
next_o_tripNum                     float64
next_o_stopPointId                   int64
next_o_gps_datetime         datetime64[ns]
next_o_stop_lat                    float64
next_o_stop_lon                    float64
boardings_timediff                  object
dist_between_origins               float64
user_trip_id                         int64
itinerary_id                         int64
date       

In [402]:
trips_origins.dtypes

cardNum                            float64
o_boarding_id                        int64
o_boarding_datetime         datetime64[ns]
o_route                             object
o_busCode                           object
o_tripNum                          float64
o_stopPointId                        int64
o_gps_datetime              datetime64[ns]
o_stop_lat                         float64
o_stop_lon                         float64
next_o_boarding_id                   int64
next_o_boarding_datetime    datetime64[ns]
next_o_route                        object
next_o_busCode                      object
next_o_tripNum                     float64
next_o_stopPointId                   int64
next_o_gps_datetime         datetime64[ns]
next_o_stop_lat                    float64
next_o_stop_lon                    float64
boardings_timediff                  object
dist_between_origins               float64
dtype: object

In [404]:
otp_filtered_legs = get_otp_matched_legs(boarding_suggestions_matches,otp_suggestions)

In [405]:
otp_filtered_legs

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station,first_vehicle_boarding,o_busCode,o_tripNum,o_boarding_datetime
0,2017-06-16,180188,1,1,2017-06-16 08:57:21,2017-06-16 09:24:28,BUS,860,34130,30162.0,27.116667,,True,LC029,3.0,2017-06-16 08:59:13
1,2017-06-16,180188,3,1,2017-06-16 09:14:21,2017-06-16 09:41:28,BUS,860,34130,30162.0,27.116667,,True,LC029,3.0,2017-06-16 08:59:13
2,2017-06-16,180188,4,1,2017-06-16 09:31:21,2017-06-16 09:58:28,BUS,860,34130,30162.0,27.116667,,True,LC029,3.0,2017-06-16 08:59:13
3,2017-06-16,180188,6,1,2017-06-16 09:48:21,2017-06-16 10:15:28,BUS,860,34130,30162.0,27.116667,,True,LC029,3.0,2017-06-16 08:59:13
4,2017-06-16,180188,7,1,2017-06-16 10:05:21,2017-06-16 10:32:28,BUS,860,34130,30162.0,27.116667,,True,LC029,3.0,2017-06-16 08:59:13
5,2017-06-16,180188,9,1,2017-06-16 10:22:21,2017-06-16 10:49:28,BUS,860,34130,30162.0,27.116667,,True,LC029,3.0,2017-06-16 08:59:13
6,2017-06-16,180188,10,1,2017-06-16 10:39:21,2017-06-16 11:06:28,BUS,860,34130,30162.0,27.116667,,True,LC029,3.0,2017-06-16 08:59:13
7,2017-06-16,244272,2,1,2017-06-16 06:02:49,2017-06-16 06:26:31,BUS,372,30249,30763.0,23.700000,,True,BC318,6.0,2017-06-16 00:03:29
9,2017-06-16,244272,4,1,2017-06-16 06:13:29,2017-06-16 06:35:33,BUS,372,30249,30763.0,22.066667,,True,BC318,6.0,2017-06-16 00:03:29
11,2017-06-16,244272,8,1,2017-06-16 06:25:19,2017-06-16 06:51:28,BUS,372,30249,30763.0,26.150000,,True,BC318,6.0,2017-06-16 00:03:29


In [406]:
otp_filtered_legs.dtypes

date                      datetime64[ns]
user_trip_id                       int64
itinerary_id                       int64
leg_id                             int64
otp_start_time            datetime64[ns]
otp_end_time              datetime64[ns]
mode                              object
route                             object
from_stop_id                      object
to_stop_id                       float64
otp_duration_mins                float64
parent_station                   float64
first_vehicle_boarding              bool
o_busCode                         object
o_tripNum                        float64
o_boarding_datetime       datetime64[ns]
dtype: object

In [407]:
len(otp_filtered_legs)

1394

#### Find OTP Suggested Itineraries in BUSTE Data

In [409]:
bus_trips_filepath = '/local/tarciso/masters/data/bus_trips/latest/enhanced-buste/2017_06_16_bus_trips.csv'
bus_trips = pd.read_csv(bus_trips_filepath, dtype={'route': object},parse_dates=['gps_datetime']) \
                .sort_values(['route','busCode','tripNum','gps_datetime']) \
                .assign(route = lambda x: x['route'].astype(str).str.replace("\.0",'').str.zfill(3))

In [410]:
bus_trips.head()

Unnamed: 0,route,busCode,tripNum,stopPointId,gps_datetime,stop_lat,stop_lon,parent_station
121790,1,BN998,1.0,29887,2017-06-16 07:01:26,-25.428171,-49.264558,
121791,1,BN998,1.0,31453,2017-06-16 07:02:04,-25.430388,-49.263602,
121792,1,BN998,1.0,31454,2017-06-16 07:02:38,-25.433503,-49.262257,
121793,1,BN998,1.0,30748,2017-06-16 07:05:09,-25.435187,-49.264933,
121794,1,BN998,1.0,30749,2017-06-16 07:05:22,-25.435959,-49.266889,


In [411]:
len(bus_trips)

578484

In [412]:
otp_filtered_legs.head()

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station,first_vehicle_boarding,o_busCode,o_tripNum,o_boarding_datetime
0,2017-06-16,180188,1,1,2017-06-16 08:57:21,2017-06-16 09:24:28,BUS,860,34130,30162.0,27.116667,,True,LC029,3.0,2017-06-16 08:59:13
1,2017-06-16,180188,3,1,2017-06-16 09:14:21,2017-06-16 09:41:28,BUS,860,34130,30162.0,27.116667,,True,LC029,3.0,2017-06-16 08:59:13
2,2017-06-16,180188,4,1,2017-06-16 09:31:21,2017-06-16 09:58:28,BUS,860,34130,30162.0,27.116667,,True,LC029,3.0,2017-06-16 08:59:13
3,2017-06-16,180188,6,1,2017-06-16 09:48:21,2017-06-16 10:15:28,BUS,860,34130,30162.0,27.116667,,True,LC029,3.0,2017-06-16 08:59:13
4,2017-06-16,180188,7,1,2017-06-16 10:05:21,2017-06-16 10:32:28,BUS,860,34130,30162.0,27.116667,,True,LC029,3.0,2017-06-16 08:59:13


#### Find candidate matches in BUSTE data

In [413]:
otp_legs_buste_start = match_otp_legs_start_to_buste(otp_filtered_legs,bus_trips)

In [414]:
len(otp_legs_buste_start)

15372

In [415]:
otp_legs_buste_start.head()

Unnamed: 0,user_trip_id,first_vehicle_boarding,itinerary_id,leg_id,route,busCode,o_busCode,tripNum,o_tripNum,from_stop_id,otp_start_time,matched_start_time,o_boarding_datetime,otp_buste_start_timediff,stopPointId,otp_end_time
105320,4699,False,2,2,342,BA120,5006,1.0,,30448,2017-06-16 05:17:00,2017-06-16 06:07:06,2017-06-16 04:43:52,00:50:06,34916.0,2017-06-16 05:33:21
105351,4699,False,3,2,342,BA120,5006,1.0,,30448,2017-06-16 05:32:00,2017-06-16 06:07:06,2017-06-16 04:43:52,00:35:06,34916.0,2017-06-16 05:50:30
105335,4699,False,3,2,342,BA116,5006,1.0,,30448,2017-06-16 05:32:00,2017-06-16 06:20:51,2017-06-16 04:43:52,00:48:51,34916.0,2017-06-16 05:50:30
105336,4699,False,3,2,342,BA116,5006,2.0,,30448,2017-06-16 05:32:00,2017-06-16 06:20:51,2017-06-16 04:43:52,00:48:51,34916.0,2017-06-16 05:50:30
105548,4699,False,4,2,222,BA037,5006,1.0,,30450,2017-06-16 05:29:00,2017-06-16 05:56:59,2017-06-16 04:43:52,00:27:59,32246.0,2017-06-16 05:43:53


In [416]:
otp_legs_buste_start.otp_buste_start_timediff.describe()

count                     15372
mean     0 days 00:30:00.625618
std      0 days 00:17:31.106519
min             0 days 00:00:00
25%             0 days 00:15:17
50%      0 days 00:30:07.500000
75%             0 days 00:44:59
max             0 days 00:59:59
Name: otp_buste_start_timediff, dtype: object

In [418]:
otp_legs_buste = match_otp_legs_end_to_buste(otp_legs_buste_start,bus_trips)

In [419]:
len(otp_legs_buste)

13493

In [420]:
otp_legs_buste.head(50)

Unnamed: 0,user_trip_id,first_vehicle_boarding,itinerary_id,leg_id,route,busCode,o_busCode,tripNum,o_tripNum,from_stop_id,otp_start_time,matched_start_time,o_boarding_datetime,otp_buste_start_timediff,to_stop_id,otp_end_time,matched_end_time,otp_buste_end_timediff,boarding_otp_match_start_timediff,leg_duration
19,4699,False,3,2,342,BA116,5006,2.0,,30448,2017-06-16 05:32:00,2017-06-16 06:20:51,2017-06-16 04:43:52,00:48:51,34916,2017-06-16 05:50:30,2017-06-16 06:43:22,00:52:52,01:36:59,00:22:31
20,4699,False,3,2,342,BA116,5006,2.0,,30448,2017-06-16 05:32:00,2017-06-16 06:20:51,2017-06-16 04:43:52,00:48:51,34916,2017-06-16 05:50:30,2017-06-16 06:43:22,00:52:52,01:36:59,00:22:31
21,4699,False,3,2,342,BA116,5006,2.0,,30448,2017-06-16 05:32:00,2017-06-16 06:20:51,2017-06-16 04:43:52,00:48:51,34916,2017-06-16 05:50:30,2017-06-16 06:43:22,00:52:52,01:36:59,00:22:31
22,4699,False,3,2,342,BA116,5006,2.0,,30448,2017-06-16 05:32:00,2017-06-16 06:20:51,2017-06-16 04:43:52,00:48:51,34916,2017-06-16 05:50:30,2017-06-16 06:43:22,00:52:52,01:36:59,00:22:31
23,4699,False,3,2,342,BA116,5006,2.0,,30448,2017-06-16 05:32:00,2017-06-16 06:20:51,2017-06-16 04:43:52,00:48:51,34916,2017-06-16 05:50:30,2017-06-16 06:43:22,00:52:52,01:36:59,00:22:31
49,4699,False,4,4,30,CB697,5006,1.0,,32495,2017-06-16 05:46:13,2017-06-16 05:45:53,2017-06-16 04:43:52,00:00:20,32500,2017-06-16 05:50:04,2017-06-16 05:48:54,00:01:10,01:02:01,00:03:01
50,4699,False,4,4,30,BB602,5006,1.0,,32495,2017-06-16 05:46:13,2017-06-16 05:54:02,2017-06-16 04:43:52,00:07:49,32500,2017-06-16 05:50:04,2017-06-16 05:57:17,00:07:13,01:10:10,00:03:15
51,4699,False,4,4,30,BB602,5006,1.0,,32495,2017-06-16 05:46:13,2017-06-16 05:54:02,2017-06-16 04:43:52,00:07:49,32500,2017-06-16 05:50:04,2017-06-16 05:57:17,00:07:13,01:10:10,00:03:15
52,4699,False,4,4,30,BB610,5006,1.0,,32495,2017-06-16 05:46:13,2017-06-16 05:36:32,2017-06-16 04:43:52,00:09:41,32500,2017-06-16 05:50:04,2017-06-16 05:39:03,00:11:01,00:52:40,00:02:31
53,4699,False,4,4,30,BB608,5006,1.0,,32495,2017-06-16 05:46:13,2017-06-16 06:02:46,2017-06-16 04:43:52,00:16:33,32500,2017-06-16 05:50:04,2017-06-16 06:04:46,00:14:42,01:18:54,00:02:00


In [421]:
otp_legs_buste.otp_buste_end_timediff.describe()

count                     13493
mean     0 days 00:29:13.215519
std      0 days 00:16:59.030899
min             0 days 00:00:00
25%             0 days 00:15:23
50%             0 days 00:29:55
75%             0 days 00:43:30
max             0 days 00:59:59
Name: otp_buste_end_timediff, dtype: object

In [422]:
otp_legs_buste.boarding_otp_match_start_timediff.describe()

count                     13493
mean     0 days 01:25:59.742384
std      0 days 01:37:40.566903
min             0 days 00:00:00
25%             0 days 00:27:08
50%             0 days 01:00:47
75%             0 days 01:40:31
max             0 days 09:02:30
Name: boarding_otp_match_start_timediff, dtype: object

In [423]:
otp_legs_buste.leg_duration.describe()

count                     13493
mean     0 days 00:22:30.554657
std      0 days 00:12:15.193986
min             0 days 00:00:55
25%             0 days 00:12:20
50%             0 days 00:21:02
75%             0 days 00:30:41
max             0 days 01:46:29
Name: leg_duration, dtype: object

#### Choosing best leg match using current and previous leg information

In [424]:
legs_matches_groups = otp_legs_buste.groupby(['user_trip_id','itinerary_id','leg_id'])

In [425]:
len(legs_matches_groups)

1255

In [426]:
len(otp_legs_buste)

13493

In [427]:
chosen_leg_matches = choose_leg_matches(legs_matches_groups)

In [428]:
chosen_leg_matches.head()

Unnamed: 0,user_trip_id,first_vehicle_boarding,itinerary_id,leg_id,route,busCode,o_busCode,tripNum,o_tripNum,from_stop_id,otp_start_time,matched_start_time,o_boarding_datetime,otp_buste_start_timediff,to_stop_id,otp_end_time,matched_end_time,otp_buste_end_timediff,boarding_otp_match_start_timediff,leg_duration
19,4699,False,3,2,342,BA116,5006,2.0,,30448.0,2017-06-16 05:32:00,2017-06-16 06:20:51,2017-06-16 04:43:52,00:48:51,34916.0,2017-06-16 05:50:30,2017-06-16 06:43:22,00:52:52,01:36:59,00:22:31
63,4699,False,4,4,30,DR106,5006,1.0,,32495.0,2017-06-16 05:46:13,2017-06-16 05:06:26,2017-06-16 04:43:52,00:39:47,32500.0,2017-06-16 05:50:04,2017-06-16 05:08:36,00:41:28,00:22:34,00:02:10
24,4699,False,5,2,342,BA116,5006,2.0,,30448.0,2017-06-16 05:50:00,2017-06-16 06:20:51,2017-06-16 04:43:52,00:30:51,34916.0,2017-06-16 06:08:30,2017-06-16 06:43:22,00:34:52,01:36:59,00:22:31
78,4699,False,6,4,341,CA001,5006,1.0,,35281.0,2017-06-16 06:07:17,2017-06-16 05:13:00,2017-06-16 04:43:52,00:54:17,32133.0,2017-06-16 06:11:29,2017-06-16 05:15:37,00:55:52,00:29:08,00:02:37
29,4699,False,7,2,342,BA116,5006,2.0,,30448.0,2017-06-16 06:08:00,2017-06-16 06:20:51,2017-06-16 04:43:52,00:12:51,34916.0,2017-06-16 06:26:30,2017-06-16 06:43:22,00:16:52,01:36:59,00:22:31


In [429]:
len(chosen_leg_matches)

1237

#### Choosing itinerary

#### Adding stops location data

In [430]:
stops_locations = stops_df[['stop_id','stop_lat','stop_lon']]

In [431]:
user_trips_ids = boarding_suggestions_matches[['cardNum','user_trip_id']] \
                    .drop_duplicates() \
                    .sort_values(['cardNum','user_trip_id'])


In [432]:
chosen_leg_matches_data = add_stops_data_to_leg_matches(chosen_leg_matches,stops_locations)

In [433]:
chosen_leg_matches_data.head()

Unnamed: 0,cardNum,user_trip_id,first_vehicle_boarding,itinerary_id,leg_id,route,busCode,o_busCode,tripNum,o_tripNum,...,to_stop_id,otp_end_time,matched_end_time,otp_buste_end_timediff,boarding_otp_match_start_timediff,leg_duration,from_stop_lat,from_stop_lon,to_stop_lat,to_stop_lon
0,844324.0,4699,False,3,2,342,BA116,5006,2.0,,...,34916,2017-06-16 05:50:30,2017-06-16 06:43:22,00:52:52,01:36:59,00:22:31,-25.393374,-49.240917,-25.396551,-49.202225
1,844324.0,4699,False,4,4,30,DR106,5006,1.0,,...,32500,2017-06-16 05:50:04,2017-06-16 05:08:36,00:41:28,00:22:34,00:02:10,-25.388687,-49.211563,-25.398503,-49.207786
2,844324.0,4699,False,5,2,342,BA116,5006,2.0,,...,34916,2017-06-16 06:08:30,2017-06-16 06:43:22,00:34:52,01:36:59,00:22:31,-25.393374,-49.240917,-25.396551,-49.202225
3,844324.0,4699,False,6,4,341,CA001,5006,1.0,,...,32133,2017-06-16 06:11:29,2017-06-16 05:15:37,00:55:52,00:29:08,00:02:37,-25.387422,-49.210284,-25.392631,-49.204064
4,844324.0,4699,False,7,2,342,BA116,5006,2.0,,...,34916,2017-06-16 06:26:30,2017-06-16 06:43:22,00:16:52,01:36:59,00:22:31,-25.393374,-49.240917,-25.396551,-49.202225


In [434]:
len(chosen_leg_matches_data)

1237

In [435]:
candidate_itineraries = build_candidate_itineraries_df(chosen_leg_matches_data)

In [436]:
candidate_itineraries.head()

Unnamed: 0,cardNum,user_trip_id,itinerary_id,match_from_stop_lon,match_matched_start_time,match_num_transfers,match_from_stop_lat,match_from_stop_id,match_to_stop_lat,match_vehicle_boarding,match_matched_end_time,match_to_stop_lon,match_to_stop_id
0,844324.0,4699,3,-49.240917,2017-06-16 06:20:51,1,-25.393374,30448.0,-25.396551,False,2017-06-16 06:43:22,-49.202225,34916.0
1,844324.0,4699,4,-49.211563,2017-06-16 05:06:26,1,-25.388687,32495.0,-25.398503,False,2017-06-16 05:08:36,-49.207786,32500.0
2,844324.0,4699,5,-49.240917,2017-06-16 06:20:51,1,-25.393374,30448.0,-25.396551,False,2017-06-16 06:43:22,-49.202225,34916.0
3,844324.0,4699,6,-49.210284,2017-06-16 05:13:00,1,-25.387422,35281.0,-25.392631,False,2017-06-16 05:15:37,-49.204064,32133.0
4,844324.0,4699,7,-49.240917,2017-06-16 06:20:51,1,-25.393374,30448.0,-25.396551,False,2017-06-16 06:43:22,-49.202225,34916.0


In [437]:
boarding_suggestions_matches.dtypes

cardNum                            float64
o_boarding_id                        int64
o_boarding_datetime         datetime64[ns]
o_route                             object
o_busCode                           object
o_tripNum                          float64
o_stopPointId                       object
o_gps_datetime              datetime64[ns]
o_stop_lat                         float64
o_stop_lon                         float64
next_o_boarding_id                   int64
next_o_boarding_datetime    datetime64[ns]
next_o_route                        object
next_o_busCode                      object
next_o_tripNum                     float64
next_o_stopPointId                   int64
next_o_gps_datetime         datetime64[ns]
next_o_stop_lat                    float64
next_o_stop_lon                    float64
boardings_timediff                  object
dist_between_origins               float64
user_trip_id                         int64
itinerary_id                         int64
date       

In [439]:
candidate_itineraries_filtered = get_candidate_itineraries_summary(candidate_itineraries,boarding_suggestions_matches)

In [440]:
candidate_itineraries_filtered.head()

Unnamed: 0,cardNum,user_trip_id,itinerary_id,match_from_stop_id,match_matched_start_time,o_boarding_datetime,match_from_stop_lat,match_from_stop_lon,o_stop_lat,o_stop_lon,...,match_to_stop_lat,match_to_stop_lon,next_o_stop_lat,next_o_stop_lon,match_num_transfers,match_vehicle_boarding,next_origin_dist,origin_dist,start_diff,trip_duration
8,844324.0,4700,1,34915.0,2017-06-16 14:50:47,2017-06-16 14:50:57,-25.39634,-49.20217,-25.39634,-49.20217,...,-25.393374,-49.240917,-25.393295,-49.241051,1,True,0.01613,0.0,00:00:10,00:19:00
9,844324.0,4700,3,34915.0,2017-06-16 14:50:47,2017-06-16 14:50:57,-25.39634,-49.20217,-25.39634,-49.20217,...,-25.393374,-49.240917,-25.393295,-49.241051,1,True,0.01613,0.0,00:00:10,00:19:00
10,844324.0,4700,6,34915.0,2017-06-16 14:50:47,2017-06-16 14:50:57,-25.39634,-49.20217,-25.39634,-49.20217,...,-25.393374,-49.240917,-25.393295,-49.241051,1,True,0.01613,0.0,00:00:10,00:19:00
11,844324.0,4700,8,34915.0,2017-06-16 15:08:58,2017-06-16 14:50:57,-25.39634,-49.20217,-25.39634,-49.20217,...,-25.393374,-49.240917,-25.393295,-49.241051,1,True,0.01613,0.0,00:18:01,00:18:10
12,844324.0,4700,9,34915.0,2017-06-16 15:48:41,2017-06-16 14:50:57,-25.39634,-49.20217,-25.39634,-49.20217,...,-25.393374,-49.240917,-25.393295,-49.241051,1,True,0.01613,0.0,00:57:44,00:17:13


In [441]:
candidate_itineraries_filtered.origin_dist.describe(percentiles=[.25,.5,.75,.9,.95,.99])

count    527.000000
mean       0.018176
std        0.028503
min        0.000000
25%        0.000000
50%        0.000230
75%        0.035070
90%        0.068240
95%        0.078483
99%        0.086670
max        0.086840
Name: origin_dist, dtype: float64

In [442]:
candidate_itineraries_filtered.next_origin_dist.describe(percentiles=[.25,.5,.75,.9,.95,.99])

count    527.000000
mean       0.201685
std        0.322845
min        0.000000
25%        0.009400
50%        0.042690
75%        0.261040
90%        0.659440
95%        0.825018
99%        1.525195
max        1.936180
Name: next_origin_dist, dtype: float64

In [443]:
len(candidate_itineraries_filtered)

527

In [444]:
candidate_itineraries_filtered.match_vehicle_boarding.describe()

count       527
unique        2
top       False
freq        285
Name: match_vehicle_boarding, dtype: object

In [445]:
candidate_itineraries_filtered.drop_duplicates('user_trip_id').match_vehicle_boarding.value_counts()

True     57
False    49
Name: match_vehicle_boarding, dtype: int64

In [446]:
otp_buste_itineraries_penalty = get_candidate_itineraries_penalty_score(candidate_itineraries_filtered)

In [447]:
otp_buste_itineraries_penalty.head()

Unnamed: 0,cardNum,user_trip_id,itinerary_id,match_num_transfers,match_vehicle_boarding,next_origin_dist,origin_dist,start_diff,trip_duration,penalty
8,844324.0,4700,1,1,True,0.01613,0.0,00:00:10,00:19:00,1170.0
9,844324.0,4700,3,1,True,0.01613,0.0,00:00:10,00:19:00,1170.0
10,844324.0,4700,6,1,True,0.01613,0.0,00:00:10,00:19:00,1170.0
11,844324.0,4700,8,1,True,0.01613,0.0,00:18:01,00:18:10,3262.0
12,844324.0,4700,9,1,True,0.01613,0.0,00:57:44,00:17:13,7971.0


In [448]:
chosen_itineraries = otp_buste_itineraries_penalty.groupby(['user_trip_id']).first().reset_index()

In [449]:
len(chosen_itineraries)

106

In [450]:
chosen_itineraries.head()

Unnamed: 0,user_trip_id,cardNum,itinerary_id,match_num_transfers,match_vehicle_boarding,next_origin_dist,origin_dist,start_diff,trip_duration,penalty
0,4700,844324.0,1,1,True,0.01613,0.0,00:00:10,00:19:00,1170.0
1,6671,948116.0,2,3,True,0.0,0.0,00:00:12,01:38:44,5978.0
2,6672,948116.0,5,3,True,0.21093,0.0,00:00:46,01:26:31,5313.0
3,7397,983885.0,3,2,False,0.04114,0.00138,00:04:38,00:52:09,3705.0
4,10498,1150536.0,3,3,False,0.0,0.00024,00:00:35,01:12:22,4442.0


In [451]:
chosen_itineraries = chosen_itineraries[(np.logical_not(chosen_itineraries['match_vehicle_boarding'])) |
                                        ((chosen_itineraries['match_vehicle_boarding'])
                                        & (chosen_itineraries['start_diff'] < pd.Timedelta('20 min')))]
num_chosen_itineraries = len(chosen_itineraries)

In [452]:
len(chosen_itineraries)

101

In [453]:
chosen_itineraries.describe(percentiles=[.25,.5,.75,.9,.95,.99])

Unnamed: 0,user_trip_id,cardNum,itinerary_id,match_num_transfers,next_origin_dist,origin_dist,start_diff,trip_duration,penalty
count,101.0,101.0,101.0,101.0,101.0,101.0,101,101,101.0
mean,117938.613861,2854524.0,2.871287,1.861386,0.275187,0.012865,0 days 00:06:08.722772,0 days 00:52:15.980198,3892.039604
std,73609.582367,904657.2,2.11501,0.678671,0.399333,0.025505,0 days 00:12:43.266403,0 days 00:26:40.917137,2035.846055
min,4700.0,844324.0,1.0,1.0,0.0,0.0,0 days 00:00:01,0 days 00:04:08,496.0
25%,54685.0,2276009.0,1.0,1.0,0.01058,0.0,0 days 00:00:25,0 days 00:33:48,2320.0
50%,106999.0,2992415.0,2.0,2.0,0.08667,0.0,0 days 00:01:08,0 days 00:47:35,3745.0
75%,180188.0,3651329.0,4.0,2.0,0.37192,0.00454,0 days 00:04:37,0 days 01:10:56,5208.0
90%,217926.0,3791238.0,6.0,3.0,0.79194,0.06216,0 days 00:16:53,0 days 01:29:55,6287.0
95%,226332.0,3814740.0,7.0,3.0,1.13705,0.07744,0 days 00:39:52,0 days 01:38:44,7557.0
99%,236447.0,3837449.0,9.0,3.0,1.72979,0.08684,0 days 00:59:33,0 days 01:41:34,9228.0


In [454]:
chosen_itineraries.head()

Unnamed: 0,user_trip_id,cardNum,itinerary_id,match_num_transfers,match_vehicle_boarding,next_origin_dist,origin_dist,start_diff,trip_duration,penalty
0,4700,844324.0,1,1,True,0.01613,0.0,00:00:10,00:19:00,1170.0
1,6671,948116.0,2,3,True,0.0,0.0,00:00:12,01:38:44,5978.0
2,6672,948116.0,5,3,True,0.21093,0.0,00:00:46,01:26:31,5313.0
3,7397,983885.0,3,2,False,0.04114,0.00138,00:04:38,00:52:09,3705.0
4,10498,1150536.0,3,3,False,0.0,0.00024,00:00:35,01:12:22,4442.0


In [455]:
chosen_itineraries.dtypes

user_trip_id                        int64
cardNum                           float64
itinerary_id                        int64
match_num_transfers                 int64
match_vehicle_boarding               bool
next_origin_dist                  float64
origin_dist                       float64
start_diff                timedelta64[ns]
trip_duration             timedelta64[ns]
penalty                           float64
dtype: object

In [456]:
print "Final number of matches (after processing): ", len(chosen_itineraries) , "(", 100*(num_chosen_itineraries/float(num_selected_trips)), "%)"

Final number of matches (after processing):  101 ( 50.7537688442 %)


In [457]:
od_trips = chosen_itineraries.merge(chosen_leg_matches_data, on=['cardNum','user_trip_id','itinerary_id'], how='inner') \
                                .filter(['cardNum','user_trip_id','itinerary_id','leg_id','route','busCode','tripNum',
                                        'from_stop_id','matched_start_time','from_stop_lat','from_stop_lon','to_stop_id',
                                        'matched_end_time','to_stop_lat','to_stop_lon','leg_duration']) \
                                .rename(index=str, columns={'matched_start_time':'start_time','matched_end_time':'end_time'})

In [458]:
od_trips.head()

Unnamed: 0,cardNum,user_trip_id,itinerary_id,leg_id,route,busCode,tripNum,from_stop_id,start_time,from_stop_lat,from_stop_lon,to_stop_id,end_time,to_stop_lat,to_stop_lon,leg_duration
0,844324.0,4700,1,1,342,BA120,10.0,34915,2017-06-16 14:50:47,-25.39634,-49.20217,30448,2017-06-16 15:09:47,-25.393374,-49.240917,00:19:00
1,948116.0,6671,2,1,684,HA610,1.0,39372,2017-06-16 04:48:26,-25.59345,-49.33388,31053,2017-06-16 05:12:47,-25.513098,-49.295303,00:24:21
2,948116.0,6671,2,3,204,HL309,1.0,26252,2017-06-16 05:29:56,-25.512983,-49.294505,25917,2017-06-16 05:57:00,-25.426995,-49.264729,00:27:04
3,948116.0,6671,2,5,361,BC906,1.0,29889,2017-06-16 06:21:06,-25.42691,-49.26308,28979,2017-06-16 06:27:10,-25.417287,-49.249938,00:06:04
4,948116.0,6672,5,1,216,BA603,7.0,28979,2017-06-16 15:25:37,-25.417287,-49.249938,28332,2017-06-16 16:00:52,-25.476127,-49.292362,00:35:15


In [462]:
od_trips.dtypes

cardNum                  float64
user_trip_id              object
itinerary_id              object
leg_id                    object
route                     object
busCode                   object
tripNum                  float64
from_stop_id              object
start_time        datetime64[ns]
from_stop_lat            float64
from_stop_lon            float64
to_stop_id                object
end_time          datetime64[ns]
to_stop_lat              float64
to_stop_lon              float64
leg_duration     timedelta64[ns]
dtype: object

In [459]:
od_trips.head(10).filter(['cardNum','user_trip_id','start_time','end_time','leg_duration'])

Unnamed: 0,cardNum,user_trip_id,start_time,end_time,leg_duration
0,844324.0,4700,2017-06-16 14:50:47,2017-06-16 15:09:47,00:19:00
1,948116.0,6671,2017-06-16 04:48:26,2017-06-16 05:12:47,00:24:21
2,948116.0,6671,2017-06-16 05:29:56,2017-06-16 05:57:00,00:27:04
3,948116.0,6671,2017-06-16 06:21:06,2017-06-16 06:27:10,00:06:04
4,948116.0,6672,2017-06-16 15:25:37,2017-06-16 16:00:52,00:35:15
5,948116.0,6672,2017-06-16 16:08:15,2017-06-16 16:19:48,00:11:33
6,948116.0,6672,2017-06-16 16:22:04,2017-06-16 16:52:08,00:30:04
7,983885.0,7397,2017-06-16 15:09:52,2017-06-16 15:14:31,00:04:39
8,983885.0,7397,2017-06-16 15:22:47,2017-06-16 16:02:01,00:39:14
9,1150536.0,10498,2017-06-16 14:54:12,2017-06-16 14:57:42,00:03:30


In [460]:
od_trips.tail(10).filter(['cardNum','user_trip_id','start_time','end_time','leg_duration'])

Unnamed: 0,cardNum,user_trip_id,start_time,end_time,leg_duration
178,3825303.0,230801,2017-06-16 04:43:36,2017-06-16 05:12:47,00:29:11
179,3825303.0,230801,2017-06-16 05:29:56,2017-06-16 05:59:12,00:29:16
180,3825303.0,230801,2017-06-16 06:03:06,2017-06-16 06:08:36,00:05:30
181,3825303.0,230802,2017-06-16 13:35:25,2017-06-16 13:49:33,00:14:08
182,3825303.0,230802,2017-06-16 14:01:02,2017-06-16 14:20:59,00:19:57
183,3825303.0,230802,2017-06-16 14:28:42,2017-06-16 15:00:03,00:31:21
184,3837449.0,236447,2017-06-16 04:43:38,2017-06-16 05:49:30,01:05:52
185,3837449.0,236447,2017-06-16 05:49:35,2017-06-16 06:01:18,00:11:43
186,3837449.0,236448,2017-06-16 18:18:38,2017-06-16 18:30:34,00:11:56
187,3837449.0,236448,2017-06-16 18:36:27,2017-06-16 19:29:26,00:52:59


In [461]:
len(od_trips)

188