In [277]:
import pandas as pd
import numpy as np
from geopy import distance

from datetime import datetime
import os

#### Functions

In [278]:
def dist(p1_lat, p1_lon, p2_lat, p2_lon):
    return np.around(distance.geodesic((p1_lat,p1_lon),(p2_lat,p2_lon)).km,decimals=5)

def get_router_id(query_date):
    INTERMEDIATE_OTP_DATE = pd.to_datetime("2017-06-30", format="%Y-%m-%d")
    router_id = ''

    if (query_date <= INTERMEDIATE_OTP_DATE):
        return 'ctba-2017-1'
    else:
        return 'ctba-2017-2'

#### Read Origin/Next-Origin Pairs

In [86]:
trips_origins = pd.read_csv('/local/tarciso/masters/data/bus_trips/latest/enhanced-buste/2017_06_16_user_trips.csv', parse_dates=['o_boarding_datetime','o_gps_datetime','next_o_boarding_datetime','next_o_gps_datetime'])

In [87]:
len(trips_origins)

139422

In [88]:
trips_origins.sort_values(['cardNum','o_boarding_id']).head(20)

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,next_o_boarding_datetime,next_o_route,next_o_busCode,next_o_tripNum,next_o_stopPointId,next_o_gps_datetime,next_o_stop_lat,next_o_stop_lon,boardings_timediff,dist_between_origins
133300,229948.0,2,2017-06-16 15:50:03,828,JA018,7.0,35386,2017-06-16 15:48:58,-25.481932,-49.323108,...,2017-06-16 18:17:11,0,03014,,14499,NaT,-25.476335,-49.292629,0 days 02:27:08.000000000,3.12215
133301,229948.0,4,2017-06-16 18:17:13,0,03014,,14499,NaT,-25.476335,-49.292629,...,2017-06-16 15:50:01,828,JA018,7.0,35386,2017-06-16 15:48:58,-25.481932,-49.323108,0 days 02:27:12.000000000,3.12215
79691,230146.0,5,2017-06-16 08:41:53,0,00057,,43328,NaT,-25.52542,-49.230897,...,2017-06-16 13:13:27,0,00049,,43723,NaT,-25.433715,-49.270258,0 days 04:31:34.000000000,10.935813
79692,230146.0,6,2017-06-16 13:13:27,0,00049,,43723,NaT,-25.433715,-49.270258,...,2017-06-16 08:41:53,0,00057,,43328,NaT,-25.52542,-49.230897,0 days 04:31:34.000000000,10.935813
116647,273085.0,8,2017-06-16 12:58:53,828,JA019,6.0,31000,2017-06-16 12:57:02,-25.441536,-49.347001,...,2017-06-16 15:05:52,611,JA003,14.0,28392,2017-06-16 15:03:46,-25.476564,-49.29235,0 days 02:06:59.000000000,6.728725
116648,273085.0,9,2017-06-16 15:05:52,611,JA003,14.0,28392,2017-06-16 15:03:46,-25.476564,-49.29235,...,2017-06-16 12:58:53,828,JA019,6.0,31000,2017-06-16 12:57:02,-25.441536,-49.347001,0 days 02:06:59.000000000,6.728725
114530,305601.0,12,2017-06-16 12:46:44,184,BN622,7.0,31906,2017-06-16 12:45:01,-25.354392,-49.281171,...,2017-06-16 13:30:58,0,00044,,43723,NaT,-25.433715,-49.270258,0 days 00:44:14.000000000,8.888211
114531,305601.0,13,2017-06-16 13:30:58,0,00044,,43723,NaT,-25.433715,-49.270258,...,2017-06-16 12:46:44,184,BN622,7.0,31906,2017-06-16 12:45:01,-25.354392,-49.281171,0 days 00:44:14.000000000,8.888211
126012,305823.0,14,2017-06-16 14:07:02,342,BA116,9.0,34932,2017-06-16 14:06:37,-25.40891,-49.20073,...,2017-06-16 16:00:25,0,03032,,26094,NaT,-25.456942,-49.28861,0 days 01:53:23.000000000,10.315149
126013,305823.0,15,2017-06-16 16:00:25,0,03032,,26094,NaT,-25.456942,-49.28861,...,2017-06-16 14:07:02,342,BA116,9.0,34932,2017-06-16 14:06:37,-25.40891,-49.20073,0 days 01:53:23.000000000,10.315149


In [89]:
trips_origins.dist_between_origins.describe()

count    139422.000000
mean          6.988813
std           3.567462
min           1.500231
25%           4.249495
50%           6.517735
75%           9.069419
max          27.613992
Name: dist_between_origins, dtype: float64

In [90]:
trips_origins.boardings_timediff.describe()

count                        139422
unique                        39142
top       0 days 09:14:19.000000000
freq                             21
Name: boardings_timediff, dtype: object

In [91]:
len(trips_origins)

139422

In [92]:
trips_origins.dtypes

cardNum                            float64
o_boarding_id                        int64
o_boarding_datetime         datetime64[ns]
o_route                             object
o_busCode                           object
o_tripNum                          float64
o_stopPointId                        int64
o_gps_datetime              datetime64[ns]
o_stop_lat                         float64
o_stop_lon                         float64
next_o_boarding_id                   int64
next_o_boarding_datetime    datetime64[ns]
next_o_route                        object
next_o_busCode                      object
next_o_tripNum                     float64
next_o_stopPointId                   int64
next_o_gps_datetime         datetime64[ns]
next_o_stop_lat                    float64
next_o_stop_lon                    float64
boardings_timediff                  object
dist_between_origins               float64
dtype: object

#### Read OTP Suggestions

In [93]:
otp_suggestions = pd.read_csv('/local/tarciso/masters/data/bus_trips/latest/otp-itineraries/it-junho/2017_06_16_user_trips_aa_otp_itineraries.csv', parse_dates=['date','otp_start_time','otp_end_time'])
otp_suggestions['otp_start_time'] = otp_suggestions['otp_start_time'] - pd.Timedelta('10800 s')
otp_suggestions['otp_end_time'] = otp_suggestions['otp_end_time'] - pd.Timedelta('10800 s')
otp_suggestions['route'] = otp_suggestions['route'].astype(str)
otp_suggestions['route'] = np.where(otp_suggestions['mode'] == 'BUS',
                                    otp_suggestions['route'].astype(str).str.replace("\.0",'').str.zfill(3),
                                    otp_suggestions['route'])

In [94]:
otp_suggestions['date'][0] == pd.to_datetime(trips_origins['o_boarding_datetime'].dt.strftime('%Y-%m-%d')[0])

True

In [95]:
len(otp_suggestions.drop_duplicates(subset=['user_trip_id']))

199

In [96]:
otp_suggestions.head()

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins
0,2017-06-16,4699,1,1,2017-06-16 05:00:25,2017-06-16 05:01:59,WALK,,,,1.566667
1,2017-06-16,4699,1,2,2017-06-16 05:02:00,2017-06-16 05:18:21,BUS,342.0,30448.0,34916.0,16.35
2,2017-06-16,4699,1,3,2017-06-16 05:18:22,2017-06-16 05:18:37,WALK,,,,0.25
3,2017-06-16,4699,2,1,2017-06-16 05:15:25,2017-06-16 05:16:59,WALK,,,,1.566667
4,2017-06-16,4699,2,2,2017-06-16 05:17:00,2017-06-16 05:33:21,BUS,342.0,30448.0,34916.0,16.35


In [97]:
len(otp_suggestions)

7839

#### Adding Parent Stop data to OTP Suggestions

In [98]:
base_gtfs_folder = '/local/tarciso/data/gtfs/'
router_id = get_router_id(otp_suggestions['date'][0])
stops_df = pd.read_csv(base_gtfs_folder + os.sep + router_id + os.sep + 'stops.txt')

In [99]:
stops_df.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,70,104505,Terminal Campina do Siqueira - 303 - Centenári...,Terminal Campina do Siqueira - Campo Comprido,-25.435724,-49.306998,,,0,14506.0,,
1,270,104905,Terminal Carmo - 030 - Interbairros III,Terminal Carmo - 030 - Interbairros III (Senti...,-25.501341,-49.237597,,,0,14485.0,,
2,276,105606,Terminal Oficinas - 030 - Interbairros III,Terminal Oficinas - 030 - Interbairros III (Se...,-25.45155,-49.214917,,,0,14481.0,,
3,299,105603,Terminal Oficinas - 030 - Interbairros III,Terminal Oficinas - 030 - Interbairros III (Se...,-25.451665,-49.215086,,,0,14481.0,,
4,308,104907,Terminal Carmo - 030 - Interbairros III,Terminal Carmo - 030 - Interbairros III (Senti...,-25.501311,-49.237825,,,0,14485.0,,


In [100]:
stops_parent_stations = stops_df[['stop_id','parent_station']]

In [101]:
otp_suggestions = otp_suggestions.merge(stops_parent_stations, left_on='from_stop_id', right_on='stop_id', how='left') \
                                    .drop(['stop_id'], axis=1)

In [102]:
otp_suggestions.head()

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station
0,2017-06-16,4699,1,1,2017-06-16 05:00:25,2017-06-16 05:01:59,WALK,,,,1.566667,
1,2017-06-16,4699,1,2,2017-06-16 05:02:00,2017-06-16 05:18:21,BUS,342.0,30448.0,34916.0,16.35,14471.0
2,2017-06-16,4699,1,3,2017-06-16 05:18:22,2017-06-16 05:18:37,WALK,,,,0.25,
3,2017-06-16,4699,2,1,2017-06-16 05:15:25,2017-06-16 05:16:59,WALK,,,,1.566667,
4,2017-06-16,4699,2,2,2017-06-16 05:17:00,2017-06-16 05:33:21,BUS,342.0,30448.0,34916.0,16.35,14471.0


#### Selecting trips for whom OTP suggestions were found

In [103]:
selected_trips = trips_origins[trips_origins['o_boarding_id'].isin(otp_suggestions['user_trip_id'])]

In [104]:
len(selected_trips)

199

In [105]:
selected_trips.head()

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,next_o_boarding_datetime,next_o_route,next_o_busCode,next_o_tripNum,next_o_stopPointId,next_o_gps_datetime,next_o_stop_lat,next_o_stop_lon,boardings_timediff,dist_between_origins
1,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16 00:00:18,860,LC029,5.0,30162,2017-06-16 12:00:07,-25.449347,-49.299662,0 days 08:58:55.000000000,4.914025
2,1712296.0,27414,2017-06-16 05:27:39,166,BC012,9.0,26740,2017-06-16 12:59:12,-25.391601,-49.300962,...,2017-06-16 05:48:02,0,05068,,41920,NaT,-25.429398,-49.272319,0 days 00:20:23.000000000,5.093096
3,1712296.0,27415,2017-06-16 05:48:02,0,05068,,41920,NaT,-25.429398,-49.272319,...,2017-06-16 18:31:55,0,00008,,14487,NaT,-25.481927,-49.246999,0 days 12:43:53.000000000,6.370171
4,1712296.0,27416,2017-06-16 18:31:55,0,00008,,14487,NaT,-25.481927,-49.246999,...,2017-06-16 00:00:58,166,BC016,8.0,26754,2017-06-16 12:00:07,-25.391386,-49.297205,0 days 18:30:57.000000000,11.259416
5,3854484.0,244272,2017-06-16 00:03:29,372,BC318,6.0,30249,2017-06-16 12:01:44,-25.427524,-49.256166,...,2017-06-16 14:29:13,0,01000,,14476,NaT,-25.413047,-49.20548,0 days 14:25:44.000000000,5.338798


In [106]:
itineraries_start = otp_suggestions.query('mode == \'BUS\'') \
                    .groupby(['user_trip_id','itinerary_id']) \
                    .first() \
                    .reset_index()

In [107]:
itineraries_start.head()

Unnamed: 0,user_trip_id,itinerary_id,date,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station
0,4699,1,2017-06-16,2,2017-06-16 05:02:00,2017-06-16 05:18:21,BUS,342,30448.0,34916.0,16.35,14471.0
1,4699,2,2017-06-16,2,2017-06-16 05:17:00,2017-06-16 05:33:21,BUS,342,30448.0,34916.0,16.35,14471.0
2,4699,3,2017-06-16,2,2017-06-16 05:32:00,2017-06-16 05:50:30,BUS,342,30448.0,34916.0,18.5,14471.0
3,4699,4,2017-06-16,2,2017-06-16 05:29:00,2017-06-16 05:43:53,BUS,222,30450.0,32246.0,14.883333,14471.0
4,4699,5,2017-06-16,2,2017-06-16 05:50:00,2017-06-16 06:08:30,BUS,342,30448.0,34916.0,18.5,14471.0


In [108]:
vehicle_boarding_origins = selected_trips[np.logical_not(selected_trips['o_busCode'].str.isdigit())]

In [109]:
len(vehicle_boarding_origins)

126

In [110]:
vehicle_boarding_origins.head()

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,next_o_boarding_datetime,next_o_route,next_o_busCode,next_o_tripNum,next_o_stopPointId,next_o_gps_datetime,next_o_stop_lat,next_o_stop_lon,boardings_timediff,dist_between_origins
1,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16 00:00:18,860,LC029,5.0,30162,2017-06-16 12:00:07,-25.449347,-49.299662,0 days 08:58:55.000000000,4.914025
2,1712296.0,27414,2017-06-16 05:27:39,166,BC012,9.0,26740,2017-06-16 12:59:12,-25.391601,-49.300962,...,2017-06-16 05:48:02,0,05068,,41920,NaT,-25.429398,-49.272319,0 days 00:20:23.000000000,5.093096
5,3854484.0,244272,2017-06-16 00:03:29,372,BC318,6.0,30249,2017-06-16 12:01:44,-25.427524,-49.256166,...,2017-06-16 14:29:13,0,01000,,14476,NaT,-25.413047,-49.20548,0 days 14:25:44.000000000,5.338798
7,2976743.0,105813,2017-06-16 00:03:39,372,BC318,6.0,30177,2017-06-16 12:03:38,-25.42732,-49.25479,...,2017-06-16 13:42:12,372,BC318,8.0,30767,2017-06-16 13:42:01,-25.415895,-49.202182,0 days 13:38:33.000000000,5.433909
8,2976743.0,105814,2017-06-16 13:42:12,372,BC318,8.0,30767,2017-06-16 13:42:01,-25.415895,-49.202182,...,2017-06-16 15:50:52,371,BC317,12.0,31860,2017-06-16 15:50:39,-25.416299,-49.223309,0 days 02:08:40.000000000,2.122279


In [111]:
vehicle_boarding_origins.dtypes

cardNum                            float64
o_boarding_id                        int64
o_boarding_datetime         datetime64[ns]
o_route                             object
o_busCode                           object
o_tripNum                          float64
o_stopPointId                        int64
o_gps_datetime              datetime64[ns]
o_stop_lat                         float64
o_stop_lon                         float64
next_o_boarding_id                   int64
next_o_boarding_datetime    datetime64[ns]
next_o_route                        object
next_o_busCode                      object
next_o_tripNum                     float64
next_o_stopPointId                   int64
next_o_gps_datetime         datetime64[ns]
next_o_stop_lat                    float64
next_o_stop_lon                    float64
boardings_timediff                  object
dist_between_origins               float64
dtype: object

In [112]:
itineraries_start.dtypes

user_trip_id                  int64
itinerary_id                  int64
date                 datetime64[ns]
leg_id                        int64
otp_start_time       datetime64[ns]
otp_end_time         datetime64[ns]
mode                         object
route                        object
from_stop_id                float64
to_stop_id                  float64
otp_duration_mins           float64
parent_station              float64
dtype: object

In [113]:
matched_vehicle_boardings = vehicle_boarding_origins.merge(itineraries_start, left_on=['o_boarding_id','o_route','o_stopPointId'], right_on=['user_trip_id','route','from_stop_id'], how='inner')

In [114]:
num_matched_vehicle_boardings = len(matched_vehicle_boardings.drop_duplicates(subset=['cardNum','o_boarding_id']))

print "Vehicle boardings with matching OTP suggestions: ", num_matched_vehicle_boardings, "(", \
                                                            100*(num_matched_vehicle_boardings/float(len(vehicle_boarding_origins))), "%)"

Vehicle boardings with matching OTP suggestions:  82 ( 65.0793650794 %)


In [115]:
matched_vehicle_boardings.head()

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,date,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station
0,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16,1,2017-06-16 08:57:21,2017-06-16 09:24:28,BUS,860,34130.0,30162.0,27.116667,
1,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16,1,2017-06-16 09:14:21,2017-06-16 09:41:28,BUS,860,34130.0,30162.0,27.116667,
2,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16,1,2017-06-16 09:31:21,2017-06-16 09:58:28,BUS,860,34130.0,30162.0,27.116667,
3,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16,1,2017-06-16 09:48:21,2017-06-16 10:15:28,BUS,860,34130.0,30162.0,27.116667,
4,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16,1,2017-06-16 10:05:21,2017-06-16 10:32:28,BUS,860,34130.0,30162.0,27.116667,


In [116]:
pd.concat([matched_vehicle_boardings[vehicle_boarding_origins.columns],vehicle_boarding_origins]).drop_duplicates(keep=False)

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,next_o_boarding_datetime,next_o_route,next_o_busCode,next_o_tripNum,next_o_stopPointId,next_o_gps_datetime,next_o_stop_lat,next_o_stop_lon,boardings_timediff,dist_between_origins
2,1712296.0,27414,2017-06-16 05:27:39,166,BC012,9.0,26740,2017-06-16 12:59:12,-25.391601,-49.300962,...,2017-06-16 05:48:02,000,05068,,41920,NaT,-25.429398,-49.272319,0 days 00:20:23.000000000,5.093096
8,2976743.0,105814,2017-06-16 13:42:12,372,BC318,8.0,30767,2017-06-16 13:42:01,-25.415895,-49.202182,...,2017-06-16 15:50:52,371,BC317,12.0,31860,2017-06-16 15:50:39,-25.416299,-49.223309,0 days 02:08:40.000000000,2.122279
17,3432341.0,145872,2017-06-16 19:12:14,547,GA181,11.0,30013,2017-06-16 19:08:34,-25.534222,-49.267621,...,2017-06-16 00:06:26,535,EA077,8.0,31217,2017-06-16 12:06:12,-25.527039,-49.239771,0 days 19:05:48.000000000,2.906326
19,3765804.0,209025,2017-06-16 23:45:51,372,BC318,16.0,30763,2017-06-16 23:45:36,-25.413104,-49.203865,...,2017-06-16 00:08:07,021,DR105,4.0,27746,2017-06-16 12:06:52,-25.481453,-49.246963,0 days 23:37:44.000000000,8.745649
20,10112431.0,247634,2017-06-16 08:55:13,176,BC850,3.0,28722,2017-06-16 08:51:23,-25.378273,-49.280727,...,2017-06-16 16:50:30,176,BC939,14.0,26550,2017-06-16 16:50:23,-25.424296,-49.274191,0 days 07:55:17.000000000,5.159383
21,10112431.0,247635,2017-06-16 16:50:30,176,BC939,14.0,26550,2017-06-16 16:50:23,-25.424296,-49.274191,...,2017-06-16 00:08:22,176,BC850,7.0,28721,2017-06-16 12:08:01,-25.380019,-49.279938,0 days 16:42:08.000000000,4.957126
23,2550936.0,77043,2017-06-16 14:11:01,183,BC500,7.0,34529,2017-06-16 14:07:22,-25.35634,-49.27331,...,2017-06-16 00:08:36,183,BC001,5.0,28626,2017-06-16 12:08:29,-25.388397,-49.267449,0 days 14:02:25.000000000,3.612847
24,3577178.0,167057,2017-06-16 00:08:59,515,EA303,10.0,33579,2017-06-16 12:08:03,-25.508756,-49.222974,...,2017-06-16 14:39:38,338,EA025,3.0,32805,2017-06-16 14:38:51,-25.482231,-49.224018,0 days 14:30:39.000000000,2.951342
29,3722382.0,196107,2017-06-16 00:09:39,232,BA026,9.0,30455,2017-06-16 12:03:42,-25.377205,-49.224568,...,2017-06-16 13:56:28,822,BA017,2.0,30994,2017-06-16 13:41:19,-25.441708,-49.346497,0 days 13:46:49.000000000,14.192168
30,3722382.0,196108,2017-06-16 13:56:28,822,BA017,2.0,30994,2017-06-16 13:41:19,-25.441708,-49.346497,...,2017-06-16 22:23:24,000,05033,,26083,NaT,-25.426983,-49.264765,0 days 08:26:56.000000000,8.369077


In [117]:
itineraries_start[itineraries_start['user_trip_id'] == 60146]

Unnamed: 0,user_trip_id,itinerary_id,date,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station


In [118]:
terminal_boarding_origins = selected_trips[selected_trips['o_busCode'].str.isdigit()]

In [119]:
terminal_boarding_origins['o_route'].value_counts()

000    63
021     3
TPH     2
TCB     2
TMA     2
TCR     1
Name: o_route, dtype: int64

In [120]:
terminal_021_origins = selected_trips[(selected_trips['o_busCode'].str.isdigit()) & (selected_trips['o_route'] == '021')]

In [121]:
terminal_021_origins.head()

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,next_o_boarding_datetime,next_o_route,next_o_busCode,next_o_tripNum,next_o_stopPointId,next_o_gps_datetime,next_o_stop_lat,next_o_stop_lon,boardings_timediff,dist_between_origins
115,2716546.0,87168,2017-06-16 04:45:38,21,4020,,14508,NaT,-25.441614,-49.34674,...,2017-06-16 16:15:26,60,HB606,6.0,33282,2017-06-16 16:14:33,-25.529799,-49.299592,0 days 11:29:48.000000000,10.887952
137,3657515.0,181409,2017-06-16 15:35:21,21,8042,,41765,NaT,-25.433218,-49.299969,...,2017-06-16 04:46:54,652,LA007,1.0,36704,2017-06-16 04:46:46,-25.529424,-49.338524,0 days 10:48:27.000000000,11.376201
187,3463750.0,149820,2017-06-16 12:41:48,21,8045,,41796,NaT,-25.430289,-49.292615,...,2017-06-16 04:49:56,0,00008,,14487,NaT,-25.481927,-49.246999,0 days 07:51:52.000000000,7.344637


In [122]:
matched_021_terminal_boardings = terminal_021_origins.merge(itineraries_start, left_on=['o_boarding_id','o_route','o_stopPointId'], right_on=['user_trip_id','route','parent_station'], how='inner')

In [123]:
num_matched_021_terminal_boardings = len(matched_021_terminal_boardings.drop_duplicates(subset=['cardNum','o_boarding_id']))

print "Line 021 Terminal boardings with matching OTP suggestions: ", num_matched_021_terminal_boardings, "(", \
                                                            100*(num_matched_021_terminal_boardings/float(len(terminal_021_origins))), "%)"

Line 021 Terminal boardings with matching OTP suggestions:  0 ( 0.0 %)


In [124]:
matched_021_terminal_boardings.head()

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,date,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station


In [125]:
terminal_boarding_origins = selected_trips[(selected_trips['o_busCode'].str.isdigit()) & (selected_trips['o_route'] != '021')]

In [126]:
terminal_boarding_origins.head()

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,next_o_boarding_datetime,next_o_route,next_o_busCode,next_o_tripNum,next_o_stopPointId,next_o_gps_datetime,next_o_stop_lat,next_o_stop_lon,boardings_timediff,dist_between_origins
3,1712296.0,27415,2017-06-16 05:48:02,0,5068,,41920,NaT,-25.429398,-49.272319,...,2017-06-16 18:31:55,0,00008,,14487,NaT,-25.481927,-49.246999,0 days 12:43:53.000000000,6.370171
4,1712296.0,27416,2017-06-16 18:31:55,0,8,,14487,NaT,-25.481927,-49.246999,...,2017-06-16 00:00:58,166,BC016,8.0,26754,2017-06-16 12:00:07,-25.391386,-49.297205,0 days 18:30:57.000000000,11.259416
6,3854484.0,244273,2017-06-16 14:29:13,0,1000,,14476,NaT,-25.413047,-49.20548,...,2017-06-16 00:03:29,372,BC318,6.0,30249,2017-06-16 12:01:44,-25.427524,-49.256166,0 days 14:25:44.000000000,5.338798
10,2976743.0,105816,2017-06-16 16:06:58,0,4022,,25427,NaT,-25.433658,-49.263464,...,2017-06-16 22:46:59,0,09004,,14515,NaT,-25.483724,-49.35016,0 days 06:40:01.000000000,10.332124
11,2976743.0,105817,2017-06-16 22:46:59,0,9004,,14515,NaT,-25.483724,-49.35016,...,2017-06-16 00:03:39,372,BC318,6.0,30177,2017-06-16 12:03:38,-25.42732,-49.25479,0 days 22:43:20.000000000,11.446409


In [127]:
matched_terminal_boardings = terminal_boarding_origins.merge(itineraries_start, left_on=['o_boarding_id','o_stopPointId'], right_on=['user_trip_id','parent_station'], how='inner') \
                .drop_duplicates(subset=['cardNum','o_boarding_id'])

In [128]:
num_matched_terminal_boardings = len(matched_terminal_boardings.drop_duplicates(subset=['cardNum','o_boarding_id']))

print "Terminal boardings with matching OTP suggestions: ", num_matched_terminal_boardings, "(", \
                                                            100*(num_matched_terminal_boardings/float(len(terminal_boarding_origins))), "%)"

Terminal boardings with matching OTP suggestions:  53 ( 75.7142857143 %)


In [129]:
matched_terminal_boardings.head()

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,date,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station
0,1712296.0,27416,2017-06-16 18:31:55,000,8,,14487,NaT,-25.481927,-49.246999,...,2017-06-16,2,2017-06-16 18:34:04,2017-06-16 18:53:00,BUS,503,27474.0,27560.0,18.933333,14487.0
10,3854484.0,244273,2017-06-16 14:29:13,000,1000,,14476,NaT,-25.413047,-49.20548,...,2017-06-16,2,2017-06-16 14:42:00,2017-06-16 15:06:44,BUS,371,26216.0,30176.0,24.733333,14476.0
16,2976743.0,105817,2017-06-16 22:46:59,000,9004,,14515,NaT,-25.483724,-49.35016,...,2017-06-17,1,2017-06-16 22:45:00,2017-06-16 23:12:19,BUS,703,32966.0,32102.0,27.316667,14515.0
25,3814740.0,226330,2017-06-16 16:29:16,TCB,6003,,14478,NaT,-25.43998,-49.221858,...,2017-06-16,2,2017-06-16 16:32:25,2017-06-16 16:38:27,BUS,303,26191.0,26203.0,6.033333,14478.0
34,3722382.0,196109,2017-06-16 22:23:24,000,5033,,26083,NaT,-25.426983,-49.264765,...,2017-06-17,2,2017-06-16 22:31:40,2017-06-16 22:54:00,BUS,204,25917.0,30454.0,22.333333,26083.0


In [130]:
total_num_matches = num_matched_vehicle_boardings + num_matched_021_terminal_boardings + num_matched_terminal_boardings

print "Total number of matches: ", total_num_matches, "(", \
                                    100*(total_num_matches/float(len(selected_trips))), "%)"

Total number of matches:  135 ( 67.8391959799 %)


In [131]:
boarding_suggestions_matches = pd.concat([matched_vehicle_boardings,matched_021_terminal_boardings,matched_terminal_boardings])

In [132]:
len(boarding_suggestions_matches)

427

In [133]:
boarding_suggestions_matches.head()

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,date,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station
0,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16,1,2017-06-16 08:57:21,2017-06-16 09:24:28,BUS,860,34130.0,30162.0,27.116667,
1,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16,1,2017-06-16 09:14:21,2017-06-16 09:41:28,BUS,860,34130.0,30162.0,27.116667,
2,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16,1,2017-06-16 09:31:21,2017-06-16 09:58:28,BUS,860,34130.0,30162.0,27.116667,
3,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16,1,2017-06-16 09:48:21,2017-06-16 10:15:28,BUS,860,34130.0,30162.0,27.116667,
4,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,2017-06-16,1,2017-06-16 10:05:21,2017-06-16 10:32:28,BUS,860,34130.0,30162.0,27.116667,


#### Add OTP extra origin/next-origin pairs to final dataset

In [134]:
boarding_suggestions_matches.dtypes

cardNum                            float64
o_boarding_id                        int64
o_boarding_datetime         datetime64[ns]
o_route                             object
o_busCode                           object
o_tripNum                          float64
o_stopPointId                       object
o_gps_datetime              datetime64[ns]
o_stop_lat                         float64
o_stop_lon                         float64
next_o_boarding_id                   int64
next_o_boarding_datetime    datetime64[ns]
next_o_route                        object
next_o_busCode                      object
next_o_tripNum                     float64
next_o_stopPointId                   int64
next_o_gps_datetime         datetime64[ns]
next_o_stop_lat                    float64
next_o_stop_lon                    float64
boardings_timediff                  object
dist_between_origins               float64
user_trip_id                         int64
itinerary_id                         int64
date       

In [135]:
otp_legs_suggestions_matches = boarding_suggestions_matches[np.append(trips_origins.columns.values,['itinerary_id'])] \
                    .merge(otp_suggestions, left_on=['o_boarding_id','itinerary_id'], right_on=['user_trip_id','itinerary_id'], how='inner') \
                    .query('mode == \'BUS\'')

In [136]:
len(otp_legs_suggestions_matches)

780

In [137]:
otp_legs_suggestions_matches

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,user_trip_id,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station
0,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,180188,1,2017-06-16 08:57:21,2017-06-16 09:24:28,BUS,860,34130,30162.0,27.116667,
1,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,180188,1,2017-06-16 09:14:21,2017-06-16 09:41:28,BUS,860,34130,30162.0,27.116667,
2,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,180188,1,2017-06-16 09:31:21,2017-06-16 09:58:28,BUS,860,34130,30162.0,27.116667,
3,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,180188,1,2017-06-16 09:48:21,2017-06-16 10:15:28,BUS,860,34130,30162.0,27.116667,
4,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,180188,1,2017-06-16 10:05:21,2017-06-16 10:32:28,BUS,860,34130,30162.0,27.116667,
5,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,180188,1,2017-06-16 10:22:21,2017-06-16 10:49:28,BUS,860,34130,30162.0,27.116667,
6,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,180188,1,2017-06-16 10:39:21,2017-06-16 11:06:28,BUS,860,34130,30162.0,27.116667,
7,3854484.0,244272,2017-06-16 00:03:29,372,BC318,6.0,30249,2017-06-16 12:01:44,-25.427524,-49.256166,...,244272,1,2017-06-16 06:02:49,2017-06-16 06:26:31,BUS,372,30249,30763.0,23.700000,
9,3854484.0,244272,2017-06-16 00:03:29,372,BC318,6.0,30249,2017-06-16 12:01:44,-25.427524,-49.256166,...,244272,1,2017-06-16 06:13:29,2017-06-16 06:35:33,BUS,372,30249,30763.0,22.066667,
11,3854484.0,244272,2017-06-16 00:03:29,372,BC318,6.0,30249,2017-06-16 12:01:44,-25.427524,-49.256166,...,244272,1,2017-06-16 06:25:19,2017-06-16 06:51:28,BUS,372,30249,30763.0,26.150000,


In [138]:
otp_legs_suggestions_matches['first_vehicle_boarding'] = np.where(np.logical_not(otp_legs_suggestions_matches['o_busCode'].str.isdigit()) & 
                                                                      (otp_legs_suggestions_matches['o_route'] == otp_legs_suggestions_matches['route']),
                                                                      True,
                                                                      False)

In [139]:
otp_legs_suggestions_matches

Unnamed: 0,cardNum,o_boarding_id,o_boarding_datetime,o_route,o_busCode,o_tripNum,o_stopPointId,o_gps_datetime,o_stop_lat,o_stop_lon,...,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station,first_vehicle_boarding
0,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,1,2017-06-16 08:57:21,2017-06-16 09:24:28,BUS,860,34130,30162.0,27.116667,,True
1,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,1,2017-06-16 09:14:21,2017-06-16 09:41:28,BUS,860,34130,30162.0,27.116667,,True
2,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,1,2017-06-16 09:31:21,2017-06-16 09:58:28,BUS,860,34130,30162.0,27.116667,,True
3,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,1,2017-06-16 09:48:21,2017-06-16 10:15:28,BUS,860,34130,30162.0,27.116667,,True
4,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,1,2017-06-16 10:05:21,2017-06-16 10:32:28,BUS,860,34130,30162.0,27.116667,,True
5,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,1,2017-06-16 10:22:21,2017-06-16 10:49:28,BUS,860,34130,30162.0,27.116667,,True
6,3651329.0,180188,2017-06-16 08:59:13,860,LC029,3.0,34130,2017-06-16 08:58:41,-25.455609,-49.348111,...,1,2017-06-16 10:39:21,2017-06-16 11:06:28,BUS,860,34130,30162.0,27.116667,,True
7,3854484.0,244272,2017-06-16 00:03:29,372,BC318,6.0,30249,2017-06-16 12:01:44,-25.427524,-49.256166,...,1,2017-06-16 06:02:49,2017-06-16 06:26:31,BUS,372,30249,30763.0,23.700000,,True
9,3854484.0,244272,2017-06-16 00:03:29,372,BC318,6.0,30249,2017-06-16 12:01:44,-25.427524,-49.256166,...,1,2017-06-16 06:13:29,2017-06-16 06:35:33,BUS,372,30249,30763.0,22.066667,,True
11,3854484.0,244272,2017-06-16 00:03:29,372,BC318,6.0,30249,2017-06-16 12:01:44,-25.427524,-49.256166,...,1,2017-06-16 06:25:19,2017-06-16 06:51:28,BUS,372,30249,30763.0,26.150000,,True


In [140]:
otp_filtered_legs = otp_legs_suggestions_matches.filter(items=np.append(otp_suggestions.columns.values,['first_vehicle_boarding','o_busCode','o_tripNum','o_boarding_datetime'])) \
                                                .sort_values(['user_trip_id','itinerary_id','leg_id'])

In [141]:
otp_filtered_legs

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station,first_vehicle_boarding,o_busCode,o_tripNum,o_boarding_datetime
1266,2017-06-16,4699,1,2,2017-06-16 05:02:00,2017-06-16 05:18:21,BUS,342,30448,34916.0,16.350000,14471.0,False,05006,,2017-06-16 04:43:52
465,2017-06-16,4700,1,1,2017-06-16 14:49:05,2017-06-16 15:10:00,BUS,342,34915,30448.0,20.916667,,True,BA120,10.0,2017-06-16 14:50:57
467,2017-06-16,4700,3,1,2017-06-16 15:09:05,2017-06-16 15:30:00,BUS,342,34915,30448.0,20.916667,,True,BA120,10.0,2017-06-16 14:50:57
469,2017-06-16,4700,6,1,2017-06-16 15:29:05,2017-06-16 15:50:00,BUS,342,34915,30448.0,20.916667,,True,BA120,10.0,2017-06-16 14:50:57
471,2017-06-16,4700,8,1,2017-06-16 15:49:05,2017-06-16 16:10:00,BUS,342,34915,30448.0,20.916667,,True,BA120,10.0,2017-06-16 14:50:57
473,2017-06-16,4700,9,1,2017-06-16 16:09:23,2017-06-16 16:31:00,BUS,342,34915,30448.0,21.616667,,True,BA120,10.0,2017-06-16 14:50:57
475,2017-06-16,4700,10,1,2017-06-16 16:29:41,2017-06-16 16:52:00,BUS,342,34915,30448.0,22.316667,,True,BA120,10.0,2017-06-16 14:50:57
968,2017-06-16,6671,1,1,2017-06-16 04:47:15,2017-06-16 05:18:00,BUS,684,39372,31053.0,30.750000,,True,HA610,1.0,2017-06-16 04:48:38
970,2017-06-16,6671,1,3,2017-06-16 05:30:00,2017-06-16 06:08:27,BUS,204,26252,26234.0,38.450000,14494.0,False,HA610,1.0,2017-06-16 04:48:38
972,2017-06-16,6671,1,5,2017-06-16 06:10:00,2017-06-16 06:15:52,BUS,216,28128,28979.0,5.866667,14474.0,False,HA610,1.0,2017-06-16 04:48:38


In [142]:
len(otp_filtered_legs)

780

#### Find OTP Suggested Itineraries in BUSTE Data

In [144]:
bus_trips = pd.read_csv('/local/tarciso/masters/data/bus_trips/latest/enhanced-buste/2017_06_16_bus_trips.csv', parse_dates=['gps_datetime'])

In [145]:
bus_trips = bus_trips.sort_values(['route','busCode','tripNum','gps_datetime'])
bus_trips['route'] = bus_trips['route'].astype(str).str.replace("\.0",'').str.zfill(3)
bus_trips = bus_trips.drop_duplicates()

In [146]:
bus_trips.head()

Unnamed: 0,route,busCode,tripNum,stopPointId,gps_datetime,stop_lat,stop_lon,parent_station
131072,21,BB606,7.0,33060,2017-06-16 21:39:10,-25.44456,-49.30532,
131073,21,BB606,7.0,33059,2017-06-16 21:40:01,-25.45028,-49.30469,
131074,21,BB606,7.0,33058,2017-06-16 21:41:15,-25.45441,-49.30424,
131075,21,BB606,7.0,33057,2017-06-16 21:41:41,-25.45671,-49.30349,
131076,21,BB606,7.0,33054,2017-06-16 21:43:48,-25.46601,-49.30076,


In [147]:
len(bus_trips)

520063

In [148]:
otp_filtered_legs.head()

Unnamed: 0,date,user_trip_id,itinerary_id,leg_id,otp_start_time,otp_end_time,mode,route,from_stop_id,to_stop_id,otp_duration_mins,parent_station,first_vehicle_boarding,o_busCode,o_tripNum,o_boarding_datetime
1266,2017-06-16,4699,1,2,2017-06-16 05:02:00,2017-06-16 05:18:21,BUS,342,30448,34916.0,16.35,14471.0,False,05006,,2017-06-16 04:43:52
465,2017-06-16,4700,1,1,2017-06-16 14:49:05,2017-06-16 15:10:00,BUS,342,34915,30448.0,20.916667,,True,BA120,10.0,2017-06-16 14:50:57
467,2017-06-16,4700,3,1,2017-06-16 15:09:05,2017-06-16 15:30:00,BUS,342,34915,30448.0,20.916667,,True,BA120,10.0,2017-06-16 14:50:57
469,2017-06-16,4700,6,1,2017-06-16 15:29:05,2017-06-16 15:50:00,BUS,342,34915,30448.0,20.916667,,True,BA120,10.0,2017-06-16 14:50:57
471,2017-06-16,4700,8,1,2017-06-16 15:49:05,2017-06-16 16:10:00,BUS,342,34915,30448.0,20.916667,,True,BA120,10.0,2017-06-16 14:50:57


#### Find candidate matches in BUSTE data

In [149]:
otp_legs_buste_start = otp_filtered_legs.merge(bus_trips, 
                                 left_on=['route','from_stop_id'], 
                                 right_on=['route','stopPointId'], 
                                 how='inner') \
                        .assign(otp_buste_start_timediff = 
                                lambda x: np.absolute(x['gps_datetime'] - x['otp_start_time'])) \
                        .filter(['user_trip_id','first_vehicle_boarding','itinerary_id','leg_id','route','busCode',
                                 'o_busCode','tripNum','o_tripNum','from_stop_id','otp_start_time','gps_datetime',
                                 'o_boarding_datetime','otp_buste_start_timediff','to_stop_id','otp_end_time']) \
                        .sort_values(['user_trip_id','itinerary_id','leg_id','otp_buste_start_timediff']) \
                        .rename(index=str, columns={'to_stop_id':'stopPointId', 'gps_datetime':'matched_start_time'})
                        
            
otp_legs_buste_start = otp_legs_buste_start[otp_legs_buste_start['otp_buste_start_timediff'] < pd.Timedelta('60min')]

In [150]:
len(otp_legs_buste_start)

6432

In [151]:
otp_legs_buste_start.head()

Unnamed: 0,user_trip_id,first_vehicle_boarding,itinerary_id,leg_id,route,busCode,o_busCode,tripNum,o_tripNum,from_stop_id,otp_start_time,matched_start_time,o_boarding_datetime,otp_buste_start_timediff,stopPointId,otp_end_time
63,4700,True,1,1,342,BA120,BA120,10.0,10.0,34915,2017-06-16 14:49:05,2017-06-16 14:50:47,2017-06-16 14:50:57,00:01:42,30448.0,2017-06-16 15:10:00
40,4700,True,1,1,342,BA116,BA120,10.0,10.0,34915,2017-06-16 14:49:05,2017-06-16 15:08:58,2017-06-16 14:50:57,00:19:53,30448.0,2017-06-16 15:10:00
39,4700,True,1,1,342,BA116,BA120,9.0,10.0,34915,2017-06-16 14:49:05,2017-06-16 14:12:26,2017-06-16 14:50:57,00:36:39,30448.0,2017-06-16 15:10:00
58,4700,True,1,1,342,BA120,BA120,11.0,10.0,34915,2017-06-16 14:49:05,2017-06-16 15:48:41,2017-06-16 14:50:57,00:59:36,30448.0,2017-06-16 15:10:00
74,4700,True,3,1,342,BA116,BA120,10.0,10.0,34915,2017-06-16 15:09:05,2017-06-16 15:08:58,2017-06-16 14:50:57,00:00:07,30448.0,2017-06-16 15:30:00


In [152]:
otp_legs_buste_start.otp_buste_start_timediff.describe()

count                      6432
mean     0 days 00:29:31.652052
std      0 days 00:17:33.072020
min             0 days 00:00:00
25%             0 days 00:14:36
50%      0 days 00:29:29.500000
75%      0 days 00:44:44.250000
max             0 days 00:59:59
Name: otp_buste_start_timediff, dtype: object

In [179]:
otp_legs_buste = otp_legs_buste_start \
                        .merge(bus_trips, 
                                 on=['route','busCode','tripNum','stopPointId'], 
                                 how='inner') \
                        .assign(otp_buste_end_timediff = 
                                    lambda x: np.absolute(x['gps_datetime'] - x['otp_end_time'])) \
                        .rename(index=str, columns={'stopPointId':'to_stop_id', 'gps_datetime':'matched_end_time'}) \
                        .assign(leg_duration = lambda x: x['matched_end_time'] - x['matched_start_time'],
                                boarding_otp_match_start_timediff = 
                                    lambda x: np.absolute(x['o_boarding_datetime'] - x['matched_start_time'])) \
                        .query('matched_end_time > matched_start_time') \
                        .filter(['user_trip_id','first_vehicle_boarding','itinerary_id','leg_id','route','busCode',
                                 'o_busCode','tripNum','o_tripNum','from_stop_id','otp_start_time',
                                 'matched_start_time','o_boarding_datetime','otp_buste_start_timediff',
                                 'to_stop_id','otp_end_time','matched_end_time','otp_buste_end_timediff',
                                 'boarding_otp_match_start_timediff', 'leg_duration']) \
                        .sort_values(['user_trip_id','itinerary_id','leg_id','otp_buste_end_timediff'])

otp_legs_buste = otp_legs_buste[otp_legs_buste['otp_buste_end_timediff'] < pd.Timedelta('60min')]

In [180]:
len(otp_legs_buste)

4804

In [186]:
otp_legs_buste.head(50)

Unnamed: 0,user_trip_id,first_vehicle_boarding,itinerary_id,leg_id,route,busCode,o_busCode,tripNum,o_tripNum,from_stop_id,otp_start_time,matched_start_time,o_boarding_datetime,otp_buste_start_timediff,to_stop_id,otp_end_time,matched_end_time,otp_buste_end_timediff,boarding_otp_match_start_timediff,leg_duration
0,4700,True,1,1,342,BA120,BA120,10.0,10.0,34915,2017-06-16 14:49:05,2017-06-16 14:50:47,2017-06-16 14:50:57,00:01:42,30448,2017-06-16 15:10:00,2017-06-16 15:09:47,00:00:13,00:00:10,00:19:00
4,4700,True,1,1,342,BA116,BA120,10.0,10.0,34915,2017-06-16 14:49:05,2017-06-16 15:08:58,2017-06-16 14:50:57,00:19:53,30448,2017-06-16 15:10:00,2017-06-16 15:27:08,00:17:08,00:18:01,00:18:10
8,4700,True,1,1,342,BA116,BA120,9.0,10.0,34915,2017-06-16 14:49:05,2017-06-16 14:12:26,2017-06-16 14:50:57,00:36:39,30448,2017-06-16 15:10:00,2017-06-16 14:31:26,00:38:34,00:38:31,00:19:00
10,4700,True,1,1,342,BA120,BA120,11.0,10.0,34915,2017-06-16 14:49:05,2017-06-16 15:48:41,2017-06-16 14:50:57,00:59:36,30448,2017-06-16 15:10:00,2017-06-16 16:05:54,00:55:54,00:57:44,00:17:13
5,4700,True,3,1,342,BA116,BA120,10.0,10.0,34915,2017-06-16 15:09:05,2017-06-16 15:08:58,2017-06-16 14:50:57,00:00:07,30448,2017-06-16 15:30:00,2017-06-16 15:27:08,00:02:52,00:18:01,00:18:10
1,4700,True,3,1,342,BA120,BA120,10.0,10.0,34915,2017-06-16 15:09:05,2017-06-16 14:50:47,2017-06-16 14:50:57,00:18:18,30448,2017-06-16 15:30:00,2017-06-16 15:09:47,00:20:13,00:00:10,00:19:00
11,4700,True,3,1,342,BA120,BA120,11.0,10.0,34915,2017-06-16 15:09:05,2017-06-16 15:48:41,2017-06-16 14:50:57,00:39:36,30448,2017-06-16 15:30:00,2017-06-16 16:05:54,00:35:54,00:57:44,00:17:13
9,4700,True,3,1,342,BA116,BA120,9.0,10.0,34915,2017-06-16 15:09:05,2017-06-16 14:12:26,2017-06-16 14:50:57,00:56:39,30448,2017-06-16 15:30:00,2017-06-16 14:31:26,00:58:34,00:38:31,00:19:00
12,4700,True,6,1,342,BA120,BA120,11.0,10.0,34915,2017-06-16 15:29:05,2017-06-16 15:48:41,2017-06-16 14:50:57,00:19:36,30448,2017-06-16 15:50:00,2017-06-16 16:05:54,00:15:54,00:57:44,00:17:13
6,4700,True,6,1,342,BA116,BA120,10.0,10.0,34915,2017-06-16 15:29:05,2017-06-16 15:08:58,2017-06-16 14:50:57,00:20:07,30448,2017-06-16 15:50:00,2017-06-16 15:27:08,00:22:52,00:18:01,00:18:10


In [187]:
otp_legs_buste.otp_buste_end_timediff.describe()

count                      4804
mean     0 days 00:28:21.276644
std      0 days 00:17:03.920909
min             0 days 00:00:00
25%             0 days 00:13:43
50%             0 days 00:27:49
75%      0 days 00:42:50.250000
max             0 days 00:59:59
Name: otp_buste_end_timediff, dtype: object

In [188]:
otp_legs_buste.boarding_otp_match_start_timediff.describe()

count                      4804
mean     0 days 01:42:45.091590
std      0 days 01:42:32.528292
min             0 days 00:00:00
25%      0 days 00:39:16.500000
50%      0 days 01:15:56.500000
75%             0 days 01:59:14
max             0 days 09:02:30
Name: boarding_otp_match_start_timediff, dtype: object

In [189]:
otp_legs_buste.leg_duration.describe()

count                      4804
mean     0 days 00:21:47.367818
std      0 days 00:11:34.293653
min             0 days 00:01:08
25%             0 days 00:14:06
50%             0 days 00:19:43
75%      0 days 00:27:47.750000
max             0 days 01:33:52
Name: leg_duration, dtype: object

#### Choosing best leg match using current and previous leg information

In [190]:
legs_matches_groups = otp_legs_buste.groupby(['user_trip_id','itinerary_id','leg_id'])

In [191]:
len(legs_matches_groups)

698

In [192]:
len(otp_legs_buste)

4804

In [193]:
chosen_leg_matches = pd.DataFrame(columns = otp_legs_buste.columns.values)
prev_group_id = ()
num_groups_not_survived = 0

for name, group in legs_matches_groups:
    
    if (prev_group_id != name[0:2]):
        prev_leg_end_time = otp_suggestions['date'][0]
    
    #print
    #print prev_leg_end_time
    #print
    #print "Original Group"
    #print group.filter(['otp_start_time','matched_start_time'])
    
    filtered_group = group[group['matched_start_time'] > prev_leg_end_time]
    #print
    #print "Filtered Group"
    #print filtered_group.filter(['otp_start_time','matched_start_time'])
    
    if (len(filtered_group) == 0):
        num_groups_not_survived += 1
        continue
    
    chosen_leg_match = filtered_group.sort_values('boarding_otp_match_start_timediff').iloc[0]
    #print "Chosen Leg"
    #print chosen_leg_match
    
    chosen_leg_matches = chosen_leg_matches.append(chosen_leg_match)
    
    #Update variables
    prev_group_id = name[0:2]
    prev_leg_end_time = chosen_leg_match['matched_end_time']


print num_groups_not_survived
chosen_leg_matches = chosen_leg_matches.filter(otp_legs_buste.columns.values)

5


In [194]:
chosen_leg_matches.head()

Unnamed: 0,user_trip_id,first_vehicle_boarding,itinerary_id,leg_id,route,busCode,o_busCode,tripNum,o_tripNum,from_stop_id,otp_start_time,matched_start_time,o_boarding_datetime,otp_buste_start_timediff,to_stop_id,otp_end_time,matched_end_time,otp_buste_end_timediff,boarding_otp_match_start_timediff,leg_duration
0,4700,True,1,1,342,BA120,BA120,10.0,10.0,34915.0,2017-06-16 14:49:05,2017-06-16 14:50:47,2017-06-16 14:50:57,00:01:42,30448.0,2017-06-16 15:10:00,2017-06-16 15:09:47,00:00:13,00:00:10,00:19:00
1,4700,True,3,1,342,BA120,BA120,10.0,10.0,34915.0,2017-06-16 15:09:05,2017-06-16 14:50:47,2017-06-16 14:50:57,00:18:18,30448.0,2017-06-16 15:30:00,2017-06-16 15:09:47,00:20:13,00:00:10,00:19:00
2,4700,True,6,1,342,BA120,BA120,10.0,10.0,34915.0,2017-06-16 15:29:05,2017-06-16 14:50:47,2017-06-16 14:50:57,00:38:18,30448.0,2017-06-16 15:50:00,2017-06-16 15:09:47,00:40:13,00:00:10,00:19:00
7,4700,True,8,1,342,BA116,BA120,10.0,10.0,34915.0,2017-06-16 15:49:05,2017-06-16 15:08:58,2017-06-16 14:50:57,00:40:07,30448.0,2017-06-16 16:10:00,2017-06-16 15:27:08,00:42:52,00:18:01,00:18:10
14,4700,True,9,1,342,BA120,BA120,11.0,10.0,34915.0,2017-06-16 16:09:23,2017-06-16 15:48:41,2017-06-16 14:50:57,00:20:42,30448.0,2017-06-16 16:31:00,2017-06-16 16:05:54,00:25:06,00:57:44,00:17:13


In [195]:
len(chosen_leg_matches)

693

#### Choosing itinerary

#### Adding stops location data

In [196]:
stops_locations = stops_df[['stop_id','stop_lat','stop_lon']]

In [197]:
user_trips_ids = otp_legs_suggestions_matches[['cardNum','user_trip_id']].drop_duplicates().sort_values(['cardNum','user_trip_id'])

In [198]:
otp_legs_buste_data = chosen_leg_matches.merge(stops_locations, left_on='from_stop_id', right_on='stop_id', how='left') \
        .drop('stop_id', axis=1) \
        .rename(index=str, columns={'stop_lat':'from_stop_lat','stop_lon':'from_stop_lon'}) \
        .merge(stops_locations, left_on='to_stop_id', right_on='stop_id', how='left') \
        .drop('stop_id', axis=1) \
        .rename(index=str, columns={'stop_lat':'to_stop_lat','stop_lon':'to_stop_lon'}) \
        .merge(user_trips_ids, on=['user_trip_id'], how='inner') \
                            [np.append(np.append(['cardNum'],otp_legs_buste.columns.values),['from_stop_lat','from_stop_lon','to_stop_lat','to_stop_lon'])]

In [199]:
otp_legs_buste_data.head()

Unnamed: 0,cardNum,user_trip_id,first_vehicle_boarding,itinerary_id,leg_id,route,busCode,o_busCode,tripNum,o_tripNum,...,to_stop_id,otp_end_time,matched_end_time,otp_buste_end_timediff,boarding_otp_match_start_timediff,leg_duration,from_stop_lat,from_stop_lon,to_stop_lat,to_stop_lon
0,844324.0,4700,True,1,1,342,BA120,BA120,10.0,10.0,...,30448,2017-06-16 15:10:00,2017-06-16 15:09:47,00:00:13,00:00:10,00:19:00,-25.39634,-49.20217,-25.393374,-49.240917
1,844324.0,4700,True,3,1,342,BA120,BA120,10.0,10.0,...,30448,2017-06-16 15:30:00,2017-06-16 15:09:47,00:20:13,00:00:10,00:19:00,-25.39634,-49.20217,-25.393374,-49.240917
2,844324.0,4700,True,6,1,342,BA120,BA120,10.0,10.0,...,30448,2017-06-16 15:50:00,2017-06-16 15:09:47,00:40:13,00:00:10,00:19:00,-25.39634,-49.20217,-25.393374,-49.240917
3,844324.0,4700,True,8,1,342,BA116,BA120,10.0,10.0,...,30448,2017-06-16 16:10:00,2017-06-16 15:27:08,00:42:52,00:18:01,00:18:10,-25.39634,-49.20217,-25.393374,-49.240917
4,844324.0,4700,True,9,1,342,BA120,BA120,11.0,10.0,...,30448,2017-06-16 16:31:00,2017-06-16 16:05:54,00:25:06,00:57:44,00:17:13,-25.39634,-49.20217,-25.393374,-49.240917


In [200]:
len(otp_legs_buste_data)

693

In [201]:
otp_buste_itineraries = otp_legs_buste_data \
                            .groupby(['cardNum','user_trip_id','itinerary_id']) \
                            .agg({'from_stop_id': lambda x: x.iloc[0], 
                                  'matched_start_time': lambda x: x.iloc[0], 
                                  'from_stop_lat': lambda x: x.iloc[0], 
                                  'from_stop_lon': lambda x: x.iloc[0],
                                  'to_stop_id': lambda x: x.iloc[-1], 
                                  'matched_end_time': lambda x: x.iloc[-1], 
                                  'to_stop_lat': lambda x: x.iloc[-1], 
                                  'to_stop_lon': lambda x: x.iloc[-1],
                                  'leg_id': lambda x: len(x),
                                  'first_vehicle_boarding' : lambda x: x.any()}) \
                            .rename(index=str, columns={'leg_id':'num_transfers','first_vehicle_boarding':'vehicle_boarding'}) \
                            .add_prefix('match_') \
                            .reset_index() \
                            .assign(cardNum = lambda x: x['cardNum'].astype(float),
                                    user_trip_id = lambda x: x['user_trip_id'].astype(int),
                                    itinerary_id = lambda x: x['itinerary_id'].astype(int))

In [202]:
otp_buste_itineraries.head()

Unnamed: 0,cardNum,user_trip_id,itinerary_id,match_from_stop_lon,match_matched_start_time,match_num_transfers,match_from_stop_lat,match_from_stop_id,match_to_stop_lat,match_vehicle_boarding,match_matched_end_time,match_to_stop_lon,match_to_stop_id
0,844324.0,4700,1,-49.20217,2017-06-16 14:50:47,1,-25.39634,34915.0,-25.393374,True,2017-06-16 15:09:47,-49.240917,30448.0
1,844324.0,4700,3,-49.20217,2017-06-16 14:50:47,1,-25.39634,34915.0,-25.393374,True,2017-06-16 15:09:47,-49.240917,30448.0
2,844324.0,4700,6,-49.20217,2017-06-16 14:50:47,1,-25.39634,34915.0,-25.393374,True,2017-06-16 15:09:47,-49.240917,30448.0
3,844324.0,4700,8,-49.20217,2017-06-16 15:08:58,1,-25.39634,34915.0,-25.393374,True,2017-06-16 15:27:08,-49.240917,30448.0
4,844324.0,4700,9,-49.20217,2017-06-16 15:48:41,1,-25.39634,34915.0,-25.393374,True,2017-06-16 16:05:54,-49.240917,30448.0


In [279]:
otp_buste_itineraries_summary = otp_buste_itineraries \
                            .merge(otp_legs_suggestions_matches
                                       .drop_duplicates(subset=['cardNum','user_trip_id','itinerary_id']), 
                                   on=['cardNum','user_trip_id','itinerary_id'], how='inner') \
                            [['cardNum', 'user_trip_id', 'itinerary_id',
                                  'match_from_stop_id', 'match_matched_start_time', 'o_boarding_datetime',
                                  'match_from_stop_lat', 'match_from_stop_lon', 'o_stop_lat', 'o_stop_lon',
                                  'match_to_stop_id', 'match_matched_end_time', 'next_o_boarding_datetime',
                                  'match_to_stop_lat', 'match_to_stop_lon', 'next_o_stop_lat', 'next_o_stop_lon','match_num_transfers', 'match_vehicle_boarding']] \
                            .assign(start_diff = lambda x: np.absolute(x['match_matched_start_time'] - x['o_boarding_datetime']),
                                    trip_duration = lambda x: x['match_matched_end_time'] - x['match_matched_start_time'],
                                    origin_dist = lambda x: x.apply(lambda y: dist(y['match_from_stop_lat'], y['match_from_stop_lon'], y['o_stop_lat'], y['o_stop_lon']), axis=1),
                                    next_origin_dist = lambda x: x.apply(lambda y: dist(y['match_to_stop_lat'], y['match_to_stop_lon'], y['next_o_stop_lat'], y['next_o_stop_lon']),axis=1)) \
                            .sort_values(['cardNum','user_trip_id'])

In [280]:
otp_buste_itineraries_summary.head()

Unnamed: 0,cardNum,user_trip_id,itinerary_id,match_from_stop_id,match_matched_start_time,o_boarding_datetime,match_from_stop_lat,match_from_stop_lon,o_stop_lat,o_stop_lon,...,match_to_stop_lat,match_to_stop_lon,next_o_stop_lat,next_o_stop_lon,match_num_transfers,match_vehicle_boarding,next_origin_dist,origin_dist,start_diff,trip_duration
0,844324.0,4700,1,34915.0,2017-06-16 14:50:47,2017-06-16 14:50:57,-25.39634,-49.20217,-25.39634,-49.20217,...,-25.393374,-49.240917,-25.393295,-49.241051,1,True,0.01613,0.0,00:00:10,00:19:00
1,844324.0,4700,3,34915.0,2017-06-16 14:50:47,2017-06-16 14:50:57,-25.39634,-49.20217,-25.39634,-49.20217,...,-25.393374,-49.240917,-25.393295,-49.241051,1,True,0.01613,0.0,00:00:10,00:19:00
2,844324.0,4700,6,34915.0,2017-06-16 14:50:47,2017-06-16 14:50:57,-25.39634,-49.20217,-25.39634,-49.20217,...,-25.393374,-49.240917,-25.393295,-49.241051,1,True,0.01613,0.0,00:00:10,00:19:00
3,844324.0,4700,8,34915.0,2017-06-16 15:08:58,2017-06-16 14:50:57,-25.39634,-49.20217,-25.39634,-49.20217,...,-25.393374,-49.240917,-25.393295,-49.241051,1,True,0.01613,0.0,00:18:01,00:18:10
4,844324.0,4700,9,34915.0,2017-06-16 15:48:41,2017-06-16 14:50:57,-25.39634,-49.20217,-25.39634,-49.20217,...,-25.393374,-49.240917,-25.393295,-49.241051,1,True,0.01613,0.0,00:57:44,00:17:13


In [268]:
otp_buste_itineraries_summary.origin_dist.describe(percentiles=[.25,.5,.75,.9,.95,.99])

count    411.000000
mean       0.144938
std        1.016015
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
90%        0.001380
95%        0.043790
99%        5.966916
max       10.436160
Name: origin_dist, dtype: float64

In [269]:
otp_buste_itineraries_summary.next_origin_dist.describe(percentiles=[.25,.5,.75,.9,.95,.99])

count    411.000000
mean       0.724793
std        2.048477
min        0.000000
25%        0.001490
50%        0.029180
75%        0.231690
90%        1.277540
95%        6.746240
99%        9.147580
max       11.906770
Name: next_origin_dist, dtype: float64

In [270]:
len(otp_buste_itineraries_summary)

411

In [208]:
otp_buste_itineraries_summary.match_vehicle_boarding.describe()

count      411
unique       2
top       True
freq       358
Name: match_vehicle_boarding, dtype: object

In [209]:
otp_buste_itineraries_summary.drop_duplicates('user_trip_id').match_vehicle_boarding.value_counts()

True     79
False    50
Name: match_vehicle_boarding, dtype: int64

In [210]:
otp_buste_itineraries_filtered = otp_buste_itineraries_summary[((otp_buste_itineraries_summary['trip_duration'] > pd.Timedelta('0s')) & (otp_buste_itineraries_summary['trip_duration'] < pd.Timedelta('2h'))) &
                                                               ((otp_buste_itineraries_summary['start_diff'] > pd.Timedelta('0s')) & (otp_buste_itineraries_summary['start_diff'] < pd.Timedelta('1.5h')))] \
                                    .query('origin_dist < 0.1 and next_origin_dist < 2.0')                                        

In [211]:
otp_buste_itineraries_penalty = otp_buste_itineraries_filtered \
                                    .assign(penalty = lambda x: 2*x['start_diff'].dt.total_seconds() + x['trip_duration'].dt.total_seconds() + x['match_num_transfers']*10) \
                                    [['cardNum','user_trip_id','itinerary_id','match_num_transfers','match_vehicle_boarding','next_origin_dist','origin_dist','start_diff','trip_duration','penalty']] \
                                    .sort_values(['user_trip_id','penalty'], ascending=True)

In [212]:
otp_buste_itineraries_penalty.head()

Unnamed: 0,cardNum,user_trip_id,itinerary_id,match_num_transfers,match_vehicle_boarding,next_origin_dist,origin_dist,start_diff,trip_duration,penalty
0,844324.0,4700,1,1,True,0.016131,0.0,00:00:10,00:19:00,1170.0
1,844324.0,4700,3,1,True,0.016131,0.0,00:00:10,00:19:00,1170.0
2,844324.0,4700,6,1,True,0.016131,0.0,00:00:10,00:19:00,1170.0
3,844324.0,4700,8,1,True,0.016131,0.0,00:18:01,00:18:10,3262.0
4,844324.0,4700,9,1,True,0.016131,0.0,00:57:44,00:17:13,7971.0


In [213]:
chosen_itineraries = otp_buste_itineraries_penalty.groupby(['user_trip_id']).first().reset_index()

In [214]:
len(chosen_itineraries)

96

In [215]:
chosen_itineraries.head()

Unnamed: 0,user_trip_id,cardNum,itinerary_id,match_num_transfers,match_vehicle_boarding,next_origin_dist,origin_dist,start_diff,trip_duration,penalty
0,4700,844324.0,1,1,True,0.016131,0.0,00:00:10,00:19:00,1170.0
1,6671,948116.0,2,3,True,0.0,0.0,00:00:12,01:38:44,5978.0
2,6672,948116.0,5,3,True,0.210594,0.0,00:00:46,01:26:31,5313.0
3,7397,983885.0,3,2,False,0.041164,0.001382,00:04:38,00:52:09,3705.0
4,10498,1150536.0,3,3,False,0.0,0.000233,00:00:35,01:12:22,4442.0


In [216]:
chosen_itineraries = chosen_itineraries[(np.logical_not(chosen_itineraries['match_vehicle_boarding'])) | ((chosen_itineraries['match_vehicle_boarding']) & (chosen_itineraries['start_diff'] < pd.Timedelta('20 min')))]

In [217]:
len(chosen_itineraries)

91

In [218]:
chosen_itineraries.describe(percentiles=[.25,.5,.75,.9,.95,.99])

Unnamed: 0,user_trip_id,cardNum,itinerary_id,match_num_transfers,next_origin_dist,origin_dist,start_diff,trip_duration,penalty
count,91.0,91.0,91.0,91.0,91.0,91.0,91,91,91.0
mean,122830.846154,2908163.0,2.032967,1.912088,0.178993,0.010714,0 days 00:07:55.846153,0 days 00:54:59.978021,4270.791209
std,73577.101394,897949.9,1.663,0.66078,0.315497,0.023154,0 days 00:15:44.025045,0 days 00:28:14.651999,2494.277328
min,4700.0,844324.0,1.0,1.0,0.0,0.0,0 days 00:00:01,0 days 00:04:08,496.0
25%,59190.5,2312998.0,1.0,1.0,0.006909,0.0,0 days 00:00:25,0 days 00:33:19.500000,2251.5
50%,133387.0,3326767.0,1.0,2.0,0.035127,0.0,0 days 00:01:18,0 days 00:51:24,3986.0
75%,186232.5,3680733.0,3.0,2.0,0.229149,0.004005,0 days 00:05:52.500000,0 days 01:18:53,5744.0
90%,217927.0,3791238.0,5.0,3.0,0.580171,0.058099,0 days 00:23:36,0 days 01:36:29,6988.0
95%,226341.0,3814760.0,5.0,3.0,0.764088,0.065141,0 days 00:54:24.500000,0 days 01:40:30,9489.5
99%,236447.1,3837449.0,7.2,3.0,1.452678,0.084842,0 days 01:00:29.499999,0 days 01:43:58.899999,11128.0


In [219]:
chosen_itineraries.head()

Unnamed: 0,user_trip_id,cardNum,itinerary_id,match_num_transfers,match_vehicle_boarding,next_origin_dist,origin_dist,start_diff,trip_duration,penalty
0,4700,844324.0,1,1,True,0.016131,0.0,00:00:10,00:19:00,1170.0
1,6671,948116.0,2,3,True,0.0,0.0,00:00:12,01:38:44,5978.0
2,6672,948116.0,5,3,True,0.210594,0.0,00:00:46,01:26:31,5313.0
3,7397,983885.0,3,2,False,0.041164,0.001382,00:04:38,00:52:09,3705.0
4,10498,1150536.0,3,3,False,0.0,0.000233,00:00:35,01:12:22,4442.0


In [220]:
chosen_itineraries.dtypes

user_trip_id                        int64
cardNum                           float64
itinerary_id                        int64
match_num_transfers                 int64
match_vehicle_boarding               bool
next_origin_dist                  float64
origin_dist                       float64
start_diff                timedelta64[ns]
trip_duration             timedelta64[ns]
penalty                           float64
dtype: object

In [221]:
otp_legs_buste_data.head()

Unnamed: 0,cardNum,user_trip_id,first_vehicle_boarding,itinerary_id,leg_id,route,busCode,o_busCode,tripNum,o_tripNum,...,to_stop_id,otp_end_time,matched_end_time,otp_buste_end_timediff,boarding_otp_match_start_timediff,leg_duration,from_stop_lat,from_stop_lon,to_stop_lat,to_stop_lon
0,844324.0,4700,True,1,1,342,BA120,BA120,10.0,10.0,...,30448,2017-06-16 15:10:00,2017-06-16 15:09:47,00:00:13,00:00:10,00:19:00,-25.39634,-49.20217,-25.393374,-49.240917
1,844324.0,4700,True,3,1,342,BA120,BA120,10.0,10.0,...,30448,2017-06-16 15:30:00,2017-06-16 15:09:47,00:20:13,00:00:10,00:19:00,-25.39634,-49.20217,-25.393374,-49.240917
2,844324.0,4700,True,6,1,342,BA120,BA120,10.0,10.0,...,30448,2017-06-16 15:50:00,2017-06-16 15:09:47,00:40:13,00:00:10,00:19:00,-25.39634,-49.20217,-25.393374,-49.240917
3,844324.0,4700,True,8,1,342,BA116,BA120,10.0,10.0,...,30448,2017-06-16 16:10:00,2017-06-16 15:27:08,00:42:52,00:18:01,00:18:10,-25.39634,-49.20217,-25.393374,-49.240917
4,844324.0,4700,True,9,1,342,BA120,BA120,11.0,10.0,...,30448,2017-06-16 16:31:00,2017-06-16 16:05:54,00:25:06,00:57:44,00:17:13,-25.39634,-49.20217,-25.393374,-49.240917


In [222]:
otp_legs_buste_data.dtypes

cardNum                                      float64
user_trip_id                                  object
first_vehicle_boarding                        object
itinerary_id                                  object
leg_id                                        object
route                                         object
busCode                                       object
o_busCode                                     object
tripNum                                      float64
o_tripNum                                    float64
from_stop_id                                  object
otp_start_time                        datetime64[ns]
matched_start_time                    datetime64[ns]
o_boarding_datetime                   datetime64[ns]
otp_buste_start_timediff             timedelta64[ns]
to_stop_id                                    object
otp_end_time                          datetime64[ns]
matched_end_time                      datetime64[ns]
otp_buste_end_timediff               timedelta

In [223]:
otp_legs_buste_data.head()

Unnamed: 0,cardNum,user_trip_id,first_vehicle_boarding,itinerary_id,leg_id,route,busCode,o_busCode,tripNum,o_tripNum,...,to_stop_id,otp_end_time,matched_end_time,otp_buste_end_timediff,boarding_otp_match_start_timediff,leg_duration,from_stop_lat,from_stop_lon,to_stop_lat,to_stop_lon
0,844324.0,4700,True,1,1,342,BA120,BA120,10.0,10.0,...,30448,2017-06-16 15:10:00,2017-06-16 15:09:47,00:00:13,00:00:10,00:19:00,-25.39634,-49.20217,-25.393374,-49.240917
1,844324.0,4700,True,3,1,342,BA120,BA120,10.0,10.0,...,30448,2017-06-16 15:30:00,2017-06-16 15:09:47,00:20:13,00:00:10,00:19:00,-25.39634,-49.20217,-25.393374,-49.240917
2,844324.0,4700,True,6,1,342,BA120,BA120,10.0,10.0,...,30448,2017-06-16 15:50:00,2017-06-16 15:09:47,00:40:13,00:00:10,00:19:00,-25.39634,-49.20217,-25.393374,-49.240917
3,844324.0,4700,True,8,1,342,BA116,BA120,10.0,10.0,...,30448,2017-06-16 16:10:00,2017-06-16 15:27:08,00:42:52,00:18:01,00:18:10,-25.39634,-49.20217,-25.393374,-49.240917
4,844324.0,4700,True,9,1,342,BA120,BA120,11.0,10.0,...,30448,2017-06-16 16:31:00,2017-06-16 16:05:54,00:25:06,00:57:44,00:17:13,-25.39634,-49.20217,-25.393374,-49.240917


In [224]:
od_trips = chosen_itineraries.merge(otp_legs_buste_data, on=['cardNum','user_trip_id','itinerary_id'], how='inner') \
        [['cardNum','user_trip_id','itinerary_id','leg_id','route','busCode','tripNum','from_stop_id','matched_start_time','from_stop_lat','from_stop_lon','to_stop_id','matched_end_time','to_stop_lat','to_stop_lon','leg_duration']] \
        .rename(index=str, columns={'matched_start_time':'start_time','matched_end_time':'end_time'})

In [230]:
od_trips.head()

Unnamed: 0,cardNum,user_trip_id,itinerary_id,leg_id,route,busCode,tripNum,from_stop_id,start_time,from_stop_lat,from_stop_lon,to_stop_id,end_time,to_stop_lat,to_stop_lon,leg_duration
0,844324.0,4700,1,1,342,BA120,10.0,34915,2017-06-16 14:50:47,-25.39634,-49.20217,30448,2017-06-16 15:09:47,-25.393374,-49.240917,00:19:00
1,948116.0,6671,2,1,684,HA610,1.0,39372,2017-06-16 04:48:26,-25.59345,-49.33388,31053,2017-06-16 05:12:47,-25.513098,-49.295303,00:24:21
2,948116.0,6671,2,3,204,HL309,1.0,26252,2017-06-16 05:29:56,-25.512983,-49.294505,25917,2017-06-16 05:57:00,-25.426995,-49.264729,00:27:04
3,948116.0,6671,2,5,361,BC906,1.0,29889,2017-06-16 06:21:06,-25.42691,-49.26308,28979,2017-06-16 06:27:10,-25.417287,-49.249938,00:06:04
4,948116.0,6672,5,1,216,BA603,7.0,28979,2017-06-16 15:25:37,-25.417287,-49.249938,28332,2017-06-16 16:00:52,-25.476127,-49.292362,00:35:15


In [231]:
od_trips.head(10).filter(['cardNum','user_trip_id','start_time','end_time','leg_duration'])

Unnamed: 0,cardNum,user_trip_id,start_time,end_time,leg_duration
0,844324.0,4700,2017-06-16 14:50:47,2017-06-16 15:09:47,00:19:00
1,948116.0,6671,2017-06-16 04:48:26,2017-06-16 05:12:47,00:24:21
2,948116.0,6671,2017-06-16 05:29:56,2017-06-16 05:57:00,00:27:04
3,948116.0,6671,2017-06-16 06:21:06,2017-06-16 06:27:10,00:06:04
4,948116.0,6672,2017-06-16 15:25:37,2017-06-16 16:00:52,00:35:15
5,948116.0,6672,2017-06-16 16:08:15,2017-06-16 16:19:48,00:11:33
6,948116.0,6672,2017-06-16 16:22:04,2017-06-16 16:52:08,00:30:04
7,983885.0,7397,2017-06-16 15:09:52,2017-06-16 15:14:31,00:04:39
8,983885.0,7397,2017-06-16 15:22:47,2017-06-16 16:02:01,00:39:14
9,1150536.0,10498,2017-06-16 14:54:12,2017-06-16 14:57:42,00:03:30


In [233]:
od_trips.tail(10).filter(['cardNum','user_trip_id','start_time','end_time','leg_duration'])

Unnamed: 0,cardNum,user_trip_id,start_time,end_time,leg_duration
164,3825303.0,230801,2017-06-16 04:43:36,2017-06-16 05:12:47,00:29:11
165,3825303.0,230801,2017-06-16 05:29:56,2017-06-16 05:59:12,00:29:16
166,3825303.0,230801,2017-06-16 06:03:06,2017-06-16 06:08:36,00:05:30
167,3825303.0,230802,2017-06-16 13:35:25,2017-06-16 13:49:33,00:14:08
168,3825303.0,230802,2017-06-16 14:01:02,2017-06-16 14:20:59,00:19:57
169,3825303.0,230802,2017-06-16 14:28:42,2017-06-16 15:00:03,00:31:21
170,3837449.0,236447,2017-06-16 04:43:38,2017-06-16 05:49:30,01:05:52
171,3837449.0,236447,2017-06-16 05:49:35,2017-06-16 06:01:18,00:11:43
172,3837449.0,236448,2017-06-16 18:18:38,2017-06-16 18:30:34,00:11:56
173,3837449.0,236448,2017-06-16 18:36:27,2017-06-16 19:29:26,00:52:59


In [226]:
len(od_trips)

174

In [234]:
np.arccos(90)

  """Entry point for launching an IPython kernel.


nan