In [29]:
#Libraries

#Python Libs
import sys
import os
import glob
import traceback
from datetime import datetime
import time
#from geopy import distance


#Data Analysis Libs
import pandas as pd
import numpy as np

In [30]:
#Functions
def select_input_files(enh_buste_base_path,init_date,fin_date,suffix):
        selected_files = []
        all_files = glob.glob(os.path.join(enh_buste_base_path,"*"))

        for file_ in all_files:
                try:
                        file_date = pd.to_datetime(file_.split('/')[-1],format=('%Y_%m_%d' + suffix  + '.csv'))
                        if (file_date >= init_date) and (file_date <= fin_date):
                                selected_files.append((file_,file_date))
                except:
                        continue

        return sorted(selected_files)

def get_gtfs_path(query_date):
    INTERMEDIATE_OTP_DATE = pd.to_datetime("2019-02-02", format="%Y-%m-%d")
    router_id = ''

    if (query_date <= INTERMEDIATE_OTP_DATE):
        return 'campina-gtfs-2019'
    else:
        return 'campina-gtfs-2017'

## Main

In [31]:
otp_suggestions_filepath = 'data/output/2019_02_02_bus_trips_otp_itineraries.csv'
bus_trips_folderpath = 'data/input'
gtfs_base_folderpath = 'data/input'
output_folderpath = 'data/output'

In [32]:
#gtfs_base_folderpath + os.sep + get_gtfs_path(file_date) + os.sep

In [33]:
file_date_str = otp_suggestions_filepath.split('/')[-1].split('_bus_trips_')[0]
file_date = pd.to_datetime(file_date_str,format='%Y_%m_%d')
print "Processing File:", otp_suggestions_filepath

Processing File: data/output/2019_02_02_bus_trips_otp_itineraries.csv


In [34]:
    # Extracting itinerary part name for later use
    itinerary_part_name = otp_suggestions_filepath.split('/')[-1].split('_')[5]
    # Read OTP Suggestions
    otp_suggestions_raw = pd.read_csv(otp_suggestions_filepath, parse_dates=['date','otp_start_time','otp_end_time'])

    if len(otp_suggestions_raw) == 0:
        print "Zero OTP suggestions found."
        print "Skipping next steps..."
        exit(0)


In [35]:
def prepare_otp_data(otp_data):
        #Fixing prefix
        otp_data.columns = otp_data.columns.str.replace('otp_','')
        otp_data = otp_data.add_prefix('otp_')
        
        #Fixing Timezone difference - when needed
        otp_data['otp_start_time'] = otp_data['otp_start_time'] - pd.Timedelta('10800 s')
        otp_data['otp_end_time'] = otp_data['otp_end_time'] - pd.Timedelta('10800 s')
        
        #Adjusting route format to have 3 numbers
        otp_data['otp_route'] = otp_data['otp_route'].astype(str)
        otp_data['otp_route'] = np.where(otp_data['otp_mode'] == 'BUS',
                            otp_data['otp_route'].astype(str).str.replace("\.0",'').str.zfill(3),
                            otp_data['otp_route'])

        return otp_data

In [36]:
    # Prepare OTP data for analysis
    otp_suggestions = prepare_otp_data(otp_suggestions_raw)    
    
    # Read stops data
    stops_filepath = gtfs_base_folderpath + os.sep + get_gtfs_path(file_date) + os.sep + 'stops.txt'
    stops_df = pd.read_csv(stops_filepath)
    
    # Adding Parent Stop data to OTP Suggestions TODO
    stops_parent_stations = stops_df[['stop_id','parent_station']]
    otp_suggestions = otp_suggestions.merge(stops_parent_stations.add_prefix('from_'),
                                                left_on='otp_from_stop_id',
                                                right_on='from_stop_id',
                                                how='left') \
                                        .merge(stops_parent_stations.add_prefix('to_'),
                                                left_on='otp_to_stop_id',
                                                right_on='to_stop_id',
                                                how='left') \
                                        .drop(['from_stop_id','to_stop_id'], axis=1) \
                                        .rename(index=str, columns={'from_parent_station':'otp_from_parent_station',
                                                                    'to_parent_station':'otp_to_parent_station'})
        
    otp_suggestions_bus_legs = otp_suggestions[otp_suggestions['otp_mode'] == 'BUS']
    otp_suggestions_walk_legs = otp_suggestions[otp_suggestions['otp_mode'] == 'WALK']

In [37]:
#otp_suggestions.drop_duplicates(subset=['otp_leg_id','otp_end_time','otp_mode','otp_duration_mins'],inplace=True)
#otp_suggestions.duplicated(subset=['otp_leg_id','otp_end_time','otp_mode','otp_duration_mins'])
otp_suggestions.tail(10)

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
30075,2019-02-02,497897.0,66,3,2019-02-02 15:57:05,2019-02-02 15:59:58,WALK,,,,2.883333,,
30076,2019-02-02,497897.0,67,1,2019-02-02 16:00:17,2019-02-02 16:01:01,WALK,,,,0.733333,,
30077,2019-02-02,497897.0,67,2,2019-02-02 16:01:02,2019-02-02 16:03:03,BUS,909.0,497897.0,385901.0,2.016667,,
30078,2019-02-02,497897.0,67,3,2019-02-02 16:03:04,2019-02-02 16:05:57,WALK,,,,2.883333,,
30079,2019-02-02,497897.0,68,1,2019-02-02 16:07:17,2019-02-02 16:08:01,WALK,,,,0.733333,,
30080,2019-02-02,497897.0,68,2,2019-02-02 16:08:02,2019-02-02 16:10:04,BUS,909.0,497897.0,385901.0,2.033333,,
30081,2019-02-02,497897.0,68,3,2019-02-02 16:10:05,2019-02-02 16:12:58,WALK,,,,2.883333,,
30082,2019-02-02,497897.0,69,1,2019-02-02 16:20:17,2019-02-02 16:21:01,WALK,,,,0.733333,,
30083,2019-02-02,497897.0,69,2,2019-02-02 16:21:02,2019-02-02 16:23:04,BUS,909.0,497897.0,385901.0,2.033333,,
30084,2019-02-02,497897.0,69,3,2019-02-02 16:23:05,2019-02-02 16:25:58,WALK,,,,2.883333,,


## Read and Prepare Bus Trip Data

In [38]:
bus_trips_folderpath + os.sep + file_date_str + '_bus_trips.csv'

'data/input/2019_02_02_bus_trips.csv'

In [39]:
bus_trips_filepath = bus_trips_folderpath + os.sep + file_date_str + '_bus_trips.csv'
bus_trips = pd.read_csv(bus_trips_filepath, dtype={'route': object},parse_dates=['gps_datetime']) \
                                        .sort_values(['route','busCode','tripNum','gps_datetime']) \
                                        .assign(route = lambda x: x['route'].astype(str).str.replace("\.0",'').str.zfill(3))  \
                                        .drop_duplicates()

In [42]:
bus_trips.head()

Unnamed: 0,route,tripNum,shapeId,routeFrequency,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,gpsLon,distanceToShapePoint,gps_datetime,stopPointId,streetName,problem
8438,3,-,140170,low_frequency,1,-7.21417,-35.85934,0.0,-,-,-,-,-,-,491437,-,BETWEEN
8439,3,-,140170,low_frequency,6,-7.21478,-35.86141,262.0,-,-,-,-,-,-,385711,-,BETWEEN
8447,3,-,140170,low_frequency,8,-7.21538,-35.86281,430.0,-,-,-,-,-,-,385713,-,BETWEEN
8440,3,1,140170,low_frequency,142,-7.21568,-35.8581,9232.0,-,-,-,-,-,-,497970,-,BETWEEN
8441,3,1,140170,low_frequency,148,-7.21439,-35.85596,9509.0,-,-,-,-,-,-,385963,-,BETWEEN


In [160]:
#bus_trips[(bus_trips.route == "944") & (bus_trips.stopPointId == 491551.0)].head()
#bus_trips['gps_datetime'] = pd.to_datetime(bus_trips['gps_datetime'], format='%Y_%m_%d %H:%M:%S')

In [161]:
#trezentos = bus_trips[(bus_trips.route == "333")]
#output = trezentos.to_csv("data/output/trezentos.csv",index=False)


In [43]:
bus_trips_clean = bus_trips.filter(['route','busCode','tripNum','stopPointId','gps_datetime'])

In [45]:
bus_trips_clean.head()

Unnamed: 0,route,busCode,tripNum,stopPointId,gps_datetime
8438,3,-,-,491437,-
8439,3,-,-,385711,-
8447,3,-,-,385713,-
8440,3,-,1,497970,-
8441,3,-,1,385963,-


In [164]:
#bus_trips_clean[bus_trips_clean.route == "944"].head()

In [46]:
bus_trips_clean.dtypes

route           object
busCode         object
tripNum         object
stopPointId      int64
gps_datetime    object
dtype: object

In [47]:
otp_suggestions_bus_legs.dtypes

otp_date                   datetime64[ns]
otp_user_trip_id                  float64
otp_itinerary_id                    int64
otp_leg_id                          int64
otp_start_time             datetime64[ns]
otp_end_time               datetime64[ns]
otp_mode                           object
otp_route                          object
otp_from_stop_id                  float64
otp_to_stop_id                    float64
otp_duration_mins                 float64
otp_from_parent_station           float64
otp_to_parent_station             float64
dtype: object

## Identify Possible Matches between OTP Itineraries and Bus Trips Observed Data

In [48]:
#bus_trips_clean.gps_datetime = bus_trips_clean.drop(bus_trips_clean.gps_datetime[bus_trips_clean.gps_datetime == "-"].index, inplace=True)

bus_trips_clean.drop(bus_trips_clean.loc[bus_trips_clean['gps_datetime']=='-'].index, inplace=True)


#bus_trips_clean = bus_trips_clean[bus_trips_clean.gps_datetime == "-"]
bus_trips_clean.dtypes
#bus_trips_clean.gps_datetime = pd.to_datetime(bus_trips_clean.gps_datetime)
#pd.to_datetime('01-02-2019 06:01:01')
bus_trips_clean.head()

Unnamed: 0,route,busCode,tripNum,stopPointId,gps_datetime
2262,3,1062,1,385713,02-02-2019 06:03:45
2263,3,1062,1,385707,02-02-2019 06:05:17
2264,3,1062,1,386062,02-02-2019 06:05:53
2265,3,1062,1,386063,02-02-2019 06:06:27
2266,3,1062,1,497826,02-02-2019 06:11:02


In [49]:
bus_trips_clean['gps_datetime'] = pd.to_datetime(bus_trips_clean['gps_datetime'], format='%d-%m-%Y %H:%M:%S')
bus_trips_clean.head()

Unnamed: 0,route,busCode,tripNum,stopPointId,gps_datetime
2262,3,1062,1,385713,2019-02-02 06:03:45
2263,3,1062,1,385707,2019-02-02 06:05:17
2264,3,1062,1,386062,2019-02-02 06:05:53
2265,3,1062,1,386063,2019-02-02 06:06:27
2266,3,1062,1,497826,2019-02-02 06:11:02


In [50]:
scheduled_itin_observed_o = otp_suggestions_bus_legs.merge(bus_trips_clean.add_prefix('bt_'),
                                left_on=['otp_route','otp_from_stop_id'],
                                right_on=['bt_route','bt_stopPointId'],
                                how='inner') \
                                .drop(['bt_route','bt_stopPointId'], axis=1) \
                                .rename(index=str, columns={'bt_gps_datetime':'bt_start_time',
                                                            'bt_tripNum':'bt_trip_num',
                                                            'bt_busCode':'bt_bus_code'}) \
                                .assign(sched_obs_start_timediff = 
                                        lambda x: np.absolute(pd.to_datetime(x['bt_start_time']) - x['otp_start_time']))

In [170]:
#bus_trips_clean[(bus_trips_clean.route == "944") & (bus_trips_clean.stopPointId == 491551.0)].sort_values(by=['gps_datetime']).head()

In [51]:
scheduled_itin_observed_o.head()

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station,bt_bus_code,bt_trip_num,bt_start_time,sched_obs_start_timediff
0,2019-02-02,385703.0,2,2,2019-02-02 05:58:29,2019-02-02 05:59:52,BUS,090B,385703.0,386613.0,1.383333,,,4002,1,2019-02-02 06:58:20,00:59:51
1,2019-02-02,385703.0,2,2,2019-02-02 05:58:29,2019-02-02 05:59:52,BUS,090B,385703.0,386613.0,1.383333,,,4002,2,2019-02-02 08:29:02,02:30:33
2,2019-02-02,385703.0,3,2,2019-02-02 06:16:29,2019-02-02 06:17:52,BUS,090B,385703.0,386613.0,1.383333,,,4002,1,2019-02-02 06:58:20,00:41:51
3,2019-02-02,385703.0,3,2,2019-02-02 06:16:29,2019-02-02 06:17:52,BUS,090B,385703.0,386613.0,1.383333,,,4002,2,2019-02-02 08:29:02,02:12:33
4,2019-02-02,385703.0,4,2,2019-02-02 06:33:29,2019-02-02 06:34:52,BUS,090B,385703.0,386613.0,1.383333,,,4002,1,2019-02-02 06:58:20,00:24:51


In [172]:
#scheduled_itin_observed_o[(scheduled_itin_observed_o.otp_route == "944") & (scheduled_itin_observed_o.otp_start_time == '2019-05-13 14:05:47')][['otp_itinerary_id',"otp_start_time","otp_route","otp_from_stop_id",
                                                                           #"otp_from_stop_id", "bt_start_time", "sched_obs_start_timediff"]].sort_values(by=['sched_obs_start_timediff']).head()
#scheduled_itin_observed_o[(scheduled_itin_observed_o.otp_itinerary_id == 112)]

In [52]:
scheduled_itin_observed_o_1 = scheduled_itin_observed_o[(scheduled_itin_observed_o.otp_mode == "BUS") & (scheduled_itin_observed_o.sched_obs_start_timediff >= pd.Timedelta('0s'))  ]#& (scheduled_itin_observed_o.sched_obs_start_timediff < pd.Timedelta('1.5h'))]
scheduled_itin_observed_o_1

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station,bt_bus_code,bt_trip_num,bt_start_time,sched_obs_start_timediff
0,2019-02-02,385703.0,2,2,2019-02-02 05:58:29,2019-02-02 05:59:52,BUS,090B,385703.0,386613.0,1.383333,,,4002,1,2019-02-02 06:58:20,00:59:51
1,2019-02-02,385703.0,2,2,2019-02-02 05:58:29,2019-02-02 05:59:52,BUS,090B,385703.0,386613.0,1.383333,,,4002,2,2019-02-02 08:29:02,02:30:33
2,2019-02-02,385703.0,3,2,2019-02-02 06:16:29,2019-02-02 06:17:52,BUS,090B,385703.0,386613.0,1.383333,,,4002,1,2019-02-02 06:58:20,00:41:51
3,2019-02-02,385703.0,3,2,2019-02-02 06:16:29,2019-02-02 06:17:52,BUS,090B,385703.0,386613.0,1.383333,,,4002,2,2019-02-02 08:29:02,02:12:33
4,2019-02-02,385703.0,4,2,2019-02-02 06:33:29,2019-02-02 06:34:52,BUS,090B,385703.0,386613.0,1.383333,,,4002,1,2019-02-02 06:58:20,00:24:51
5,2019-02-02,385703.0,4,2,2019-02-02 06:33:29,2019-02-02 06:34:52,BUS,090B,385703.0,386613.0,1.383333,,,4002,2,2019-02-02 08:29:02,01:55:33
6,2019-02-02,385703.0,5,2,2019-02-02 06:49:29,2019-02-02 06:50:52,BUS,090B,385703.0,386613.0,1.383333,,,4002,1,2019-02-02 06:58:20,00:08:51
7,2019-02-02,385703.0,5,2,2019-02-02 06:49:29,2019-02-02 06:50:52,BUS,090B,385703.0,386613.0,1.383333,,,4002,2,2019-02-02 08:29:02,01:39:33
8,2019-02-02,385703.0,6,2,2019-02-02 07:05:29,2019-02-02 07:06:52,BUS,090B,385703.0,386613.0,1.383333,,,4002,1,2019-02-02 06:58:20,00:07:09
9,2019-02-02,385703.0,6,2,2019-02-02 07:05:29,2019-02-02 07:06:52,BUS,090B,385703.0,386613.0,1.383333,,,4002,2,2019-02-02 08:29:02,01:23:33


In [53]:
scheduled_itin_observed_od = scheduled_itin_observed_o.merge(bus_trips_clean.add_prefix('bt_'),
                                left_on=['otp_route','bt_bus_code','bt_trip_num','otp_to_stop_id'],
                                right_on=['bt_route','bt_busCode','bt_tripNum','bt_stopPointId'],
                                how='inner') \
                                .drop(['bt_route','bt_stopPointId'], axis=1) \
                                .rename(index=str, columns={'bt_gps_datetime':'bt_end_time'}) \
                                .assign(sched_obs_end_timediff = 
                                        lambda x: np.absolute(x['bt_end_time'] - x['otp_end_time'])) \
                                .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','sched_obs_start_timediff','sched_obs_end_timediff'])

In [54]:
scheduled_itin_observed_od.head()

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,...,otp_from_parent_station,otp_to_parent_station,bt_bus_code,bt_trip_num,bt_start_time,sched_obs_start_timediff,bt_busCode,bt_tripNum,bt_end_time,sched_obs_end_timediff
0,2019-02-02,385703.0,2,2,2019-02-02 05:58:29,2019-02-02 05:59:52,BUS,090B,385703.0,386613.0,...,,,4002,1,2019-02-02 06:58:20,00:59:51,4002,1,2019-02-02 07:00:01,01:00:09
1116,2019-02-02,385703.0,2,2,2019-02-02 05:58:29,2019-02-02 05:59:52,BUS,090B,385703.0,386613.0,...,,,4002,2,2019-02-02 08:29:02,02:30:33,4002,2,2019-02-02 08:30:16,02:30:24
1,2019-02-02,385703.0,3,2,2019-02-02 06:16:29,2019-02-02 06:17:52,BUS,090B,385703.0,386613.0,...,,,4002,1,2019-02-02 06:58:20,00:41:51,4002,1,2019-02-02 07:00:01,00:42:09
1117,2019-02-02,385703.0,3,2,2019-02-02 06:16:29,2019-02-02 06:17:52,BUS,090B,385703.0,386613.0,...,,,4002,2,2019-02-02 08:29:02,02:12:33,4002,2,2019-02-02 08:30:16,02:12:24
2,2019-02-02,385703.0,4,2,2019-02-02 06:33:29,2019-02-02 06:34:52,BUS,090B,385703.0,386613.0,...,,,4002,1,2019-02-02 06:58:20,00:24:51,4002,1,2019-02-02 07:00:01,00:25:09


In [55]:
scheduled_itin_observed_od['bt_duration_mins'] = (scheduled_itin_observed_od['bt_end_time'] - scheduled_itin_observed_od['bt_start_time'])/pd.Timedelta(minutes=1)
scheduled_itin_observed_od = scheduled_itin_observed_od[scheduled_itin_observed_od['bt_duration_mins'] > 0]

In [56]:
scheduled_itin_observed_od

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,...,otp_to_parent_station,bt_bus_code,bt_trip_num,bt_start_time,sched_obs_start_timediff,bt_busCode,bt_tripNum,bt_end_time,sched_obs_end_timediff,bt_duration_mins
0,2019-02-02,385703.0,2,2,2019-02-02 05:58:29,2019-02-02 05:59:52,BUS,090B,385703.0,386613.0,...,,4002,1,2019-02-02 06:58:20,00:59:51,4002,1,2019-02-02 07:00:01,01:00:09,1.683333
1116,2019-02-02,385703.0,2,2,2019-02-02 05:58:29,2019-02-02 05:59:52,BUS,090B,385703.0,386613.0,...,,4002,2,2019-02-02 08:29:02,02:30:33,4002,2,2019-02-02 08:30:16,02:30:24,1.233333
1,2019-02-02,385703.0,3,2,2019-02-02 06:16:29,2019-02-02 06:17:52,BUS,090B,385703.0,386613.0,...,,4002,1,2019-02-02 06:58:20,00:41:51,4002,1,2019-02-02 07:00:01,00:42:09,1.683333
1117,2019-02-02,385703.0,3,2,2019-02-02 06:16:29,2019-02-02 06:17:52,BUS,090B,385703.0,386613.0,...,,4002,2,2019-02-02 08:29:02,02:12:33,4002,2,2019-02-02 08:30:16,02:12:24,1.233333
2,2019-02-02,385703.0,4,2,2019-02-02 06:33:29,2019-02-02 06:34:52,BUS,090B,385703.0,386613.0,...,,4002,1,2019-02-02 06:58:20,00:24:51,4002,1,2019-02-02 07:00:01,00:25:09,1.683333
1118,2019-02-02,385703.0,4,2,2019-02-02 06:33:29,2019-02-02 06:34:52,BUS,090B,385703.0,386613.0,...,,4002,2,2019-02-02 08:29:02,01:55:33,4002,2,2019-02-02 08:30:16,01:55:24,1.233333
3,2019-02-02,385703.0,5,2,2019-02-02 06:49:29,2019-02-02 06:50:52,BUS,090B,385703.0,386613.0,...,,4002,1,2019-02-02 06:58:20,00:08:51,4002,1,2019-02-02 07:00:01,00:09:09,1.683333
1119,2019-02-02,385703.0,5,2,2019-02-02 06:49:29,2019-02-02 06:50:52,BUS,090B,385703.0,386613.0,...,,4002,2,2019-02-02 08:29:02,01:39:33,4002,2,2019-02-02 08:30:16,01:39:24,1.233333
4,2019-02-02,385703.0,6,2,2019-02-02 07:05:29,2019-02-02 07:06:52,BUS,090B,385703.0,386613.0,...,,4002,1,2019-02-02 06:58:20,00:07:09,4002,1,2019-02-02 07:00:01,00:06:51,1.683333
1120,2019-02-02,385703.0,6,2,2019-02-02 07:05:29,2019-02-02 07:06:52,BUS,090B,385703.0,386613.0,...,,4002,2,2019-02-02 08:29:02,01:23:33,4002,2,2019-02-02 08:30:16,01:23:24,1.233333


In [57]:
scheduled_itin_observed_od_full = pd.concat([scheduled_itin_observed_od,otp_suggestions_walk_legs], sort=False)
scheduled_itin_observed_od_full

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,...,otp_to_parent_station,bt_bus_code,bt_trip_num,bt_start_time,sched_obs_start_timediff,bt_busCode,bt_tripNum,bt_end_time,sched_obs_end_timediff,bt_duration_mins
0,2019-02-02,385703.0,2,2,2019-02-02 05:58:29,2019-02-02 05:59:52,BUS,090B,385703.0,386613.0,...,,4002,1,2019-02-02 06:58:20,00:59:51,4002,1,2019-02-02 07:00:01,01:00:09,1.683333
1116,2019-02-02,385703.0,2,2,2019-02-02 05:58:29,2019-02-02 05:59:52,BUS,090B,385703.0,386613.0,...,,4002,2,2019-02-02 08:29:02,02:30:33,4002,2,2019-02-02 08:30:16,02:30:24,1.233333
1,2019-02-02,385703.0,3,2,2019-02-02 06:16:29,2019-02-02 06:17:52,BUS,090B,385703.0,386613.0,...,,4002,1,2019-02-02 06:58:20,00:41:51,4002,1,2019-02-02 07:00:01,00:42:09,1.683333
1117,2019-02-02,385703.0,3,2,2019-02-02 06:16:29,2019-02-02 06:17:52,BUS,090B,385703.0,386613.0,...,,4002,2,2019-02-02 08:29:02,02:12:33,4002,2,2019-02-02 08:30:16,02:12:24,1.233333
2,2019-02-02,385703.0,4,2,2019-02-02 06:33:29,2019-02-02 06:34:52,BUS,090B,385703.0,386613.0,...,,4002,1,2019-02-02 06:58:20,00:24:51,4002,1,2019-02-02 07:00:01,00:25:09,1.683333
1118,2019-02-02,385703.0,4,2,2019-02-02 06:33:29,2019-02-02 06:34:52,BUS,090B,385703.0,386613.0,...,,4002,2,2019-02-02 08:29:02,01:55:33,4002,2,2019-02-02 08:30:16,01:55:24,1.233333
3,2019-02-02,385703.0,5,2,2019-02-02 06:49:29,2019-02-02 06:50:52,BUS,090B,385703.0,386613.0,...,,4002,1,2019-02-02 06:58:20,00:08:51,4002,1,2019-02-02 07:00:01,00:09:09,1.683333
1119,2019-02-02,385703.0,5,2,2019-02-02 06:49:29,2019-02-02 06:50:52,BUS,090B,385703.0,386613.0,...,,4002,2,2019-02-02 08:29:02,01:39:33,4002,2,2019-02-02 08:30:16,01:39:24,1.233333
4,2019-02-02,385703.0,6,2,2019-02-02 07:05:29,2019-02-02 07:06:52,BUS,090B,385703.0,386613.0,...,,4002,1,2019-02-02 06:58:20,00:07:09,4002,1,2019-02-02 07:00:01,00:06:51,1.683333
1120,2019-02-02,385703.0,6,2,2019-02-02 07:05:29,2019-02-02 07:06:52,BUS,090B,385703.0,386613.0,...,,4002,2,2019-02-02 08:29:02,01:23:33,4002,2,2019-02-02 08:30:16,01:23:24,1.233333


In [58]:
otp_suggestions_walk_legs

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,2019-02-02,385703.0,1,1,2019-02-02 01:31:17,2019-02-02 01:41:05,WALK,,,,9.800000,,
1,2019-02-02,385703.0,2,1,2019-02-02 05:58:24,2019-02-02 05:58:28,WALK,,,,0.066667,,
3,2019-02-02,385703.0,2,3,2019-02-02 05:59:53,2019-02-02 06:05:12,WALK,,,,5.316667,,
4,2019-02-02,385703.0,3,1,2019-02-02 06:16:24,2019-02-02 06:16:28,WALK,,,,0.066667,,
6,2019-02-02,385703.0,3,3,2019-02-02 06:17:53,2019-02-02 06:23:12,WALK,,,,5.316667,,
7,2019-02-02,385703.0,4,1,2019-02-02 06:33:24,2019-02-02 06:33:28,WALK,,,,0.066667,,
9,2019-02-02,385703.0,4,3,2019-02-02 06:34:53,2019-02-02 06:40:12,WALK,,,,5.316667,,
10,2019-02-02,385703.0,5,1,2019-02-02 06:49:24,2019-02-02 06:49:28,WALK,,,,0.066667,,
12,2019-02-02,385703.0,5,3,2019-02-02 06:50:53,2019-02-02 06:56:12,WALK,,,,5.316667,,
13,2019-02-02,385703.0,6,1,2019-02-02 07:05:24,2019-02-02 07:05:28,WALK,,,,0.066667,,


In [180]:
#scheduled_itin_observed_od_full[scheduled_itin_observed_od_full.otp_itinerary_id == 296]

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,...,otp_to_parent_station,bt_bus_code,bt_trip_num,bt_start_time,sched_obs_start_timediff,bt_busCode,bt_tripNum,bt_end_time,sched_obs_end_timediff,bt_duration_mins


In [181]:
#bus_trips_clean[(bus_trips['route'] == '944') & (bus_trips['stopPointId'] == 491551)].sort_values(['gps_datetime'])

  """Entry point for launching an IPython kernel.


Unnamed: 0,route,busCode,tripNum,stopPointId,gps_datetime
43243,944,1064,2,491551,2019-02-10 09:08:14
43281,944,1064,3,491551,2019-02-10 09:31:55
4232,944,1083,2,491551,2019-02-10 11:58:10
8953,944,1078,3,491551,2019-02-10 12:17:56
8993,944,1078,4,491551,2019-02-10 12:42:29
4353,944,1083,3,491551,2019-02-10 13:03:59
9065,944,1078,5,491551,2019-02-10 13:43:49
4528,944,1083,8,491551,2019-02-10 16:16:23
4593,944,1083,9,491551,2019-02-10 17:09:10
4656,944,1083,10,491551,2019-02-10 18:09:36


In [59]:
scheduled_itin_observed_od_full_clean = scheduled_itin_observed_od_full \
                            .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_mode','otp_route',
                                     'bt_bus_code','bt_trip_num','otp_from_stop_id','otp_start_time',
                                     'bt_start_time','sched_obs_start_timediff','otp_to_stop_id',
                                     'otp_end_time','bt_end_time','sched_obs_end_timediff','otp_duration_mins','minimun_obs_start_time','bt_duration_mins']) \
                            .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])

In [60]:
scheduled_itin_observed_od_full.head() \
                            .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,...,otp_to_parent_station,bt_bus_code,bt_trip_num,bt_start_time,sched_obs_start_timediff,bt_busCode,bt_tripNum,bt_end_time,sched_obs_end_timediff,bt_duration_mins
0,2019-02-02,385703.0,2,2,2019-02-02 05:58:29,2019-02-02 05:59:52,BUS,090B,385703.0,386613.0,...,,4002,1,2019-02-02 06:58:20,00:59:51,4002,1,2019-02-02 07:00:01,01:00:09,1.683333
1116,2019-02-02,385703.0,2,2,2019-02-02 05:58:29,2019-02-02 05:59:52,BUS,090B,385703.0,386613.0,...,,4002,2,2019-02-02 08:29:02,02:30:33,4002,2,2019-02-02 08:30:16,02:30:24,1.233333
1,2019-02-02,385703.0,3,2,2019-02-02 06:16:29,2019-02-02 06:17:52,BUS,090B,385703.0,386613.0,...,,4002,1,2019-02-02 06:58:20,00:41:51,4002,1,2019-02-02 07:00:01,00:42:09,1.683333
1117,2019-02-02,385703.0,3,2,2019-02-02 06:16:29,2019-02-02 06:17:52,BUS,090B,385703.0,386613.0,...,,4002,2,2019-02-02 08:29:02,02:12:33,4002,2,2019-02-02 08:30:16,02:12:24,1.233333
2,2019-02-02,385703.0,4,2,2019-02-02 06:33:29,2019-02-02 06:34:52,BUS,090B,385703.0,386613.0,...,,4002,1,2019-02-02 06:58:20,00:24:51,4002,1,2019-02-02 07:00:01,00:25:09,1.683333


In [61]:
# coisa = scheduled_itin_observed_od_full_clean[(scheduled_itin_observed_od_full_clean.otp_route == "944") & (scheduled_itin_observed_od_full_clean.otp_start_time == '2019-05-13 15:05:47')][['otp_itinerary_id',"otp_start_time","otp_route","otp_from_stop_id",
#                                                                            "otp_from_stop_id", "bt_start_time", "sched_obs_start_timediff"]].sort_values(by=['sched_obs_start_timediff'])

# coisa

In [62]:
scheduled_itin_observed_od_full_clean = scheduled_itin_observed_od_full \
                            .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_mode','otp_route',
                                     'bt_bus_code','bt_trip_num','otp_from_stop_id','otp_start_time',
                                     'bt_start_time','sched_obs_start_timediff','otp_to_stop_id',
                                     'otp_end_time','bt_end_time','sched_obs_end_timediff','otp_duration_mins','minimun_obs_start_time','bt_duration_mins']) \
                            .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])


In [63]:
scheduled_itin_observed_od_full_clean

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins,bt_duration_mins
0,385703.0,1,1,WALK,,,,,2019-02-02 01:31:17,NaT,NaT,,2019-02-02 01:41:05,NaT,NaT,9.800000,
1,385703.0,2,1,WALK,,,,,2019-02-02 05:58:24,NaT,NaT,,2019-02-02 05:58:28,NaT,NaT,0.066667,
0,385703.0,2,2,BUS,090B,4002,1,385703.0,2019-02-02 05:58:29,2019-02-02 06:58:20,00:59:51,386613.0,2019-02-02 05:59:52,2019-02-02 07:00:01,01:00:09,1.383333,1.683333
1116,385703.0,2,2,BUS,090B,4002,2,385703.0,2019-02-02 05:58:29,2019-02-02 08:29:02,02:30:33,386613.0,2019-02-02 05:59:52,2019-02-02 08:30:16,02:30:24,1.383333,1.233333
3,385703.0,2,3,WALK,,,,,2019-02-02 05:59:53,NaT,NaT,,2019-02-02 06:05:12,NaT,NaT,5.316667,
4,385703.0,3,1,WALK,,,,,2019-02-02 06:16:24,NaT,NaT,,2019-02-02 06:16:28,NaT,NaT,0.066667,
1,385703.0,3,2,BUS,090B,4002,1,385703.0,2019-02-02 06:16:29,2019-02-02 06:58:20,00:41:51,386613.0,2019-02-02 06:17:52,2019-02-02 07:00:01,00:42:09,1.383333,1.683333
1117,385703.0,3,2,BUS,090B,4002,2,385703.0,2019-02-02 06:16:29,2019-02-02 08:29:02,02:12:33,386613.0,2019-02-02 06:17:52,2019-02-02 08:30:16,02:12:24,1.383333,1.233333
6,385703.0,3,3,WALK,,,,,2019-02-02 06:17:53,NaT,NaT,,2019-02-02 06:23:12,NaT,NaT,5.316667,
7,385703.0,4,1,WALK,,,,,2019-02-02 06:33:24,NaT,NaT,,2019-02-02 06:33:28,NaT,NaT,0.066667,


In [64]:
scheduled_itin_observed_od_full_clean \
                        .groupby(['otp_itinerary_id', 'otp_leg_id']) \
                        .apply(lambda x: x.sort_values(["sched_obs_start_timediff"]))


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins,bt_duration_mins
otp_itinerary_id,otp_leg_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,1,0,385703.0,1,1,WALK,,,,,2019-02-02 01:31:17,NaT,NaT,,2019-02-02 01:41:05,NaT,NaT,9.800000,
1,1,103,385704.0,1,1,WALK,,,,,2019-02-02 01:31:22,NaT,NaT,,2019-02-02 01:38:22,NaT,NaT,7.000000,
1,1,209,385705.0,1,1,WALK,,,,,2019-02-02 01:30:54,NaT,NaT,,2019-02-02 01:45:22,NaT,NaT,14.466667,
1,1,522,385706.0,1,1,WALK,,,,,2019-02-02 01:31:28,NaT,NaT,,2019-02-02 01:38:59,NaT,NaT,7.516667,
1,1,718,385708.0,1,1,WALK,,,,,2019-02-02 01:32:02,NaT,NaT,,2019-02-02 01:38:44,NaT,NaT,6.700000,
1,1,920,385710.0,1,1,WALK,,,,,2019-02-02 01:31:24,NaT,NaT,,2019-02-02 01:37:31,NaT,NaT,6.116667,
1,1,921,385712.0,1,1,WALK,,,,,2019-02-02 01:31:26,NaT,NaT,,2019-02-02 01:37:36,NaT,NaT,6.166667,
1,1,922,385717.0,1,1,WALK,,,,,2019-02-02 01:27:31,NaT,NaT,,2019-02-02 01:36:55,NaT,NaT,9.400000,
1,1,1124,385744.0,1,1,WALK,,,,,2019-02-02 05:28:42,NaT,NaT,,2019-02-02 05:32:32,NaT,NaT,3.833333,
1,1,1953,385746.0,1,1,WALK,,,,,2019-02-02 05:31:01,NaT,NaT,,2019-02-02 05:33:26,NaT,NaT,2.416667,


In [188]:
#scheduled_itin_observed_od_full_clean[(scheduled_itin_observed_od_full_clean.otp_route == "944") & (scheduled_itin_observed_od_full_clean.otp_start_time == '2019-05-13 14:05:47') & (scheduled_itin_observed_od_full_clean.otp_itinerary_id == 230)][['otp_itinerary_id','otp_leg_id',"otp_start_time","otp_route","otp_from_stop_id",
                                                                           #"otp_from_stop_id", "bt_start_time", "sched_obs_start_timediff"]].sort_values(by=['sched_obs_start_timediff']).head()

In [65]:
scheduled_itin_observed_od_full_clean = scheduled_itin_observed_od_full_clean.drop_duplicates(subset=['otp_itinerary_id','otp_leg_id'])

In [66]:
scheduled_itin_observed_od_full_clean

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins,bt_duration_mins
0,385703.0,1,1,WALK,,,,,2019-02-02 01:31:17,NaT,NaT,,2019-02-02 01:41:05,NaT,NaT,9.800000,
1,385703.0,2,1,WALK,,,,,2019-02-02 05:58:24,NaT,NaT,,2019-02-02 05:58:28,NaT,NaT,0.066667,
0,385703.0,2,2,BUS,090B,4002,1,385703.0,2019-02-02 05:58:29,2019-02-02 06:58:20,00:59:51,386613.0,2019-02-02 05:59:52,2019-02-02 07:00:01,01:00:09,1.383333,1.683333
3,385703.0,2,3,WALK,,,,,2019-02-02 05:59:53,NaT,NaT,,2019-02-02 06:05:12,NaT,NaT,5.316667,
4,385703.0,3,1,WALK,,,,,2019-02-02 06:16:24,NaT,NaT,,2019-02-02 06:16:28,NaT,NaT,0.066667,
1,385703.0,3,2,BUS,090B,4002,1,385703.0,2019-02-02 06:16:29,2019-02-02 06:58:20,00:41:51,386613.0,2019-02-02 06:17:52,2019-02-02 07:00:01,00:42:09,1.383333,1.683333
6,385703.0,3,3,WALK,,,,,2019-02-02 06:17:53,NaT,NaT,,2019-02-02 06:23:12,NaT,NaT,5.316667,
7,385703.0,4,1,WALK,,,,,2019-02-02 06:33:24,NaT,NaT,,2019-02-02 06:33:28,NaT,NaT,0.066667,
2,385703.0,4,2,BUS,090B,4002,1,385703.0,2019-02-02 06:33:29,2019-02-02 06:58:20,00:24:51,386613.0,2019-02-02 06:34:52,2019-02-02 07:00:01,00:25:09,1.383333,1.683333
9,385703.0,4,3,WALK,,,,,2019-02-02 06:34:53,NaT,NaT,,2019-02-02 06:40:12,NaT,NaT,5.316667,


In [191]:
#scheduled_itin_observed_od_full_clean[["otp_itinerary_id","otp_leg_id","otp_mode","otp_route","bt_start_time","bt_end_time"]]

In [67]:
# Filtering out itineraries which lost bus legs along the processing


curr_matched_itins_num_legs = scheduled_itin_observed_od_full_clean.groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                    .agg({'otp_leg_id': lambda x: len(np.unique(x))}) \
                                    .reset_index() \
                                    .rename(index=str, columns={'otp_leg_id':'num_legs'})

In [68]:
original_suggested_itins_num_legs = otp_suggestions.groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                    .agg({'otp_leg_id': lambda x: len(x)}) \
                                    .reset_index() \
                                    .rename(index=str, columns={'otp_leg_id':'num_legs'})

In [69]:
complete_matched_itins = original_suggested_itins_num_legs

In [70]:
complete_matched_itins.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,num_legs
0,385703.0,1,1
1,385703.0,2,3
2,385703.0,3,3
3,385703.0,4,3
4,385703.0,5,3


In [71]:
all_complete_vehicle_legs_options = scheduled_itin_observed_od_full_clean.merge(complete_matched_itins.drop('num_legs', axis=1), how='inner')

In [72]:
all_complete_vehicle_legs_options.drop_duplicates(keep='first',subset=['otp_mode','otp_route','bt_bus_code','bt_trip_num', 'otp_from_stop_id','otp_start_time','bt_start_time','sched_obs_start_timediff','otp_to_stop_id','otp_end_time','bt_end_time','sched_obs_end_timediff','otp_duration_mins'])


Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins,bt_duration_mins
0,385703.0,1,1,WALK,,,,,2019-02-02 01:31:17,NaT,NaT,,2019-02-02 01:41:05,NaT,NaT,9.800000,
1,385703.0,2,1,WALK,,,,,2019-02-02 05:58:24,NaT,NaT,,2019-02-02 05:58:28,NaT,NaT,0.066667,
2,385703.0,2,2,BUS,090B,4002,1,385703.0,2019-02-02 05:58:29,2019-02-02 06:58:20,00:59:51,386613.0,2019-02-02 05:59:52,2019-02-02 07:00:01,01:00:09,1.383333,1.683333
3,385703.0,2,3,WALK,,,,,2019-02-02 05:59:53,NaT,NaT,,2019-02-02 06:05:12,NaT,NaT,5.316667,
4,385703.0,3,1,WALK,,,,,2019-02-02 06:16:24,NaT,NaT,,2019-02-02 06:16:28,NaT,NaT,0.066667,
5,385703.0,3,2,BUS,090B,4002,1,385703.0,2019-02-02 06:16:29,2019-02-02 06:58:20,00:41:51,386613.0,2019-02-02 06:17:52,2019-02-02 07:00:01,00:42:09,1.383333,1.683333
6,385703.0,3,3,WALK,,,,,2019-02-02 06:17:53,NaT,NaT,,2019-02-02 06:23:12,NaT,NaT,5.316667,
7,385703.0,4,1,WALK,,,,,2019-02-02 06:33:24,NaT,NaT,,2019-02-02 06:33:28,NaT,NaT,0.066667,
8,385703.0,4,2,BUS,090B,4002,1,385703.0,2019-02-02 06:33:29,2019-02-02 06:58:20,00:24:51,386613.0,2019-02-02 06:34:52,2019-02-02 07:00:01,00:25:09,1.383333,1.683333
9,385703.0,4,3,WALK,,,,,2019-02-02 06:34:53,NaT,NaT,,2019-02-02 06:40:12,NaT,NaT,5.316667,


In [73]:
def is_new_itinerary(prev_trip_id,curr_trip_id,prev_itin_id,curr_itin_id):
    return ((prev_trip_id != curr_trip_id) | (prev_itin_id != curr_itin_id))

def choose_leg_matches(leg_matches_groups):
        colnames = leg_matches_groups.obj.columns.values
        chosen_leg_matches = pd.DataFrame(columns = colnames)
        prev_trip_id = -1
        prev_itin_id = -1
        prev_leg_mode = ""
        prev_leg_end_time = pd.NaT
        num_groups_not_survived = 0
        new_itinerary = False

        for name, group in leg_matches_groups:
            
                #print
                #print "Name:", name
                #print "Group:"
                #print group
                #print
                
                curr_trip_id = group['otp_user_trip_id'].iloc[0]
                curr_itin_id = group['otp_itinerary_id'].iloc[0]
                curr_leg_id = group['otp_leg_id'].iloc[0]
                curr_leg_mode = group['otp_mode'].iloc[0]
                
                new_itinerary = is_new_itinerary(prev_trip_id,curr_trip_id,prev_itin_id,curr_itin_id)
                if new_itinerary:
                    prev_leg_end_time = group['otp_start_time'].dt.floor('d').iloc[0]

                #if (prev_group_id == ()):
                #        prev_leg_end_time = group['bt_start_time'].dt.floor('d')[0]

                #print
                #print "Previous itinerary id:", prev_itin_id
                #print "Previous leg mode:", prev_leg_mode
                #print "Previous leg end time:", prev_leg_end_time
                #print "Current leg id:", curr_leg_id
                #print "Current leg mode:", curr_leg_mode
                #print
                #print "Original Group"
                #print group.filter(['otp_start_time','bt_start_time','bt_end_time'])
                
                if (curr_leg_mode == 'WALK'):
                    #print "Walking duration:", filtered_group['otp_duration_mins']
                    filtered_group = group.reset_index()
                    if new_itinerary: #first leg is a WALK leg
                        filtered_group.loc[0,'bt_end_time'] = prev_leg_end_time
                    else:
                        filtered_group.loc[0,'bt_start_time'] = prev_leg_end_time
                        filtered_group.loc[0,'bt_end_time'] = prev_leg_end_time + \
                            pd.Timedelta(minutes=np.rint(filtered_group['otp_duration_mins'].iloc[0]))
                    #print "Filtered Group"
                    #print filtered_group
                else:
                    filtered_group = group[group['bt_start_time'] > prev_leg_end_time]
                
                #print
                #print "Filtered Group"
                #print filtered_group.filter(['otp_start_time','bt_start_time','bt_end_time'])

                if (len(filtered_group) == 0):
                        #print "Group did not survive! =("
                        #print
                        #print "Previous itinerary id:", prev_itin_id
                        #print "Previous leg mode:", prev_leg_mode
                        #print "Previous leg end time:", prev_leg_end_time
                        #print "Current leg id:", curr_leg_id
                        #print "Current leg mode:", curr_leg_mode
                        #print
                        #print "Original Group"
                        #print group#.filter(['otp_start_time','bt_start_time','bt_end_time'])
                        num_groups_not_survived += 1
                        continue

                chosen_leg_match = filtered_group.sort_values('bt_start_time').iloc[0]
                
                if ((curr_leg_id == 2) & 
                    ((curr_leg_mode == 'BUS') & (prev_leg_mode == 'WALK'))):
                        #Update previous walk start/end_times
                        #print
                        #print "Chosen Leg Matches"
                        #print chosen_leg_matches.iloc[-1]
                        #print
                        chosen_leg_matches.iloc[-1,chosen_leg_matches.columns.get_loc('bt_start_time')] = chosen_leg_match['bt_start_time'] - \
                            pd.Timedelta(minutes=np.rint(chosen_leg_matches.iloc[-1].otp_duration_mins))
                        chosen_leg_matches.iloc[-1,chosen_leg_matches.columns.get_loc('bt_end_time')] = chosen_leg_match['bt_start_time']
                #print "Chosen Leg"
                #print chosen_leg_match

                chosen_leg_matches = chosen_leg_matches.append(chosen_leg_match)

                #Update variables
                #prev_group_id = name
                prev_trip_id = curr_trip_id
                prev_itin_id = curr_itin_id
                prev_leg_mode = curr_leg_mode
                prev_leg_end_time = chosen_leg_match['bt_end_time']

        #print "Number of groups which did not survive:", num_groups_not_survived
        return chosen_leg_matches.filter(colnames)

In [74]:
legs_groups = all_complete_vehicle_legs_options.groupby(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])

In [75]:
legs_groups.obj.columns.values

array(['otp_user_trip_id', 'otp_itinerary_id', 'otp_leg_id', 'otp_mode',
       'otp_route', 'bt_bus_code', 'bt_trip_num', 'otp_from_stop_id',
       'otp_start_time', 'bt_start_time', 'sched_obs_start_timediff',
       'otp_to_stop_id', 'otp_end_time', 'bt_end_time',
       'sched_obs_end_timediff', 'otp_duration_mins', 'bt_duration_mins'],
      dtype=object)

In [76]:
# Choose best actual leg matches (based on feasibility and start time)
import time

start = time.time()
#chosen_legs = choose_leg_matches(sample_itinerary_options.groupby(['otp_user_trip_id','otp_itinerary_id','otp_leg_id']))
feasible_legs = choose_leg_matches(legs_groups)
end = time.time()

print "Execution time in s:", (end-start)
feasible_legs

Execution time in s: 12.8390760422


Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins,bt_duration_mins
0,385703.0,1,1,WALK,,,,,2019-02-02 01:31:17,NaT,NaT,,2019-02-02 01:41:05,2019-02-02 00:00:00,NaT,9.800000,
0,385703.0,2,1,WALK,,,,,2019-02-02 05:58:24,2019-02-02 06:58:20,NaT,,2019-02-02 05:58:28,2019-02-02 06:58:20,NaT,0.066667,
2,385703.0,2,2,BUS,090B,4002,1,385703.0,2019-02-02 05:58:29,2019-02-02 06:58:20,0 days 00:59:51,386613.0,2019-02-02 05:59:52,2019-02-02 07:00:01,0 days 01:00:09,1.383333,1.683333
0,385703.0,2,3,WALK,,,,,2019-02-02 05:59:53,2019-02-02 07:00:01,NaT,,2019-02-02 06:05:12,2019-02-02 07:05:01,NaT,5.316667,
0,385703.0,3,1,WALK,,,,,2019-02-02 06:16:24,2019-02-02 06:58:20,NaT,,2019-02-02 06:16:28,2019-02-02 06:58:20,NaT,0.066667,
5,385703.0,3,2,BUS,090B,4002,1,385703.0,2019-02-02 06:16:29,2019-02-02 06:58:20,0 days 00:41:51,386613.0,2019-02-02 06:17:52,2019-02-02 07:00:01,0 days 00:42:09,1.383333,1.683333
0,385703.0,3,3,WALK,,,,,2019-02-02 06:17:53,2019-02-02 07:00:01,NaT,,2019-02-02 06:23:12,2019-02-02 07:05:01,NaT,5.316667,
0,385703.0,4,1,WALK,,,,,2019-02-02 06:33:24,2019-02-02 06:58:20,NaT,,2019-02-02 06:33:28,2019-02-02 06:58:20,NaT,0.066667,
8,385703.0,4,2,BUS,090B,4002,1,385703.0,2019-02-02 06:33:29,2019-02-02 06:58:20,0 days 00:24:51,386613.0,2019-02-02 06:34:52,2019-02-02 07:00:01,0 days 00:25:09,1.383333,1.683333
0,385703.0,4,3,WALK,,,,,2019-02-02 06:34:53,2019-02-02 07:00:01,NaT,,2019-02-02 06:40:12,2019-02-02 07:05:01,NaT,5.316667,


In [77]:
if len(feasible_legs) == 0:
    print "No matches left after matching and selecting feasible bus legs."
    print "Skipping next steps..."
    exit(0)

# Filtering out itineraries which lost bus legs after feasible legs choice processing
feasible_itins_num_legs = feasible_legs.groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                    .agg({'otp_leg_id': lambda x: len(x)}) \
                                    .reset_index() \
                                    .rename(index=str, columns={'otp_leg_id':'num_legs'})
feasible_itins_num_legs.head(20)


Unnamed: 0,otp_user_trip_id,otp_itinerary_id,num_legs
0,385703.0,1,1
1,385703.0,2,3
2,385703.0,3,3
3,385703.0,4,3
4,385703.0,5,3
5,385703.0,6,3
6,385703.0,7,3
7,385703.0,8,3
8,385703.0,9,3
9,385703.0,10,3


In [78]:
feasible_complete_itins = feasible_itins_num_legs.merge(original_suggested_itins_num_legs,how='inner')

feasible_complete_itins.head(20)

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,num_legs
0,385703.0,1,1
1,385703.0,2,3
2,385703.0,3,3
3,385703.0,4,3
4,385703.0,5,3
5,385703.0,6,3
6,385703.0,7,3
7,385703.0,8,3
8,385703.0,9,3
9,385703.0,10,3


In [79]:
feasible_complete_itins_legs = feasible_legs.merge(feasible_complete_itins.drop('num_legs', axis=1),how='inner')

feasible_complete_itins_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins,bt_duration_mins
0,385703.0,1,1,WALK,,,,,2019-02-02 01:31:17,NaT,NaT,,2019-02-02 01:41:05,2019-02-02 00:00:00,NaT,9.8,
1,385703.0,2,1,WALK,,,,,2019-02-02 05:58:24,2019-02-02 06:58:20,NaT,,2019-02-02 05:58:28,2019-02-02 06:58:20,NaT,0.066667,
2,385703.0,2,2,BUS,090B,4002.0,1.0,385703.0,2019-02-02 05:58:29,2019-02-02 06:58:20,0 days 00:59:51,386613.0,2019-02-02 05:59:52,2019-02-02 07:00:01,0 days 01:00:09,1.383333,1.683333
3,385703.0,2,3,WALK,,,,,2019-02-02 05:59:53,2019-02-02 07:00:01,NaT,,2019-02-02 06:05:12,2019-02-02 07:05:01,NaT,5.316667,
4,385703.0,3,1,WALK,,,,,2019-02-02 06:16:24,2019-02-02 06:58:20,NaT,,2019-02-02 06:16:28,2019-02-02 06:58:20,NaT,0.066667,


In [80]:
def add_stops_data_to_legs(itineraries_legs,stops_locs):
    itineraries_legs_stops = itineraries_legs.merge(stops_locs, left_on='otp_from_stop_id', right_on='stop_id', how='left') \
                                                                                .drop('stop_id', axis=1) \
                                                                                .rename(index=str, columns={'stop_lat':'from_stop_lat','stop_lon':'from_stop_lon'}) \
                                                                                .merge(stops_locations, left_on='otp_to_stop_id', right_on='stop_id', how='left') \
                                                                                .drop('stop_id', axis=1) \
                                                                                .rename(index=str, columns={'stop_lat':'to_stop_lat','stop_lon':'to_stop_lon'}) 
    return itineraries_legs_stops

In [81]:
stops_locations = stops_df[['stop_id','stop_lat','stop_lon']]
itineraries_legs = add_stops_data_to_legs(feasible_complete_itins_legs,stops_locations)

itineraries_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,...,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins,bt_duration_mins,from_stop_lat,from_stop_lon,to_stop_lat,to_stop_lon
0,385703.0,1,1,WALK,,,,,2019-02-02 01:31:17,NaT,...,,2019-02-02 01:41:05,2019-02-02 00:00:00,NaT,9.8,,,,,
1,385703.0,2,1,WALK,,,,,2019-02-02 05:58:24,2019-02-02 06:58:20,...,,2019-02-02 05:58:28,2019-02-02 06:58:20,NaT,0.066667,,,,,
2,385703.0,2,2,BUS,090B,4002.0,1.0,385703.0,2019-02-02 05:58:29,2019-02-02 06:58:20,...,386613.0,2019-02-02 05:59:52,2019-02-02 07:00:01,0 days 01:00:09,1.383333,1.683333,-7.247367,-35.913655,-7.251194,-35.915513
3,385703.0,2,3,WALK,,,,,2019-02-02 05:59:53,2019-02-02 07:00:01,...,,2019-02-02 06:05:12,2019-02-02 07:05:01,NaT,5.316667,,,,,
4,385703.0,3,1,WALK,,,,,2019-02-02 06:16:24,2019-02-02 06:58:20,...,,2019-02-02 06:16:28,2019-02-02 06:58:20,NaT,0.066667,,,,,


In [82]:
feasible_complete_itins_legs.head()
output = feasible_complete_itins_legs.to_csv("data/output/output_2019_02_02.csv",index=False)
#feasible_complete_itins_legs[["otp_itinerary_id","otp_leg_id","otp_mode","otp_route","bt_start_time","bt_end_time"]]

In [208]:
teste = feasible_complete_itins_legs.drop_duplicates(keep='first',subset=['otp_leg_id','otp_mode','otp_route','bt_bus_code','bt_trip_num', 'otp_from_stop_id','otp_start_time','bt_start_time',
                'sched_obs_start_timediff','otp_to_stop_id','otp_end_time','bt_end_time','sched_obs_end_timediff'])
#feasible_complete_itins_legs_drop_duplicates = choose_leg_matches_drop_duplicates(feasible_complete_itins_legs)
#output = teste.to_csv("data/output/output.csv",index=False)
teste


Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
0,385702.0,1,1,WALK,,,,,2019-02-10 19:38:24,2019-02-10 14:44:46,NaT,,2019-02-10 19:56:37,2019-02-10 15:02:46,NaT,18.216667
1,385702.0,1,2,BUS,092,2042,6,386176.0,2019-02-10 19:56:38,2019-02-10 15:02:46,0 days 04:53:52,386142.0,2019-02-10 20:20:10,2019-02-10 15:30:07,0 days 04:50:03,23.533333
2,385702.0,1,3,BUS,555,2033,11,386142.0,2019-02-10 20:29:31,2019-02-10 22:23:34,0 days 01:54:03,385702.0,2019-02-10 20:34:42,2019-02-10 22:28:44,0 days 01:54:02,5.183333
3,385702.0,1,4,WALK,,,,,2019-02-10 20:34:43,2019-02-10 22:28:44,NaT,,2019-02-10 20:34:46,2019-02-10 22:28:44,NaT,0.050000
4,385702.0,2,1,WALK,,,,,2019-02-10 19:30:00,2019-02-10 16:50:55,NaT,,2019-02-10 19:55:32,2019-02-10 17:16:55,NaT,25.533333
5,385702.0,2,2,BUS,245,1027,6,386397.0,2019-02-10 19:55:33,2019-02-10 17:16:55,0 days 02:38:38,385820.0,2019-02-10 20:30:39,2019-02-10 17:47:24,0 days 02:43:15,35.100000
6,385702.0,2,3,WALK,,,,,2019-02-10 20:30:40,2019-02-10 17:47:24,NaT,,2019-02-10 20:43:45,2019-02-10 18:00:24,NaT,13.083333
7,385702.0,8,1,WALK,,,,,2019-02-10 20:08:24,2019-02-10 16:40:35,NaT,,2019-02-10 20:26:37,2019-02-10 16:58:35,NaT,18.216667
8,385702.0,8,2,BUS,092,2053,9,386176.0,2019-02-10 20:26:38,2019-02-10 16:58:35,0 days 03:28:03,386160.0,2019-02-10 20:56:06,2019-02-10 17:07:54,0 days 03:48:12,29.466667
9,385702.0,8,3,WALK,,,,,2019-02-10 20:56:07,2019-02-10 17:07:54,NaT,,2019-02-10 21:05:49,2019-02-10 17:17:54,NaT,9.700000


In [209]:
output2 = teste.to_csv("data/output/output_2019_02_10.csv",index=False)

In [135]:
teste = teste[~teste.duplicated(['otp_leg_id','otp_mode','otp_route','bt_bus_code','bt_trip_num', 'otp_from_stop_id','otp_start_time','bt_start_time',
                'sched_obs_start_timediff','otp_to_stop_id','otp_end_time','bt_end_time','sched_obs_end_timediff'])
              .groupby(teste['otp_itinerary_id']).transform('any')]
teste
output = teste.to_csv("data/output/output_2019_02_10.csv",index=False)


In [None]:
grouped = teste.groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                    .agg({'otp_leg_id': lambda x: len(x) < 5}).reset_index()


In [None]:
grouped2 = teste.groupby(['otp_user_trip_id','otp_itinerary_id']).filter(lambda g: len(g.otp_leg_id) == 5)
grouped2



In [None]:
grouped3 = grouped2.groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                    .agg({'otp_leg_id': lambda x: len(x) < 5})
