In [1]:
#Libraries

#Python Libs
import sys
import os
import glob
import traceback
from datetime import datetime
import time
from geopy import distance


#Data Analysis Libs
import pandas as pd
import numpy as np

In [3]:
#Functions
def select_input_files(enh_buste_base_path,init_date,fin_date,suffix):
        selected_files = []
        all_files = glob.glob(os.path.join(enh_buste_base_path,"*"))

        for file_ in all_files:
                try:
                        file_date = pd.to_datetime(file_.split('/')[-1],format=('%Y_%m_%d' + suffix  + '.csv'))
                        if (file_date >= init_date) and (file_date <= fin_date):
                                selected_files.append((file_,file_date))
                except:
                        continue

        return sorted(selected_files)

def get_gtfs_path(query_date):
    INTERMEDIATE_OTP_DATE = pd.to_datetime("2019-05-13", format="%Y-%m-%d")
    router_id = ''

    if (query_date <= INTERMEDIATE_OTP_DATE):
        return 'campina-gtfs-2019'
    else:
        return 'campina-gtfs-2017'

## Main

In [66]:
otp_suggestions_filepath = 'data/output/2019_02_01_bus_trips_otp_itineraries.csv'
bus_trips_folderpath = 'data/input'
gtfs_base_folderpath = 'data/input'
output_folderpath = 'data/output'

In [67]:
file_date_str = otp_suggestions_filepath.split('/')[-1].split('_bus_trips_')[0]
file_date = pd.to_datetime(file_date_str,format='%Y_%m_%d')
print "Processing File:", otp_suggestions_filepath

Processing File: data/output/2019_02_01_bus_trips_otp_itineraries.csv


In [68]:
    # Extracting itinerary part name for later use
    itinerary_part_name = otp_suggestions_filepath.split('/')[-1].split('_')[5]
    # Read OTP Suggestions
    otp_suggestions_raw = pd.read_csv(otp_suggestions_filepath, parse_dates=['date','otp_start_time','otp_end_time'])

    if len(otp_suggestions_raw) == 0:
        print "Zero OTP suggestions found."
        print "Skipping next steps..."
        exit(0)


In [69]:
def prepare_otp_data(otp_data):
        #Fixing prefix
        otp_data.columns = otp_data.columns.str.replace('otp_','')
        otp_data = otp_data.add_prefix('otp_')
        
        #Fixing Timezone difference - when needed
        otp_data['otp_start_time'] = otp_data['otp_start_time'] - pd.Timedelta('10800 s')
        otp_data['otp_end_time'] = otp_data['otp_end_time'] - pd.Timedelta('10800 s')
        
        #Adjusting route format to have 3 numbers
        otp_data['otp_route'] = otp_data['otp_route'].astype(str)
        otp_data['otp_route'] = np.where(otp_data['otp_mode'] == 'BUS',
                            otp_data['otp_route'].astype(str).str.replace("\.0",'').str.zfill(3),
                            otp_data['otp_route'])

        return otp_data

In [70]:
    # Prepare OTP data for analysis
    otp_suggestions = prepare_otp_data(otp_suggestions_raw)    
    
    # Read stops data
    stops_filepath = gtfs_base_folderpath + os.sep + get_gtfs_path(file_date) + os.sep + 'stops.txt'
    stops_df = pd.read_csv(stops_filepath)
    
    # Adding Parent Stop data to OTP Suggestions TODO
    stops_parent_stations = stops_df[['stop_id','parent_station']]
    otp_suggestions = otp_suggestions.merge(stops_parent_stations.add_prefix('from_'),
                                                left_on='otp_from_stop_id',
                                                right_on='from_stop_id',
                                                how='left') \
                                        .merge(stops_parent_stations.add_prefix('to_'),
                                                left_on='otp_to_stop_id',
                                                right_on='to_stop_id',
                                                how='left') \
                                        .drop(['from_stop_id','to_stop_id'], axis=1) \
                                        .rename(index=str, columns={'from_parent_station':'otp_from_parent_station',
                                                                    'to_parent_station':'otp_to_parent_station'})
        
    otp_suggestions_bus_legs = otp_suggestions[otp_suggestions['otp_mode'] == 'BUS']
    otp_suggestions_walk_legs = otp_suggestions[otp_suggestions['otp_mode'] == 'WALK']

In [8]:
#otp_suggestions.drop_duplicates(subset=['otp_leg_id','otp_end_time','otp_mode','otp_duration_mins'],inplace=True)
#otp_suggestions.duplicated(subset=['otp_leg_id','otp_end_time','otp_mode','otp_duration_mins'])
otp_suggestions.head()

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station
0,2019-02-01,491551.0,1,1,2019-02-02 05:18:45,2019-02-02 05:18:46,WALK,,,,0.016667,,
1,2019-02-01,491551.0,1,2,2019-02-02 05:18:47,2019-02-02 05:47:02,BUS,944,491551.0,386255.0,28.25,,
2,2019-02-01,491551.0,1,3,2019-02-02 05:47:02,2019-02-02 05:49:21,WALK,,,,2.316667,,
3,2019-02-01,491551.0,1,4,2019-02-02 05:51:05,2019-02-02 05:58:04,BUS,903A,386338.0,386291.0,6.983333,,
4,2019-02-01,491551.0,1,5,2019-02-02 05:58:05,2019-02-02 05:58:06,WALK,,,,0.016667,,


## Read and Prepare Bus Trip Data

In [71]:
bus_trips_filepath = bus_trips_folderpath + os.sep + file_date_str + '_bus_trips.csv'
bus_trips = pd.read_csv(bus_trips_filepath, dtype={'route': object},parse_dates=['gps_datetime']) \
                                        .sort_values(['route','busCode','tripNum','gps_datetime']) \
                                        .assign(route = lambda x: x['route'].astype(str).str.replace("\.0",'').str.zfill(3))  \
                                        .drop_duplicates()

In [72]:
bus_trips.head()

Unnamed: 0,route,tripNum,shapeId,routeFrequency,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,gpsLon,distanceToShapePoint,gps_datetime,stopPointId,streetName,problem
16416,3,1,140170,low_frequency,109,-7.2163,-35.8648,7990.0,1062,3309,-7.216141,-35.86465,24.21548,01-02-2019 06:01:01,497900,-,NO_PROBLEM
16417,3,1,140170,low_frequency,88,-7.21439,-35.87282,6659.0,1062,3793,-7.214154,-35.87266,31.62558,01-02-2019 06:05:06,497906,-,NO_PROBLEM
16418,3,1,140170,low_frequency,47,-7.21845,-35.8818,3128.0,1062,4168,-7.218444,-35.88185,5.555885,01-02-2019 06:07:39,386328,-,NO_PROBLEM
16419,3,1,140170,low_frequency,23,-7.21453,-35.87003,1534.0,1062,-,-,-,-,01-02-2019 06:09:26,386064,-,BETWEEN
16420,3,1,140170,low_frequency,64,-7.22076,-35.88904,4633.0,1062,4653,-7.220669,-35.88926,26.293863,01-02-2019 06:11:13,386340,-,NO_PROBLEM


In [11]:
#bus_trips[(bus_trips.route == "944") & (bus_trips.stopPointId == 491551.0)].head()

In [12]:
#trezentos = bus_trips[(bus_trips.route == "333")]
#output = trezentos.to_csv("data/output/trezentos.csv",index=False)


In [73]:
bus_trips_clean = bus_trips.filter(['route','busCode','tripNum','stopPointId','gps_datetime'])

In [43]:
bus_trips_clean.head()

Unnamed: 0,route,busCode,tripNum,stopPointId,gps_datetime
16416,3,1062,1,497900,01-02-2019 06:01:01
16417,3,1062,1,497906,01-02-2019 06:05:06
16418,3,1062,1,386328,01-02-2019 06:07:39
16419,3,1062,1,386064,01-02-2019 06:09:26
16420,3,1062,1,386340,01-02-2019 06:11:13


In [15]:
#bus_trips_clean[bus_trips_clean.route == "944"].head()

In [15]:
bus_trips_clean.dtypes

route           object
busCode         object
tripNum         object
stopPointId      int64
gps_datetime    object
dtype: object

In [12]:
otp_suggestions_bus_legs.dtypes

otp_date                   datetime64[ns]
otp_user_trip_id                  float64
otp_itinerary_id                    int64
otp_leg_id                          int64
otp_start_time             datetime64[ns]
otp_end_time               datetime64[ns]
otp_mode                           object
otp_route                          object
otp_from_stop_id                  float64
otp_to_stop_id                    float64
otp_duration_mins                 float64
otp_from_parent_station           float64
otp_to_parent_station             float64
dtype: object

## Identify Possible Matches between OTP Itineraries and Bus Trips Observed Data

In [74]:
#bus_trips_clean.gps_datetime = bus_trips_clean.drop(bus_trips_clean.gps_datetime[bus_trips_clean.gps_datetime == "-"].index, inplace=True)

bus_trips_clean.drop(bus_trips_clean.loc[bus_trips_clean['gps_datetime']=='-'].index, inplace=True)


#bus_trips_clean = bus_trips_clean[bus_trips_clean.gps_datetime == "-"]
bus_trips_clean.dtypes
#bus_trips_clean.gps_datetime = pd.to_datetime(bus_trips_clean.gps_datetime)
#pd.to_datetime('01-02-2019 06:01:01')

route           object
busCode         object
tripNum         object
stopPointId      int64
gps_datetime    object
dtype: object

In [75]:
scheduled_itin_observed_o = otp_suggestions_bus_legs.merge(bus_trips_clean.add_prefix('bt_'),
                                left_on=['otp_route','otp_from_stop_id'],
                                right_on=['bt_route','bt_stopPointId'],
                                how='inner') \
                                .drop(['bt_route','bt_stopPointId'], axis=1) \
                                .rename(index=str, columns={'bt_gps_datetime':'bt_start_time',
                                                            'bt_tripNum':'bt_trip_num',
                                                            'bt_busCode':'bt_bus_code'}) \
                                .assign(sched_obs_start_timediff = 
                                        lambda x: np.absolute(pd.to_datetime(x['bt_start_time']) - x['otp_start_time']))

In [None]:
#bus_trips_clean[(bus_trips_clean.route == "944") & (bus_trips_clean.stopPointId == 491551.0)].sort_values(by=['gps_datetime']).head()

In [76]:
scheduled_itin_observed_o.head()

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,otp_duration_mins,otp_from_parent_station,otp_to_parent_station,bt_bus_code,bt_trip_num,bt_start_time,sched_obs_start_timediff
0,2019-02-01,491551.0,1,2,2019-02-02 05:18:47,2019-02-02 05:47:02,BUS,944,491551.0,386255.0,28.25,,,1002,2,01-02-2019 06:27:52,30 days 22:50:55
1,2019-02-01,491551.0,1,2,2019-02-02 05:18:47,2019-02-02 05:47:02,BUS,944,491551.0,386255.0,28.25,,,1002,3,01-02-2019 09:05:35,30 days 20:13:12
2,2019-02-01,491551.0,1,2,2019-02-02 05:18:47,2019-02-02 05:47:02,BUS,944,491551.0,386255.0,28.25,,,1002,5,01-02-2019 11:24:45,30 days 17:54:02
3,2019-02-01,491551.0,1,2,2019-02-02 05:18:47,2019-02-02 05:47:02,BUS,944,491551.0,386255.0,28.25,,,1002,7,01-02-2019 13:50:17,30 days 15:28:30
4,2019-02-01,491551.0,1,2,2019-02-02 05:18:47,2019-02-02 05:47:02,BUS,944,491551.0,386255.0,28.25,,,1002,8,01-02-2019 15:12:00,30 days 14:06:47


In [None]:
#scheduled_itin_observed_o[(scheduled_itin_observed_o.otp_route == "944") & (scheduled_itin_observed_o.otp_start_time == '2019-05-13 14:05:47')][['otp_itinerary_id',"otp_start_time","otp_route","otp_from_stop_id",
                                                                           #"otp_from_stop_id", "bt_start_time", "sched_obs_start_timediff"]].sort_values(by=['sched_obs_start_timediff']).head()
scheduled_itin_observed_o[(scheduled_itin_observed_o.otp_itinerary_id == 112)]

In [77]:
scheduled_itin_observed_o = scheduled_itin_observed_o[(scheduled_itin_observed_o.sched_obs_start_timediff >= pd.Timedelta('0s'))  & (scheduled_itin_observed_o.sched_obs_start_timediff < pd.Timedelta('1.5h'))]

In [78]:
scheduled_itin_observed_od = scheduled_itin_observed_o.merge(bus_trips_clean.add_prefix('bt_'),
                                left_on=['otp_route','bt_bus_code','bt_trip_num','otp_to_stop_id'],
                                right_on=['bt_route','bt_busCode','bt_tripNum','bt_stopPointId'],
                                how='inner') \
                                .drop(['bt_route','bt_stopPointId'], axis=1) \
                                .rename(index=str, columns={'bt_gps_datetime':'bt_end_time'}) \
                                .assign(sched_obs_end_timediff = 
                                        lambda x: np.absolute(x['bt_end_time'] - x['otp_end_time'])) \
                                .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','sched_obs_start_timediff','sched_obs_end_timediff'])

In [None]:
len(scheduled_itin_observed_od)

In [79]:
scheduled_itin_observed_od['bt_duration_mins'] = (scheduled_itin_observed_od['bt_end_time'] - scheduled_itin_observed_od['bt_start_time'])/pd.Timedelta(minutes=1)
scheduled_itin_observed_od = scheduled_itin_observed_od[scheduled_itin_observed_od['bt_duration_mins'] > 0]

In [None]:
scheduled_itin_observed_od

In [80]:
scheduled_itin_observed_od_full = pd.concat([scheduled_itin_observed_od,otp_suggestions_walk_legs], sort=False)

In [None]:
otp_suggestions_walk_legs

In [None]:
scheduled_itin_observed_od_full[scheduled_itin_observed_od_full.otp_itinerary_id == 296]

In [None]:
bus_trips_clean[(bus_trips['route'] == '944') & (bus_trips['stopPointId'] == 491551)].sort_values(['gps_datetime'])

In [81]:
scheduled_itin_observed_od_full_clean = scheduled_itin_observed_od_full \
                            .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_mode','otp_route',
                                     'bt_bus_code','bt_trip_num','otp_from_stop_id','otp_start_time',
                                     'bt_start_time','sched_obs_start_timediff','otp_to_stop_id',
                                     'otp_end_time','bt_end_time','sched_obs_end_timediff','otp_duration_mins','minimun_obs_start_time','bt_duration_mins']) \
                            .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])

In [82]:
scheduled_itin_observed_od_full.head() \
                            .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])

Unnamed: 0,otp_date,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_start_time,otp_end_time,otp_mode,otp_route,otp_from_stop_id,otp_to_stop_id,...,otp_to_parent_station,bt_bus_code,bt_trip_num,bt_start_time,sched_obs_start_timediff,bt_busCode,bt_tripNum,bt_end_time,sched_obs_end_timediff,bt_duration_mins
0,2019-02-01,491551.0,1,1,2019-02-02 05:18:45,2019-02-02 05:18:46,WALK,,,,...,,,,,NaT,,,,,
2,2019-02-01,491551.0,1,3,2019-02-02 05:47:02,2019-02-02 05:49:21,WALK,,,,...,,,,,NaT,,,,,
4,2019-02-01,491551.0,1,5,2019-02-02 05:58:05,2019-02-02 05:58:06,WALK,,,,...,,,,,NaT,,,,,
5,2019-02-01,491551.0,2,1,2019-02-02 05:53:45,2019-02-02 05:53:46,WALK,,,,...,,,,,NaT,,,,,
7,2019-02-01,491551.0,2,3,2019-02-02 06:23:31,2019-02-02 06:25:08,WALK,,,,...,,,,,NaT,,,,,


In [None]:
coisa = scheduled_itin_observed_od_full_clean[(scheduled_itin_observed_od_full_clean.otp_route == "944") & (scheduled_itin_observed_od_full_clean.otp_start_time == '2019-05-13 15:05:47')][['otp_itinerary_id',"otp_start_time","otp_route","otp_from_stop_id",
                                                                           "otp_from_stop_id", "bt_start_time", "sched_obs_start_timediff"]].sort_values(by=['sched_obs_start_timediff'])

coisa

In [83]:
scheduled_itin_observed_od_full_clean = scheduled_itin_observed_od_full \
                            .filter(['otp_user_trip_id','otp_itinerary_id','otp_leg_id','otp_mode','otp_route',
                                     'bt_bus_code','bt_trip_num','otp_from_stop_id','otp_start_time',
                                     'bt_start_time','sched_obs_start_timediff','otp_to_stop_id',
                                     'otp_end_time','bt_end_time','sched_obs_end_timediff','otp_duration_mins','minimun_obs_start_time']) \
                            .sort_values(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])


In [None]:
scheduled_itin_observed_od_full_clean

In [84]:
scheduled_itin_observed_od_full_clean \
                        .groupby(['otp_itinerary_id', 'otp_leg_id']) \
                        .apply(lambda x: x.sort_values(["sched_obs_start_timediff"]))


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
otp_itinerary_id,otp_leg_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,1,0,491551.0,1,1,WALK,,,,,2019-02-02 05:18:45,,NaT,,2019-02-02 05:18:46,,,0.016667
1,3,2,491551.0,1,3,WALK,,,,,2019-02-02 05:47:02,,NaT,,2019-02-02 05:49:21,,,2.316667
1,5,4,491551.0,1,5,WALK,,,,,2019-02-02 05:58:05,,NaT,,2019-02-02 05:58:06,,,0.016667
2,1,5,491551.0,2,1,WALK,,,,,2019-02-02 05:53:45,,NaT,,2019-02-02 05:53:46,,,0.016667
2,3,7,491551.0,2,3,WALK,,,,,2019-02-02 06:23:31,,NaT,,2019-02-02 06:25:08,,,1.616667
2,5,9,491551.0,2,5,WALK,,,,,2019-02-02 06:39:28,,NaT,,2019-02-02 06:39:29,,,0.016667
3,1,10,491551.0,3,1,WALK,,,,,2019-02-02 06:11:45,,NaT,,2019-02-02 06:11:46,,,0.016667
3,3,12,491551.0,3,3,WALK,,,,,2019-02-02 06:40:02,,NaT,,2019-02-02 06:42:21,,,2.316667
3,5,14,491551.0,3,5,WALK,,,,,2019-02-02 06:50:05,,NaT,,2019-02-02 06:50:06,,,0.016667
4,1,15,491551.0,4,1,WALK,,,,,2019-02-02 06:29:45,,NaT,,2019-02-02 06:29:46,,,0.016667


In [None]:
#scheduled_itin_observed_od_full_clean[(scheduled_itin_observed_od_full_clean.otp_route == "944") & (scheduled_itin_observed_od_full_clean.otp_start_time == '2019-05-13 14:05:47') & (scheduled_itin_observed_od_full_clean.otp_itinerary_id == 230)][['otp_itinerary_id','otp_leg_id',"otp_start_time","otp_route","otp_from_stop_id",
                                                                           #"otp_from_stop_id", "bt_start_time", "sched_obs_start_timediff"]].sort_values(by=['sched_obs_start_timediff']).head()

In [85]:
scheduled_itin_observed_od_full_clean = scheduled_itin_observed_od_full_clean.drop_duplicates(subset=['otp_itinerary_id','otp_leg_id'])

In [None]:
scheduled_itin_observed_od_full_clean

In [None]:
#scheduled_itin_observed_od_full_clean[["otp_itinerary_id","otp_leg_id","otp_mode","otp_route","bt_start_time","bt_end_time"]]

In [86]:
# Filtering out itineraries which lost bus legs along the processing


curr_matched_itins_num_legs = scheduled_itin_observed_od_full_clean.groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                    .agg({'otp_leg_id': lambda x: len(np.unique(x))}) \
                                    .reset_index() \
                                    .rename(index=str, columns={'otp_leg_id':'num_legs'})

In [87]:
original_suggested_itins_num_legs = otp_suggestions.groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                    .agg({'otp_leg_id': lambda x: len(x)}) \
                                    .reset_index() \
                                    .rename(index=str, columns={'otp_leg_id':'num_legs'})

In [88]:
complete_matched_itins = original_suggested_itins_num_legs

In [89]:
complete_matched_itins.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,num_legs
0,491551.0,1,5
1,491551.0,2,5
2,491551.0,3,5
3,491551.0,4,5
4,491551.0,5,5


In [90]:
all_complete_vehicle_legs_options = scheduled_itin_observed_od_full_clean.merge(complete_matched_itins.drop('num_legs', axis=1), how='inner')

In [91]:
all_complete_vehicle_legs_options.drop_duplicates(keep='first',subset=['otp_mode','otp_route','bt_bus_code','bt_trip_num', 'otp_from_stop_id','otp_start_time','bt_start_time','sched_obs_start_timediff','otp_to_stop_id','otp_end_time','bt_end_time','sched_obs_end_timediff','otp_duration_mins'])


Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
0,491551.0,1,1,WALK,,,,,2019-02-02 05:18:45,,NaT,,2019-02-02 05:18:46,,,0.016667
1,491551.0,1,3,WALK,,,,,2019-02-02 05:47:02,,NaT,,2019-02-02 05:49:21,,,2.316667
2,491551.0,1,5,WALK,,,,,2019-02-02 05:58:05,,NaT,,2019-02-02 05:58:06,,,0.016667
3,491551.0,2,1,WALK,,,,,2019-02-02 05:53:45,,NaT,,2019-02-02 05:53:46,,,0.016667
4,491551.0,2,3,WALK,,,,,2019-02-02 06:23:31,,NaT,,2019-02-02 06:25:08,,,1.616667
5,491551.0,2,5,WALK,,,,,2019-02-02 06:39:28,,NaT,,2019-02-02 06:39:29,,,0.016667
6,491551.0,3,1,WALK,,,,,2019-02-02 06:11:45,,NaT,,2019-02-02 06:11:46,,,0.016667
7,491551.0,3,3,WALK,,,,,2019-02-02 06:40:02,,NaT,,2019-02-02 06:42:21,,,2.316667
8,491551.0,3,5,WALK,,,,,2019-02-02 06:50:05,,NaT,,2019-02-02 06:50:06,,,0.016667
9,491551.0,4,1,WALK,,,,,2019-02-02 06:29:45,,NaT,,2019-02-02 06:29:46,,,0.016667


In [92]:
def is_new_itinerary(prev_trip_id,curr_trip_id,prev_itin_id,curr_itin_id):
    return ((prev_trip_id != curr_trip_id) | (prev_itin_id != curr_itin_id))

def choose_leg_matches(leg_matches_groups):
        colnames = leg_matches_groups.obj.columns.values
        chosen_leg_matches = pd.DataFrame(columns = colnames)
        prev_trip_id = -1
        prev_itin_id = -1
        prev_leg_mode = ""
        prev_leg_end_time = pd.NaT
        num_groups_not_survived = 0
        new_itinerary = False

        for name, group in leg_matches_groups:
            
                #print
                #print "Name:", name
                #print "Group:"
                #print group
                #print
                
                curr_trip_id = group['otp_user_trip_id'].iloc[0]
                curr_itin_id = group['otp_itinerary_id'].iloc[0]
                curr_leg_id = group['otp_leg_id'].iloc[0]
                curr_leg_mode = group['otp_mode'].iloc[0]
                
                new_itinerary = is_new_itinerary(prev_trip_id,curr_trip_id,prev_itin_id,curr_itin_id)
                if new_itinerary:
                    prev_leg_end_time = group['otp_start_time'].dt.floor('d').iloc[0]

                #if (prev_group_id == ()):
                #        prev_leg_end_time = group['bt_start_time'].dt.floor('d')[0]

                #print
                #print "Previous itinerary id:", prev_itin_id
                #print "Previous leg mode:", prev_leg_mode
                #print "Previous leg end time:", prev_leg_end_time
                #print "Current leg id:", curr_leg_id
                #print "Current leg mode:", curr_leg_mode
                #print
                #print "Original Group"
                #print group.filter(['otp_start_time','bt_start_time','bt_end_time'])
                
                if (curr_leg_mode == 'WALK'):
                    #print "Walking duration:", filtered_group['otp_duration_mins']
                    filtered_group = group.reset_index()
                    if new_itinerary: #first leg is a WALK leg
                        filtered_group.loc[0,'bt_end_time'] = prev_leg_end_time
                    else:
                        filtered_group.loc[0,'bt_start_time'] = prev_leg_end_time
                        filtered_group.loc[0,'bt_end_time'] = prev_leg_end_time + \
                            pd.Timedelta(minutes=np.rint(filtered_group['otp_duration_mins'].iloc[0]))
                    #print "Filtered Group"
                    #print filtered_group
                else:
                    filtered_group = group[group['bt_start_time'] > prev_leg_end_time]
                
                #print
                #print "Filtered Group"
                #print filtered_group.filter(['otp_start_time','bt_start_time','bt_end_time'])

                if (len(filtered_group) == 0):
                        #print "Group did not survive! =("
                        #print
                        #print "Previous itinerary id:", prev_itin_id
                        #print "Previous leg mode:", prev_leg_mode
                        #print "Previous leg end time:", prev_leg_end_time
                        #print "Current leg id:", curr_leg_id
                        #print "Current leg mode:", curr_leg_mode
                        #print
                        #print "Original Group"
                        #print group#.filter(['otp_start_time','bt_start_time','bt_end_time'])
                        num_groups_not_survived += 1
                        continue

                chosen_leg_match = filtered_group.sort_values('bt_start_time').iloc[0]
                
                if ((curr_leg_id == 2) & 
                    ((curr_leg_mode == 'BUS') & (prev_leg_mode == 'WALK'))):
                        #Update previous walk start/end_times
                        #print
                        #print "Chosen Leg Matches"
                        #print chosen_leg_matches.iloc[-1]
                        #print
                        chosen_leg_matches.iloc[-1,chosen_leg_matches.columns.get_loc('bt_start_time')] = chosen_leg_match['bt_start_time'] - \
                            pd.Timedelta(minutes=np.rint(chosen_leg_matches.iloc[-1].otp_duration_mins))
                        chosen_leg_matches.iloc[-1,chosen_leg_matches.columns.get_loc('bt_end_time')] = chosen_leg_match['bt_start_time']
                #print "Chosen Leg"
                #print chosen_leg_match

                chosen_leg_matches = chosen_leg_matches.append(chosen_leg_match)

                #Update variables
                #prev_group_id = name
                prev_trip_id = curr_trip_id
                prev_itin_id = curr_itin_id
                prev_leg_mode = curr_leg_mode
                prev_leg_end_time = chosen_leg_match['bt_end_time']

        #print "Number of groups which did not survive:", num_groups_not_survived
        return chosen_leg_matches.filter(colnames)

In [93]:
legs_groups = all_complete_vehicle_legs_options.groupby(['otp_user_trip_id','otp_itinerary_id','otp_leg_id'])

In [None]:
legs_groups.obj.columns.values

In [94]:
# Choose best actual leg matches (based on feasibility and start time)
import time

start = time.time()
#chosen_legs = choose_leg_matches(sample_itinerary_options.groupby(['otp_user_trip_id','otp_itinerary_id','otp_leg_id']))
feasible_legs = choose_leg_matches(legs_groups)
end = time.time()

print "Execution time in s:", (end-start)
feasible_legs

Execution time in s: 1.26161789894


Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins
0,491551.0,1,1,WALK,,,,,2019-02-02 05:18:45,,NaT,,2019-02-02 05:18:46,2019-02-02 00:00:00,,0.016667
0,491551.0,1,3,WALK,,,,,2019-02-02 05:47:02,2019-02-02 00:00:00,NaT,,2019-02-02 05:49:21,2019-02-02 00:02:00,,2.316667
0,491551.0,1,5,WALK,,,,,2019-02-02 05:58:05,2019-02-02 00:02:00,NaT,,2019-02-02 05:58:06,2019-02-02 00:02:00,,0.016667
0,491551.0,2,1,WALK,,,,,2019-02-02 05:53:45,,NaT,,2019-02-02 05:53:46,2019-02-02 00:00:00,,0.016667
0,491551.0,2,3,WALK,,,,,2019-02-02 06:23:31,2019-02-02 00:00:00,NaT,,2019-02-02 06:25:08,2019-02-02 00:02:00,,1.616667
0,491551.0,2,5,WALK,,,,,2019-02-02 06:39:28,2019-02-02 00:02:00,NaT,,2019-02-02 06:39:29,2019-02-02 00:02:00,,0.016667
0,491551.0,3,1,WALK,,,,,2019-02-02 06:11:45,,NaT,,2019-02-02 06:11:46,2019-02-02 00:00:00,,0.016667
0,491551.0,3,3,WALK,,,,,2019-02-02 06:40:02,2019-02-02 00:00:00,NaT,,2019-02-02 06:42:21,2019-02-02 00:02:00,,2.316667
0,491551.0,3,5,WALK,,,,,2019-02-02 06:50:05,2019-02-02 00:02:00,NaT,,2019-02-02 06:50:06,2019-02-02 00:02:00,,0.016667
0,491551.0,4,1,WALK,,,,,2019-02-02 06:29:45,,NaT,,2019-02-02 06:29:46,2019-02-02 00:00:00,,0.016667


In [95]:
if len(feasible_legs) == 0:
    print "No matches left after matching and selecting feasible bus legs."
    print "Skipping next steps..."
    exit(0)

# Filtering out itineraries which lost bus legs after feasible legs choice processing
feasible_itins_num_legs = feasible_legs.groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                    .agg({'otp_leg_id': lambda x: len(x)}) \
                                    .reset_index() \
                                    .rename(index=str, columns={'otp_leg_id':'num_legs'})
feasible_itins_num_legs.head(20)


Unnamed: 0,otp_user_trip_id,otp_itinerary_id,num_legs
0,491551.0,1,3
1,491551.0,2,3
2,491551.0,3,3
3,491551.0,4,3
4,491551.0,5,3
5,491551.0,6,2
6,491551.0,7,3
7,491551.0,8,2
8,491551.0,9,3
9,491551.0,10,3


In [96]:
feasible_complete_itins = feasible_itins_num_legs.merge(original_suggested_itins_num_legs,how='inner')

feasible_complete_itins.head(20)

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,num_legs


In [97]:
feasible_complete_itins_legs = feasible_legs.merge(feasible_complete_itins.drop('num_legs', axis=1),how='inner')

feasible_complete_itins_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins


In [98]:
def add_stops_data_to_legs(itineraries_legs,stops_locs):
    itineraries_legs_stops = itineraries_legs.merge(stops_locs, left_on='otp_from_stop_id', right_on='stop_id', how='left') \
                                                                                .drop('stop_id', axis=1) \
                                                                                .rename(index=str, columns={'stop_lat':'from_stop_lat','stop_lon':'from_stop_lon'}) \
                                                                                .merge(stops_locations, left_on='otp_to_stop_id', right_on='stop_id', how='left') \
                                                                                .drop('stop_id', axis=1) \
                                                                                .rename(index=str, columns={'stop_lat':'to_stop_lat','stop_lon':'to_stop_lon'}) 
    return itineraries_legs_stops

In [99]:
stops_locations = stops_df[['stop_id','stop_lat','stop_lon']]
itineraries_legs = add_stops_data_to_legs(feasible_complete_itins_legs,stops_locations)

itineraries_legs.head()

Unnamed: 0,otp_user_trip_id,otp_itinerary_id,otp_leg_id,otp_mode,otp_route,bt_bus_code,bt_trip_num,otp_from_stop_id,otp_start_time,bt_start_time,sched_obs_start_timediff,otp_to_stop_id,otp_end_time,bt_end_time,sched_obs_end_timediff,otp_duration_mins,from_stop_lat,from_stop_lon,to_stop_lat,to_stop_lon


In [None]:
#feasible_complete_itins_legs.head()
output = feasible_complete_itins_legs.to_csv("data/output/output_2019_02_01.csv",index=False)
#feasible_complete_itins_legs[["otp_itinerary_id","otp_leg_id","otp_mode","otp_route","bt_start_time","bt_end_time"]]

In [None]:
teste = feasible_complete_itins_legs.drop_duplicates(keep='first',subset=['otp_leg_id','otp_mode','otp_route','bt_bus_code','bt_trip_num', 'otp_from_stop_id','otp_start_time','bt_start_time',
                'sched_obs_start_timediff','otp_to_stop_id','otp_end_time','bt_end_time','sched_obs_end_timediff'])
#feasible_complete_itins_legs_drop_duplicates = choose_leg_matches_drop_duplicates(feasible_complete_itins_legs)
#output = teste.to_csv("data/output/output.csv",index=False)
teste

In [None]:
teste = teste[~teste.duplicated(['otp_leg_id','otp_mode','otp_route','bt_bus_code','bt_trip_num', 'otp_from_stop_id','otp_start_time','bt_start_time',
                'sched_obs_start_timediff','otp_to_stop_id','otp_end_time','bt_end_time','sched_obs_end_timediff'])
              .groupby(teste['otp_itinerary_id']).transform('any')]
teste
#output = teste.to_csv("data/output/output.csv",index=False)


In [None]:
grouped = teste.groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                    .agg({'otp_leg_id': lambda x: len(x) < 5}).reset_index()


In [None]:
grouped2 = teste.groupby(['otp_user_trip_id','otp_itinerary_id']).filter(lambda g: len(g.otp_leg_id) == 5)
grouped2



In [None]:
grouped3 = grouped2.groupby(['otp_user_trip_id','otp_itinerary_id']) \
                                    .agg({'otp_leg_id': lambda x: len(x) < 5})
