In [1]:
# Author: Yun Ma
# update: add more genetic features, 5/20/25
# update: add managed_lane_use attribute, 6/8/25
# update: add driver attribute for manage lane use modification,6/25/25

# approach:
# 1. Sort the leg-based trip table by person_id and trip_num.
# 2. Loop by person_id and when the first destination purpose is not ‘change mode’, a link trip is separated;
# 3. Moreover, when the trip_day was changed, a link trip is separated as well. In this case, the dest_purpose was updated from the original misimputed “change-mode” to the real reported purpose.
# 3. Use the loop’s first leg’s origin-purpose, depart-time, o-lat, o-lon as the linked trip’s origin’s attributes;
# 4. Use the loop’s last leg’s dest-purpose, arrive-time, d-lat, d-lon as the linked trip’s destination’s attributes;
# 5. The mode type with the most priority among all legs was picked as the linked trip’s mode type. The prioirity of mode_type can be customized in the correspondence table.
# 6. The mode_x with the most priority among all legs was picked as the linked trip’s mode_x. The prioirity of mode_x can be customized in the correspondence table.
# 7. The transit access/egress with the most priority among all legs was picked as the linked trip’s transit access/egress. The prioirity of transit access/egress can be customized in the correspondence table.
# 8. The weight of the linked trip used the the leg of main mode_type's weight. 

In [2]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

In [3]:
# 0.1 read correspondence, which are mostly used to determine linked priority
lookup_o_purpose_reported = pd.read_csv(r'..\corres\lookup_o_purpose_reported.csv')
lookup_d_purpose_reported = pd.read_csv(r'..\corres\lookup_d_purpose_reported.csv')
lookup_mode = pd.read_csv(r'..\corres\lookup_mode.csv')     
lookup_mode_type = pd.read_csv(r'..\corres\lookup_mode_type.csv')       
lookup_transit_access = pd.read_csv(r'..\corres\lookup_transit_access.csv')
lookup_transit_egress = pd.read_csv(r'..\corres\lookup_transit_egress.csv')
lookup_transit_egress = pd.read_csv(r'..\corres\lookup_transit_egress.csv')
lookup_transit_egress = pd.read_csv(r'..\corres\lookup_transit_egress.csv')

# 0.2 read raw dataset from latest delivery
trip = pd.read_csv(r'..\dataset\WeightedDataset_02212025\trip.csv')               # 365830, 107 

In [4]:
# 1.1 lookup attributes
# mode
trip = trip.merge(lookup_mode[['mode_1','mode_1_priority']],how = 'left')
trip = trip.merge(lookup_mode[['mode_2','mode_2_priority']],how = 'left')
trip = trip.merge(lookup_mode[['mode_3','mode_3_priority']],how = 'left')
trip = trip.merge(lookup_mode[['mode_4','mode_4_priority']],how = 'left')
trip = trip.merge(lookup_mode_type[['mode_type','mode_type_priority']],how='left')

# access/egress
trip = trip.merge(lookup_transit_access[['transit_access','transit_access_priority']],how='left')
trip = trip.merge(lookup_transit_egress[['transit_egress','transit_egress_priority']],how='left')

# purpose
trip = trip.merge(lookup_d_purpose_reported[['d_purpose_reported','d_purpose_reported_priority']],how = 'left')
trip = trip.merge(lookup_o_purpose_reported[['o_purpose_reported','o_purpose_reported_priority']],how = 'left')

# 1.2 modify attributes
trip['depart_datetime'] = pd.to_datetime(trip['depart_date'].astype(str) +' '+ 
                                         trip['depart_hour'].astype(str) + ':' + 
                                         trip['depart_minute'].astype(str) + ':' + 
                                         trip['depart_seconds'].astype(str), format='%Y-%m-%d %H:%M:%S')
trip['arrive_datetime'] = pd.to_datetime(trip['arrive_date'].astype(str) +' '+ 
                                         trip['arrive_hour'].astype(str) + ':' + 
                                         trip['arrive_minute'].astype(str) + ':' + 
                                         trip['arrive_second'].astype(str), format='%Y-%m-%d %H:%M:%S')

# trip['num_travelers'] = np.where(trip['num_travelers']==995,1,trip['num_travelers'])
# trip['managed_lane_use'] = np.where(trip['managed_lane_use']==995,2,trip['managed_lane_use'])

In [5]:
# 2. split trips into 2 parts to save processing time
trip_s0 = trip[ (trip['o_purpose_category'] != 11) & (trip['d_purpose_category'] != 11)]   # 306147 individual trip 
trip_s1 = trip[ (trip['o_purpose_category'] == 11) | (trip['d_purpose_category'] == 11)]   # 59683 legs to be linked

In [6]:
# 3. process trip_s1 : 59683 tansfer legs need to be linked

In [7]:
# 3.0 define key variables 

C_id = ['hh_id','person_id','trip_id','person_num','trip_num','day_num']
C_flag = ['link_day','link_num','leg_num','leg_delete']
C_purp = ['o_purpose_category','d_purpose_category','o_purpose','d_purpose','o_purpose_reported_priority','d_purpose_reported_priority']
C_time = ['depart_datetime','arrive_datetime']
C_mode = ['mode_type_priority','mode_1_priority','mode_2_priority','mode_3_priority','mode_4_priority']
C_access = ['transit_access_priority','transit_egress_priority']
C_od =   ['o_lat','o_lon','d_lat','d_lon','o_county', 'd_county','o_state', 'd_state','o_puma_2022', 'd_puma_2022']
C_wt =   ['trip_weight','trip_weight_rmove_only','num_travelers','managed_lane_use','driver']
C_other =  ['distance_miles','duration_minutes','dwell_mins']
C_last = ['d_purpose_category','d_purpose','arrive_datetime','d_lat','d_lon','d_county','d_state','d_puma_2022']
C_ppr =  ['o_purpose_reported_priority','d_purpose_reported_priority']

C_pri = C_ppr + C_mode + C_access
C_key = C_purp + C_mode + C_access + C_time + C_od + C_wt + C_other

C_key_link = ["link_" + key for key in C_key]
C_last_link = ["link_" + key for key in C_last]
C_pri_link = ["link_" + key for key in C_pri] 
C_wt_link = ["link_" + key for key in C_wt]
C_other_sum = ["sum_" + key for key in C_other]

dictKey = dict(zip(C_key,C_key_link))
dictLast = dict(zip(C_last,C_last_link))
dictPri = dict(zip(C_pri,C_pri_link))
dictWt = dict(zip(C_wt,C_wt_link)) 
dictSum = dict(zip(C_other,C_other_sum)) 

In [8]:
# 3.1 sort trip (trips must be sorted before linking!)

trip_s1.fillna(0, inplace=True)
trip_s1=trip_s1.sort_values(by=['hh_id','person_id','trip_num'])

In [9]:
# 3.2 merge transfer legs to linked trip_s1s for trip_s1s crossing multip days

# initial
for item in C_key_link:
    trip_s1.loc[:,item] = 0.0
for item in C_flag:
    trip_s1.loc[:,item] = 0

i = 0
link_pid = 0
link_day = 0
link_num = 0

# loop
while i < len(trip_s1):
    
    person_id = trip_s1.iloc[i,trip_s1.columns.get_loc('person_id')]
    day_num = trip_s1.iloc[i,trip_s1.columns.get_loc('day_num')] 
    link_day = day_num
    
    if (person_id == link_pid):
        link_num += 1
    else:
        link_num = 1
        link_pid = person_id 

    # initial for all trips including independent trips with one leg only    
    for item in dictKey:    
        trip_s1.iloc[i,trip_s1.columns.get_loc(dictKey[item])] = trip_s1.iloc[i,trip_s1.columns.get_loc(item)]  
    trip_s1.iloc[i,trip_s1.columns.get_loc('link_num')] = link_num
    trip_s1.iloc[i,trip_s1.columns.get_loc('link_day')] = link_day
    
    if (trip_s1.iloc[i,trip_s1.columns.get_loc('d_purpose_category')] !=11):        
        i += 1
        continue
    
    j = 1
    for item in C_other:
        exec('sum_'+item + '=' + "trip_s1.iloc[i,trip_s1.columns.get_loc('" + item + "')]")
                           
    while ((trip_s1.iloc[(i+j),trip_s1.columns.get_loc('person_id')] == person_id) & \
           (trip_s1.iloc[(i+j),trip_s1.columns.get_loc('day_num')] == day_num)):

        # use last leg's 'd_purpose_category','d_purpose'... as real purpose
        for item in dictLast:
            trip_s1.iloc[i,trip_s1.columns.get_loc(dictLast[item])] = trip_s1.iloc[(i+j),trip_s1.columns.get_loc(item)]
        
        # use leg of priority
        for item in dictPri:
            if trip_s1.iloc[(i+j),trip_s1.columns.get_loc(item)] < trip_s1.iloc[i,trip_s1.columns.get_loc(dictPri[item])]:
                trip_s1.iloc[i,trip_s1.columns.get_loc(dictPri[item])] = trip_s1.iloc[(i+j),trip_s1.columns.get_loc(item)]            
            
        # use leg of main mode_type for trip_weight
        for item in dictWt:
            if trip_s1['mode_type_priority'].iloc[i+j] == trip_s1['link_mode_type_priority'].iloc[i]:
                trip_s1.iloc[i,trip_s1.columns.get_loc(dictWt[item])] = trip_s1.iloc[(i+j),trip_s1.columns.get_loc(item)]
            
        #sum_other
        for item in C_other:
            exec('sum_'+item + '=' + 'sum_'+item  + " + trip_s1.iloc[(i+j),trip_s1.columns.get_loc('" + item + "')]")
            exec("trip_s1.iloc[i,trip_s1.columns.get_loc('link_" + item + "')] = sum_" + item )
       
        # add flags
        trip_s1.iloc[(i+j),trip_s1.columns.get_loc('leg_delete')] = 1
        trip_s1.iloc[(i+j),trip_s1.columns.get_loc('link_num')] = link_num
        trip_s1.iloc[(i+j),trip_s1.columns.get_loc('leg_num')] = j
        trip_s1.iloc[(i+j),trip_s1.columns.get_loc('link_day')] = link_day
        
        if trip_s1.iloc[(i+j),trip_s1.columns.get_loc('d_purpose_category')] != 11:            
            j += 1
            break
        j += 1   
        continue

    i = i + j
    continue


In [10]:
# 3.3. post-process

# 3.3.1 link_mode_type_priority -> link_mode_type
trip_s1_m = trip_s1.merge(lookup_mode_type[['link_mode_type_priority','link_mode_type']],how='left') 
trip_s1_m = trip_s1_m.merge(lookup_mode[['link_mode_1_priority','link_mode_1']],how='left') 
trip_s1_m = trip_s1_m.merge(lookup_mode[['link_mode_2_priority','link_mode_2']],how='left')
trip_s1_m = trip_s1_m.merge(lookup_mode[['link_mode_3_priority','link_mode_3']],how='left')
trip_s1_m = trip_s1_m.merge(lookup_mode[['link_mode_4_priority','link_mode_4']],how='left')

# 3.3.2 link_transit_access_priority -> link_transit_access
trip_s1_m = trip_s1_m.merge(lookup_transit_access[['link_transit_access_priority','link_transit_access']],how='left') 
trip_s1_m = trip_s1_m.merge(lookup_transit_egress[['link_transit_egress_priority','link_transit_egress']],how='left')

# 3.3.3 modify link_x_purpose, link_x_purpose_category
trip_s1_m=trip_s1_m.merge(lookup_o_purpose_reported[['link_o_purpose_reported_priority',
                                                     'link_o_purpose_category_new',
                                                     'link_o_purpose_new',
                                                     'link_o_purpose_reported']],how='left')   
trip_s1_m=trip_s1_m.merge(lookup_d_purpose_reported[['link_d_purpose_reported_priority',
                                                     'link_d_purpose_category_new',
                                                     'link_d_purpose_new',
                                                     'link_d_purpose_reported']],how='left')

# if link_od_purpose_category == 11, use link_od_purpose_new based on link_od_purpose_reported_priority as new real purpose
trip_s1_m['link_o_purpose_category'] = np.where(trip_s1_m['link_o_purpose_category'] == 11,
                                       trip_s1_m['link_o_purpose_category_new'],
                                       trip_s1_m['link_o_purpose_category'])
trip_s1_m['link_d_purpose_category'] = np.where(trip_s1_m['link_d_purpose_category'] == 11,
                                       trip_s1_m['link_d_purpose_category_new'],
                                       trip_s1_m['link_d_purpose_category'])

trip_s1_m['link_o_purpose'] = np.where(trip_s1_m['link_o_purpose'] == 60,
                                       trip_s1_m['link_o_purpose_new'],
                                       trip_s1_m['link_o_purpose'])
trip_s1_m['link_d_purpose'] = np.where(trip_s1_m['link_d_purpose'] == 60,
                                       trip_s1_m['link_d_purpose_new'],
                                       trip_s1_m['link_d_purpose'])

In [11]:
# 4. output trip_s1

# define output columns
C_purp = ['o_purpose_category','o_purpose','o_purpose_reported','d_purpose_category','d_purpose','d_purpose_reported']
C_mode = ['mode_type','mode_1','mode_2','mode_3','mode_4']
C_access = ['transit_access','transit_egress']
C_filter_unlink = C_time + C_purp + C_mode + C_access + C_wt + C_od + C_other
C_filter_link = ["link_" + key for key in C_filter_unlink]
C_filter_link_unlink = dict(zip(C_filter_link,C_filter_unlink))
C_filter = C_id + C_filter_link + C_flag

# linked trip_s1 with all legs and flags
trip_s1 = trip_s1_m.filter(items=C_filter)
trip_s1.to_csv(r'..\output\0625\combine_from_legs_to_links_59683x42.csv',index=False)

# create distinct trip_s1_linked, by removing ['leg_delete'] == 1
trip_s1_linked = trip_s1[trip_s1['leg_delete'] == 0]
trip_s1_linked.loc[:,'link_flag'] = "Y"
trip_s1_linked.to_csv(r'..\output\0625\linked_trips_only_17958x43.csv',index=False)

# rename attriubtes by removing "link_"
trip_s1_linked_rename = trip_s1_linked.copy()
trip_s1_linked_rename = trip_s1_linked_rename.rename(columns=C_filter_link_unlink)

# 5.0 process trip_s0 by filter columns
trip_s0_filter = trip_s0.filter(items=trip_s1_linked_rename.columns.tolist())
trip_s0_filter.loc[:,'link_flag'] = "N"
trip_s0_filter.loc[:,'link_num'] = 0
trip_s0_filter.loc[:,'leg_num'] = 999
trip_s0_filter.loc[:,'leg_delete'] = "NA"

# 6.0 append trip_s1 linked trips with trip_s0 filter
trip_s01 = pd.concat([trip_s1_linked_rename,trip_s0_filter])
trip_s01.to_csv(r'..\output\0625\final_output_with_all_linked_n_individual_trips_324105x43.csv',index=False)

In [12]:
### end of trip-linkage ###