# Generate a Route Segment Length Table

In [1]:
import sys
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import gtfs_kit

In [2]:
# load the feed 
gtfs_path = 'data/bus_data.zip'
feed = gtfs_kit.read_feed(gtfs_path, dist_units='mi')

# functionized new algorithm

In [3]:
## function ##
#
# Generate stop list for a single route, including the longest trip in both directions
    
def make_route_distance_table(feed, gtfs_route_id, dates):
    
    # generate a timetable for this route
    timetable = gtfs_kit.routes.build_route_timetable(feed, gtfs_route_id, dates)
    
    results = []

    # loop to do both directions
    for direction_id, timetable_grouped_by_direction in timetable.groupby('direction_id'):
        
        # print(f"Direction {direction_id}")

        timetable_grouped_by_direction = timetable.groupby('direction_id')
        group_direction_0 = timetable_grouped_by_direction.get_group(direction_id)

        # Group by 'trip_id' within the selected group and calculate trip lengths
        result_group = group_direction_0.groupby('trip_id').size().reset_index(name='trip_length')

        # Perform aggregation to count occurrences and create a list of trip_ids
        aggregated_result = result_group.groupby('trip_length')['trip_id'].agg(['count', list]).reset_index()

        # pick a trip randomly from the list of trips that have the most stops for this direction
        max_count_row = aggregated_result[aggregated_result['count'] == aggregated_result['count'].max()]
        
        selected_value = np.random.choice(max_count_row['list'].iloc[0])

        # print(f"trip_id {selected_value}")
        
        # get timetable for that trip
        timetable_grouped_by_trip_id = timetable.groupby('trip_id')
        random_group = timetable_grouped_by_trip_id.get_group(selected_value)

        # lookup route_short_name in feed.routes and fill it down in the table
        condition = feed.routes['route_id'] == gtfs_route_id
        value_to_assign = feed.routes.loc[condition, 'route_short_name'].iloc[0]
        random_group.loc[:, 'route_short_name'] = value_to_assign

        # drop the trip_id
        random_group.drop('trip_id', axis=1, inplace=True)

        # Specify the desired order of columns
        desired_order = ['route_short_name','route_id','stop_id','stop_sequence','shape_dist_traveled']

        # Reorder columns
        results.append(random_group[desired_order])
        
        sys.stdout.write('.')
    
    # concatenate and return results
    
    return pd.concat(results, ignore_index=True)
 

### main loop

In [4]:
## main loop ##
#
# Loop over routes and generate stop list with distances

# pick date
week = feed.get_first_week()
dates = [week[2]]  # First Wednesday

# create empty list to hold route by route results
dfs = []

# context to make things less annoying
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=FutureWarning)
    
    # loop every every route in GTFS feed
    for index, row in feed.routes.iterrows():
        
        # print(f"Route {row['route_id']}" )

        # call function to generate list of stops with distances
        df = make_route_distance_table(feed, row['route_id'], dates)


        # append to results list
        dfs.append(df)

# create empty df and concatenate results
route_distance_table = pd.DataFrame()
route_distance_table=pd.concat(dfs, ignore_index=True)
route_distance_table


...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

Unnamed: 0,route_short_name,route_id,stop_id,stop_sequence,shape_dist_traveled
0,1,1,2204,1,0.0000
1,1,1,27662,2,0.1886
2,1,1,27663,3,0.2991
3,1,1,27664,4,0.4629
4,1,1,27665,5,0.5725
...,...,...,...,...,...
25389,GO28,259,16176,17,11.3848
25390,GO28,259,40639,18,11.9958
25391,GO28,259,16184,19,12.4377
25392,GO28,259,1837,20,13.3799


# Add NJTransit Stop Codes

In [5]:
# # left — uses GTFS stop_id
# route_distance_table[route_distance_table["route_short_name"] == "119"]

# # right
# feed.stops


In [None]:
# join on stop_id
merged = pd \
    .merge(route_distance_table, feed.stops, on=['stop_id'], how='left') \
    .drop(['route_id','stop_desc','stop_lat','stop_lon','zone_id'], axis=1)
merged=merged.drop(['stop_id'], axis=1)
merged=merged.rename(columns={"stop_code" : "stop_id"})

# Save DataFrame to a pickle file
merged.to_pickle('data/njtransit_stop_distances.pkl')

In [7]:
# check one route
pd.set_option('display.max_rows', None)
display(merged[merged.route_short_name == '119']) 

Unnamed: 0,route_short_name,stop_sequence,shape_dist_traveled,stop_id,stop_name
1323,119,1,0.0,20383,JFK BLVD AT 3RD ST
1324,119,2,0.1242,20384,JFK BLVD AT 4TH ST
1325,119,3,0.3286,31594,JFK BLVD AT W 7TH STREET
1326,119,4,0.6252,20385,JFK BLVD AT 8TH ST
1327,119,5,0.7331,20386,JFK BLVD AT 10TH ST
1328,119,6,0.8284,20387,JFK BLVD AT 12TH ST
1329,119,7,0.9231,20388,JFK BLVD AT 14TH ST
1330,119,8,1.0473,20389,JFK BLVD AT 16TH ST
1331,119,9,1.1509,20390,JFK BLVD AT ANDREW ST
1332,119,10,1.2383,20391,JFK BLVD AT 19TH ST
