# Modelling of trips
Noticed that a bus stop will not only serve one route and there needs a more accurate representation of the tap in and tap out volume. Hence we will try to use the frequency to model a trip data here.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
bus_routes = pd.read_json('../data/cleaned/BusRoutes.json', lines=True)
bus_ridership_quarter = pd.read_csv('../data/cleaned/BusRideVolume_2024_070809.csv', dtype={'PT_CODE': int})

In [3]:
bus_routes.head()

Unnamed: 0,ServiceNo,Operator,Direction,StopSequence,BusStopCode,Distance,WD_FirstBus,WD_LastBus,SAT_FirstBus,SAT_LastBus,SUN_FirstBus,SUN_LastBus
0,10,SBST,1,1,75009,0.0,500,2300,500,2300,500,2300
1,10,SBST,1,2,76059,0.6,502,2302,502,2302,502,2302
2,10,SBST,1,3,76069,1.1,504,2304,504,2304,503,2304
3,10,SBST,1,4,96289,2.3,508,2308,508,2309,507,2308
4,10,SBST,1,5,96109,2.7,509,2310,509,2311,508,2309


In [4]:
bus_ridership_quarter.head()

Unnamed: 0,YEAR_MONTH,DAY_TYPE,TIME_PER_HOUR,PT_TYPE,PT_CODE,TOTAL_TAP_IN_VOLUME,TOTAL_TAP_OUT_VOLUME
0,2024-07,WEEKDAY,19.0,BUS,46359,279,138
1,2024-07,WEEKDAY,6.0,BUS,64551,570,26
2,2024-07,WEEKDAY,9.0,BUS,92099,2009,866
3,2024-07,WEEKDAY,6.0,BUS,12201,51,1405
4,2024-07,WEEKDAY,17.0,BUS,77101,94,110


### Understanding the ridership data in general

# Looking at Total Trips in a Bus Stop Data
Now that we know each bus stops tap in and out, we are interested to look at the total number of trips that run through this bus stops and get a better representation of the ridership. 

In [6]:
# read the bus trips data

bus_trips = pd.read_csv("../data/raw/origin_destination_bus_202408.csv")
bus_trips

Unnamed: 0,YEAR_MONTH,DAY_TYPE,TIME_PER_HOUR,PT_TYPE,ORIGIN_PT_CODE,DESTINATION_PT_CODE,TOTAL_TRIPS
0,2024-08,WEEKENDS/HOLIDAY,18,BUS,76201,76079,6
1,2024-08,WEEKENDS/HOLIDAY,7,BUS,10351,13201,7
2,2024-08,WEEKENDS/HOLIDAY,19,BUS,76061,75371,1
3,2024-08,WEEKENDS/HOLIDAY,9,BUS,14271,7021,2
4,2024-08,WEEKDAY,5,BUS,54581,66471,1
...,...,...,...,...,...,...,...
5760076,2024-08,WEEKENDS/HOLIDAY,20,BUS,60069,46009,13
5760077,2024-08,WEEKDAY,8,BUS,76141,75311,58
5760078,2024-08,WEEKDAY,10,BUS,11369,28221,1
5760079,2024-08,WEEKDAY,18,BUS,75419,66381,1


In [7]:
# Example: 
bus_trips[(bus_trips['ORIGIN_PT_CODE'] == 22009) & (bus_trips['DESTINATION_PT_CODE'] == 22441)  & (bus_trips['TIME_PER_HOUR'] == 10)  ]
bus_trips[(bus_trips['ORIGIN_PT_CODE'] == 22441) & (bus_trips['DESTINATION_PT_CODE'] == 22591)  & (bus_trips['TIME_PER_HOUR'] == 10)  ]

Unnamed: 0,YEAR_MONTH,DAY_TYPE,TIME_PER_HOUR,PT_TYPE,ORIGIN_PT_CODE,DESTINATION_PT_CODE,TOTAL_TRIPS
850872,2024-08,WEEKENDS/HOLIDAY,10,BUS,22441,22591,3
1271668,2024-08,WEEKDAY,10,BUS,22441,22591,6


We are unable to group by origin or destination PT_Code due to the over laps

In [None]:
# TODO: create a df to create origin destination pair for each bus route sequences

# Sort by ServiceNo and StopSequence to ensure proper order
bus_routes_grouped = bus_routes.sort_values(by=['ServiceNo', 'Direction', 'StopSequence']).reset_index(drop=True)

# Initialize list to store origin-destination pairs
origin_destination_pairs = []

# Sliding window approach for each ServiceNo
for (service_no, direction), group in bus_routes_grouped.groupby(['ServiceNo', 'Direction']):
    group = group.reset_index(drop=True)
    
    for i in range(len(group) - 1):
        # Set current stop as origin and next stop as destination
        origin_stop = group.iloc[i]
        destination_stop = group.iloc[i + 1]
        
        origin_destination_pairs.append({
            'ServiceNo': service_no,
            'Direction': direction,
            'Origin_Stop': origin_stop['BusStopCode'],
            'Destination_Stop': destination_stop['BusStopCode'],
            'Origin_StopSequence': origin_stop['StopSequence'],
            'Destination_StopSequence': destination_stop['StopSequence']
        })

# Convert the result to DataFrame
bus_routes_od_df = pd.DataFrame(origin_destination_pairs)

bus_routes_od_df

Unnamed: 0,ServiceNo,Direction,Origin_Stop,Destination_Stop,Origin_StopSequence,Destination_StopSequence
0,10,1,75009,76059,1,2
1,10,1,76059,76069,2,3
2,10,1,76069,96289,3,4
3,10,1,96289,96109,4,5
4,10,1,96109,85079,5,6
...,...,...,...,...,...,...
24773,9B,1,95081,95091,24,25
24774,9B,1,95091,95131,25,26
24775,9B,1,95131,95141,26,27
24776,9B,1,95141,95061,27,28


In [14]:
# Merge the bus_routes_od_df with bus trips
bus_routes_trips = pd.merge(
    bus_routes_od_df,
    bus_trips,
    left_on=['Origin_Stop', 'Destination_Stop'],
    right_on=['ORIGIN_PT_CODE', 'DESTINATION_PT_CODE'],
    how = 'inner'
)


bus_routes_trips = bus_routes_trips[[
    'ServiceNo', 'Direction', 'Origin_Stop', 'Destination_Stop', 'Origin_StopSequence', 'Destination_StopSequence',
    'DAY_TYPE', 'TIME_PER_HOUR', 'PT_TYPE', 'TOTAL_TRIPS'
]]

bus_routes_trips


Unnamed: 0,ServiceNo,Direction,Origin_Stop,Destination_Stop,Origin_StopSequence,Destination_StopSequence,DAY_TYPE,TIME_PER_HOUR,PT_TYPE,TOTAL_TRIPS
0,10,1,75009,76059,1,2,WEEKDAY,9,BUS,196
1,10,1,75009,76059,1,2,WEEKDAY,22,BUS,26
2,10,1,75009,76059,1,2,WEEKENDS/HOLIDAY,20,BUS,41
3,10,1,75009,76059,1,2,WEEKDAY,15,BUS,301
4,10,1,75009,76059,1,2,WEEKDAY,5,BUS,4
...,...,...,...,...,...,...,...,...,...,...
713494,993,1,40329,43453,18,19,WEEKDAY,9,BUS,2
713495,993,1,40329,43453,18,19,WEEKENDS/HOLIDAY,9,BUS,2
713496,993,1,40329,43453,18,19,WEEKDAY,8,BUS,5
713497,993,1,40329,43453,18,19,WEEKDAY,18,BUS,3


# Model potential bus trips per hour.
Given that we have the bus frequency data, we can clean up the file to estimate the potential number of trips per hour in each stop.

In [15]:
bus_frequency = pd.read_json("../data/BusServices.json")
bus_frequency[(bus_frequency['Category'] == 'TRUNK') & (bus_frequency['OriginCode'] == bus_frequency['DestinationCode'])]

Unnamed: 0,ServiceNo,Operator,Direction,Category,OriginCode,DestinationCode,AM_Peak_Freq,AM_Offpeak_Freq,PM_Peak_Freq,PM_Offpeak_Freq,LoopDesc
4,119,GAS,1,TRUNK,65009,65009,09-13,12-18,12-15,15-17,Hougang St 21
11,15,GAS,1,TRUNK,77009,77009,04-09,04-13,07-15,13-16,Marine Parade Rd
21,34,GAS,1,TRUNK,65009,65009,4-9,4-15,9-13,11-14,PTB2 Basement
28,36,GAS,1,TRUNK,95129,95129,08-08,08-11,07-09,07-12,Tomlinson Rd
41,403,GAS,1,TRUNK,77009,77009,13-13,13-19,14-19,14-25,Pasir Ris Rd
...,...,...,...,...,...,...,...,...,...,...,...
702,965,TTS,1,TRUNK,47009,47009,09-12,09-13,10-13,11-13,Sengkang Sq
705,966,TTS,1,TRUNK,46009,46009,06-07,06-12,07-13,12-13,Marine Parade Rd
716,98,TTS,1,TRUNK,28009,28009,06-10,08-16,08-08,08-20,Jurong Pier Way
723,98M,TTS,1,TRUNK,28009,28009,-,17-18,-,12-17,Corporation Rd


In [16]:
# We want the category of service in bus_frequnecy in bus_route_trips
bus_routes_trips = bus_routes_trips.merge(bus_frequency[['ServiceNo', 'Direction', 'Category']], 
                     on=['ServiceNo', 'Direction'], how='right')

bus_routes_trips

Unnamed: 0,ServiceNo,Direction,Origin_Stop,Destination_Stop,Origin_StopSequence,Destination_StopSequence,DAY_TYPE,TIME_PER_HOUR,PT_TYPE,TOTAL_TRIPS,Category
0,118,1,96289.0,96109.0,19.0,20.0,WEEKDAY,8.0,BUS,29.0,TRUNK
1,118,1,96289.0,96109.0,19.0,20.0,WEEKENDS/HOLIDAY,22.0,BUS,1.0,TRUNK
2,118,1,96289.0,96109.0,19.0,20.0,WEEKENDS/HOLIDAY,20.0,BUS,7.0,TRUNK
3,118,1,96289.0,96109.0,19.0,20.0,WEEKDAY,11.0,BUS,21.0,TRUNK
4,118,1,96289.0,96109.0,19.0,20.0,WEEKDAY,13.0,BUS,16.0,TRUNK
...,...,...,...,...,...,...,...,...,...,...,...
713499,992,2,40309.0,43409.0,12.0,13.0,WEEKDAY,13.0,BUS,4.0,TRUNK
713500,992,2,40309.0,43409.0,12.0,13.0,WEEKDAY,17.0,BUS,4.0,TRUNK
713501,992,2,40309.0,43409.0,12.0,13.0,WEEKENDS/HOLIDAY,10.0,BUS,5.0,TRUNK
713502,992,2,40309.0,43409.0,12.0,13.0,WEEKENDS/HOLIDAY,16.0,BUS,1.0,TRUNK


In [17]:
# Function to calculate average frequency from frequency range
def mean_frequency(freq):
    if freq == '-' or pd.isna(freq):
        return None  # Handle missing or empty frequencies
    # Split frequency range and convert to integers
    range_vals = list(map(int, freq.split('-')))
    return sum(range_vals) / 2  # Calculate the mean

# Apply mean calculation to each frequency column
bus_frequency['AM_Peak_Freq'] = bus_frequency['AM_Peak_Freq'].apply(mean_frequency)
bus_frequency['AM_Offpeak_Freq'] = bus_frequency['AM_Offpeak_Freq'].apply(mean_frequency)
bus_frequency['PM_Peak_Freq'] = bus_frequency['PM_Peak_Freq'].apply(mean_frequency)
bus_frequency['PM_Offpeak_Freq'] = bus_frequency['PM_Offpeak_Freq'].apply(mean_frequency)

bus_frequency = bus_frequency.fillna(0)

bus_frequency.head()

Unnamed: 0,ServiceNo,Operator,Direction,Category,OriginCode,DestinationCode,AM_Peak_Freq,AM_Offpeak_Freq,PM_Peak_Freq,PM_Offpeak_Freq,LoopDesc
0,118,GAS,1,TRUNK,65009,97009,6.5,10.0,9.0,11.5,
1,118,GAS,2,TRUNK,97009,65009,10.0,9.5,6.0,10.5,
2,118A,GAS,1,TRUNK,65009,96119,36.0,0.0,0.0,0.0,
3,118B,GAS,1,TRUNK,96111,65191,0.0,0.0,40.5,0.0,
4,119,GAS,1,TRUNK,65009,65009,11.0,15.0,13.5,16.0,Hougang St 21


In [18]:
# calculate estimated trips per hour based on the peak frequencies

import math

# Step 1: Define the time ranges for peak periods
def get_frequency_type(time_hour):
    if 7 <= time_hour <= 9:
        return 'AM_Peak_Freq'
    elif 10 <= time_hour <= 16:
        return 'AM_Offpeak_Freq'
    elif 17 <= time_hour <= 19:
        return 'PM_Peak_Freq'
    elif 20 <= time_hour <= 23:
        return 'PM_Offpeak_Freq'
    else:
        return None

# Step 2: Function to calculate trips per hour based on frequency in minutes
def calculate_trips_per_hour(frequency_minutes):
    if frequency_minutes > 0:
        trips_per_hour = 60 / frequency_minutes
        return math.floor(trips_per_hour)
    return 0 


# step 2: Map the frequency to each row
def calculate_estimated_trips(row):
    frequency_df = bus_frequency
    freq_type = get_frequency_type(row['TIME_PER_HOUR'])
    
    if freq_type:
        service_info = frequency_df[(frequency_df['ServiceNo'] == row['ServiceNo']) &
                                    (frequency_df['Direction'] == row['Direction'])]
        
        if not service_info.empty:
            frequency = service_info.iloc[0][freq_type]
            return calculate_trips_per_hour(frequency)
    return 0


In [19]:
# Apply the function to calculate estimated trips per hour
bus_routes_trips['Estimated_Trips'] = bus_routes_trips.apply(calculate_estimated_trips, axis=1)


In [20]:
bus_routes_trips

Unnamed: 0,ServiceNo,Direction,Origin_Stop,Destination_Stop,Origin_StopSequence,Destination_StopSequence,DAY_TYPE,TIME_PER_HOUR,PT_TYPE,TOTAL_TRIPS,Category,Estimated_Trips
0,118,1,96289.0,96109.0,19.0,20.0,WEEKDAY,8.0,BUS,29.0,TRUNK,9
1,118,1,96289.0,96109.0,19.0,20.0,WEEKENDS/HOLIDAY,22.0,BUS,1.0,TRUNK,5
2,118,1,96289.0,96109.0,19.0,20.0,WEEKENDS/HOLIDAY,20.0,BUS,7.0,TRUNK,5
3,118,1,96289.0,96109.0,19.0,20.0,WEEKDAY,11.0,BUS,21.0,TRUNK,6
4,118,1,96289.0,96109.0,19.0,20.0,WEEKDAY,13.0,BUS,16.0,TRUNK,6
...,...,...,...,...,...,...,...,...,...,...,...,...
713499,992,2,40309.0,43409.0,12.0,13.0,WEEKDAY,13.0,BUS,4.0,TRUNK,6
713500,992,2,40309.0,43409.0,12.0,13.0,WEEKDAY,17.0,BUS,4.0,TRUNK,12
713501,992,2,40309.0,43409.0,12.0,13.0,WEEKENDS/HOLIDAY,10.0,BUS,5.0,TRUNK,6
713502,992,2,40309.0,43409.0,12.0,13.0,WEEKENDS/HOLIDAY,16.0,BUS,1.0,TRUNK,6


In [21]:
# Group by ServiceNo and Direction, then find the maximum Destination_StopSequence for each group
max_stop_sequence = bus_routes_trips.groupby(['ServiceNo', 'Direction'])['Destination_StopSequence'].max().reset_index()

# Rename the column for clarity
max_stop_sequence = max_stop_sequence.rename(columns={'Destination_StopSequence': 'Max_StopSequence'})

# Merge the max stop sequence information back into the original dataframe if desired
bus_routes_trips = bus_routes_trips.merge(max_stop_sequence, on=['ServiceNo', 'Direction'], how='left')


bus_routes_trips

Unnamed: 0,ServiceNo,Direction,Origin_Stop,Destination_Stop,Origin_StopSequence,Destination_StopSequence,DAY_TYPE,TIME_PER_HOUR,PT_TYPE,TOTAL_TRIPS,Category,Estimated_Trips,Max_StopSequence
0,118,1,96289.0,96109.0,19.0,20.0,WEEKDAY,8.0,BUS,29.0,TRUNK,9,26.0
1,118,1,96289.0,96109.0,19.0,20.0,WEEKENDS/HOLIDAY,22.0,BUS,1.0,TRUNK,5,26.0
2,118,1,96289.0,96109.0,19.0,20.0,WEEKENDS/HOLIDAY,20.0,BUS,7.0,TRUNK,5,26.0
3,118,1,96289.0,96109.0,19.0,20.0,WEEKDAY,11.0,BUS,21.0,TRUNK,6,26.0
4,118,1,96289.0,96109.0,19.0,20.0,WEEKDAY,13.0,BUS,16.0,TRUNK,6,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
713499,992,2,40309.0,43409.0,12.0,13.0,WEEKDAY,13.0,BUS,4.0,TRUNK,6,15.0
713500,992,2,40309.0,43409.0,12.0,13.0,WEEKDAY,17.0,BUS,4.0,TRUNK,12,15.0
713501,992,2,40309.0,43409.0,12.0,13.0,WEEKENDS/HOLIDAY,10.0,BUS,5.0,TRUNK,6,15.0
713502,992,2,40309.0,43409.0,12.0,13.0,WEEKENDS/HOLIDAY,16.0,BUS,1.0,TRUNK,6,15.0


In [22]:
# Adjust the estimated trips by ensuring estimated trips i s <= total est column
bus_routes_trips['Adj_Estimated_Trips'] = bus_routes_trips[['TOTAL_TRIPS', 'Estimated_Trips']].min(axis=1)

# Drop the 'Estimated_Trips' column
bus_routes_trips = bus_routes_trips.drop(columns=['Estimated_Trips'])

bus_routes_trips

Unnamed: 0,ServiceNo,Direction,Origin_Stop,Destination_Stop,Origin_StopSequence,Destination_StopSequence,DAY_TYPE,TIME_PER_HOUR,PT_TYPE,TOTAL_TRIPS,Category,Max_StopSequence,Adj_Estimated_Trips
0,118,1,96289.0,96109.0,19.0,20.0,WEEKDAY,8.0,BUS,29.0,TRUNK,26.0,9.0
1,118,1,96289.0,96109.0,19.0,20.0,WEEKENDS/HOLIDAY,22.0,BUS,1.0,TRUNK,26.0,1.0
2,118,1,96289.0,96109.0,19.0,20.0,WEEKENDS/HOLIDAY,20.0,BUS,7.0,TRUNK,26.0,5.0
3,118,1,96289.0,96109.0,19.0,20.0,WEEKDAY,11.0,BUS,21.0,TRUNK,26.0,6.0
4,118,1,96289.0,96109.0,19.0,20.0,WEEKDAY,13.0,BUS,16.0,TRUNK,26.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
713499,992,2,40309.0,43409.0,12.0,13.0,WEEKDAY,13.0,BUS,4.0,TRUNK,15.0,4.0
713500,992,2,40309.0,43409.0,12.0,13.0,WEEKDAY,17.0,BUS,4.0,TRUNK,15.0,4.0
713501,992,2,40309.0,43409.0,12.0,13.0,WEEKENDS/HOLIDAY,10.0,BUS,5.0,TRUNK,15.0,5.0
713502,992,2,40309.0,43409.0,12.0,13.0,WEEKENDS/HOLIDAY,16.0,BUS,1.0,TRUNK,15.0,1.0


In [105]:
# save this dataframe 
bus_routes_trips.to_csv('../data/cleaned/bus_route_trips.csv', index=False)