In [None]:
import os
os.environ['PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT'] = '2'

In [None]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Set the option to prevent the FutureWarning
pd.set_option('future.no_silent_downcasting', True)
import datetime as dt

colTypes = {
    'route_id': 'string',
    'direction_id': 'category',
    'half_trip_id': 'string',
    'stop_id': pd.Int32Dtype(),
    'time_point_order': pd.Int8Dtype(),
    'point_type': 'category', 
    'standard_type': 'category'
}

def split_multiple_block_id(df):
    # 1. Extract rows with multiple block_id substrings and save their indexes
    multiple_blocks_indexes = df[df['block_id'].str.contains(',')].index
    df2 = df.loc[multiple_blocks_indexes].copy()

    # 2. Split block_id strings into a list of strings
    df2['block_id'] = df2['block_id'].str.split(', ')

    # 3. Group by service_date, direction_id, and departure_time
    groups = df2.groupby(['service_date', 'direction_id', 'departure_time'], observed=True)

    # 4. Add a dummy column to each group
    for _, group in groups:
        group['dummy_column'] = range(len(group))
        for i, row in group.iterrows():
            row['block_id'] = row['block_id'][row['dummy_column']]
            # replace the original row with the modified one
            group.loc[i] = row
        # Assign the new block_id values to the original dataframe
        df.loc[group.index, 'block_id'] = group['block_id']
    return df

# From the folder, import calendar_csv
def import_calendar_csv(foldername, filename):
    calendar_csv_path = os.path.join(foldername, filename)
    calendar_df = pd.read_csv(calendar_csv_path, sep=',')
    # Convert the date column to datetime
    calendar_df['date'] = pd.to_datetime(calendar_df['date'], format='%Y-%m-%d')
    # Convert day_of_week to category
    calendar_df['day_of_week'] = calendar_df['day_of_week'].astype('category')
    # Convert service_ids to dictionary
    calendar_df['service_ids'] = calendar_df['service_ids'].apply(eval)
    return calendar_df

def polish_arrival_departure_time_df(ArrDepDF, route_list):
    # Extract records whose point_type equal to either 'Startpoint' or 'Endpoint', if route_list is not empty, filter the records by route_id
    if route_list:
        start_end_points_mask = (ArrDepDF.point_type.isin(['Startpoint', 'Endpoint'])) & (ArrDepDF.route_id.isin(route_list))
    else:
        start_end_points_mask = (ArrDepDF.point_type.isin(['Startpoint', 'Endpoint']))

    ArrDepDF = ArrDepDF.loc[start_end_points_mask]
    # If there are any nan values in the actual columns, replace them with the scheduled values
    ArrDepDF.loc[:,'actual'] = ArrDepDF['actual'].fillna(ArrDepDF['scheduled'])
    # Drop the columns that are not needed for the analysis
    drop_columns = ['scheduled_headway', 'headway', 'route_short_name']
    ArrDepDF = ArrDepDF.drop(columns=drop_columns)
    ArrDepDF = ArrDepDF.reset_index(drop=True)
    # Rename actual to departure_time
    ArrDepDF = ArrDepDF.rename(columns={'scheduled': 'departure_time'})
    # Replace 'Inbound' entries with 1 and 'Outbound' entries with 0
    ArrDepDF['direction_id'] = ArrDepDF['direction_id'].cat.rename_categories({'Inbound': 1, 'Outbound': 0})
    # Convert service_date, scheduled and departure_time to datetime objects
    #ArrDepDF['service_date'] = pd.to_datetime(ArrDepDF['service_date'], format='%Y-%m-%d')
    #ArrDepDF['departure_time'] = pd.to_datetime(ArrDepDF['departure_time'], format='%H:%M:%S')
    #ArrDepDF.loc[:,'actual'] = pd.to_datetime(ArrDepDF.loc[:,'actual'], format='%H:%M:%S')
    # Add a new service_id column to the ArrDepDF dataframe made of empty sets
    ArrDepDF['service_id'] =''
    # Add an empty block_id column
    ArrDepDF['block_id'] = ''
    return ArrDepDF

def handle_24h_time(series):
    return series.str.replace(r'^24', '00', regex=True) \
                 .str.replace(r'^25', '01', regex=True) \
                 .str.replace(r'^26', '02', regex=True) \
                 .str.replace(r'^27', '03', regex=True) \
                 .str.replace(r'^28', '04', regex=True)

# Parse datetime strings to datetime objects
def parse_datetime_strings(df):
    df['arrival_time'] = handle_24h_time(df['arrival_time'])
    df['arrival_time'] = pd.to_datetime(df['arrival_time'], format='%H:%M:%S')
    df['departure_time'] = handle_24h_time(df['departure_time'])
    df['departure_time'] = pd.to_datetime(df['departure_time'], format='%H:%M:%S')
    return df


### Column dictionaries

In [None]:
gtfs_cols = {
    'calendar': {
    'service_id': 'category',
    'monday': bool,
    'tuesday': bool,
    'wednesday': bool,
    'thursday': bool,
    'friday': bool,
    'saturday': bool,
    'sunday': bool,
    'start_date': 'string',
    'end_date': 'string'
    },

    'calendar_attributes':{
    'service_id': 'category',
    'service_description': 'category',
    'service_schedule_name': 'category',
    'service_schedule_type': 'category',
    'service_schedule_typicality': pd.Int8Dtype(),
    'rating_start_date': 'string',
    'rating_end_date': 'string',
    'rating_description': 'category'
    },

    'calendar_dates':{
    'service_id': 'category',
    'date': 'string',
    'exception_type': pd.Int8Dtype(),
    'holiday_name': 'category'
    },

    'feed_info': {
    'feed_publisher_name': 'category',
    'feed_publisher_url': 'category',
    'feed_lang': 'category',
    'feed_start_date': 'string',
    'feed_end_date': 'string',
    'feed_version': 'category',
    'feed_contact_email': 'category',
    'feed_contact_url': 'category'
    },

    'routes':{
    'route_id': 'string',
    'agency_id': 'category',
    'route_short_name': 'category',
    'route_long_name': 'category',
    'route_desc': 'category',
    'route_type': 'category',
    'route_url': 'category',
    'route_color': 'category',
    'route_text_color': 'category',
    'route_sort_order': pd.Int16Dtype(),
    'route_fare': 'category',
    'line_id': 'category',
    'listed_route': 'category'
    },

    'stop_times':{
    'trip_id': 'string',
    'arrival_time': 'string',
    'departure_time': 'string',
    'stop_id': 'category',
    'stop_sequence': pd.Int16Dtype(),
    'stop_headsign': 'category',
    'pickup_type': 'category',
    'drop_off_type': 'category',
    'timepoint': pd.Int16Dtype(),
    'checkpoint_id': 'category',
    'continuous_pickup': 'category',
    'continuous_drop_off': 'category'
    },

    'stops':{
    'stop_id': 'string',
    'stop_code': 'category',
    'stop_name': 'category',
    'stop_desc': 'category',
    'stop_lat': pd.Float32Dtype(),
    'stop_lon': pd.Float32Dtype(),
    'zone_id': 'category',
    'stop_url': 'category',
    'level_id': 'category',
    'location_type': 'category',
    'municipality': 'category',
    'on_street': 'category',
    'at_street': 'category',
    'parent_station': 'category',
    'stop_timezone': 'category',
    'wheelchair_boarding': 'category',
    'platform_code': 'category',
    'platofrm_name': 'category',
    'stop_address': 'category',
    'stop_city': 'category',
    'stop_region': 'category',
    'stop_postal_code': 'category',
    'stop_country': 'category',
    'stop_phone': 'category',
    'stop_url': 'category',
    'stop_contact_name': 'category',
    'stop_contact_phone': 'category',
    'stop_contact_url': 'category',
    'stop_contact_email': 'category',
    'vehicle_type': 'category'
    },

    'trips':{
    'route_id': 'string',
    'service_id': 'string',
    'trip_id': 'category',
    'trip_headsign': 'category',
    'trip_short_name': 'category',
    'direction_id': 'category',
    'block_id': 'category',
    'shape_id': 'category',
    'wheelchair_accessible': 'category',
    'trip_route_type': 'category',
    'route_pattern_id': 'category',
    'bikes_allowed': 'category'}
}

schedule_cols = {
    'trip_id': 'string',
    'arrival_time': 'string',
    'departure_time': 'string',
    'stop_id': 'string',
    'stop_sequence': pd.Int16Dtype(),
    'timepoint': pd.Int16Dtype(),
    'checkpoint_id': 'category',
    'route_id': 'string',
    'service_id': 'string',
    'direction_id': 'category',
    'block_id': 'string',
    'rating_start_date': 'string',
    'rating_end_date': 'string'
}

#Arrival adn Departure time columns
adt_dtype_map = {
    "service_date": "string",
    "route_id": "string",
    "direction_id": "category",
    "half_trip_id": "string",
    "stop_id": "string",
    "time_point_id": "category", 
    "time_point_order": pd.Int16Dtype(),
    "point_type": "category", 
    "standard_type": "category",  
    "scheduled": "string",  # Consider converting to datetime later
    "actual": "string",  # Consider converting to datetime later
    "scheduled_headway": pd.Int32Dtype(),
    "headway": pd.Int32Dtype()
    }

# Drop the columns that are not needed for the analysis

drop_colums = ['trip_headsign', 'trip_short_name',
        'shape_id', 'wheelchair_accessible',
       'trip_route_type', 'route_pattern_id', 'bikes_allowed', 'stop_headsign',
       'pickup_type', 'drop_off_type', 
       'continuous_pickup', 'continuous_drop_off']

In [18]:
# Import all *.txt files in the gtfsSchedule folder and parse them as dataframes
# add the gtfsSchedule folder and subfolders to the current path
txt_path = os.path.join('gtfsSchedule','gtfs_2022-12-18_2023-03-11_Winter2023PostRecap')
# List only the files needed later, avoid to import the whole folder
txt_list = (['calendar.txt', 'calendar_attributes.txt', 'calendar_dates.txt', 'feed_info.txt', 'routes.txt', 'stop_times.txt', 'stops.txt', 'trips.txt'])
# List of dataframe names: remove the '.txt' extension from the filenames
df_names = [txt_file.rstrip('.txt') for txt_file in txt_list]
# Read txt files into dataframes and assign them the names in df_names, and the dtypes in gtfs_cols using the keys with the same name as the dataframe
dfs = [pd.read_csv(os.path.join(txt_path, gtfs_file), sep=',', low_memory=False, dtype=gtfs_cols[df_name]) for df_name, gtfs_file in zip(df_names, txt_list)]
#dfs = [pd.read_csv(os.path.join(txt_path, gtfs_file), sep=',', low_memory=False) for gtfs_file in txt_list]
# create a dictionary of dataframes
gtfsSchedule = dict(zip(df_names, dfs))
# Assign the dataframes to variables
calendar = gtfsSchedule['calendar']
calendar_attributes = gtfsSchedule['calendar_attributes']
calendar_dates = gtfsSchedule['calendar_dates']
feed_info = gtfsSchedule['feed_info']
routes = gtfsSchedule['routes']
stop_times = gtfsSchedule['stop_times']
end_points_df = (stop_times.groupby
                     (['trip_id'], observed=True, as_index=False)
                    ['stop_sequence']
                    .transform('max') #with transform I can get all the max occurrences while also preserving their original row index
                )
stop_times = stop_times.loc[(stop_times.loc[:,'stop_sequence'] == end_points_df)|(stop_times.loc[:,'stop_sequence'] == 1)]
stops = gtfsSchedule['stops']
trips = gtfsSchedule['trips']

# Filter routes and trips to only include bus routes, i.e., those whose rotue_id is a string of digits
routes, trips = routes[routes['route_id'].str.isdigit()], trips[trips['route_id'].str.isdigit()]

# Convert datetime strings to proper datetime objects
calendar['start_date'] = pd.to_datetime(calendar['start_date'], format='%Y%m%d')
calendar['end_date'] = pd.to_datetime(calendar['end_date'], format='%Y%m%d')
calendar_attributes['rating_start_date'] = pd.to_datetime(calendar_attributes['rating_start_date'], format='%Y%m%d')
calendar_attributes['rating_end_date'] = pd.to_datetime(calendar_attributes['rating_end_date'], format='%Y%m%d')
feed_info['feed_start_date'] = pd.to_datetime(feed_info['feed_start_date'], format='%Y%m%d')
feed_info['feed_end_date'] = pd.to_datetime(feed_info['feed_end_date'], format='%Y%m%d')
calendar_dates['date'] = pd.to_datetime(calendar_dates['date'], format='%Y%m%d')
# Add the service_id field from the calendar dataframe to the trips dataframe, without including the other fields
trips = pd.merge(trips[['service_id','trip_id', 'route_id', 'direction_id', 'block_id']], calendar_attributes[['service_id']], on='service_id')
# Merge stop_times with trips
schedule = pd.merge(trips,stop_times[['trip_id','arrival_time','departure_time','stop_id','stop_sequence']], on='trip_id', how='left')
parse_datetime_strings(schedule)

Unnamed: 0,service_id,trip_id,route_id,direction_id,block_id,arrival_time,departure_time,stop_id,stop_sequence
0,BUS123-3-Wdy-02,54811655,1,0,C01-1,04:37:00,04:37:00,64,1
1,BUS123-3-Wdy-02,54811655,1,0,C01-1,05:04:00,05:04:00,110,24
2,BUS123-3-Wdy-02,54811657,1,0,C01-3,04:51:00,04:51:00,64,1
3,BUS123-3-Wdy-02,54811657,1,0,C01-3,05:18:00,05:18:00,110,24
4,BUS123-3-Wdy-02,54811660,1,0,C01-7,05:05:00,05:05:00,64,1
...,...,...,...,...,...,...,...,...,...
133481,WinterWeekday,55004493,99,1,G110-124,16:47:00,16:47:00,5271,38
133482,WinterWeekday,55004494,99,1,G110-118,17:40:00,17:40:00,15058,1
133483,WinterWeekday,55004494,99,1,G110-118,18:12:00,18:12:00,5271,38
133484,WinterWeekday,55004495,99,1,G110-118,19:05:00,19:05:00,15058,1


Unnamed: 0,service_id,trip_id,route_id,direction_id,block_id,arrival_time,departure_time,stop_id,stop_sequence
0,BUS123-3-Wdy-02,54811655,1,0,C01-1,1900-01-01 04:37:00,1900-01-01 04:37:00,64,1
1,BUS123-3-Wdy-02,54811655,1,0,C01-1,1900-01-01 05:04:00,1900-01-01 05:04:00,110,24
2,BUS123-3-Wdy-02,54811657,1,0,C01-3,1900-01-01 04:51:00,1900-01-01 04:51:00,64,1
3,BUS123-3-Wdy-02,54811657,1,0,C01-3,1900-01-01 05:18:00,1900-01-01 05:18:00,110,24
4,BUS123-3-Wdy-02,54811660,1,0,C01-7,1900-01-01 05:05:00,1900-01-01 05:05:00,64,1
...,...,...,...,...,...,...,...,...,...
133481,WinterWeekday,55004493,99,1,G110-124,1900-01-01 16:47:00,1900-01-01 16:47:00,5271,38
133482,WinterWeekday,55004494,99,1,G110-118,1900-01-01 17:40:00,1900-01-01 17:40:00,15058,1
133483,WinterWeekday,55004494,99,1,G110-118,1900-01-01 18:12:00,1900-01-01 18:12:00,5271,38
133484,WinterWeekday,55004495,99,1,G110-118,1900-01-01 19:05:00,1900-01-01 19:05:00,15058,1


In [None]:
schedule.loc[schedule.arrival_time.str.startswith('25')]

In [None]:
# Assign to every row in trips the corresponding rating_start_date and rating_end_date contained in calendar_attributes matching the two datasets by service_id
# Add the service_id field from the calendar dataframe to the trips dataframe, without including the other fields
trips = pd.merge(trips[['service_id','trip_id', 'route_id', 'direction_id', 'block_id']], calendar_attributes[['service_id', 'rating_start_date', 'rating_end_date']], on='service_id')
# Merge stop_times with trips
schedule = pd.merge(trips,stop_times[['trip_id','arrival_time','departure_time','stop_id','stop_sequence']], on='trip_id', how='left').apply(parse_datetime_strings, axis=1)

In [None]:

schedule = parse_datetime_strings(schedule)
schedule = schedule.drop(columns=drop_colums)

### Import files with arrival and departure times

In [None]:
ArrDepFolder = 'MBTA_ArrivalDepartureTimes'
yearlyFolders = os.listdir(ArrDepFolder)
numYears = len(yearlyFolders)

#Empty dataframe
adt_list = []
chunksize = 10**5

        
for year in range(numYears):
    num_files = len(os.listdir(os.path.join(ArrDepFolder, yearlyFolders[year])))
    files_path = os.path.join(ArrDepFolder, yearlyFolders[year])

    for month in range(num_files):
        #print(f'files: {os.listdir(files_path)}')
        filename = (os.path.join(files_path, os.listdir(files_path)[month]))
        print(f'filename: {filename}')
        for chunk in pd.read_csv(filename, chunksize=chunksize, dtype=adt_dtype_map, low_memory=False):
            #print('Step 1')
            for key, value in adt_dtype_map.items():
                chunk[key] = chunk[key].astype(value)
            #print('Step 2')
        adt_list.append(chunk)

print('Concatenating...')
adt_df = pd.concat(adt_list, axis=0)
adt_df.to_csv('050224_adt_df.csv', index=False)

In [None]:
# Replace wrong route_ids with the correct one, if needed
adt_df.loc[:,'route_id'] = adt_df.loc[:,'route_id'].str.rstrip('_')
adt_df.loc[:,'half_trip_id'] = adt_df.loc[:,'half_trip_id'].str.rstrip('.0') #i guess .0 exists because there are some empty entries that are treated as nan
adt_df['route_id'] = adt_df['route_id'].str.lstrip('0')          
            #print('Step 3')

# Convert service_date to datetime objects
# Convert service_date, scheduled and actual columns to datetime objects
adt_df['service_date'] = pd.to_datetime(adt_df['service_date'], format='%Y-%m-%d')
adt_df['scheduled'] = pd.to_datetime(adt_df['scheduled'], format='ISO8601')
adt_df['actual'] = pd.to_datetime(adt_df['actual'], format='ISO8601')
#adt_df["scheduled"] = adt_df["scheduled"].dt.strftime("%H:%M:%S")
#adt_df["actual"] = adt_df["actual"].dt.strftime("%H:%M:%S")
adt_df = pd.merge(adt_df, routes[['route_id', 'route_short_name']], on='route_id')

In [None]:
# Import csv files from 2022 and 2023, cast them into a single dataframe, and filter out the bus routes included within the dates of the scheduled df
# Import the csv files
csv_path = 'MBTA_ArrivalDepartureTimes'
foldername = 'MBTA_Bus_Arrival_Departure_Times'

adt2022_list = []
adt2023_list = []
import_process = 1
if import_process:
    # Arrival/Departure times 2022
    adt_2022 = os.path.join(csv_path, (foldername + '_' + '2022'))
    csv2022_files = os.listdir(adt_2022)
    # Arrival/Departure times 2023
    adt_2023 = os.path.join(csv_path, (foldername + '_' + '2023'))
    csv2023_files = os.listdir(adt_2023)
    
    for i in range(12):
        print(i)
        adt2022_list.append(pd.read_csv(os.path.join(adt_2022, csv2022_files[i]), sep=','))
        adt2023_list.append(pd.read_csv(os.path.join(adt_2023, csv2023_files[i]), sep=','))

    # Build a single dataframe
    adt_df = pd.concat((pd.concat(adt2023_list, axis = 0), pd.concat(adt2022_list, axis = 0)), axis = 0)
   
    print('Step 1')
    # Change the dtype of the columns included in the colTypes to their corresponding values
    for key, value in adt_dtype_map.items():
        adt_df[key] = adt_df[key].astype(value)

    print('Step 2')    
    # Replace wrong route_ids with the correct one, if needed
    adt_df.loc[:,'route_id'] = adt_df.loc[:,'route_id'].str.rstrip('_')
    adt_df.loc[:,'half_trip_id'] = adt_df.loc[:,'half_trip_id'].str.rstrip('.0') #i guess .0 exists because there are some empty entries that are treated as nan
    adt_df['route_id'] = adt_df['route_id'].str.lstrip('0')
    # Use the routes gtfs file to match route_ids in the adt dataframe with their univocal identifier
    adt_df = pd.merge(adt_df, routes[['route_id', 'route_short_name']], on='route_id')
    
    print('Step 3')
    # Convert service_date to datetime objects
    # Convert service_date, scheduled and actual columns to datetime objects
    adt_df['service_date'] = pd.to_datetime(adt_df['service_date'], format='%Y-%m-%d')
    adt_df['scheduled'] = pd.to_datetime(adt_df['scheduled'], format='ISO8601')
    adt_df['actual'] = pd.to_datetime(adt_df['actual'], format='ISO8601')
    #adt_df["scheduled"] = adt_df["scheduled"].dt.strftime("%H:%M:%S")
    #adt_df["actual"] = adt_df["actual"].dt.strftime("%H:%M:%S")
    print('Step 4')
    # Keep only the rows whose service_date is within the range of the scheduled df
    feed_info = gtfsSchedule['feed_info']
    start_date = pd.to_datetime(feed_info.feed_start_date.values, format='%Y%m%d')
    end_date = pd.to_datetime(feed_info.feed_end_date.values, format='%Y%m%d')
    # Keep only entries whose service_date is within the range start_date and end_date
    adt_df = adt_df[(adt_df['service_date'] >= start_date[0]) & (adt_df['service_date'] <= end_date[0])]
    print('Step 5')
    adt_df.reset_index()
    
else:
    # Read the file in separate chunks and concatenate them
    chunk_size = 10**6
    chunks = []
    
    for chunk in pd.read_csv('adt_df.csv', dtype=adt_dtype_map, chunksize=chunk_size):
        chunks.append(chunk)
    adt_df = pd.concat(chunks, axis=0, ignore_index=True)
    # Convert service_date, scheduled and actual columns to datetime objects
    adt_df['service_date'] = pd.to_datetime(adt_df['service_date'], format='%Y-%m-%d')
    adt_df['scheduled'] = pd.to_datetime(adt_df['scheduled'], format='ISO8601')
    adt_df['actual'] = pd.to_datetime(adt_df['actual'], format='ISO8601')
    #adt_df["scheduled"] = adt_df["scheduled"].dt.strftime("%H:%M:%S")
    #adt_df["actual"] = adt_df["actual"].dt.strftime("%H:%M:%S")
    # If half_trip_id endswith '.0', trim this piece

    # Replace wrong route_ids with the correct one, if needed
    adt_df.loc[:,'route_id'] = adt_df.loc[:,'route_id'].str.rstrip('_')
    adt_df.loc[:,'half_trip_id'] = adt_df.loc[:,'half_trip_id'].str.rstrip('.0') #i guess .0 exists because there are some empty entries that are treated as nan
    adt_df['route_id'] = adt_df['route_id'].str.lstrip('0')
    # Use the routes gtfs file to match route_ids in the adt dataframe with their univocal identifier
    routes = gtfsSchedule['routes']  
    adt_df = pd.merge(adt_df, routes[['route_id', 'route_short_name']], on='route_id')

### Combine arrival and departure times with scheduled information

In [None]:
calendar_df = import_calendar_csv('CalendarDates', 'calendar_df.csv')
adt_2 = polish_arrival_departure_time_df(adt_df, [])

### Assign service_id and block_id to arrival and departure times

In [None]:
# Merge adt_route10 and schedule on the route_id and departure_time columns
schedule_route10 = schedule.copy()
# Extract only the records with route_id == 10 and stop_sequence equal to 1
schedule_route10 = schedule_route10.loc[(schedule_route10.route_id == '10') & (schedule_route10.stop_sequence == 1)]

grouping_var = ['route_id', 'direction_id', 'departure_time'] # these are the same for both adt_route10 and schedule_route10
# Work with only Startpoint records in arrival departure times, will merge with Endpoint records later
startpoint_df = adt_route10.loc[adt_route10.point_type == 'Startpoint']
adt_route10_grouped = startpoint_df.groupby(grouping_var, observed=True)
schedule_route10_grouped = schedule_route10.groupby(grouping_var, observed=True)

# Create two lists to store the unmatched names and groups
unmatched_names = []
unmatched_groups = []

for name, group in adt_route10_grouped:
    # print the group if an error occurs
    if name in schedule_route10_grouped.groups:
        # extract the corresponding group from schedule_route10_grouped
        schedule_group = schedule_route10_grouped.get_group(name)
        schedule_services = set(schedule_group['service_id'])
        schedule_service_block_ids = schedule_group.groupby(['service_id'])['block_id'].apply(list)

        # extract the subset of the calendar_df that matches the service_date
        service_days = calendar_df.loc[calendar_df.date.isin(group.service_date)]
        # loop through the service_days
        for i, row in service_days.iterrows():
            # add the intersection between schedule_services and row['service_ids'] to the service_id column in adt_route10 as plain strings
            adt_service_ids = schedule_services.intersection(row['service_ids'])
            adt_service_ids_str = ', '.join(adt_service_ids)  # Convert set to string
            adt_route10.loc[group.index, 'service_id'] = adt_service_ids_str
            # Get the block_id list associated to adt_service_ids_str
            block_list=schedule_service_block_ids[schedule_service_block_ids.index==adt_service_ids_str]
            block_list = block_list.iloc[0] if not block_list.empty else ''

            if block_list and not group.empty:
                # Concatenate block_list elements into a comma-separated string
                block_ids_str = ', '.join(block_list)
                # Assign the concatenated string to the specified rows in the column
                adt_route10.loc[group.index, 'block_id'] = block_ids_str
    else:
        unmatched_names.append(name)
        unmatched_groups.append(group)

adt_route10 = split_multiple_block_id(adt_route10)

In [None]:
# Assign the same values of block_id and service_id to Endpoint records with the same half_trip_id
trips_grouped = adt_route10.groupby(['half_trip_id'], observed=True)
for name, group in trips_grouped:
    adt_route10.loc[group.index[1], ['block_id', 'service_id']] = group.loc[group.index[0], ['block_id', 'service_id']].values

### Compute layover

In [None]:
# Create layover_df as a copy
layover_df = adt_route10.copy()
layover_df = layover_df.sort_values(by=['block_id','service_date','half_trip_id','departure_time'])
layover_df=layover_df.reset_index(drop=True)

# Group by 'block_id' and 'service_date', skipping rows with null 'service_id' or 'block_id'
grouped = layover_df.loc[layover_df.block_id.notna()].groupby(['block_id', 'service_date'])

# Calculate theoretical and actual layover times using diff()
layover_df['theoretical_layover'] = grouped['departure_time'].diff().dt.total_seconds() / 60
layover_df['actual_layover'] = grouped['actual'].diff().dt.total_seconds() / 60

# Replace the first row of each group with null timedelta
layover_df.loc[grouped.head(1).index, ['theoretical_layover', 'actual_layover']] = 0
layover_df.loc[layover_df.time_point_order != 1, ['theoretical_layover', 'actual_layover']] = np.nan
layover_df = layover_df.drop(columns=['time_point_order', 'point_type', 'standard_type'])
(layover_df.groupby(['block_id', 'service_date'])[['theoretical_layover', 'actual_layover']]
            .agg({'theoretical_layover': ['mean', 'max', 'count'],
                  'actual_layover': ['mean', 'max', 'count'],
                  'stop_id': 'first'})
)

In [None]:
test = adt_route10.groupby(['half_trip_id'])
startpoint_df = test.get_group