In [55]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
import os

colTypes = {
    'route_id': 'string',
    'direction_id': 'category',
    'half_trip_id': 'string',
    'stop_id': pd.Int32Dtype(),
    'time_point_order': pd.Int8Dtype(),
    'point_type': 'category', 
    'standard_type': 'category'
}

In [2]:
# Import all *.txt files in the gtfsSchedule folder and parse them as dataframes
# add the gtfsSchedule folder and subfolders to the current path
txt_path = ('gtfsSchedule\\gtfs_2022-12-18_2023-03-11_Winter2023PostRecap')
# List of dataframe names: remove the '.txt' extension from the filenames
df_names = [filename[:-4] for filename in os.listdir(txt_path)] 
# Read txt files into dataframes and assign them the names in df_names
# create a list of filenames
files = os.listdir(txt_path)
# create a list of dataframes
dfs = [pd.read_csv(os.path.join(txt_path, f), sep=',') for f in files]
# create a dictionary of dataframes
gtfsSchedule = dict(zip(df_names, dfs))

  dfs = [pd.read_csv(os.path.join(txt_path, f), sep=',') for f in files]
  dfs = [pd.read_csv(os.path.join(txt_path, f), sep=',') for f in files]


In [131]:
# Create a new df starting from gtfsScehdule['trips] where trip_ids are matched to service_ids
# This will be the base df for the analysis
trips = gtfsSchedule['trips']
calendar = gtfsSchedule['calendar']
calendar_attributes = gtfsSchedule['calendar_attributes']
calendar_dates = gtfsSchedule['calendar_dates']
# Assign to every row in trips the corresponding rating_start_date and rating_end_date contained in calendar_attributes matching the two datasets by service_id
# Add the service_id field from the calendar dataframe to the trips dataframe, without including the other fields
trips = pd.merge(trips, calendar_attributes[['service_id', 'rating_start_date', 'rating_end_date']], on='service_id')

# Extract the stop_times df from the dict
stop_times = gtfsSchedule['stop_times']
# Make sure trip_id column from both the df has the same dtype not to miss any correspondence
trips['trip_id'] = trips['trip_id'].astype(str)
stop_times['trip_id'] = stop_times['trip_id'].astype(str)

# Merge stop_times with trips
schedule = pd.merge(stop_times, trips, on='trip_id', how='left')

# Convert date values to datetime objects
schedule['rating_start_date'] = pd.to_datetime(schedule['rating_start_date'], format='%Y%m%d')
schedule['rating_end_date'] = pd.to_datetime(schedule['rating_end_date'], format='%Y%m%d')
# Extract only records whose route_id is an integer, i.e., bus routes
schedule = schedule[schedule.route_id.str.isnumeric()]
# Drop the columns that are not needed for the analysis
drop_colums = ['trip_headsign', 'trip_short_name',
        'shape_id', 'wheelchair_accessible',
       'trip_route_type', 'route_pattern_id', 'bikes_allowed', 'stop_headsign',
       'pickup_type', 'drop_off_type', 
       'continuous_pickup', 'continuous_drop_off']
schedule = schedule.drop(columns=drop_colums)
schedule.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,timepoint,checkpoint_id,route_id,service_id,direction_id,block_id,rating_start_date,rating_end_date
32832,54241723,08:00:00,08:00:00,2519,1,1.0,sprhl,85,WinterWeekday,1,T88-161,2022-12-18,2023-03-11
32833,54241723,08:02:00,08:02:00,2507,2,0.0,,85,WinterWeekday,1,T88-161,2022-12-18,2023-03-11
32834,54241723,08:03:00,08:03:00,2508,3,0.0,,85,WinterWeekday,1,T88-161,2022-12-18,2023-03-11
32835,54241723,08:03:00,08:03:00,2574,4,0.0,,85,WinterWeekday,1,T88-161,2022-12-18,2023-03-11
32836,54241723,08:05:00,08:05:00,2510,5,0.0,unsqu,85,WinterWeekday,1,T88-161,2022-12-18,2023-03-11


In [126]:
trips['trip_id'] = trips['trip_id'].astype(str)
stop_times['trip_id'] = stop_times['trip_id'].astype(str)
# Add the stop_times to the schedule df
merged_df = pd.merge(stop_times, trips[['trip_id', 'route_id', 'service_id', 'direction_id', 'block_id']], on='trip_id', how='left')
merged_df.loc[merged_df.trip_id=='54415522']
#stop_times.loc[stop_times.trip_id==54415522]
#merged_df.loc[merged_df.trip_id==54415522]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,timepoint,checkpoint_id,continuous_pickup,continuous_drop_off,route_id,service_id,direction_id,block_id
585966,54415522,12:04:00,12:04:00,64,1,,0,1,1.0,nubn,,,1,WinterSunday,0,C01-1
585967,54415522,12:05:00,12:05:00,1,2,,0,0,0.0,,,,1,WinterSunday,0,C01-1
585968,54415522,12:06:00,12:06:00,2,3,,0,0,0.0,melwa,,,1,WinterSunday,0,C01-1
585969,54415522,12:07:00,12:07:00,6,4,,0,0,0.0,,,,1,WinterSunday,0,C01-1
585970,54415522,12:10:00,12:10:00,10003,5,,0,0,0.0,,,,1,WinterSunday,0,C01-1
585971,54415522,12:12:00,12:12:00,57,6,,0,0,0.0,,,,1,WinterSunday,0,C01-1
585972,54415522,12:13:00,12:13:00,58,7,,0,0,0.0,,,,1,WinterSunday,0,C01-1
585973,54415522,12:15:00,12:15:00,10590,8,,0,0,0.0,wasma,,,1,WinterSunday,0,C01-1
585974,54415522,12:18:00,12:18:00,87,9,,0,0,0.0,,,,1,WinterSunday,0,C01-1
585975,54415522,12:20:00,12:20:00,188,10,,0,0,0.0,masta,,,1,WinterSunday,0,C01-1


### Import files with arrival and departure times

In [None]:
# Import csv files from 2022 and 2023, cast them into a single dataframe, and filter out the bus routes included within the dates of the scheduled df
# Import the csv files
csv_path = 'MBTA_ArrivalDepartureTimes'
foldername = 'MBTA_Bus_Arrival_Departure_Times'
# Arrival/Departure times 2022
adt_2022 = os.path.join(csv_path, (foldername + '_' + '2022'))
csv2022_files = os.listdir(adt_2022)
# Arrival/Departure times 2023
adt_2023 = os.path.join(csv_path, (foldername + '_' + '2023'))
csv2023_files = os.listdir(adt_2023)

adt2022_list = []
adt2023_list = []
import_process = 0
if import_process:
    for i in range(12):
        print(i)
        adt2022_list.append(pd.read_csv(os.path.join(adt_2022, csv2022_files[i]), sep=','))
        adt2023_list.append(pd.read_csv(os.path.join(adt_2023, csv2023_files[i]), sep=','))

    # Build a single dataframe
    adt_df = pd.concat((pd.concat(adt2023_list, axis = 0), pd.concat(adt2022_list, axis = 0)), axis = 0)
    # Keep only the rows whose service_date is within the range of the scheduled df
    feed_info = gtfsSchedule['feed_info']
    start_date = pd.to_datetime(feed_info.feed_start_date.values, format='%Y%m%d')
    end_date = pd.to_datetime(feed_info.feed_end_date.values, format='%Y%m%d')
    adt_df['service_date'] = pd.to_datetime(adt_df['service_date'], format='%Y-%m-%d')
    adt_df = adt_df.loc[adt_df.service_date >= start_date[0]]
    adt_df = adt_df.loc[adt_df.service_date <= end_date[0]]
    adt_df.reset_index()
    # Replace wrong SL3 id with the correct one
    adt_df.loc[adt_df.route_id=='746_', 'route_id'] = '746'
    # Use the routes gtfs file to match route_ids in the adt dataframe with their univocal identifier
    routes = gtfsSchedule['routes']
    adt_df = pd.merge(adt_df, routes[['route_id', 'route_short_name']], on='route_id')
    # Change the dtype of the columns included in the colTypes to their corresponding values
    for key, value in colTypes.items():
        adt_df[key] = adt_df[key].astype(value)
else:
    adt_df = pd.read_csv('adt.csv', index_col=0)

In [None]:
groups = adt_df.groupby(
    ['service_date','route_id', 'direction_id']
)

# Print first group and then break the for loop
for name, group in groups:
    group = group.sort_values(by='scheduled', ascending = True)
    print(group.head())
    break

In [86]:
gtfsSchedule['trips'].loc[gtfsSchedule['trips'].route_id == '1'].service_id.unique()    

array(['BUS123-3-Wdy-02', 'BUS123-4-Wdy-02', 'BUS123-5-Wdy-02',
       'BUS123-6-Wdy-02', 'ChristmasDay(Observed)-1', 'ChristmasDay-1',
       'MartinLutherKingDay-1', 'WinterSaturday', 'WinterSunday'],
      dtype=object)