In [1]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
import os

colTypes = {
    'route_id': 'string',
    'direction_id': 'category',
    'half_trip_id': 'string',
    'stop_id': pd.Int32Dtype(),
    'time_point_order': pd.Int8Dtype(),
    'point_type': 'category', 
    'standard_type': 'category'
}

In [2]:
# Import all *.txt files in the gtfsSchedule folder and parse them as dataframes
# add the gtfsSchedule folder and subfolders to the current path
txt_path = ('gtfsSchedule\\gtfs_2022-12-18_2023-03-11_Winter2023PostRecap')
# List of dataframe names: remove the '.txt' extension from the filenames
df_names = [filename[:-4] for filename in os.listdir(txt_path)] 
# Read txt files into dataframes and assign them the names in df_names
# create a list of filenames
files = os.listdir(txt_path)
# create a list of dataframes
dfs = [pd.read_csv(os.path.join(txt_path, f), sep=',') for f in files]
# create a dictionary of dataframes
gtfsSchedule = dict(zip(df_names, dfs))

  dfs = [pd.read_csv(os.path.join(txt_path, f), sep=',') for f in files]
  dfs = [pd.read_csv(os.path.join(txt_path, f), sep=',') for f in files]


In [3]:
# Create a new df starting from gtfsScehdule['trips] where trip_ids are matched to service_ids
# This will be the base df for the analysis
trips = gtfsSchedule['trips']
calendar = gtfsSchedule['calendar']
calendar_attributes = gtfsSchedule['calendar_attributes']
calendar_dates = gtfsSchedule['calendar_dates']
# Assign to every row in trips the corresponding rating_start_date and rating_end_date contained in calendar_attributes matching the two datasets by service_id
# Add the service_id field from the calendar dataframe to the trips dataframe, without including the other fields
trips = pd.merge(trips, calendar_attributes[['service_id', 'rating_start_date', 'rating_end_date']], on='service_id')

# Extract the stop_times df from the dict
stop_times = gtfsSchedule['stop_times']
# Make sure trip_id column from both the df has the same dtype not to miss any correspondence
trips['trip_id'] = trips['trip_id'].astype(str)
stop_times['trip_id'] = stop_times['trip_id'].astype(str)

# Merge stop_times with trips
schedule = pd.merge(stop_times, trips, on='trip_id', how='left')

# Convert date values to datetime objects
schedule['rating_start_date'] = pd.to_datetime(schedule['rating_start_date'], format='%Y%m%d')
schedule['rating_end_date'] = pd.to_datetime(schedule['rating_end_date'], format='%Y%m%d')
# Extract only records whose route_id is an integer, i.e., bus routes
schedule = schedule[schedule.route_id.str.isnumeric()]
# Drop the columns that are not needed for the analysis
drop_colums = ['trip_headsign', 'trip_short_name',
        'shape_id', 'wheelchair_accessible',
       'trip_route_type', 'route_pattern_id', 'bikes_allowed', 'stop_headsign',
       'pickup_type', 'drop_off_type', 
       'continuous_pickup', 'continuous_drop_off']
schedule = schedule.drop(columns=drop_colums)

In [133]:
schedule.loc[schedule.route_id=='1'].to_csv('test_route1.csv')

### Import files with arrival and departure times

In [None]:
# Import csv files from 2022 and 2023, cast them into a single dataframe, and filter out the bus routes included within the dates of the scheduled df
# Import the csv files
csv_path = 'MBTA_ArrivalDepartureTimes'
foldername = 'MBTA_Bus_Arrival_Departure_Times'

adt2022_list = []
adt2023_list = []
import_process = 0
if import_process:
    # Arrival/Departure times 2022
    adt_2022 = os.path.join(csv_path, (foldername + '_' + '2022'))
    csv2022_files = os.listdir(adt_2022)
    # Arrival/Departure times 2023
    adt_2023 = os.path.join(csv_path, (foldername + '_' + '2023'))
    csv2023_files = os.listdir(adt_2023)
    
    for i in range(12):
        print(i)
        adt2022_list.append(pd.read_csv(os.path.join(adt_2022, csv2022_files[i]), sep=','))
        adt2023_list.append(pd.read_csv(os.path.join(adt_2023, csv2023_files[i]), sep=','))

    # Build a single dataframe
    adt_df = pd.concat((pd.concat(adt2023_list, axis = 0), pd.concat(adt2022_list, axis = 0)), axis = 0)
    # Keep only the rows whose service_date is within the range of the scheduled df
    feed_info = gtfsSchedule['feed_info']
    start_date = pd.to_datetime(feed_info.feed_start_date.values, format='%Y%m%d')
    end_date = pd.to_datetime(feed_info.feed_end_date.values, format='%Y%m%d')
    adt_df['service_date'] = pd.to_datetime(adt_df['service_date'], format='%Y-%m-%d')
    adt_df = adt_df.loc[adt_df.service_date >= start_date[0]]
    adt_df = adt_df.loc[adt_df.service_date <= end_date[0]]
    adt_df.reset_index()
    # Replace wrong SL3 id with the correct one
    adt_df.loc[adt_df.route_id=='746_', 'route_id'] = '746'
    # Use the routes gtfs file to match route_ids in the adt dataframe with their univocal identifier
    routes = gtfsSchedule['routes']
    adt_df = pd.merge(adt_df, routes[['route_id', 'route_short_name']], on='route_id')
    # Change the dtype of the columns included in the colTypes to their corresponding values
    for key, value in colTypes.items():
        adt_df[key] = adt_df[key].astype(value)
else:
    # Read the file in separate chunks and concatenate them
    chunk_size = 10**6
    chunks = []
    dtype_map = {
    "service_date": "string",
    "route_id": "string",
    "direction_id": "category",
    "half_trip_id": "string",
    "stop_id": "string",
    "time_point_id": "category", 
    "time_point_order": pd.Int16Dtype(),
    "point_type": "category", 
    "standard_type": "category",  
    "scheduled": "string",  # Consider converting to datetime later
    "actual": "string",  # Consider converting to datetime later
    "scheduled_headway": pd.Int32Dtype(),
    "headway": pd.Int32Dtype(),
    "route_short_name": "category"
    }
    for chunk in pd.read_csv('adt_df.csv', dtype=dtype_map, chunksize=chunk_size):
        chunks.append(chunk)
    adt_df = pd.concat(chunks, axis=0, ignore_index=True)
    # Convert service_date, scheduled and actual columns to datetime objects
    adt_df['scheduled'] = pd.to_datetime(adt_df['scheduled'], format='ISO8601')
    adt_df['actual'] = pd.to_datetime(adt_df['actual'], format='ISO8601')
    adt_df["scheduled"] = adt_df["scheduled"].dt.strftime("%H:%M:%S")
    adt_df["actual"] = adt_df["actual"].dt.strftime("%H:%M:%S")
    # If half_trip_id endswith '.0', trim this piece
    adt_df['half_trip_id'] = adt_df['half_trip_id'].str.replace('.0', '', regex=False)


In [None]:
groups = adt_df.groupby(
    ['service_date','route_id', 'direction_id']
)

# Print first group and then break the for loop
for name, group in groups:
    group = group.sort_values(by='scheduled', ascending = True)
    print(group.head())
    break

In [86]:
gtfsSchedule['trips'].loc[gtfsSchedule['trips'].route_id == '1'].service_id.unique()    

array(['BUS123-3-Wdy-02', 'BUS123-4-Wdy-02', 'BUS123-5-Wdy-02',
       'BUS123-6-Wdy-02', 'ChristmasDay(Observed)-1', 'ChristmasDay-1',
       'MartinLutherKingDay-1', 'WinterSaturday', 'WinterSunday'],
      dtype=object)