In [None]:
import os
os.environ['PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT'] = '2'

In [41]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Set the option to prevent the FutureWarning
pd.set_option('future.no_silent_downcasting', True)
import datetime as dt
from functions import *
from dtype_dictionaries import *

### Import and process gtfs schedule

In [None]:
# Import all *.txt files in the gtfsSchedule folder and parse them as dataframes
# add the gtfsSchedule folder and subfolders to the current path
txt_path = os.path.join('gtfsSchedule','gtfs_2022-12-18_2023-03-11_Winter2023PostRecap')
# List only the files needed later, avoid to import the whole folder
txt_list = (['calendar.txt', 'calendar_attributes.txt', 'calendar_dates.txt', 'feed_info.txt', 'routes.txt', 'stop_times.txt', 'stops.txt', 'trips.txt'])
# List of dataframe names: remove the '.txt' extension from the filenames
df_names = [txt_file.rstrip('.txt') for txt_file in txt_list]
# Read txt files into dataframes and assign them the names in df_names, and the dtypes in gtfs_cols using the keys with the same name as the dataframe
dfs = [pd.read_csv(os.path.join(txt_path, gtfs_file), sep=',', low_memory=False, dtype=gtfs_cols[df_name]) for df_name, gtfs_file in zip(df_names, txt_list)]
#dfs = [pd.read_csv(os.path.join(txt_path, gtfs_file), sep=',', low_memory=False) for gtfs_file in txt_list]
# create a dictionary of dataframes
gtfsSchedule = dict(zip(df_names, dfs))
# Assign the dataframes to variables
calendar = gtfsSchedule['calendar']
calendar_attributes = gtfsSchedule['calendar_attributes']
calendar_dates = gtfsSchedule['calendar_dates']
feed_info = gtfsSchedule['feed_info']
routes = gtfsSchedule['routes']
stop_times = gtfsSchedule['stop_times']
end_points_df = (stop_times.groupby
                     (['trip_id'], observed=True, as_index=False)
                    ['stop_sequence']
                    .transform('max') #with transform I can get all the max occurrences while also preserving their original row index
                )
stop_times = stop_times.loc[(stop_times.loc[:,'stop_sequence'] == end_points_df)|(stop_times.loc[:,'stop_sequence'] == 1)]
# Drop arrival_time and rename departure_time to scheduled
stop_times = stop_times.drop(columns=['arrival_time'])
stop_times = stop_times.rename(columns={'departure_time': 'scheduled'})
stops = gtfsSchedule['stops']
trips = gtfsSchedule['trips']

# Filter routes and trips to only include bus routes, i.e., those whose rotue_id is a string of digits
routes, trips = routes[routes['route_id'].str.isdigit()], trips[trips['route_id'].str.isdigit()]

# Convert datetime strings to proper datetime objects
calendar['start_date'] = pd.to_datetime(calendar['start_date'], format='%Y%m%d')
calendar['end_date'] = pd.to_datetime(calendar['end_date'], format='%Y%m%d')
calendar_attributes['rating_start_date'] = pd.to_datetime(calendar_attributes['rating_start_date'], format='%Y%m%d')
calendar_attributes['rating_end_date'] = pd.to_datetime(calendar_attributes['rating_end_date'], format='%Y%m%d')
start_date_str = feed_info['feed_start_date'].values[0]
end_date_str = feed_info['feed_end_date'].values[0]
feed_info['feed_start_date'] = pd.to_datetime(feed_info['feed_start_date'], format='%Y%m%d')
feed_info['feed_end_date'] = pd.to_datetime(feed_info['feed_end_date'], format='%Y%m%d')
calendar_dates['date'] = pd.to_datetime(calendar_dates['date'], format='%Y%m%d')
# Add the service_id field from the calendar dataframe to the trips dataframe, without including the other fields
trips = pd.merge(trips[['service_id','trip_id', 'route_id', 'direction_id', 'block_id']], calendar_attributes[['service_id']], on='service_id')
# Merge stop_times with trips
schedule = pd.merge(trips,stop_times[['trip_id','scheduled','stop_id','stop_sequence']], on='trip_id', how='left')
schedule = parse_datetime_strings(schedule)

In [None]:
# Create a listdir containg only files whose month and year are compatible with feed_start_date and feed_end_date
# Start with creating the list of all the files included in the subfolders of ArrDepFolder
ArrDepFolder = 'MBTA_ArrivalDepartureTimes'
yearlyFolders = os.listdir(ArrDepFolder)
numYears = len(yearlyFolders)
file_list = []
for year in range(numYears):
    num_files = len(os.listdir(os.path.join(ArrDepFolder, yearlyFolders[year])))
    files_path = os.path.join(ArrDepFolder, yearlyFolders[year])

    for month in range(num_files):
        #print(f'files: {os.listdir(files_path)}')
        filename = (os.path.join(files_path, os.listdir(files_path)[month]))
        file_list.append(filename)

# This list will be used to filter the files to be imported
compatibleFiles = []
start_date = feed_info['feed_start_date'][0]
end_date = feed_info['feed_end_date'][0]

for file in file_list:
    # Extract the date from the filename
    date = file.split('_')[-1].split('.')[0]
    # Convert the date to a datetime object
    date = dt.datetime.strptime(date, '%Y-%m')
    # Check if the date is within the feed_start_date and feed_end_date. Apply the control only to year and month
    if dt.date(year=start_date.year, month=start_date.month, day=1) <= date.date() <= dt.date(year=end_date.year, month=end_date.month, day=31):
        compatibleFiles.append(file)

### Import files with arrival and departure times

In [38]:
#Empty dataframe
adt_list = []
chunk_size = 10**5

# Read files or import table?
import_separate_files = False
i = 0

filename = start_date_str + '_' + end_date_str + '.csv'
filepath = os.path.join('parsed_ArrivalDepartureFiles', filename)

if import_separate_files:        
    for filename in (compatibleFiles):   
        print(f'Importing {filename}...')  
        for chunk in pd.read_csv(filename, chunksize=chunk_size, dtype=adt_dtype_map, low_memory=False):
            # Carry out here any filtering, drop or cutting down operation
            chunk = reduce_df_size(chunk)
            # Parse datetime strings to datetime objects only once at the end
            adt_list.append(chunk)
           
    print('Concatenating...')
    adt_df = pd.concat(adt_list, axis=0)
    # Filter out all the records that lie outside the feed_start_date and feed_end_date rang
    adt_df = adjust_adt_df_settings(adt_df, routes, start_date, end_date)
    # Save the dataframe to a csv file: filename is equal as 'feed_start_date_feed_end_date.csv'
    filename = start_date_str + '_' + end_date_str + '.csv'
    filepath = os.path.join('parsed_ArrivalDepartureFiles', filename)
    adt_df.to_csv(filepath, index=False)
else:
    for chunk in pd.read_csv(filepath, chunksize=chunk_size, dtype=adt_dtype_map, low_memory=False):
        adt_list.append(chunk)
    adt_df = pd.concat(adt_list, axis=0, ignore_index=True)
    # Convert service_date, scheduled and actual columns to datetime objects
    adt_df['service_date'] = pd.to_datetime(adt_df['service_date'], format='%Y-%m-%d')
    adt_df['scheduled'] = pd.to_datetime(adt_df['scheduled'], format='%H:%M:%S')
    adt_df['actual'] = pd.to_datetime(adt_df['actual'], format='%H:%M:%S')
    #adt_df = adjust_adt_df_settings(adt_df, routes, start_date, end_date)

### Combine arrival and departure times with scheduled information

In [None]:
# Import the calendar_df with only the feasible service_ids for every service date
calendar_df = import_calendar_csv('CalendarDates', 'calendar_df.csv')

adt_grouped = adt_df.copy()
# Work with only Startpoint records in arrival departure times, will merge with Endpoint records later
adt_grouped = adt_df.loc[adt_df.point_type == 'Startpoint'].groupby(['route_id', 'direction_id', 'actual'], observed=True)
schedule_grouped = schedule.loc[schedule.stop_sequence == 1].groupby(['route_id', 'direction_id', 'scheduled'], observed=True)

# Create two lists to store the unmatched names and groups
unmatched_names = []
unmatched_groups = []

for name, group in adt_grouped:
    # print the group if an error occurs
    if name in schedule_grouped.groups:
        # extract the corresponding group from schedule_route10_grouped
        schedule_group = schedule_grouped.get_group(name)
        schedule_services = set(schedule_group['service_id'])
        schedule_service_block_ids = schedule_group.groupby(['service_id'])['block_id'].apply(list)

        # extract the subset of the calendar_df that matches the service_date
        service_days = calendar_df.loc[calendar_df.date.isin(group.service_date)]
        # loop through the service_days
        for i, row in service_days.iterrows():
            # add the intersection between schedule_services and row['service_ids'] to the service_id column in adt_route10 as plain strings
            adt_service_ids = schedule_services.intersection(row['service_ids'])
            adt_service_ids_str = ', '.join(adt_service_ids)  # Convert set to string
            adt_df.loc[group.index, 'service_id'] = adt_service_ids_str
            # Get the block_id list associated to adt_service_ids_str
            block_list=schedule_service_block_ids[schedule_service_block_ids.index==adt_service_ids_str]
            block_list = block_list.iloc[0] if not block_list.empty else ''

            if block_list and not group.empty:
                # Concatenate block_list elements into a comma-separated string
                block_ids_str = ', '.join(block_list)
                # Assign the concatenated string to the specified rows in the column
                adt_df.loc[group.index, 'block_id'] = block_ids_str
    else:
        unmatched_names.append(name)
        unmatched_groups.append(group)

print('Split Blocks')
adt_df = split_multiple_block_id(adt_df)
print('Assign Blocks to endpoints')


In [None]:
# Assign the same values of block_id and service_id to Endpoint records with the same half_trip_id
trips_grouped = adt_df.groupby(['half_trip_id'], observed=True)
for name, group in trips_grouped:
    if len(group.index) > 1:
        adt_df.loc[group.index[1], ['block_id', 'service_id']] = group.loc[group.index[0], ['block_id', 'service_id']].values
    else:
        print(name)
        print(group)

In [None]:
trips_grouped = adt_df.groupby(['half_trip_id'], observed=True)
for name, group in trips_grouped:
    if len(group.index) > 1:
        adt_df.loc[group.index[1], ['block_id', 'service_id']] = group.loc[group.index[0], ['block_id', 'service_id']].values
    else:
        print(name)
        print(group)

In [None]:
# Assign the same values of block_id and service_id to Endpoint records with the same half_trip_id
trips_grouped = adt_df.groupby(['half_trip_id'], observed=True)
for name, group in trips_grouped:
    adt_df.loc[group.index[1], ['block_id', 'service_id']] = group.loc[group.index[0], ['block_id', 'service_id']].values

### Compute layover

In [None]:
# Create layover_df as a copy
layover_df = adt_route10.copy()
layover_df = layover_df.sort_values(by=['block_id','service_date','half_trip_id','departure_time'])
layover_df=layover_df.reset_index(drop=True)

# Group by 'block_id' and 'service_date', skipping rows with null 'service_id' or 'block_id'
grouped = layover_df.loc[layover_df.block_id.notna()].groupby(['block_id', 'service_date'])

# Calculate theoretical and actual layover times using diff()
layover_df['theoretical_layover'] = grouped['departure_time'].diff().dt.total_seconds() / 60
layover_df['actual_layover'] = grouped['actual'].diff().dt.total_seconds() / 60

# Replace the first row of each group with null timedelta
layover_df.loc[grouped.head(1).index, ['theoretical_layover', 'actual_layover']] = 0
layover_df.loc[layover_df.time_point_order != 1, ['theoretical_layover', 'actual_layover']] = np.nan
layover_df = layover_df.drop(columns=['time_point_order', 'point_type', 'standard_type'])
(layover_df.groupby(['block_id', 'service_date'])[['theoretical_layover', 'actual_layover']]
            .agg({'theoretical_layover': ['mean', 'max', 'count'],
                  'actual_layover': ['mean', 'max', 'count'],
                  'stop_id': 'first'})
)

In [None]:
test = adt_route10.groupby(['half_trip_id'])
startpoint_df = test.get_group