In [1]:
import os
os.environ['PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT'] = '2'

In [2]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Set the option to prevent the FutureWarning
pd.set_option('future.no_silent_downcasting', True)
import datetime as dt
from functions import *
from dtype_dictionaries import *

### Constants

In [3]:
# Position of the gtfs post-rating schedule files
gtfs_post_rating_files = os.listdir('gtfsSchedule')
# Sort the elements in alphabetical order. I need to ensure this otherwise the strategy of keeping the latest df in memory won't work
gtfs_post_rating_files.sort()

# List only the files needed later, avoid to import the whole folder
txt_list = (['calendar.txt', 'calendar_attributes.txt', 'calendar_dates.txt', 'feed_info.txt', 'routes.txt', 'stop_times.txt', 'stops.txt', 'trips.txt'])
# Position of MBTA_ArrivalDepartureTimes files
ArrDepFolder = 'MBTA_ArrivalDepartureTimes'
# Position of the  parsed ArrivalDepartureTimes files
parsed_ArrDepFolder = 'parsed_ArrivalDepartureFiles'
# List of files contained in the parsed_ArrivalDepartureFiles folder
parsed_gtfs_rt = os.listdir(parsed_ArrDepFolder)
parsed_gtfs_rt.sort()

chunk_size = 12**5

# Initialize to store calendar days and related services, to be converted to dataframe
calendar_service_map = []
# Initialize the list of dataframes to be concatenated containing all the data from gtfs rt
df_ArrDep_list = []
# Initialize the list of gtfs_schedules to be concatenated
gtfs_schedules_list = []

start_date_list = []
end_date_list = []
# Read files or import table?
import_separate_files = False

# Create two lists to store the unmatched names and groups
unmatched_names = []
unmatched_groups = []

# Variable we wish to groupby
grouping_vars = ['direction_id', 'scheduled']

### Import files with arrival and departure times

In [4]:
for postRatingRecap_file in gtfs_post_rating_files[4:]:
    adt_list = []
    print(f'Processing {postRatingRecap_file}...')
    gtfs_post_rating_folder = os.path.join('gtfsSchedule', postRatingRecap_file)
    calendar, calendar_attributes, calendar_dates, feed_info, routes, stop_times, stops, trips, gtfs_schedule = get_gtfs_post_rating_txt_files(gtfs_post_rating_folder, txt_list, gtfs_cols)
    # Add schedule to the list of gtfs_schedules
    gtfs_schedules_list.append(gtfs_schedule)    
    # Save feed_start_date and feed_end_date
    start_date = feed_info['feed_start_date'][0]
    start_date_list.append(start_date)
    end_date = feed_info['feed_end_date'][0]
    end_date_list.append(end_date)
    calendar_data = parse_calendar_file(calendar)
    calendar_data = parse_calendar_dates_file(calendar_dates, calendar_data)
    calendar_schedule = generate_schedule(start_date, end_date, calendar_data)
    calendar_service_map.append(calendar_schedule)

    if import_separate_files:

        # Return the list of compatible files
        compatibleFiles = get_compatible_files(ArrDepFolder, start_date, end_date)
        for filename in (compatibleFiles):   
            print(f'Importing {filename}...')  
            for chunk in pd.read_csv(filename, chunksize=chunk_size, dtype=adt_dtype_map, low_memory=False):
                # Carry out here any filtering, drop or cutting down operation
                chunk = reduce_df_size(chunk)
                chunk = adjust_adt_df_settings(chunk, routes, start_date, end_date)
                adt_list.append(chunk)

        print('Concatenating...')
        #adt_df = pd.concat(adt_list, axis=0)
        # Filter out all the records that lie outside the feed_start_date and feed_end_date rang
        #adt_df = adjust_adt_df_settings(adt_df, routes, start_date, end_date)
        # Save the dataframe to a csv file: filename is equal as 'feed_start_date_feed_end_date.csv'
        export_filename = start_date.strftime('%Y%m%d') + '_' + end_date.strftime('%Y%m%d') + '.csv'
        filepath = os.path.join('parsed_ArrivalDepartureFiles', export_filename)
        print(f'Exporting {filepath}...')
        pd.concat(adt_list, axis=0).to_csv(filepath, index=False)

    else:        
        # Get list index of postRatingRecap_file
        idx = gtfs_post_rating_files.index(postRatingRecap_file)
        parsed_file = parsed_gtfs_rt[idx]
        print(f'Importing {parsed_file}...') 
        filepath = os.path.join(parsed_ArrDepFolder, parsed_file)
        for chunk in pd.read_csv(filepath, chunksize=chunk_size, dtype=adt_dtype_map, low_memory=False):
            chunk['service_date'] = pd.to_datetime(chunk['service_date'], format='%Y-%m-%d')
            chunk['scheduled'] = pd.to_datetime(chunk['scheduled'], format='%H:%M:%S')
            chunk['actual'] = pd.to_datetime(chunk['actual'], format='%H:%M:%S')            
            adt_list.append(chunk)

        # Concatenate the list of dataframes
        adt_df = pd.concat(adt_list, axis=0, ignore_index=True)
        # Map realtime data to gtfs schedule to assign block_ids and service_ids
        adt_df = map_realtime_to_gtfs_schedule(adt_df, start_date, end_date, calendar_schedule, gtfs_schedule)
print('Concatenating...')      
adt_df = pd.concat(adt_list, axis=0, ignore_index=True)
# Build the calendar_service_map and gtfs_schedule dataframe
# Convert every element in calendar_service_map to a dataframe
calendar_df = pd.concat([pd.DataFrame(map, columns=['date', 'day_of_week', 'service_ids']) for map in calendar_service_map], axis=0, ignore_index=True)

Processing 05_gtfs_2022-12-18_2023-03-11_Winter2023PostRecap...
Importing 20221218_20230311.csv...
Processing route 1...
Feed start date: 2022-12-18 00:00:00, Feed end date: 2023-03-11 00:00:00


Length: 14
Categories (144, object): ['BUS123-1-Wdy-02', 'BUS123-2-Wdy-02', 'BUS123-3-Wdy-02', 'BUS123-4-Wdy-02', ..., 'SPR22-S-Su-78', 'WinterSaturday', 'WinterSunday', 'WinterWeekday']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[merged.index, 'service_id'] = merged['service_id_y']
 list(['C01-14']) list(['C01-14']) list(['C01-14']) list(['C01-14'])
 list(['C01-14']) list(['C01-14']) list(['C01-14']) list(['C01-14'])
 list(['C01-14']) list(['C01-14'])]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[merged.index, 'block_id'] = merged['block_id_y']


Processing route 8...
Feed start date: 2022-12-18 00:00:00, Feed end date: 2023-03-11 00:00:00
Processing route 9...
Feed start date: 2022-12-18 00:00:00, Feed end date: 2023-03-11 00:00:00
Processing route 10...
Feed start date: 2022-12-18 00:00:00, Feed end date: 2023-03-11 00:00:00
Processing route 100...
Feed start date: 2022-12-18 00:00:00, Feed end date: 2023-03-11 00:00:00
Processing route 101...
Feed start date: 2022-12-18 00:00:00, Feed end date: 2023-03-11 00:00:00
Processing route 104...
Feed start date: 2022-12-18 00:00:00, Feed end date: 2023-03-11 00:00:00
Processing route 105...
Feed start date: 2022-12-18 00:00:00, Feed end date: 2023-03-11 00:00:00
Processing route 106...
Feed start date: 2022-12-18 00:00:00, Feed end date: 2023-03-11 00:00:00
Processing route 108...
Feed start date: 2022-12-18 00:00:00, Feed end date: 2023-03-11 00:00:00
Processing route 109...
Feed start date: 2022-12-18 00:00:00, Feed end date: 2023-03-11 00:00:00
Processing route 11...
Feed start d

Length: 17
Categories (214, object): ['BUS223-1-Wdy-02', 'BUS223-2-Wdy-02', 'BUS223-3-Sa-02', 'BUS223-4-Wdy-02', ..., 'SpringWeekday', 'SummerSaturday', 'SummerWeekday', 'canonical']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[merged.index, 'service_id'] = merged['service_id_y']
 list(['C01-14']) list(['C01-14']) list(['C01-14']) list(['C01-14'])
 list(['C01-14']) list(['C01-14']) list(['C01-14']) list(['C01-14'])
 list(['C01-14']) list(['C01-14']) list(['C01-14']) list(['C01-14'])
 list(['C01-14'])]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[merged.index, 'block_id'] = merged['block_id_y']


Processing route 8...
Feed start date: 2023-03-12 00:00:00, Feed end date: 2023-07-01 00:00:00
Processing route 9...
Feed start date: 2023-03-12 00:00:00, Feed end date: 2023-07-01 00:00:00
Route: 9
 Length of indexes for group ('0', Timestamp('1900-01-01 16:43:00')) is different: 78 vs 79
Processing route 10...
Feed start date: 2023-03-12 00:00:00, Feed end date: 2023-07-01 00:00:00
Processing route 100...
Feed start date: 2023-03-12 00:00:00, Feed end date: 2023-07-01 00:00:00
Processing route 101...
Feed start date: 2023-03-12 00:00:00, Feed end date: 2023-07-01 00:00:00
Processing route 104...
Feed start date: 2023-03-12 00:00:00, Feed end date: 2023-07-01 00:00:00
Processing route 105...
Feed start date: 2023-03-12 00:00:00, Feed end date: 2023-07-01 00:00:00
Processing route 106...
Feed start date: 2023-03-12 00:00:00, Feed end date: 2023-07-01 00:00:00
Processing route 108...
Feed start date: 2023-03-12 00:00:00, Feed end date: 2023-07-01 00:00:00
Processing route 109...
Feed st

Categories (120, object): ['BUS323-hbb33j47-Su-02', 'BUS323-hbc33j47-Su-02', 'BUS323-hbg33j47-Su-02', 'BUS323-hbl33j47-Su-02', ..., 'SprSnd', 'SprStd', 'SprWkd', 'canonical']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[merged.index, 'service_id'] = merged['service_id_y']
 list(['C01-14']) list(['C01-14']) list(['C01-14']) list(['C01-14'])
 list(['C01-14'])]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[merged.index, 'block_id'] = merged['block_id_y']


Processing route 8...
Feed start date: 2023-07-02 00:00:00, Feed end date: 2023-08-26 00:00:00
Processing route 9...
Feed start date: 2023-07-02 00:00:00, Feed end date: 2023-08-26 00:00:00
Processing route 10...
Feed start date: 2023-07-02 00:00:00, Feed end date: 2023-08-26 00:00:00
Processing route 100...
Feed start date: 2023-07-02 00:00:00, Feed end date: 2023-08-26 00:00:00
Processing route 101...
Feed start date: 2023-07-02 00:00:00, Feed end date: 2023-08-26 00:00:00
Processing route 104...
Feed start date: 2023-07-02 00:00:00, Feed end date: 2023-08-26 00:00:00
Processing route 105...
Feed start date: 2023-07-02 00:00:00, Feed end date: 2023-08-26 00:00:00
Processing route 106...
Feed start date: 2023-07-02 00:00:00, Feed end date: 2023-08-26 00:00:00
Processing route 108...
Feed start date: 2023-07-02 00:00:00, Feed end date: 2023-08-26 00:00:00
Processing route 109...
Feed start date: 2023-07-02 00:00:00, Feed end date: 2023-08-26 00:00:00
Processing route 11...
Feed start d

Length: 18
Categories (253, object): ['BUS42023-hbb43hl6-Saturday-02', 'BUS42023-hbc43tp6-Saturday-02', 'BUS42023-hbg43tp6-Saturday-02', 'BUS42023-hbl43hl6-Saturday-02', ..., 'SummerWeekday', 'ThanksgivingDay', 'VeteransDay(Observed)-DfrThgD', 'canonical']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[merged.index, 'service_id'] = merged['service_id_y']
 list(['C01-13']) list(['C01-13']) list(['C01-13']) list(['C01-13'])
 list(['C01-13']) list(['C01-13']) list(['C01-13']) list(['C01-13'])
 list(['C01-13']) list(['C01-13']) list(['C01-12']) list(['C01-13'])
 list(['C01-13']) list(['C01-13'])]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[merged.index, 'block_id'] = merged['block_id_y']


Processing route 8...
Feed start date: 2023-08-27 00:00:00, Feed end date: 2023-12-16 00:00:00
Processing route 9...
Feed start date: 2023-08-27 00:00:00, Feed end date: 2023-12-16 00:00:00
Processing route 10...
Feed start date: 2023-08-27 00:00:00, Feed end date: 2023-12-16 00:00:00
Processing route 100...
Feed start date: 2023-08-27 00:00:00, Feed end date: 2023-12-16 00:00:00
Processing route 101...
Feed start date: 2023-08-27 00:00:00, Feed end date: 2023-12-16 00:00:00
Processing route 104...
Feed start date: 2023-08-27 00:00:00, Feed end date: 2023-12-16 00:00:00
Processing route 105...
Feed start date: 2023-08-27 00:00:00, Feed end date: 2023-12-16 00:00:00
Processing route 106...
Feed start date: 2023-08-27 00:00:00, Feed end date: 2023-12-16 00:00:00
Processing route 108...
Feed start date: 2023-08-27 00:00:00, Feed end date: 2023-12-16 00:00:00
Processing route 109...
Feed start date: 2023-08-27 00:00:00, Feed end date: 2023-12-16 00:00:00
Processing route 11...
Feed start d

 'WinterSunday' 'WinterSunday' 'WinterSunday' 'WinterSunday'
 'WinterSunday' 'WinterSunday' 'WinterSunday' 'WinterSunday'
 'WinterSunday' 'WinterSunday' 'WinterSunday' 'WinterSunday'
 'WinterSunday']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[merged.index, 'service_id'] = merged['service_id_y']
 list(['C01-11']) list(['C01-11']) list(['C01-11']) list(['C01-11'])
 list(['C01-11']) list(['C01-11']) list(['C01-11']) list(['C01-11'])
 list(['C01-11']) list(['C01-11']) list(['C01-11']) list(['C01-11'])
 list(['C01-11']) list(['C01-11'])]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[merged.index, 'block_id'] = merged['block_id_y']


Processing route 8...
Feed start date: 2023-12-17 00:00:00, Feed end date: 2024-04-06 00:00:00
Processing route 9...
Feed start date: 2023-12-17 00:00:00, Feed end date: 2024-04-06 00:00:00
Processing route 10...
Feed start date: 2023-12-17 00:00:00, Feed end date: 2024-04-06 00:00:00
Route: 10
 Length of indexes for group ('0', Timestamp('1900-01-01 17:35:00')) is different: 18 vs 21
Processing route 100...
Feed start date: 2023-12-17 00:00:00, Feed end date: 2024-04-06 00:00:00
Processing route 101...
Feed start date: 2023-12-17 00:00:00, Feed end date: 2024-04-06 00:00:00
Processing route 104...
Feed start date: 2023-12-17 00:00:00, Feed end date: 2024-04-06 00:00:00
Processing route 105...
Feed start date: 2023-12-17 00:00:00, Feed end date: 2024-04-06 00:00:00
Processing route 106...
Feed start date: 2023-12-17 00:00:00, Feed end date: 2024-04-06 00:00:00
Processing route 108...
Feed start date: 2023-12-17 00:00:00, Feed end date: 2024-04-06 00:00:00
Processing route 109...
Feed s