In [None]:
import os
os.environ['PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT'] = '2'

In [None]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Set the option to prevent the FutureWarning
pd.set_option('future.no_silent_downcasting', True)
import datetime as dt
from functions import *
from dtype_dictionaries import *

### Import files with arrival and departure times

In [None]:
# Position of the gtfs post-rating schedule files
gtfs_post_rating_files = os.listdir('gtfsSchedule')
# Sort the elements in alphabetical order. I need to ensure this otherwise the strategy of keeping the latest df in memory won't work
gtfs_post_rating_files.sort()

# List only the files needed later, avoid to import the whole folder
txt_list = (['calendar.txt', 'calendar_attributes.txt', 'calendar_dates.txt', 'feed_info.txt', 'routes.txt', 'stop_times.txt', 'stops.txt', 'trips.txt'])
# Position of MBTA_ArrivalDepartureTimes files
ArrDepFolder = 'MBTA_ArrivalDepartureTimes'
# Position of the  parsed ArrivalDepartureTimes files
parsed_ArrDepFolder = 'parsed_ArrivalDepartureFiles'
# List of files contained in the parsed_ArrivalDepartureFiles folder
parsed_gtfs_rt = os.listdir(parsed_ArrDepFolder)
parsed_gtfs_rt.sort()

latest_month_df_list = []
chunk_size = 10**5

adt_list = []
# Initialize to store calendar days and related services, to be converted to dataframe
calendar_service_map = []
# Initialize the list of dataframes to be concatenated containing all the data from gtfs rt
df_ArrDep_list = []
# Initialize the list of gtfs_schedules to be concatenated
gtfs_schedules_list = []
# Read files or import table?
import_separate_files = False
# Flag that will be used to skip the first element of the compatibleFiles list
first_round = 0 

for postRatingRecap_file in gtfs_post_rating_files:
#    adt_list = []
    print(f'Processing {postRatingRecap_file}...')
    gtfs_post_rating_folder = os.path.join('gtfsSchedule', postRatingRecap_file)
    calendar, calendar_attributes, calendar_dates, feed_info, routes, stop_times, stops, trips, schedule = get_gtfs_post_rating_txt_files(gtfs_post_rating_folder, txt_list, gtfs_cols)
    # Save feed_start_date and feed_end_date
    start_date = feed_info['feed_start_date'][0]
    end_date = feed_info['feed_end_date'][0]
    calendar_data = parse_calendar_file(calendar)
    calendar_dates_data = parse_calendar_dates_file(calendar_dates)
    calendar_schedule = generate_schedule(start_date, end_date, calendar_data, calendar_dates_data)
    calendar_service_map.append(calendar_schedule)
    
    # Add schedule to the list of gtfs_schedules
    gtfs_schedules_list.append(schedule)

    if first_round:
        # Remove first element from compatibleFiles
        compatibleFiles = compatibleFiles[1:]
        adt_list = adt_list + (latest_month_df_list)
        latest_month_df_list = []
    if import_separate_files:
        # Return the list of compatible files
        compatibleFiles = get_compatible_files(ArrDepFolder, start_date, end_date)
        for filename in (compatibleFiles):   
            print(f'Importing {filename}...')  
            for chunk in pd.read_csv(filename, chunksize=chunk_size, dtype=adt_dtype_map, low_memory=False):
                # Carry out here any filtering, drop or cutting down operation
                chunk = reduce_df_size(chunk)
                adt_list.append(chunk)
                if filename == compatibleFiles[-1]:
                    latest_month_df_list.append(chunk)
        first_round = 1
        print('Concatenating...')
        adt_df = pd.concat(adt_list, axis=0)
        # Filter out all the records that lie outside the feed_start_date and feed_end_date rang
        adt_df = adjust_adt_df_settings(adt_df, routes, start_date, end_date)
        # Save the dataframe to a csv file: filename is equal as 'feed_start_date_feed_end_date.csv'
        export_filename = start_date.strftime('%Y%m%d') + '_' + end_date.strftime('%Y%m%d') + '.csv'
        filepath = os.path.join('parsed_ArrivalDepartureFiles', export_filename)
        adt_df.to_csv(filepath, index=False)
        # Assign to adt_list the rows belonging to the latest read df
        adt_list = []

    else:        
        # Get list index of postRatingRecap_file
        idx = gtfs_post_rating_files.index(postRatingRecap_file)
        parsed_file = parsed_gtfs_rt[idx]
        print(f'Importing {parsed_file}...') 
        filepath = os.path.join(parsed_ArrDepFolder, parsed_file)
        for chunk in pd.read_csv(filepath, chunksize=chunk_size, dtype=adt_dtype_map, low_memory=False):
           adt_list.append(chunk)
            
adt_df = pd.concat(adt_list, axis=0, ignore_index=True)
# Convert service_date, scheduled and actual columns to datetime objects
adt_df['service_date'] = pd.to_datetime(adt_df['service_date'], format='%Y-%m-%d')
adt_df['scheduled'] = pd.to_datetime(adt_df['scheduled'], format='%H:%M:%S')
adt_df['actual'] = pd.to_datetime(adt_df['actual'], format='%H:%M:%S')

In [None]:
calendar_df = pd.DataFrame(calendar_service_map, columns=['date', 'day_of_week', 'service_ids'])    
df_gtfs_schedule = pd.concat(gtfs_schedules_list, axis=0, ignore_index=True)

### Combine arrival and departure times with scheduled information

In [None]:
adt_grouped = adt_df.copy()
schedule_grouped = df_gtfs_schedule.copy()
# Work with only Startpoint records in arrival departure times, will merge with Endpoint records later
adt_grouped = adt_df.groupby(['route_id', 'direction_id', 'actual'], observed=True)
schedule_grouped = df_gtfs_schedule.groupby(['route_id', 'direction_id', 'scheduled'], observed=True)

# Create two lists to store the unmatched names and groups
unmatched_names = []
unmatched_groups = []

for name, group in adt_grouped:
    # print the group if an error occurs
    if name in schedule_grouped.groups:
        # extract the corresponding group from schedule_route10_grouped
        schedule_group = schedule_grouped.get_group(name)
        schedule_services = set(schedule_group['service_id'])
        schedule_service_block_ids = schedule_group.groupby(['service_id'])['block_id'].apply(list)

        # extract the subset of the calendar_df that matches the service_date
        service_days = calendar_df.loc[calendar_df.date.isin(group.service_date)]
        # loop through the service_days
        for i, row in service_days.iterrows():
            # add the intersection between schedule_services and row['service_ids'] to the service_id column in adt_route10 as plain strings
            adt_service_ids = schedule_services.intersection(row['service_ids'])
            adt_service_ids_str = ', '.join(adt_service_ids)  # Convert set to string
            adt_df.loc[group.index, 'service_id'] = adt_service_ids_str
            # Get the block_id list associated to adt_service_ids_str
            block_list=schedule_service_block_ids[schedule_service_block_ids.index==adt_service_ids_str]
            block_list = block_list.iloc[0] if not block_list.empty else ''

            if block_list and not group.empty:
                # Concatenate block_list elements into a comma-separated string
                block_ids_str = ', '.join(block_list)
                # Assign the concatenated string to the specified rows in the column
                adt_df.loc[group.index, 'block_id'] = block_ids_str
    else:
        unmatched_names.append(name)
        unmatched_groups.append(group)

In [None]:
for date in adt_df.service_date.unique():
    print(f'Date: {date}')

In [None]:

print('Split Blocks')
adt_df = split_multiple_block_id(adt_df)
print('Assign Blocks to endpoints')


In [None]:
# Assign the same values of block_id and service_id to Endpoint records with the same half_trip_id
trips_grouped = adt_df.groupby(['half_trip_id'], observed=True)
for name, group in trips_grouped:
    if len(group.index) > 1:
        adt_df.loc[group.index[1], ['block_id', 'service_id']] = group.loc[group.index[0], ['block_id', 'service_id']].values
    else:
        print(name)
        print(group)

In [None]:
trips_grouped = adt_df.groupby(['half_trip_id'], observed=True)
for name, group in trips_grouped:
    if len(group.index) > 1:
        adt_df.loc[group.index[1], ['block_id', 'service_id']] = group.loc[group.index[0], ['block_id', 'service_id']].values
    else:
        print(name)
        print(group)

In [None]:
# Assign the same values of block_id and service_id to Endpoint records with the same half_trip_id
trips_grouped = adt_df.groupby(['half_trip_id'], observed=True)
for name, group in trips_grouped:
    adt_df.loc[group.index[1], ['block_id', 'service_id']] = group.loc[group.index[0], ['block_id', 'service_id']].values

### Compute layover

In [None]:
# Create layover_df as a copy
layover_df = adt_route10.copy()
layover_df = layover_df.sort_values(by=['block_id','service_date','half_trip_id','departure_time'])
layover_df=layover_df.reset_index(drop=True)

# Group by 'block_id' and 'service_date', skipping rows with null 'service_id' or 'block_id'
grouped = layover_df.loc[layover_df.block_id.notna()].groupby(['block_id', 'service_date'])

# Calculate theoretical and actual layover times using diff()
layover_df['theoretical_layover'] = grouped['departure_time'].diff().dt.total_seconds() / 60
layover_df['actual_layover'] = grouped['actual'].diff().dt.total_seconds() / 60

# Replace the first row of each group with null timedelta
layover_df.loc[grouped.head(1).index, ['theoretical_layover', 'actual_layover']] = 0
layover_df.loc[layover_df.time_point_order != 1, ['theoretical_layover', 'actual_layover']] = np.nan
layover_df = layover_df.drop(columns=['time_point_order', 'point_type', 'standard_type'])
(layover_df.groupby(['block_id', 'service_date'])[['theoretical_layover', 'actual_layover']]
            .agg({'theoretical_layover': ['mean', 'max', 'count'],
                  'actual_layover': ['mean', 'max', 'count'],
                  'stop_id': 'first'})
)

In [None]:
test = adt_route10.groupby(['half_trip_id'])
startpoint_df = test.get_group