In [None]:
import pandas as pd
import numpy as np
from heapq import heappop, heappush
from collections import defaultdict
from tqdm.notebook import tqdm
from multiprocessing import Pool, cpu_count

# Load data from uploaded files
agency = pd.read_csv('agency.txt')
calendar = pd.read_csv('calendar.txt')
calendar_dates = pd.read_csv('calendar_dates.txt')
feed_info = pd.read_csv('feed_info.txt')
routes_df = pd.read_csv('routes.txt')
stops_df = pd.read_csv('cleaned_filtered_stops.txt')  # Use pre-filtered stops
stop_times_df = pd.read_csv('stop_times.txt')
transfers = pd.read_csv('transfers.txt')
trips_df = pd.read_csv('trips.txt')

def normalize_time(t):
    if pd.isna(t):
        return t
    parts = t.split(':')
    if len(parts) == 2:
        h, m = map(int, parts)
        s = 0
    elif len(parts) == 3:
        h, m, s = map(int, parts)
    else:
        return pd.NaT
    return pd.Timedelta(hours=h % 24, minutes=m, seconds=s) + pd.Timedelta(days=h // 24)

# Normalize arrival_time and departure_time
stop_times_df['arrival_time'] = stop_times_df['arrival_time'].apply(normalize_time)
stop_times_df['departure_time'] = stop_times_df['departure_time'].apply(normalize_time)

# Merge stop_times with stops to get stop information
stop_times_df = stop_times_df.merge(stops_df, on='stop_id', how='left')

# Pre-compute trip stop times for faster access
trip_stop_times = {trip_id: df.sort_values('stop_sequence').reset_index(drop=True)
                   for trip_id, df in stop_times_df.groupby('trip_id')}

# Define time intervals
time_intervals = {
    'early_morning': (pd.Timedelta(hours=0), pd.Timedelta(hours=6)),
    'morning': (pd.Timedelta(hours=6), pd.Timedelta(hours=10)),
    'midday': (pd.Timedelta(hours=10), pd.Timedelta(hours=14)),
    'afternoon': (pd.Timedelta(hours=14), pd.Timedelta(hours=18)),
    'late_afternoon': (pd.Timedelta(hours=18), pd.Timedelta(hours=21)),
    'night': (pd.Timedelta(hours=21), pd.Timedelta(hours=24)),
}

def get_time_interval_name(time):
    for interval_name, (start, end) in time_intervals.items():
        if start <= time < end:
            return interval_name
    return 'night'

def process_city(args):
    city_name, time_limit, max_transfers, stops_df, stop_times_df, trip_stop_times = args
    all_routes = []
    city_stops = stops_df[stops_df['stop_name'].str.contains(city_name, case=False, na=False, regex=False)]
    city_stop_ids = city_stops['stop_id'].tolist()

    priority_queue = [(pd.Timedelta(0), 0, stop_id, None) for stop_id in city_stop_ids]
    travel_times = defaultdict(lambda: pd.Timedelta.max)
    transfer_counts = defaultdict(lambda: float('inf'))
    explored_routes = defaultdict(set)

    for stop_id in city_stop_ids:
        travel_times[(stop_id, 0)] = pd.Timedelta(0)
        transfer_counts[(stop_id, 0)] = 0

    while priority_queue:
        current_time, transfers, current_stop, prev_trip_id = heappop(priority_queue)

        if transfers > max_transfers or current_time > time_limit:
            continue

        next_trips = stop_times_df[stop_times_df['stop_id'] == current_stop]['trip_id'].unique()

        for trip_id in next_trips:
            if trip_id in explored_routes[prev_trip_id]:
                continue
            explored_routes[prev_trip_id].add(trip_id)

            trip_df = trip_stop_times[trip_id]
            start_index = trip_df[trip_df['stop_id'] == current_stop].index[0]

            cumulative_travel_time = current_time

            for i in range(start_index + 1, len(trip_df)):
                prev_stop = trip_df.iloc[i - 1]
                next_stop = trip_df.iloc[i]
                travel_time = next_stop['arrival_time'] - prev_stop['departure_time']
                cumulative_travel_time += travel_time

                if cumulative_travel_time > time_limit:
                    break

                next_stop_id = next_stop['stop_id']
                next_transfers = transfers if trip_id == prev_trip_id else transfers + 1

                route_info = {
                    'origin_city': city_name,
                    'stop_id': next_stop_id,
                    'stop_name': next_stop['stop_name'],
                    'stop_lat': next_stop['stop_lat'],
                    'stop_lon': next_stop['stop_lon'],
                    'travel_time': cumulative_travel_time,
                    'transfer_count': next_transfers,
                    'departure_time_interval': get_time_interval_name(prev_stop['departure_time']),
                    'arrival_time': next_stop['arrival_time']
                }

                if (next_stop_id, next_transfers) not in travel_times or cumulative_travel_time < travel_times[(next_stop_id, next_transfers)]:
                    travel_times[(next_stop_id, next_transfers)] = cumulative_travel_time
                    transfer_counts[(next_stop_id, next_transfers)] = next_transfers
                    heappush(priority_queue, (cumulative_travel_time, next_transfers, next_stop_id, trip_id))
                    all_routes.append(route_info)

    return all_routes

def precompute_routes(time_limit, max_transfers):
    cities = stops_df['stop_name'].unique()
    all_routes = []

    args = [(city, time_limit, max_transfers, stops_df, stop_times_df, trip_stop_times) for city in cities]

    with Pool(processes=cpu_count()) as pool:
        for city_routes in tqdm(pool.imap_unordered(process_city, args), total=len(cities), desc="Processing all cities"):
            all_routes.extend(city_routes)

    routes_df = pd.DataFrame(all_routes)
    routes_df.to_csv('precomputed_routes.csv', index=False)

if __name__ == '__main__':
    # Set the time limit and max transfers
    time_limit = pd.Timedelta(hours=8)
    max_transfers = 3

    # Precompute the routes
    precompute_routes(time_limit, max_transfers)


Processing all cities:   0%|          | 0/580 [00:00<?, ?it/s]

In [None]:
# Load the CSV
routes_df = pd.read_csv('precomputed_routes_with_missing_values.csv')

# Drop rows with any missing values
routes_df_cleaned = routes_df.dropna()

# Save the cleaned DataFrame to a new CSV
routes_df_cleaned.to_csv('precomputed_routes_no_missing_values.csv', index=False)
