## Second Pass at Cardinal Direction ID
* Right now, there is a rough draft but it doesn't incorporate dates and it takes forever to run.
* This notebook will incorporate dates and deployed to make sure each operator doesn't take 2+ minutes to generate when deploying this work to the portfolio

### Resources
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/average_summary_speeds.py#L184-L191

In [1]:
import _section2_utils as section2_utils
import geopandas as gpd
import numpy as np
import pandas as pd
from IPython.display import HTML, Image, Markdown, display, display_html
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
import datetime

from dask import compute, delayed

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
lawndale = "City of Lawndale"

In [5]:
la_metro = "Los Angeles County Metropolitan Transportation Authority"

In [6]:
sf = "City and County of San Francisco"

In [7]:
df = section2_utils.load_schedule_vp_metrics(sf)

In [None]:
    df.head(1)

In [None]:
df.Direction.value_counts()

In [None]:
all_dates_list = list(df.Date.unique())
gtfs_keys = list(df.schedule_gtfs_dataset_key.unique())
all_dates_list = [np.datetime_as_string(date, unit="D") for date in all_dates_list]

In [None]:
analysis_date = all_dates_list[0]

#### Editing the `stop_times_direction`

In [None]:
TABLE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction
FILE = f"{RT_SCHED_GCS}{TABLE}_{analysis_date}.parquet"

In [None]:
stops = pd.read_parquet(FILE)

In [None]:
stops.sample()

#### Editing `import_scheduled_trips`

In [None]:
TABLE2 = GTFS_DATA_DICT.schedule_downloads.trips
FILE2 = f"{COMPILED_CACHED_VIEWS}{TABLE}_{analysis_date}.parquet"

In [None]:
scheduled_trips = pd.read_parquet(FILE)

In [None]:
scheduled_trips.sample()

#### Testing for ONE date only

In [None]:
def find_most_common_dir(
     scheduled_trips_df: pd.DataFrame,
     scheduled_stop_times_df: pd.DataFrame, 
) -> pd.DataFrame:
    """
    Load load_scheduled_trips() and load_scheduled_stop_times()
    """
    
    # Merge dfs
    merge_cols = [
        "trip_instance_key",
        "schedule_gtfs_dataset_key",
        "shape_array_key",
        "service_date",
    ]

    df = delayed(pd.merge)(
        scheduled_trips_df,
        scheduled_stop_times_df,
        on=merge_cols,
        how="inner",
    )

    agg1 = (
        df.groupby(
            [
                "route_id",
                "schedule_gtfs_dataset_key",
                "direction_id",
                "stop_primary_direction",
                "service_date",
            ]
        )
        .agg({"stop_sequence": "count"})
        .reset_index()
        .rename(columns={"stop_sequence": "total_stops"})
    )

    # Sort and drop duplicates so that the
    # largest # of stops by stop_primary_direction is at the top
    agg2 = agg1.sort_values(
        by=["route_id",
            "schedule_gtfs_dataset_key", 
            "direction_id",
            "service_date",
            "total_stops"],
        ascending=[True, True, True, True, False],
    )

    # Drop duplicates so only the top stop_primary_direction is kept.
    agg3 = agg2.drop_duplicates(
        subset=[
            "route_id",
            "schedule_gtfs_dataset_key",
            "direction_id",
            "service_date"
        ]
    ).reset_index(drop=True)

    agg3 = agg3.drop(columns=["total_stops"])
    return agg3

In [None]:
scheduled_trips_dd = delayed(section2_utils.load_scheduled_trips(analysis_date, gtfs_keys))

In [None]:
scheduled_stops_dd = delayed(section2_utils.load_scheduled_stop_times(analysis_date, gtfs_keys))

In [None]:
# apr_2023 = find_most_common_dir(scheduled_trips_dd,scheduled_stops_dd)

In [None]:
def most_recent_route_info(
    df: pd.DataFrame,
    group_cols: list,
    route_col: str
) -> pd.DataFrame:
    """
    Find the most recent value across a grouping.
    Ex: if we group by route_id, we can find the most recent 
    value for route_long_name.
    
    Needs a date column to work.
    """
    sort_order = [True for c in group_cols]
    
    most_recent = (df.sort_values(group_cols + ["service_date"], 
                                  ascending = sort_order + [False])
                   .drop_duplicates(subset = group_cols)  
                   .rename(columns = {route_col: f"recent_{route_col}"})
                  )
    
    
    df2 = delayed(pd.merge)(
        df,
        most_recent[group_cols + [f"recent_{route_col}"]],
        on = group_cols,
        how = "left"
    )
    return most_recent

In [None]:
def find_most_recent_route_id(df):
    df = df.assign(
        route_id=df.route_id.fillna(""),
        route_short_name=df.route_short_name.fillna(""),
        route_long_name=df.route_long_name.fillna(""),
    )
    df = df.assign(combined_name=df.route_short_name + "__" + df.route_long_name)
    
    df = df.assign(
        route_id2=df.apply(
            lambda x: gtfs_schedule_wrangling.standardize_route_id(
                x, "name", "route_id"
            ),
            axis=1,
        )
    )
    
    route_cols = ["schedule_gtfs_dataset_key", "name", "route_id2"]
    
    df2 =most_recent_route_info(
        df, group_cols=route_cols, route_col="combined_name"
    ).pipe(
        most_recent_route_info,
        group_cols=["schedule_gtfs_dataset_key", "name", "recent_combined_name"],
        route_col="route_id2",
    )
    
    return df2

In [None]:
recent_route_id_test = find_most_recent_route_id(scheduled_trips_dd)

In [None]:
recent_route_id_test_df = recent_route_id_test.compute()

In [None]:
def find_cardinal_direction(date:str, gtfs_schedule_keys: list) -> pd.DataFrame:
    # Grab all available dates for these dataframes
    # Load the 2 dataframes
    scheduled_trips_dd = delayed(section2_utils.load_scheduled_trips(date, gtfs_keys))
    scheduled_stops_dd = delayed(section2_utils.load_scheduled_stop_times(date, gtfs_keys))
    
    # Find the most common direction for this Route ID
    common_stops_dd = find_most_common_dir(scheduled_trips_dd, scheduled_stops_dd)
    
    
    #common_stops_df = common_stops_dd.compute()
    #scheduled_trips_df = scheduled_trips_dd.compute()
    
    # Find the most recent Route ID to connect back to sched_vp_df
    recent_ids_dd = find_most_recent_route_id(scheduled_trips_dd)

    # Merge this
    m1 = delayed(pd.merge)(
        common_stops_dd,
        recent_ids_dd,
        on=["schedule_gtfs_dataset_key", "route_id"],
        how="inner",
    )
    
    m1 = m1.drop(columns = ["route_id"])
    
    return m1


In [None]:
apr2023_final = find_cardinal_direction(analysis_date, gtfs_keys)

In [None]:
apr2023_final.head(1)

In [None]:
apr2023_final.recent_route_id2.nunique(), apr2023_final.shape

In [None]:
apr2023_final.service_date.unique()

In [None]:
df_apr_2023 = df.loc[df["Date"] == "2023-04-12"]

In [None]:
df_apr_2023["Date"].value_counts()

In [None]:
df_apr_2023 = df_apr_2023.drop(columns = ['recent_route_id2','Direction'])

In [None]:
m1 = pd.merge(
    df_apr_2023,
    apr2023_final.compute(),
    left_on=["schedule_gtfs_dataset_key", "dir_0_1", "Route ID"],
    right_on=[
        "schedule_gtfs_dataset_key",
        "direction_id",
        "recent_route_id2",
    ],
    how="left"
    )

In [None]:
m1.head(1)

In [None]:
apr2023_final.stop_primary_direction.value_counts()

#### Testing for ALL dates

In [None]:
def all_dates_cardinal_dir(dates:list, gtfs_schedule_keys:list)->pd.DataFrame:
    full_df = pd.DataFrame()
    for date in dates:
        df = find_cardinal_direction(date, gtfs_schedule_keys)
        df = df.compute()
        full_df = pd.concat([full_df, df], axis=0)
    return full_df

In [None]:
start1 = datetime.datetime.now()
all_dates_df = all_dates_cardinal_dir(all_dates_list,gtfs_keys)
end1 = datetime.datetime.now()
print(f"{end1-start1}")

In [None]:
all_dates_df.head(2)

In [None]:
all_dates_df.shape

In [None]:
all_dates_df.service_date.unique()

In [None]:
all_dates_df.groupby(['service_date']).agg({'recent_route_id2':'nunique'})

In [None]:
df = df.drop(columns = ['recent_route_id2','Direction'])

In [None]:
df['temp_service_date'] = df['Date'].astype(str)

In [None]:
df.sample()

In [None]:
m2 = pd.merge(
    df,
    all_dates_df,
    left_on=["schedule_gtfs_dataset_key", "direction_id", "Route ID", "temp_service_date"],
    right_on=[
        "schedule_gtfs_dataset_key",
        "direction_id",
        "recent_route_id2",
        "service_date"
    ],
    how="left"
    )

In [None]:
m2.head(1)

In [None]:
m2.stop_primary_direction.value_counts()

#### Redoing `find_most_recent_route_id`
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py

In [None]:
scheduled_trips_dd

In [None]:
scheduled_trips_dd = scheduled_trips_dd.assign(
        route_id=scheduled_trips_dd.route_id.fillna(""),
        route_short_name=scheduled_trips_dd.route_short_name.fillna(""),
        route_long_name=df.route_long_name.fillna(""),
    )

In [None]:
scheduled_trips_dd = scheduled_trips_dd.assign(combined_name=scheduled_trips_dd.route_short_name + "__" + scheduled_trips_dd.route_long_name)

In [None]:
scheduled_trips_dd = scheduled_trips_dd.assign(
        route_id2=df.apply(
            lambda x: gtfs_schedule_wrangling.standardize_route_id(
                x, "name", "route_id"
            ),
            axis=1,
        )
    )


In [None]:
route_cols = ["schedule_gtfs_dataset_key", "name", "route_id2"]

In [None]:
df2 =most_recent_route_info(
        scheduled_trips_dd, group_cols=route_cols, route_col="combined_name"
    )

In [None]:
df3 = df2.pipe(
        most_recent_route_info,
        group_cols=["schedule_gtfs_dataset_key", "name", "recent_combined_name"],
        route_col="route_id2",
    )
    

In [None]:
df3

In [None]:
df3_pandas = df3.compute()

In [None]:
df3_pandas.head(1)