## Second Pass at Cardinal Direction ID
* Right now, there is a rough draft but it doesn't incorporate dates and it takes forever to run.
* This notebook will incorporate dates and deployed to make sure each operator doesn't take 2+ minutes to generate when deploying this work to the portfolio

Resources
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/average_summary_speeds.py#L184-L191

In [1]:
import _section2_utils as section2_utils
import geopandas as gpd
import numpy as np
import pandas as pd
from IPython.display import HTML, Image, Markdown, display, display_html
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
import datetime

from dask import compute, delayed

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
lawndale = "City of Lawndale"

In [5]:
la_metro = "Los Angeles County Metropolitan Transportation Authority"

In [6]:
sf = "City and County of San Francisco"

In [7]:
df = section2_utils.load_schedule_vp_metrics(sf)

In [8]:
df.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,dir_0_1,Period,Average Scheduled Service (trip minutes),Average Stop Distance (miles),# scheduled trips,Trips per Hour,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,Date,Route typology,# Minutes with 1+ VP per Minute,# Minutes with 2+ VP per Minute,Aggregate Actual Service Minutes,Aggregate Scheduled Service Minutes (all trips),# VP,# VP within Scheduled Shape,# Early Arrival Trips,# On-Time Trips,# Late Trips,# Trips with VP,Average VP per Minute,% VP within Scheduled Shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,% Scheduled Trip w/ 1+ VP/Minute,% Scheduled Trip w/ 2+ VP/Minute,Realtime versus Scheduled Service Ratio,Average Actual Service (Trip Minutes),GTFS Availability,Speed (MPH),Transit Operator,route_long_name,route_short_name,Route,Route ID,schedule_source_record_id,Base64 Encoded Feed URL,Organization ID,Organization,District,ruler_100_pct,ruler_for_vp_per_min,frequency_in_minutes,direction_id,Direction,service_date
0,7cc0cb1871dfd558f11a2885c145d144,0.0,all_day,41.33,0.12,151,6.29,0.0,0.0,0.0,0.0,1.0,0.0,2023-04-12,downtown_local,7816,7708,12084.08,6194.0,23106,21485,4,28,118,150,1.91,93.0,65.0,64.0,100.0,100.0,1.95,80.56,schedule_and_vp,6.91,Bay Area 511 Muni Schedule,CALIFORNIA,1,1 CALIFORNIA,1,recHD22phgJs34JHP,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1TRg==,rechaapWbeffO33OX,City and County of San Francisco,04 - Oakland,100,2,9.54,0.0,Westbound,2023-04-12


In [9]:
df.Direction.value_counts()

Eastbound     1411
Westbound     1312
Southbound     968
Northbound     963
Name: Direction, dtype: int64

In [10]:
df.Direction.value_counts().sum()

4654

In [11]:
len(df)

4654

In [12]:
df.dir_0_1.count()

4654

In [13]:
all_dates_list = list(df.Date.unique())
gtfs_keys = list(df.schedule_gtfs_dataset_key.unique())
all_dates_list = [np.datetime_as_string(date, unit="D") for date in all_dates_list]

In [14]:
analysis_date = all_dates_list[0]

In [16]:
all_dates_list

['2023-04-12',
 '2023-05-17',
 '2023-06-14',
 '2023-07-12',
 '2023-08-15',
 '2023-10-11',
 '2023-11-15',
 '2023-12-13',
 '2024-01-17',
 '2024-02-14',
 '2024-03-13',
 '2024-04-17',
 '2023-03-15']

In [15]:
analysis_date

'2023-04-12'

### Editing the `stop_times_direction`

In [None]:
TABLE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction
FILE = f"{RT_SCHED_GCS}{TABLE}_{analysis_date}.parquet"

In [None]:
stops = pd.read_parquet(FILE)

In [None]:
stops.sample()

### Editing `import_scheduled_trips`

In [None]:
TABLE2 = GTFS_DATA_DICT.schedule_downloads.trips
FILE2 = f"{COMPILED_CACHED_VIEWS}{TABLE}_{analysis_date}.parquet"

In [None]:
scheduled_trips = pd.read_parquet(FILE)

In [None]:
scheduled_trips.sample()

### Testing for ONE date only

In [None]:
def find_most_common_dir(
    scheduled_trips_df: pd.DataFrame,
    scheduled_stop_times_df: pd.DataFrame,
) -> pd.DataFrame:
    """
    Load load_scheduled_trips() and load_scheduled_stop_times()
    """
    # Merge dfs
    merge_cols = [
        "trip_instance_key",
        "schedule_gtfs_dataset_key",
        "shape_array_key",
        "service_date",
    ]

    df = delayed(pd.merge)(
        scheduled_trips_df,
        scheduled_stop_times_df,
        on=merge_cols,
        how="inner",
    )

    agg1 = (
        df.groupby(
            [
                "route_id",
                "schedule_gtfs_dataset_key",
                "direction_id",
                "stop_primary_direction",
                "service_date",
            ]
        )
        .agg({"stop_sequence": "count"})
        .reset_index()
        .rename(columns={"stop_sequence": "total_stops"})
    )

    # Sort and drop duplicates so that the
    # largest # of stops by stop_primary_direction is at the top
    agg2 = agg1.sort_values(
        by=[
            "route_id",
            "schedule_gtfs_dataset_key",
            "direction_id",
            "service_date",
            "total_stops",
        ],
        ascending=[True, True, True, True, False],
    )

    # Drop duplicates so only the top stop_primary_direction is kept.
    agg3 = agg2.drop_duplicates(
        subset=["route_id", "schedule_gtfs_dataset_key", "direction_id", "service_date"]
    ).reset_index(drop=True)

    agg3 = agg3.drop(columns=["total_stops"])
    return agg3

In [None]:
scheduled_trips_dd = delayed(
    section2_utils.load_scheduled_trips(analysis_date, gtfs_keys)
)
scheduled_stops_dd = delayed(
    section2_utils.load_scheduled_stop_times(analysis_date, gtfs_keys)
)
apr_2023_cardinal_dir = find_most_common_dir(scheduled_trips_dd, scheduled_stops_dd)

In [None]:
apr_2023_cardinal_dir_df = apr_2023_cardinal_dir.compute()

#### Something is going wrong with linking the route name to the route ID.
* Using `1 CALIFORNIA` as an example, I can see that direction 1 isn't populating correctly.
* Even though `find_most_common_dir` yields the correct result of a cardinal direction for 0 and 1.

In [None]:
apr_2023_cardinal_dir_df.route_id.value_counts().describe()

In [None]:
apr_2023_cardinal_dir_df.head(2)

In [None]:
apr_2023_cardinal_dir_df.route_id.nunique()

In [None]:
df.loc[df["Route ID"] == "1"].Direction.value_counts()

In [None]:
df.loc[df["Route ID"] == "1"][
    ["Direction", "dir_0_1", "Route", "Route ID"]
].drop_duplicates()

In [None]:
def most_recent_route_info(
    df: pd.DataFrame, group_cols: list, route_col: str
) -> pd.DataFrame:
    """
    Find the most recent value across a grouping.
    Ex: if we group by route_id, we can find the most recent
    value for route_long_name.

    Needs a date column to work.
    """
    sort_order = [True for c in group_cols]

    most_recent = (
        df.sort_values(group_cols + ["service_date"], ascending=sort_order + [False])
        .drop_duplicates(subset=group_cols)
        .rename(columns={route_col: f"recent_{route_col}"})
    )

    df2 = delayed(pd.merge)(
        df, most_recent[group_cols + [f"recent_{route_col}"]], on=group_cols, how="left"
    )
    return most_recent

In [None]:
def find_most_recent_route_id(df):
    df = df.assign(
        route_id=df.route_id.fillna(""),
        route_short_name=df.route_short_name.fillna(""),
        route_long_name=df.route_long_name.fillna(""),
    )
    df = df.assign(combined_name=df.route_short_name + "__" + df.route_long_name)

    df = df.assign(
        route_id2=df.apply(
            lambda x: gtfs_schedule_wrangling.standardize_route_id(
                x, "name", "route_id"
            ),
            axis=1,
        )
    )

    route_cols = ["schedule_gtfs_dataset_key", "name", "route_id2"]

    df2 = most_recent_route_info(
        df, group_cols=route_cols, route_col="combined_name"
    ).pipe(
        most_recent_route_info,
        group_cols=["schedule_gtfs_dataset_key", "name", "recent_combined_name"],
        route_col="route_id2",
    )

    df2 = df2[
        ["schedule_gtfs_dataset_key", "route_id", "service_date", "recent_route_id2"]
    ]
    return df2

In [None]:
recent_route_id_test = find_most_recent_route_id(scheduled_trips_dd)

In [None]:
recent_route_id_test_df = recent_route_id_test.compute()

In [None]:
recent_route_id_test_df.shape

In [None]:
recent_route_id_test_df.head(3)

In [None]:
recent_route_id_test_df.recent_route_id2.nunique()

#### Testing merging

In [None]:
m1 = pd.merge(
    apr_2023_cardinal_dir_df,
    recent_route_id_test_df,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id", "service_date"],
    how="outer",
    indicator=True,
)

In [None]:
m1[["_merge"]].value_counts()

In [None]:
apr_2023_cardinal_dir_df.info()

In [None]:
recent_route_id_test_df.info()

In [None]:
m2 = pd.merge(
    apr_2023_cardinal_dir_df,
    recent_route_id_test_df,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="outer",
    indicator=True,
)

In [None]:
m2._merge.value_counts()

In [None]:
m3 = pd.merge(
    apr_2023_cardinal_dir_df,
    recent_route_id_test_df,
    on=["schedule_gtfs_dataset_key", "route_id", "service_date"],
    how="outer",
    indicator=True,
)

In [None]:
m3._merge.value_counts()

#### Something going wrong with this function when merging.

In [None]:
def find_cardinal_direction(date: str, gtfs_schedule_keys: list) -> pd.DataFrame:
    # Grab all available dates for these dataframes
    # Load the 2 dataframes
    scheduled_trips_dd = delayed(
        section2_utils.load_scheduled_trips(date, gtfs_schedule_keys)
    )
    scheduled_stops_dd = delayed(
        section2_utils.load_scheduled_stop_times(date, gtfs_schedule_keys)
    )

    # Find the most common direction for this Route ID
    common_stops_dd = find_most_common_dir(scheduled_trips_dd, scheduled_stops_dd)

    # Find the most recent Route ID to connect back to sched_vp_df
    recent_ids_dd = find_most_recent_route_id(scheduled_trips_dd)

    # Merge this
    m1 = delayed(pd.merge)(
        common_stops_dd,
        recent_ids_dd,
        on=["schedule_gtfs_dataset_key", "route_id", "service_date"],
        how="inner",
    )

    m1 = m1.drop(columns=["route_id"])

    return m1

In [None]:
apr2023_final = find_cardinal_direction(analysis_date, gtfs_keys)

In [None]:
apr2023_final_df = apr2023_final.compute()

In [None]:
apr2023_final_df.sample()

In [None]:
apr2023_final_df.groupby(
    ["recent_route_id2", "direction_id", "stop_primary_direction"]
).agg({"service_date": "unique"})

In [None]:
df_apr_2023 = df.loc[df["Date"] == "2023-04-12"]

In [None]:
df_apr_2023["Route ID"].nunique()

In [None]:
df_apr_2023 = df_apr_2023.drop(columns=["Direction", "direction_id", "service_date"])

In [None]:
df_apr_2023["temp_service_date"] = df_apr_2023["Date"].astype(str)

In [None]:
df_apr_2023.head(1)

In [None]:
apr2023_final_df.head(1)

#### 1st Merge test

In [None]:
m1_apr_2023 = pd.merge(
    df_apr_2023,
    apr2023_final_df,
    left_on=["schedule_gtfs_dataset_key", "dir_0_1", "Route ID", "temp_service_date"],
    right_on=[
        "schedule_gtfs_dataset_key",
        "direction_id",
        "recent_route_id2",
        "service_date",
    ],
    how="left",
)

In [None]:
print(len(m1_apr_2023))
print(m1_apr_2023.stop_primary_direction.count())
print(m1_apr_2023.dir_0_1.count())

In [None]:
len(df_apr_2023)

##### IDs are matching so it's the match that's the issue.

In [None]:
cardinal_dir_routeids = set(apr2023_final_df.recent_route_id2.unique().tolist())
main_routeids = set(m1_apr_2023["Route ID"].unique().tolist())

In [None]:
cardinal_dir_routeids - main_routeids

In [None]:
main_routeids - cardinal_dir_routeids

In [None]:
apr2023_final_df.sample()

#### 2nd Merge Test

In [None]:
m2_apr_2023 = pd.merge(
    df_apr_2023,
    apr2023_final_df,
    left_on=["schedule_gtfs_dataset_key", "dir_0_1", "Route ID"],
    right_on=[
        "schedule_gtfs_dataset_key",
        "direction_id",
        "recent_route_id2",
    ],
    how="left",
)

In [None]:
print(len(m2_apr_2023))
print(m2_apr_2023.stop_primary_direction.count())
print(m2_apr_2023.dir_0_1.count())

#### 3rd test

In [None]:
m3_apr_2023 = pd.merge(
    df_apr_2023,
    apr2023_final_df,
    left_on=["dir_0_1", "Route ID"],
    right_on=[
        "direction_id",
        "recent_route_id2",
    ],
    how="left",
)

In [None]:
print(len(m3_apr_2023))
print(m3_apr_2023.stop_primary_direction.count())
print(m3_apr_2023.dir_0_1.count())

#### Testing for ALL dates

In [None]:
def all_dates_cardinal_dir(dates: list, gtfs_schedule_keys: list) -> pd.DataFrame:
    full_df = pd.DataFrame()
    for date in dates:
        df = find_cardinal_direction(date, gtfs_schedule_keys)
        df = df.compute()
        full_df = pd.concat([full_df, df], axis=0)
    return full_df

In [None]:
start1 = datetime.datetime.now()
all_dates_df = all_dates_cardinal_dir(all_dates_list, gtfs_keys)
end1 = datetime.datetime.now()
print(f"{end1-start1}")

In [None]:
all_dates_df.groupby(["service_date"]).agg({"recent_route_id2": "nunique"})

#### What to do about routes that change direction? 
* Check and make sure my code is right.
* Use 14R
    * 14R is also missing data for September but it looks like all routes for San Francisco are.

In [None]:
all_dates_df.groupby(
    ["recent_route_id2", "direction_id", "stop_primary_direction"]
).agg({"service_date": "unique"})

In [None]:
all_dates_df.groupby(["recent_route_id2",]).agg(
    {"stop_primary_direction": "nunique"}
).sort_values(by=["stop_primary_direction"], ascending=False).head()

In [None]:
scheduled_trips_dd_aug = delayed(
    section2_utils.load_scheduled_trips("2023-08-15", gtfs_keys)
)
scheduled_stops_dd_aug = delayed(
    section2_utils.load_scheduled_stop_times("2023-08-15", gtfs_keys)
)
aug_2023_cardinal_dir = find_most_common_dir(
    scheduled_trips_dd_aug, scheduled_stops_dd_aug
)

In [None]:
aug_2023_cardinal_dir_df = aug_2023_cardinal_dir.compute()

In [None]:
aug_2023_cardinal_dir_df.loc[aug_2023_cardinal_dir_df.route_id == "14R"]

In [None]:
df = df.drop(columns=["Direction", "direction_id", "service_date"])

In [None]:
df["temp_service_date"] = df["Date"].astype(str)

In [None]:
m1_all_dates = pd.merge(
    df,
    all_dates_df,
    left_on=["schedule_gtfs_dataset_key", "dir_0_1", "Route ID", "temp_service_date"],
    right_on=[
        "schedule_gtfs_dataset_key",
        "direction_id",
        "recent_route_id2",
        "service_date",
    ],
    how="left",
)

In [None]:
m1_all_dates.stop_primary_direction.value_counts()

In [None]:
print(len(m1_all_dates))
print(m1_all_dates.stop_primary_direction.count())
print(m1_all_dates.dir_0_1.count())

### Plot and see