## Checking routes with 2+ cardinal directions
* While running an operator (but I forgot which one) in district 4, I discovered there were some routes with more than 2 cardinal directions. 
* This shouldn't be happening. Find out what's going on. 
* Work backwards from final dataset to figure out which routes from D4 is experiencing this, then go back to the pipeline.

In [None]:
import _section1_utils
import _section2_utils as section2
import geopandas as gpd
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

#### Check this out for all districts

In [None]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [None]:
all_districts_df = pd.read_parquet(
    schd_vp_url,
    filters=[
        [
            ("sched_rt_category", "==", "schedule_and_vp"),
        ]
    ],
)

all_districts_df_agg = (
    all_districts_df.groupby(
        ["schedule_gtfs_dataset_key", "organization_name", "route_combined_name"]
    )
    .agg({"route_primary_direction": "nunique"})
    .reset_index()
    .sort_values(by=["route_primary_direction"], ascending=False)
)

In [None]:
routes_multi_cardinal_dir = all_districts_df_agg.loc[
    all_districts_df_agg.route_primary_direction > 2
].sort_values(by=["organization_name"])

In [None]:
routes_multi_cardinal_dir.route_combined_name.nunique()

In [None]:
len(routes_multi_cardinal_dir)

In [None]:
routes_multi_cardinal_dir.route_combined_name.value_counts().head()

In [None]:
routes_multi_cardinal_dir.loc[
    routes_multi_cardinal_dir.route_combined_name == "Route 1"
]

In [None]:
routes_multi_cardinal_dir[
    [
        "schedule_gtfs_dataset_key",
        "organization_name",
        "route_combined_name",
        "route_primary_direction",
    ]
].drop_duplicates()

#### Checking out if routes are actually switching direction in real life. 

In [None]:
def changing_directions(organization_name: str, route: str, gtfs_dataset_key: str):
    name = _section1_utils.organization_name_crosswalk(organization_name)
    op_routes_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

    gdf = gpd.read_parquet(op_routes_url, filters=[[("name", "==", name)]])

    gdf["service_date"] = gdf["service_date"].dt.strftime("%Y-%m-%d %H:%M:%S")

    gdf2 = gdf.loc[gdf.route_combined_name.str.contains(route)]

    display(
        gdf2.explore(
            "service_date",
            cmap="Spectral",
            tiles="CartoDB positron",
            width=800,
            height=500,
            style_kwds={"weight": 3},
            tooltip=["service_date"],
        )
    )

    route_df = (
        all_districts_df.loc[
            (all_districts_df.schedule_gtfs_dataset_key == gtfs_dataset_key)
            & (all_districts_df.route_combined_name == route)
        ]
        .sort_values(by=["service_date", "direction_id"])[
            ["service_date", "route_primary_direction", "direction_id"]
        ]
        .drop_duplicates()
    )
    display(route_df)

In [None]:
changing_directions(
    "Monterey-Salinas Transit",
    "84 King City - Paso Robles",
    "9809d3f8121513057bc5cb8de7b54ce2",
)

##### Elk Grove Route 11
* Route didn't change.

In [None]:
changing_directions(
    "City of Elk Grove",
    "Route 11",
    "63029a23cb0e73f2a5d98a345c5e2e40",
)

##### San Francisco 14R
* No changes across the time based on the geography we have.
* Direction 0 changes from Southbound to Westbound October 2023.

In [None]:
changing_directions(
    "City and County of San Francisco",
    "14R MISSION RAPID",
    "7cc0cb1871dfd558f11a2885c145d144",
)

##### Fairfield - Route 7
* No change based on the geography.
* Both direction 0 and 1 change May 2023.

In [None]:
changing_directions(
    "City of Fairfield",
    "7 FTC/Solano College/Cordelia Library",
    "0f5e1b251db53223200c5bfc365d33f2",
)

##### Ventura Route 1
* This one is confusing, there are two values for direction 0 consistently? How is this possible?
* Route does seem to genuinely change. 

In [None]:
changing_directions(
    "Ventura County Transportation Commission",
    "Route 1",
    "1770249a5a2e770ca90628434d4934b1",
)

##### LA Metro 256 Metro Local Line
* Route genuinely changed.

In [None]:
changing_directions(
    "Los Angeles County Metropolitan Transportation Authority",
    "256 Metro Local Line",
    "3f3f36b4c41cc6b5df3eb7f5d8ea6e3c",
)

##### OmniTrans 82 RANCHO CUCAMONGA-FONTANA	
* No change in route geography.
* Most of the directions are Southbound and Northbound.
* It changes in May 2024

In [None]:
changing_directions(
    "OmniTrans",
    "82 RANCHO CUCAMONGA-FONTANA",
    "95cb514215c61ca578b01d885f35ec0a",
)

##### Riverside Transit Agency 13 Hunter Park Metro-Galleria	
* Route changed from August 2023.
* 

In [None]:
changing_directions(
    "Riverside Transit Agency",
    "13 Hunter Park Metro-Galleria",
    "d9d0325e50e50064e3cc8384b1751d67",
)

### `schedule_stats_by_routes/cardinal_direction_for_route_direction` 

In [None]:
dates_2023 = rt_dates.y2023_dates[0:4]

In [None]:
dates_2023

In [None]:
STOP_TIMES_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction

In [None]:
stop_times_gdf = pd.DataFrame()
for date in dates_2023:
    df = pd.read_parquet(
        f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{date}.parquet",
        filters=[[("stop_primary_direction", "!=", "Unknown")]],
    )

    stop_times_gdf = pd.concat([stop_times_gdf, df])

In [None]:
len(
    stop_times_gdf.loc[
        stop_times_gdf.schedule_gtfs_dataset_key == "0f5e1b251db53223200c5bfc365d33f2"
    ]
)

In [None]:
fairfield_stops = stop_times_gdf.loc[
    stop_times_gdf.schedule_gtfs_dataset_key == "0f5e1b251db53223200c5bfc365d33f2"
].reset_index(drop=True)

In [None]:
fairfield_stops.head(2)

In [None]:
trip_scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name",
]

In [None]:
trips_df = pd.DataFrame()
for date in dates_2023:
    df = helpers.import_scheduled_trips(
        date, columns=trip_scheduled_col, get_pandas=True
    )

    trips_df = pd.concat([trips_df, df])

In [None]:
len(
    trips_df.loc[
        trips_df.schedule_gtfs_dataset_key == "0f5e1b251db53223200c5bfc365d33f2"
    ]
)

In [None]:
city_fairfield_trips = trips_df.loc[
    trips_df.schedule_gtfs_dataset_key == "0f5e1b251db53223200c5bfc365d33f2"
]

In [None]:
city_fairfield_trips = city_fairfield_trips.drop(columns=["route_desc"])

In [None]:
len(
    city_fairfield_trips.loc[
        city_fairfield_trips.route_long_name.str.contains("Cordelia")
    ]
)

In [None]:
cordelia = city_fairfield_trips.loc[
    city_fairfield_trips.route_long_name.str.contains("Cordelia")
]

In [None]:
merge_cols = ["trip_instance_key", "schedule_gtfs_dataset_key", "shape_array_key"]

stop_times_with_trip = pd.merge(fairfield_stops, cordelia, on=merge_cols)

In [None]:
stop_times_with_trip.head(2)

In [None]:
main_cols = ["route_id", "schedule_gtfs_dataset_key", "direction_id"]

agg1 = (
    stop_times_with_trip.groupby(main_cols + ["stop_primary_direction"])
    .agg({"stop_sequence": "count"})
    .reset_index()
    .rename(columns={"stop_sequence": "total_stops"})
)

In [None]:
agg2 = agg1.sort_values(
    by=main_cols + ["total_stops"],
    ascending=[True, True, True, False],
)

#### Connect this back to route typologies

In [None]:
ROUTE_DIR_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [None]:
RT_SCHED_GCS

In [None]:
ROUTE_DIR_EXPORT

In [None]:
def open_add(file: str, date: str):
    df = pd.read_parquet(
        "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2023-03-15.parquet"
    )
    df["service_date"] = date
    return df

In [None]:
march_df = open_add(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2023-03-15.parquet",
    "3-15",
)

In [None]:
apr_df = open_add(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2023-04-12.parquet",
    "4-12",
)

In [None]:
may_df = open_add(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2023-05-17.parquet",
    "5-17",
)

In [None]:
june_df = open_add(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2023-06-14.parquet",
    "6-14",
)

In [None]:
all_df = pd.concat([march_df, apr_df, may_df, june_df])

In [None]:
route_group_merge_cols = ["schedule_gtfs_dataset_key", "route_id", "direction_id"]

In [None]:
m1 = pd.merge(agg2, all_df, on=route_group_merge_cols, how="left")

In [None]:
m1 = m1.drop(columns=["geometry"])

In [None]:
m1.head(1)

In [None]:
m1[
    ["service_date", "stop_primary_direction", "direction_id", "total_stops"]
].sort_values(
    by=["service_date", "total_stops"], ascending=[True, False]
).drop_duplicates()