## Switch Direction 0/1 to Cardinal
* direction_id -> cardinal direction. i want to use stop_times_with_direction and just count, by route-id/direction-id how many stops in each stop_primary_direction group. then whatever is pluraltiy, assign that.
* you can take a look at the gtfs_analytics_catalog and find that file and just open it. the script to create it is in gtfs_funnel already, and you can see what that column means.
* https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
* https://github.com/cal-itp/data-analyses/blob/main/gtfs_funnel/stop_times_with_direction.py


In [None]:
from datetime import datetime

import _section2_utils as section2_utils
import geopandas as gpd
import numpy as np
import pandas as pd
from IPython.display import HTML, Image, Markdown, display, display_html
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
org_name = "Los Angeles County Metropolitan Transportation Authority"

### Charts Testing

In [None]:
import altair as alt

alt.data_transformers.enable("default", max_rows=None)
import _report_utils

In [None]:
# Data Dictionary
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")
import yaml

with open("readable.yml") as f:
    readable_dict = yaml.safe_load(f)

# Color Palette
with open("color_palettes.yml") as f:
    color_dict = yaml.safe_load(f)

In [None]:
sched_vp_df = section2_utils.load_schedule_vp_metrics(org_name)

In [None]:
sched_vp_df.columns

In [None]:
sched_vp_df["Direction"].value_counts()

In [None]:
sched_vp_df.shape

In [None]:
df = sched_vp_df.copy()

In [None]:
routes_list = df["Route"].unique().tolist()

route_dropdown = alt.binding_select(
    options=routes_list,
    name="Routes",
)
# Column that controls the bar charts
route_selector = alt.selection_point(
    fields=["Route"],
    bind=route_dropdown,
)

In [None]:
all_day = df.loc[df["Period"] == "all_day"].reset_index(drop=True)

In [None]:
all_day.shape

In [None]:
route_stats_df = section2_utils.route_stats(df)

In [None]:
route_stats_df.head(2)

In [None]:
timeliness_df = section2_utils.timeliness_trips(df)

In [None]:
timeliness_df.head(2)

In [None]:
sched_journey_vp = section2_utils.pct_vp_journey(
    all_day,
    "% Scheduled Trip w/ 1+ VP/Minute",
    "% Scheduled Trip w/ 2+ VP/Minute",
)

In [None]:
sched_journey_vp.head(2)

#### Avg Scheduled Min Graph

In [None]:
(
    section2_utils.grouped_bar_chart(
        df=all_day,
        color_col="Direction",
        y_col="Average Scheduled Service (trip minutes)",
        offset_col="Direction",
        title=readable_dict["avg_scheduled_min_graph"]["title"],
        subtitle=readable_dict["avg_scheduled_min_graph"]["subtitle"],
    )
    .add_params(route_selector)
    .transform_filter(route_selector)
)

### Timeliness
* Need to change `readable.yml` titles so it displays Eastbound/Westbound instead of 0

In [None]:
timeliness_df.head()

In [None]:
timeliness_df.info()

In [None]:
(
    (
        section2_utils.base_facet_chart(
            timeliness_df.loc[timeliness_df["Direction"] == "Eastbound/Westbound"],
            "value",
            "variable",
            "Period",
            readable_dict["timeliness_trips_dir_eastwest_graph"]["title"],
            readable_dict["timeliness_trips_dir_eastwest_graph"]["subtitle"],
        )
    )
    .add_params(route_selector)
    .transform_filter(route_selector)
)

#### Frequency
* Add specific direction into tooltip (for all the graphs)

In [None]:
(
    section2_utils.frequency_chart(df)
    .add_params(route_selector)
    .transform_filter(route_selector)
)

#### Speed

In [None]:
(
        section2_utils.base_facet_line(
            df,
            "Speed (MPH)",
            readable_dict["speed_graph"]["title"],
            readable_dict["speed_graph"]["subtitle"],
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

#### VP Per Min

In [None]:
(
        (
            section2_utils.base_facet_with_ruler_chart(
                all_day,
                "Average VP per Minute",
                "ruler_for_vp_per_min",
                readable_dict["vp_per_min_graph"]["title"],
                readable_dict["vp_per_min_graph"]["subtitle"],
                color_dict["vp_domain"],
                color_dict["vp_range"]
            )
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

#### Scheduled VP Per Min

In [None]:
sched_journey_vp.head(2)

In [None]:
(
        section2_utils.base_facet_circle(
            sched_journey_vp,
            "% of Actual Trip Minutes",
            "Category",
            "ruler_100_pct",
            readable_dict["sched_vp_per_min_graph"]["title"],
            readable_dict["sched_vp_per_min_graph"]["subtitle"],
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

#### Spatial Accuracy

In [None]:
(
        section2_utils.base_facet_with_ruler_chart(
            all_day,
            "% VP within Scheduled Shape",
            "ruler_100_pct",
            readable_dict["spatial_accuracy_graph"]["title"],
            readable_dict["spatial_accuracy_graph"]["subtitle"],
            color_dict["spatial_accuracy_domain"],
            color_dict["spatial_accuracy_range"]
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

#### Text

In [None]:
(
        (section2_utils.create_text_table(route_stats_df, 'Eastbound/Westbound'))
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

### Why are there so many unknowns in the Cardinal Direction?
* Because I need to clean up route names using [this script](https://github.com/cal-itp/data-analyses/blob/b1e5d4f870400251240eeba4a6515a0848e5d6f8/gtfs_funnel/clean_route_naming.py#L4).

In [None]:
stop

In [None]:
gtfs_keys = list(sched_vp_df.schedule_gtfs_dataset_key.unique())

In [None]:
date = section2_utils.load_most_current_date()

In [None]:
date

In [None]:
all_dates_list = list(sched_vp_df.Date.unique())

In [None]:
# Need this line of code to turn dates into strings.
all_dates_list = [np.datetime_as_string(date, unit="D") for date in all_dates_list]

### Use more than one `schedule_gtfs_dataset_key` & date

In [None]:
def load_scheduled_stop_times(date: str, gtfs_schedule_key: list) -> pd.DataFrame:
    stop_times_col = [
        "feed_key",
        "stop_id",
        "stop_sequence",
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
        "shape_array_key",
        "stop_name",
        "prior_stop_sequence",
        "subseq_stop_sequence",
        "stop_pair",
        "stop_pair_name",
        "stop_primary_direction",
        "stop_meters",
    ]
    stop_times_df = helpers.import_scheduled_stop_times(
        date,
        filters=[[("schedule_gtfs_dataset_key", "in", gtfs_schedule_key)]],
        columns=stop_times_col,
        get_pandas=True,
        with_direction=True,
    ).assign(service_date=pd.to_datetime(date))

    return stop_times_df

In [None]:
def load_scheduled_trips(date: str, gtfs_schedule_key: list) -> pd.DataFrame:
    scheduled_col = [
        "route_id",
        "trip_instance_key",
        "gtfs_dataset_key",
        "shape_array_key",
        "direction_id",
        "route_long_name",
        "route_short_name",
        "route_desc",
        "name",
    ]

    scheduled_trips_df = helpers.import_scheduled_trips(
        date,
        filters=[[("gtfs_dataset_key", "in", gtfs_schedule_key)]],
        columns=scheduled_col,
    ).assign(service_date=pd.to_datetime(date))

    return scheduled_trips_df

In [None]:
# test = load_scheduled_trips(all_dates_list[0],gtfs_keys)

In [None]:
def load_stack_all_dates(date_list: list, gtfs_schedule_keys: list) -> pd.DataFrame:
    scheduled_stop_times_df = pd.DataFrame()
    for i in date_list:
        df = load_scheduled_stop_times(i, gtfs_schedule_keys)
        scheduled_stop_times_df = pd.concat([scheduled_stop_times_df, df], axis=0)

    scheduled_trips_df = pd.DataFrame()
    for i in date_list:
        df = load_scheduled_trips(i, gtfs_schedule_keys)
        scheduled_trips_df = pd.concat([scheduled_trips_df, df], axis=0)

    # Need to return scheduled_trips_df to find the most recent route id
    return scheduled_stop_times_df, scheduled_trips_df

In [None]:
# 10:18
# m1, scheduled_trips_df = load_stack_all_dates(all_dates_list, gtfs_keys)

In [None]:
# len(m1)

### Aggregate
* DO we want to aggregate by date? 

In [None]:
def find_most_common_dir(df: pd.DataFrame) -> pd.DataFrame:
    # Count total stops
    agg1 = (
        df.groupby(
            [
                "route_id",
                "schedule_gtfs_dataset_key",
                "direction_id",
                "stop_primary_direction",
            ]
        )
        .agg({"stop_sequence": "count"})
        .reset_index()
        .rename(columns={"stop_sequence": "total_stops"})
    )

    # Sort and drop duplicates so that the
    # largest # of stops by stop_primary_direction is at the top
    agg2 = agg1.sort_values(
        by=["route_id", "schedule_gtfs_dataset_key", "direction_id", "total_stops"],
        ascending=[True, True, True, False],
    )

    # Drop duplicates so only the top stop_primary_direction is kept.
    agg3 = agg2.drop_duplicates(
        subset=[
            "route_id",
            "schedule_gtfs_dataset_key",
            "direction_id",
        ]
    ).reset_index(drop=True)

    agg3 = agg3.drop(columns=["total_stops"])
    return agg3

In [None]:
def find_most_recent_route_id(df: pd.DataFrame) -> pd.DataFrame:
    df = df.assign(
        route_id=df.route_id.fillna(""),
        route_short_name=df.route_short_name.fillna(""),
        route_long_name=df.route_long_name.fillna(""),
    )

    df = df.assign(combined_name=df.route_short_name + "__" + df.route_long_name)

    df = df.assign(
        route_id2=df.apply(
            lambda x: gtfs_schedule_wrangling.standardize_route_id(
                x, "name", "route_id"
            ),
            axis=1,
        )
    )

    route_cols = ["schedule_gtfs_dataset_key", "name", "route_id2"]

    df2 = gtfs_schedule_wrangling.most_recent_route_info(
        df, group_cols=route_cols, route_col="combined_name"
    ).pipe(
        gtfs_schedule_wrangling.most_recent_route_info,
        group_cols=["schedule_gtfs_dataset_key", "name", "recent_combined_name"],
        route_col="route_id2",
    )

    sort_order = [True for c in route_cols]
    df3 = (
        df2.sort_values(route_cols + ["service_date"], ascending=sort_order + [False])
        .drop_duplicates(subset=route_cols)
        .rename(columns={"combined_name": "recent_combined_name"})
    )

    df3 = df3[["schedule_gtfs_dataset_key", "recent_route_id2", "route_id"]]
    return df3

In [None]:
# recent_id_df = find_most_recent_route_id(scheduled_trips_df)

In [None]:
"""cardinal_dir_df2 = pd.merge(
    recent_id_df,
    cardinal_dir_df,
    on=["schedule_gtfs_dataset_key", "route_id"],
    how="inner",
)"""

In [None]:
""" sched_vp_df2 = pd.merge(
    sched_vp_df,
    cardinal_dir_df2.drop(columns=["route_id"]),
    left_on=["schedule_gtfs_dataset_key", "Direction", "Route ID"],
    right_on=[
        "schedule_gtfs_dataset_key",
        "direction_id",
        "recent_route_id2",
    ],
    how="left",
)"""

In [None]:
def find_cardinal_direction(date_list: list, gtfs_schedule_keys: list) -> pd.DataFrame:
    # Grab all available dates for these dataframes
    scheduled_stop_times_df, scheduled_trips_df = load_stack_all_dates(
        date_list, gtfs_schedule_keys
    )

    # Merge them
    m1 = pd.merge(
        scheduled_trips_df,
        scheduled_stop_times_df,
        on=["trip_instance_key", "schedule_gtfs_dataset_key", "shape_array_key"],
        how="inner",
    )

    # Find the most common direction for this Route ID
    common_stops_df = find_most_common_dir(m1)

    # Find the most recent Route ID to connect back to sched_vp_df
    recent_ids_df = find_most_recent_route_id(scheduled_trips_df)

    # Merge this
    m2 = pd.merge(
        common_stops_df,
        recent_ids_df,
        on=["schedule_gtfs_dataset_key", "route_id"],
        how="inner",
    )

    m2 = m2.drop(columns=["route_id"])
    return m2

In [None]:
cardinal_dir_df = find_cardinal_direction(all_dates_list, gtfs_keys)

In [None]:
cardinal_dir_df.shape

In [None]:
cardinal_dir_df.head(2)

#### Recode 1=East/West, 0=North/South

In [None]:
sched_vp_df.columns

In [None]:
sched_vp_df2 = pd.merge(
    sched_vp_df.drop(columns=["Cardinal Direction"]),
    cardinal_dir_df,
    left_on=["schedule_gtfs_dataset_key", "Direction", "Route ID"],
    right_on=[
        "schedule_gtfs_dataset_key",
        "direction_id",
        "recent_route_id2",
    ],
    how="left",
)

In [None]:
sched_vp_df2.columns

In [None]:
def assign_cardinal_dir(row):
    if row["Direction"] == 0:
        return "Eastbound/Westbound"
    else:
        return "Northbound/Southbound"


# Apply the function
sched_vp_df2["Cardinal Direction"] = sched_vp_df2.apply(assign_cardinal_dir, axis=1)

In [None]:
sched_vp_df2 = sched_vp_df2.rename(
    columns={
        "Direction": "Direction_filter",
        "Cardinal Direction": "Direction",
        "stop_primary_direction": "Specific Direction",
    }
)

In [None]:
sched_vp_df2.head(1)

In [None]:
sched_vp_df2["Direction"].value_counts()

In [None]:
sched_vp_df2["Direction"].unique()

In [None]:
section2_utils.filtered_route(sched_vp_df2)

### Understanding `/gtfs_funnel/clean_route_naming.py` & grabbing the most recent `route_name` and `route_id`
* https://github.com/cal-itp/data-analyses/blob/main/gtfs_funnel/clean_route_naming.py

In [None]:
stop

In [None]:
import sys

sys.path.append("../gtfs_funnel")
import clean_route_naming

#### Breaking out `concatenate_routes_across_dates`
* This would go into the `helpers.import_scheduled_trips` in my `merge_scheduled_stop_times` function.

In [None]:
gtfs_funnel_scheduled_trips = pd.concat(
    [
        helpers.import_scheduled_trips(
            analysis_date,
            filters=[[("gtfs_dataset_key", "in", gtfs_keys)]],
            columns=[
                "route_id",
                "trip_instance_key",
                "gtfs_dataset_key",
                "shape_array_key",
                "direction_id",
                "route_long_name",
                "route_short_name",
                "route_desc",
                "name",
            ],
            get_pandas=True,
        ).assign(service_date=pd.to_datetime(analysis_date))
        for analysis_date in all_dates_list
    ],
    axis=0,
    ignore_index=True,
)

In [None]:
gtfs_funnel_scheduled_trips = gtfs_funnel_scheduled_trips.assign(
    route_id=gtfs_funnel_scheduled_trips.route_id.fillna(""),
    route_short_name=gtfs_funnel_scheduled_trips.route_short_name.fillna(""),
    route_long_name=gtfs_funnel_scheduled_trips.route_long_name.fillna(""),
)

In [None]:
gtfs_funnel_scheduled_trips = gtfs_funnel_scheduled_trips.assign(
    combined_name=gtfs_funnel_scheduled_trips.route_short_name
    + "__"
    + gtfs_funnel_scheduled_trips.route_long_name
)

In [None]:
gtfs_funnel_scheduled_trips = gtfs_funnel_scheduled_trips.assign(
    route_id2=gtfs_funnel_scheduled_trips.apply(
        lambda x: gtfs_schedule_wrangling.standardize_route_id(x, "name", "route_id"),
        axis=1,
    )
)

In [None]:
gtfs_funnel_scheduled_trips.head(2)

In [None]:
gtfs_funnel_scheduled_trips.shape

#### `gtfs_schedule_wrangling.most_recent_route_info()`

In [None]:
route_cols = ["schedule_gtfs_dataset_key", "name", "route_id2"]

In [None]:
df2 = gtfs_schedule_wrangling.most_recent_route_info(
    gtfs_funnel_scheduled_trips, group_cols=route_cols, route_col="combined_name"
).pipe(
    gtfs_schedule_wrangling.most_recent_route_info,
    group_cols=["schedule_gtfs_dataset_key", "name", "recent_combined_name"],
    route_col="route_id2",
)

In [None]:
gtfs_schedule_wrangling.most_recent_route_info??

In [None]:
df2.shape

In [None]:
sort_order = [True for c in route_cols]

In [None]:
most_recent = (
    gtfs_funnel_scheduled_trips.sort_values(
        route_cols + ["service_date"], ascending=sort_order + [False]
    )
    .drop_duplicates(subset=route_cols)
    .rename(columns={"combined_name": "recent_combined_name"})
)

most_recent2 = (
    most_recent.sort_values(
        ["schedule_gtfs_dataset_key", "name", "recent_combined_name"]
        + ["service_date"],
        ascending=sort_order + [False],
    )
    .drop_duplicates(subset=route_cols)
    .rename(columns={"route_id2": "recent_route_id2"})
)

In [None]:
most_recent.head(2)

In [None]:
most_recent.shape, most_recent.recent_combined_name.nunique()

In [None]:
most_recent2.head(2)

In [None]:
most_recent2.shape, most_recent2.recent_route_id2.nunique()

In [None]:
stop

### Test Merge w/ `sched_vp_df` w/ `standardized_route_ids.parquet`

In [None]:
std_route_names = (
    f"{SCHED_GCS}{GTFS_DATA_DICT.schedule_tables.route_identification}.parquet"
)

In [None]:
# std_route_names_df = pd.read_parquet(std_route_names)

In [None]:
# std_route_names_df = std_route_names_df.loc[
#    std_route_names_df.schedule_gtfs_dataset_key.isin(gtfs_keys)
# ]

In [None]:
# std_route_names_df.shape, std_route_names_df.route_id2.nunique(), std_route_names_df.recent_combined_name.nunique()

In [None]:
len(cardinal_dir_df)

In [None]:
# Delete out cols
sched_vp_df = sched_vp_df.drop(columns=["Cardinal Direction"])

In [None]:
sched_vp_df["Route ID"].unique(), sched_vp_df["Route ID"].nunique()

In [None]:
most_recent2.recent_route_id2.unique(), most_recent2["recent_route_id2"].nunique()

#### Seeing what are the best columns to match over for names.

In [None]:
routes3 = set(most_recent2.recent_route_id2.unique().tolist())

In [None]:
most_recent2.columns

In [None]:
route_names1 = set(most_recent2.route_long_name.unique().tolist())

In [None]:
route_names2 = set(sched_vp_df.route_long_name.unique().tolist())

In [None]:
route_names2 - route_names1

In [None]:
route_names1 - route_names2

In [None]:
len(routes3)

In [None]:
routes4 = set(most_recent2.route_id.unique().tolist())

In [None]:
len(routes1), len(routes3)

In [None]:
routes3 - routes1

In [None]:
routes1 - routes3

### Merges

In [None]:
pd.merge(
    most_recent2,
    cardinal_dir_df,
    left_on="recent_route_id2",
    right_on="route_id",
    how="left",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
most_recent2.head(1)

In [None]:
most_recent2.shape, most_recent2.route_id.nunique()

In [None]:
pd.merge(most_recent2, cardinal_dir_df, on=["route_id"], how="outer", indicator=True)[
    ["_merge"]
].value_counts()

In [None]:
cardinal_dir_df2 = pd.merge(
    most_recent2.drop(columns=["direction_id"]),
    cardinal_dir_df,
    on=["schedule_gtfs_dataset_key", "route_id"],
    how="inner",
)

In [None]:
cardinal_dir_df2.head(2)

In [None]:
cardinal_dir_df2.shape, cardinal_dir_df2.route_id.nunique()

In [None]:
pd.merge(
    sched_vp_df,
    cardinal_dir_df2,
    left_on=["schedule_gtfs_dataset_key", "Direction", "Route ID", "route_long_name"],
    right_on=[
        "schedule_gtfs_dataset_key",
        "direction_id",
        "recent_route_id2",
        "route_long_name",
    ],
    how="left",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
pd.merge(
    sched_vp_df,
    cardinal_dir_df2,
    left_on=["schedule_gtfs_dataset_key", "Direction", "Route ID"],
    right_on=[
        "schedule_gtfs_dataset_key",
        "direction_id",
        "recent_route_id2",
    ],
    how="left",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
len(sched_vp_df)

#### Merging on `route_id` and `route_long_name`

In [None]:
m1 = pd.merge(
    sched_vp_df,
    cardinal_dir_df2,
    left_on=["schedule_gtfs_dataset_key", "Direction", "Route ID", "route_long_name"],
    right_on=[
        "schedule_gtfs_dataset_key",
        "direction_id",
        "recent_route_id2",
        "route_long_name",
    ],
    how="left",
)

In [None]:
m1.head(2)

In [None]:
m1.stop_primary_direction = m1.stop_primary_direction.fillna("Unknown")

In [None]:
m1.stop_primary_direction.value_counts()

In [None]:
# Check which routes are always unknown
# C & K Line Link (857)
# Metro A Line (Blue)
unknown_df = m1.groupby(
    ["Route ID", "route_long_name", "Date", "stop_primary_direction"]
).agg({"Period": "count"})

In [None]:
unknown_df

#### Merging on `route_id` ONLY.

In [None]:
m2 = pd.merge(
    sched_vp_df,
    cardinal_dir_df2,
    left_on=["schedule_gtfs_dataset_key", "Direction", "Route ID"],
    right_on=[
        "schedule_gtfs_dataset_key",
        "direction_id",
        "recent_route_id2",
    ],
    how="left",
    indicator=True,
)

In [None]:
m2.stop_primary_direction.value_counts()