## Finding Missing Routes
* [Issue](https://github.com/cal-itp/data-analyses/issues/1312): Capital Corridor doesn't have any rail routes. 
* [Most of Santa Maria's routes not showing up in GTFS Digest](https://github.com/cal-itp/data-analyses/issues/1313)
* Rerun all the scripts that create the underlying dataframes for November date (`df_sched`,`df_avg_speeds`,`df_rt_sched`) and merge them using `gtfs_digest/merge_data.merge_data_sources_by_route_direction()`

In [1]:
import _section2_utils
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

SyntaxError: invalid syntax (_section2_utils.py, line 896)

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
org_name_lists = ["Capitol Corridor Joint Powers Authority", "City of Santa Maria"]

In [2]:
analysis_date_list = ["2024-11-13"]

In [3]:
one_analysis_date = "2024-11-13"

In [4]:
schd_keys = [
    "5a8721fe96786fcd25fba1f8a0ee6358",
    "73105f2d1cabc8170ab066d96863c5d5",
    "f5a749dd65924e025b1293c58f95f8d6",
]

### Run the scripts that create the following dataframes for November.
* `df_sched`: `gtfs_funnel/schedule_stats_by_route_direction`
* `df_rt_sched`: `rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`
* `df_avg_speeds`: `rt_segment_speeds/script/average_summary_speed`

In [5]:
# df_sched
RT_SCHED_GCS

NameError: name 'RT_SCHED_GCS' is not defined

In [None]:
ROUTE_DIR_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [None]:
ROUTE_DIR_EXPORT

In [None]:
df_schedule = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-11-13.parquet"
)

In [None]:
df_schedule.columns

In [None]:
filtered_df_schedule = df_schedule.loc[
    df_schedule.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [None]:
filtered_df_schedule.route_id.unique()

In [None]:
# df_avg_speeds
segment_type = "rt_stop_times"

dict_inputs = GTFS_DATA_DICT[segment_type]
ROUTE_DIR_FILE = dict_inputs["route_dir_single_summary"]

In [None]:
SEGMENT_GCS

In [None]:
ROUTE_DIR_FILE

#### Average speeds is missing a lot of stuff

In [None]:
df_avg_speeds = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_route_dir_2024-11-13.parquet"
)

In [None]:
filtered_df_avg_speeds = df_avg_speeds.loc[
    df_avg_speeds.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [None]:
filtered_df_avg_speeds.route_id.unique()

In [None]:
df_avg_speeds.loc[
    df_avg_speeds.organization_name == "Marin County Transit District"
].drop(columns=["geometry"])

In [None]:
# filtered_df_avg_speeds[[ 'route_id', 'direction_id', 'time_period','speed_mph']]

In [None]:
# df_rt_sched
RT_SCHED_GCS

In [None]:
GTFS_DATA_DICT.rt_vs_schedule_tables.vp_route_direction_metrics

In [None]:
df_rt_sched = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/vp_route_dir/route_direction_metrics_2024-11-13.parquet"
)

In [None]:
df_rt_sched.columns

### Open up original file

In [None]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [None]:
schd_vp_url

In [None]:
schd_vp_df = pd.read_parquet(schd_vp_url)

In [None]:
schd_vp_df2 = schd_vp_df.loc[schd_vp_df.organization_name.isin(org_name_lists)]

### Merge all the files based on `gtfs_digest/merge_data`

In [None]:
service_date_datetime = pd.to_datetime("2024-11-13T00:00:00.000000000")

In [None]:
df_schedule["service_date"] = service_date_datetime

In [None]:
df_rt_sched["service_date"] = service_date_datetime

In [None]:
df_avg_speeds["service_date"] = service_date_datetime

In [None]:
df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)

In [None]:
route_time_cols = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
]

In [None]:
primary_typology = merge_data.set_primary_typology(df_schedule)

In [None]:
df_schedule2 = pd.merge(df_schedule, primary_typology, on=route_time_cols, how="left")

In [None]:
df = pd.merge(
    df_schedule2,
    df_rt_sched,
    on=route_time_cols + ["service_date"],
    how="outer",
    indicator="sched_rt_category",
).merge(
    df_avg_speeds,
    on=route_time_cols + ["service_date"],
    how="outer",
)

In [None]:
df = (
    df.assign(
        sched_rt_category=df.sched_rt_category.map(
            gtfs_schedule_wrangling.sched_rt_category_dict
        )
    )
    .pipe(
        merge_data.merge_in_standardized_route_names,
    )
    .merge(
        df_crosswalk,
        on=["schedule_gtfs_dataset_key", "name", "service_date"],
        how="left",
    )
    .pipe(
        # Find the most common cardinal direction
        gtfs_schedule_wrangling.top_cardinal_direction
    )
)

In [None]:
df = df.rename(columns={"n_trips": "n_scheduled_trips"})

In [None]:
integrify = [
    "n_scheduled_trips",
    "n_vp_trips",
    "minutes_atleast1_vp",
    "minutes_atleast2_vp",
    "total_vp",
    "vp_in_shape",
    "is_early",
    "is_ontime",
    "is_late",
]

df[integrify] = df[integrify].fillna(0).astype("int")

In [None]:
repeated_y_cols = list([col for col in df.columns if "_y" in col.lower()])

In [None]:
df = df.drop(columns=repeated_y_cols)

In [None]:
repeated_x_cols = list([col for col in df.columns if "_x" in col.lower()])

In [None]:
df = df.drop(columns=repeated_x_cols)

In [None]:
df.columns

In [None]:
df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)].route_id.value_counts()

In [None]:
df.info()

In [None]:
df.sched_rt_category.value_counts()

In [None]:
filtered_df = df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
filtered_df[
    [
        "organization_name",
        "route_combined_name",
        "sched_rt_category",
        "speed_mph",
        "frequency",
        "direction_id",
    ]
].drop_duplicates()

### Save this temporarily 

In [None]:
df.to_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics_AH_TESTING.parquet"
)

### Check for speeds again

In [None]:
organization_name = "Marin County Transit District"

In [None]:
y_col = "Speed (MPH)"

In [None]:
marin_county = _section2_utils.load_schedule_vp_metrics(organization_name)

In [None]:
marin_county[
    ["GTFS Availability", "Route", "Route ID", "Direction", "Period", "Speed (MPH)"]
].sort_values(by="Route ID")

In [None]:
marin_county_route_29 = marin_county.loc[
    marin_county.Route == "29 Downtown San Rafael - E. Corte Madera"
]

In [None]:
import altair as alt

In [None]:
routes_list = marin_county["Route"].unique().tolist()

In [None]:
_section2_utils.base_facet_line(marin_county_route_29, y_col, "Testing", "Testing")

In [None]:
max_y = _section2_utils.set_y_axis(marin_county_route_29, y_col)

In [None]:
max_y

In [None]:
marin_county_route_29 = _section2_utils.clean_data_charts(marin_county_route_29, y_col)

In [None]:
marin_county_route_29[["dir_0_1", "Direction", "Period", "Speed (MPH)", "Date"]]

In [None]:
import _report_utils

In [None]:
import yaml

with open("color_palettes.yml") as f:
    color_dict = yaml.safe_load(f)

In [None]:
with open("readable.yml") as f:
    readable_dict = yaml.safe_load(f)

In [None]:
readable_dict["frequency_graph"]["title"]

In [None]:
(readable_dict["frequency_graph"]["title"] + " Test")

In [None]:
alt.Chart(
    marin_county_route_29.loc[marin_county_route_29.dir_0_1 == 1]
).mark_bar(size=10).encode(
    x="yearmonthdate(Date):O",
    y="Speed (MPH):Q",
    color=alt.Color(
        "Period:N",
        title=_report_utils.labeling("Period"),
        scale=alt.Scale(range=color_dict["tri_color"]),
    ),
).facet(column=alt.Column("Period:N", title=_report_utils.labeling("Direction")),
        )

In [None]:
readable_dict["speed_graph"]["title"]

In [None]:
_section2_utils.grouped_bar_chart(
    df = marin_county_route_29.loc[marin_county_route_29.dir_0_1 == 1],
    color_col = "Period",
    y_col = "Speed (MPH)",
    offset_col = "Period",
    title=readable_dict["speed_graph"]["title"],
    subtitle= readable_dict["speed_graph"]["subtitle"])

In [None]:
(
    alt.Chart(marin_county_route_29.loc[marin_county_route_29.dir_0_1 == 0])
    .mark_line(size=3)
    .encode(
        x=alt.X(
            "yearmonthdate(Date):O",
            title="Date",
            axis=alt.Axis(labelAngle=-45, format="%b %Y"),
        ),
        y=alt.Y(
            f"{y_col}:Q",
            title=_report_utils.labeling(y_col),
            scale=alt.Scale(domain=[0, max_y]),
        ),
        color=alt.Color(
            "Period:N",
            title=_report_utils.labeling("Period"),
            scale=alt.Scale(range=color_dict["tri_color"]),
        ),
    )
).properties(width=200, height=250)