## Where is January 2025 data?

In [2]:
# import _section1_utils
# import _section2_utils
import geopandas as gpd
# import merge_data
import numpy as np
import pandas as pd
# from segment_speed_utils import gtfs_schedule_wrangling
# from shared_utils import rt_dates
# from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
analysis_date_list = rt_dates.y2025_dates

In [None]:
analysis_date_list_all = (
    rt_dates.y2024_dates + rt_dates.y2023_dates + rt_dates.y2025_dates
)

In [None]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [None]:
og_df = pd.read_parquet(schd_vp_url)

In [None]:
og_df.columns

In [None]:
all_ops_jan_only = og_df.loc[og_df.service_date == "2025-01-15T00:00:00.000000000"]

In [None]:
all_ops_jan_only.service_date.unique()

In [None]:
all_ops_jan_only.time_period.value_counts()

In [None]:
all_ops_jan_only[
    [
        "organization_name",
        "route_id",
        "time_period",
        "is_early",
        "is_ontime",
        "is_late",
    ]
].sample(10)

### In the original dataframe, not loaded using the function in  `_merge_data.section_2_utils()` there are peak/offpeak values in `time_period`
* There should only be 6 rows for each route (ideally)
* Dir 1: all day, peak, offpeak
* Dir 0: all day, peak, offpeak.

In [None]:
ac_transit_og_one_route = all_ops_jan_only.loc[
    (all_ops_jan_only.organization_name == "Alameda-Contra Costa Transit District")
    & (all_ops_jan_only.route_id == "200")
]

In [None]:
ac_transit_og_one_route.shape

In [None]:
ac_transit_og_one_route.head(2)

In [None]:
ac_transit_og_one_route.time_period.value_counts()

In [None]:
ac_transit_og_one_route.sched_rt_category.unique()

In [None]:
ac_transit_og_one_route.columns

In [None]:
all_ops_jan_only.loc[
    (all_ops_jan_only.organization_name == "City and County of San Francisco")].route_id.unique()

In [None]:
sf_og_one_route = all_ops_jan_only.loc[
    (all_ops_jan_only.organization_name == "City and County of San Francisco")
    & (all_ops_jan_only.route_id == "22")
]

In [None]:
sf_og_one_route.head(1)

In [None]:
sf_og_one_route[["sched_rt_category","time_period"]]

### Something has gone wrong with the merges that I fixed awhile back.

In [None]:
ac_transit_og_one_route[["sched_rt_category", "time_period"]]

In [None]:
ac_transitonly = _section2_utils.load_schedule_vp_metrics(
    "Alameda-Contra Costa Transit District"
)

In [None]:
ac_jan_only = ac_transitonly.loc[ac_transitonly.Date == "2025-01-15T00:00:00.000000000"]

In [None]:
ac_jan_only.head(2)

In [None]:
ac_jan_only.Period.value_counts()

### Timeliness only has `all_day` values.

In [None]:
timeliness_test = _section2_utils.timeliness_trips(sf_only)

In [None]:
timeliness_test.head(2)

In [None]:
timeliness_test.Date.unique()

In [None]:
timeliness_to_keep = [
    "Date",
    "Organization",
    "Direction",
    "Period",
    "Route",
    "# Early Arrival Trips",
    "# On-Time Trips",
    "# Late Trips",
    "dir_0_1",
]

In [None]:
sf_jan_only[timeliness_to_keep].head(10)

In [None]:
sf_jan_only.Period.value_counts()

### Timeliness metrics is located in `rt_segment_speeds/segment_speed_utils/metrics.py`

### None of the "route identifiers" are showing up for January 2025

In [None]:
df.loc[df.service_date == "2025-01-15T00:00:00.000000000"].head()

In [None]:
df.columns

In [None]:
df_test.info()

In [None]:
df_test.route_id = df_test.route_id.fillna("No Route Name")

In [None]:
df_test.columns

In [None]:
df_test.service_date.unique()

In [None]:
df_test.loc[
    (df_test.organization_name == "Monterey-Salinas Transit")
    & (df_test.service_date == "2025-01-15T00:00:00.000000000")
][["route_id"]].nunique()

In [None]:
# Define a function to highlight the string "No Route Name" in red
def highlight_no_route_name(val):
    color = "red" if val == "No Route Name" else ""
    return f"color: white; background-color: {color}" if color else ""

In [None]:
january_only = df_test.loc[(df_test.service_date == "2025-01-15T00:00:00.000000000")]

In [None]:
no_route_names = (
    january_only.groupby(["schedule_gtfs_dataset_key", "organization_name", "route_id"])
    .agg({"direction_id": "count"})
    .reset_index()
)

In [None]:
# Apply the highlighting function to the 'Route Name' column
no_route_names.style.applymap(highlight_no_route_name, subset=["route_id"])

### Test: removing `name` from the `merge` with `df_crosswalk` in line 259 in `gtfs_digest/merge_data`

### Find out where it's not merging

In [None]:
df_sched = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)

In [None]:
df_sched.service_date.unique()

In [None]:
df_sched.loc[df_sched.service_date == "2025-01-15T00:00:00.000000000"].head(2)

In [None]:
df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(analysis_date_list)

In [None]:
df_avg_speeds.loc[df_avg_speeds.service_date == "2025-01-15T00:00:00.000000000"].head(2)

In [None]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(
    analysis_date_list
)

In [None]:
df_rt_sched.loc[df_rt_sched.service_date == "2025-01-15T00:00:00.000000000"].head(2)

In [None]:
df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)

In [None]:
df_crosswalk.loc[df_crosswalk.service_date == "2025-01-15T00:00:00.000000000"].head(2)

In [None]:
primary_typology = merge_data.set_primary_typology(df_sched)

In [None]:
primary_typology.head(2)

In [None]:
df_schedule2 = pd.merge(
    df_sched, primary_typology, on=merge_data.route_time_cols, how="left"
)

In [None]:
df_schedule2.loc[df_schedule2.service_date == "2025-01-15T00:00:00.000000000"].head(2)

In [None]:
df1 = pd.merge(
    df_schedule2,
    df_rt_sched,
    on=merge_data.route_time_cols + ["service_date"],
    how="outer",
    indicator="sched_rt_category",
).merge(
    df_avg_speeds,
    on=merge_data.route_time_cols + ["service_date"],
    how="outer",
)

In [None]:
df1.loc[df.service_date == "2025-01-15T00:00:00.000000000"].head(2)

In [None]:
df1.info()

In [None]:
df1.sched_rt_category.unique()

In [None]:
df_wo_standardized_route = df1.assign(
    sched_rt_category=df1.sched_rt_category.map(
        gtfs_schedule_wrangling.sched_rt_category_dict
    )
)

In [None]:
df_w_standardized_route = df_wo_standardized_route.pipe(
    merge_data.merge_in_standardized_route_names,
)

In [None]:
df_w_standardized_route.info()

#### `merge_in_standardized_route_names` isn't working
* `route_names_df` doesn't have any values for January 2025.
* Rerun `gtfs_funnel/clean_route_naming.py`

In [None]:
keep_cols = [
    "schedule_gtfs_dataset_key",
    "name",
    "route_id",
    "service_date",
]

CLEAN_ROUTES = GTFS_DATA_DICT.schedule_tables.route_identification

route_names_df = pd.read_parquet(f"{SCHED_GCS}{CLEAN_ROUTES}.parquet")

In [None]:
route_names_df.info()

In [None]:
route_names_df.service_date.unique()

In [None]:
from segment_speed_utils import gtfs_schedule_wrangling, time_series_utils

In [None]:
route_names_df = time_series_utils.clean_standardized_route_names(
    route_names_df
).drop_duplicates()

In [None]:
route_names_df.head(2)

In [None]:
route_names_df.service_date.unique()

In [None]:
if "name" in df_wo_standardized_route.columns:
    df_wo_standardized_route = df.drop(columns="name")