## Banning Transit is missing
[PR](https://github.com/cal-itp/data-analyses/issues/1254)

In [None]:
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

In [None]:
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
import _operators_prep as op_prep

In [None]:
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates

In [None]:
analysis_date_list

### What's in `gtfs_funnel/crosswalk_gtfs.../`

In [None]:
from shared_utils import schedule_rt_utils

In [None]:
analysis_date = "2024-10-16"

In [None]:
imported_trips_df = helpers.import_scheduled_trips(
    analysis_date, columns=["gtfs_dataset_key", "name"], get_pandas=True
).rename(columns={"schedule_gtfs_dataset_key": "gtfs_dataset_key"})

In [None]:
imported_trips_df.head(2)

In [None]:
crosswalk = schedule_rt_utils.sample_gtfs_dataset_key_to_organization_crosswalk(
    imported_trips_df,
    analysis_date,
    quartet_data="schedule",
    dim_gtfs_dataset_cols=["key", "source_record_id", "base64_url"],
    dim_organization_cols=[
        "source_record_id",
        "name",
        "itp_id",
        "caltrans_district",
        "ntd_id_2022",
    ],
)

In [None]:
crosswalk.head(2)

#### Fix at crosswalk `filter_dim_organizations` is where caltrans_district is brought in

In [None]:
dim_organization_cols: list[str] = ["source_record_id", "name", "caltrans_district"]

In [None]:
orgs = schedule_rt_utils.filter_dim_organizations(
    analysis_date, keep_cols=dim_organization_cols, get_df=True
)

In [None]:
orgs.info()

In [None]:
orgs.loc[orgs.name.str.contains("Banning")]

### Edited crosswalk in `gtfs_funnel`: manually added Districts

In [None]:
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [None]:
url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-10-21.parquet"

In [None]:
crosswalk_oct_31 = pd.read_parquet(url)

In [None]:
crosswalk_oct_31.loc[crosswalk_oct_31.name.str.contains("Ban")]

In [None]:
url2 = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-04-15.parquet"

In [None]:
crosswalk_apr_15 = pd.read_parquet(url2)

In [None]:
crosswalk_apr_15.loc[crosswalk_apr_15.name.str.contains("Ban")].organization_name

### `Merge_date`
#### Stack all the new crosswalks

In [None]:
stack_crossawlk = merge_data.concatenate_crosswalk_organization(analysis_date_list)

In [None]:
stack_crossawlk.service_date.nunique()

In [None]:
banning_only = stack_crossawlk.loc[stack_crossawlk.name.str.contains("Banning")]

In [None]:
banning_only[
    ["service_date", "organization_name", "schedule_gtfs_dataset_key", "name"]
].drop_duplicates().sort_values(by=["service_date"])

#### Try to find Banning in `df_schedule`

In [None]:
df_sched = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)

In [None]:
df_sched.service_date.nunique()

In [None]:
df_sched.head(1)

In [None]:
banning_only_df_sched = df_sched.loc[
    df_sched.schedule_gtfs_dataset_key.str.contains("ebc783bace70899492d6206c352547d6")
]

In [None]:
banning_only_df_sched[
    ["service_date", "schedule_gtfs_dataset_key"]
].drop_duplicates().sort_values(by=["service_date"])

#### Try to find Banning in `df_rt_sched`

In [None]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(
    analysis_date_list
)

In [None]:
df_rt_sched.head(1)

In [None]:
banning_only_df_rt_sched = df_rt_sched.loc[df_rt_sched.name.str.contains("Banning")]

In [None]:
banning_only_df_rt_sched.head(1)

In [None]:
banning_only_df_rt_sched[
    ["service_date", "schedule_gtfs_dataset_key", "name"]
].drop_duplicates().sort_values(by=["service_date"])

## Read in file after rerunning `merge_data`

In [None]:
DIGEST_RT_SCHED = GTFS_DATA_DICT.digest_tables.route_schedule_vp
DIGEST_SEGMENT_SPEEDS = GTFS_DATA_DICT.digest_tables.route_segment_speeds

In [None]:
f"{RT_SCHED_GCS}{DIGEST_RT_SCHED}.parquet"

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [None]:
final = pd.read_parquet(f"{RT_SCHED_GCS}{DIGEST_RT_SCHED}.parquet")

In [None]:
final.head(1)

### Banning is here

In [None]:
final[
    [
        "organization_name",
        "caltrans_district",
        "name",
        "sched_rt_category",
        "service_date",
    ]
].drop_duplicates().sort_values(by=["organization_name"])

## I also want to edit this so if an agency that consistently produces RT or schedule data misses a month or two, they will still show up on the portfolio

In [None]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

schd_vp_df = pd.read_parquet(
    schd_vp_url,
    filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
    columns=[
        "schedule_gtfs_dataset_key",
        "caltrans_district",
        "organization_name",
        "name",
        "sched_rt_category",
        "service_date",
    ],
)

In [None]:
schd_vp_df2 = (
    schd_vp_df.dropna(subset="caltrans_district")
    .sort_values(
        by=[
            "caltrans_district",
            "organization_name",
            "service_date",
        ],
        ascending=[True, True, False],
    )
    .drop_duplicates(
        subset=[
            "organization_name",
            "caltrans_district",
        ]
    )
    .reset_index(drop=True)
)

In [None]:
schd_vp_df2.loc[schd_vp_df2.organization_name.str.contains("Banning")]

In [None]:
schd_vp_df2.organization_name.value_counts().describe()

In [None]:
schd_vp_df2.service_date.unique()

In [None]:
schd_vp_df2.service_date.value_counts()

In [None]:
schd_vp_df2.loc[schd_vp_df2.service_date == "2024-02-14"]

## Forgot why I need `operator_profiles`

In [None]:
operator_profiles = op_prep.operator_profiles()

In [None]:
operator_profiles