## Find Missing Routes: 2 operators. 
* [Issue](https://github.com/cal-itp/data-analyses/issues/1312): Capital Corridor doesn't have any rail routes. 
* [Most of Santa Maria's routes not showing up in GTFS Digest](https://github.com/cal-itp/data-analyses/issues/1313)
* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`

In [None]:
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### `gtfs_digest/merge_operator_data`

In [None]:
OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles
OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

In [None]:
operator_route_gdf = gpd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet",
)

In [None]:
operator_route_gdf.columns

In [None]:
org_name_lists = ["Capitol Corridor Joint Powers Authority", "City of Santa Maria"]

In [None]:
operator_route_gdf2 = operator_route_gdf.loc[
    operator_route_gdf.organization_name.isin(org_name_lists)
]

In [None]:
operator_route_gdf2.columns

In [None]:
len(operator_route_gdf2)

In [None]:
operator_route_gdf2.is_rail.value_counts()

In [None]:
operator_route_gdf2.organization_name.value_counts()

In [None]:
operator_route_gdf2.schedule_gtfs_dataset_key.unique()

### Why does City of Santa Maria have multiple schedule_gtfs_dataset_keys?

In [None]:
operator_route_gdf2.groupby(["organization_name", "schedule_gtfs_dataset_key"]).agg(
    {"route_short_name": "nunique"}
)

In [None]:
operator_route_gdf2.drop(columns=["service_date"]).explore("organization_name")

In [None]:
# operator_route_gdf2.drop(columns = ["service_date"]).explore("shape_array_key")

### Find longest_shape_array_key [at `gtfs_funnel/operator_scheduled_stats`](https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/gtfs_funnel/operator_scheduled_stats.py#L148)
* There aren't any routes for Santa Maria
* Routes are showing for Capital Corridor.

In [None]:
OPERATOR_ROUTE

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
analysis_date = "2024-11-13"

In [None]:
route_cols = ["schedule_gtfs_dataset_key", "route_id"]

In [None]:
longest_shape_gdf = (
    gtfs_schedule_wrangling.longest_shape_by_route_direction(analysis_date)
    .sort_values(
        route_cols + ["route_length"], ascending=[True for i in route_cols] + [False]
    )
    .drop_duplicates(subset=route_cols)
    .reset_index(drop=True)
)

In [None]:
schd_keys = list(operator_route_gdf2.schedule_gtfs_dataset_key.unique())

In [None]:
schd_keys

In [None]:
longest_shape_gdf2 = longest_shape_gdf.loc[
    longest_shape_gdf.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [None]:
longest_shape_gdf2.columns

In [None]:
longest_shape_gdf2.info()

In [None]:
longest_shape_gdf2 = longest_shape_gdf2.dropna()

In [None]:
longest_shape_gdf2.shape_array_key.nunique()

In [None]:
longest_shape_gdf2.schedule_gtfs_dataset_key.value_counts()

In [None]:
longest_shape_gdf2.schedule_gtfs_dataset_key.unique()

In [None]:
longest_shape_gdf2.explore("schedule_gtfs_dataset_key")

### Step back before finding the longest shape [here](https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py#L365)
#### Amanda: deleted `direction_id` b/c I discovered City of Santa Maria doesn't have values for the column `direction_id`

In [None]:
route_dir_cols = ["gtfs_dataset_key", "route_id", ]

keep_trip_cols = route_dir_cols + ["trip_instance_key", "shape_id", "shape_array_key"]

In [None]:
trips = helpers.import_scheduled_trips(
    analysis_date, columns=keep_trip_cols, get_pandas=True
).rename(columns={"schedule_gtfs_dataset_key": "gtfs_dataset_key"})

In [None]:
sorting_order = [True for i in route_dir_cols]

In [None]:
# Grab only relevant schedule_gtfs_dataset_keys
trips2 = trips.loc[trips.gtfs_dataset_key.isin(schd_keys)].reset_index(drop=True)

In [None]:
trips2.info()

In [None]:
direction_id_kept = ["direction_id","gtfs_dataset_key","route_id", "trip_instance_key", "shape_id", "shape_array_key"]

In [None]:
trips_w_direction_id = helpers.import_scheduled_trips(
    analysis_date, columns=direction_id_kept, get_pandas=True
).rename(columns={"schedule_gtfs_dataset_key": "gtfs_dataset_key"})

In [None]:
# Grab only relevant schedule_gtfs_dataset_keys
trips_w_direction_id2 = trips_w_direction_id.loc[trips_w_direction_id.gtfs_dataset_key.isin(schd_keys)].reset_index(drop=True)

In [None]:
trips_w_direction_id2.info()

In [None]:
len(trips2)

In [None]:
trips2.gtfs_dataset_key.value_counts()

In [None]:
most_common_shape = (
    trips2.groupby(
        route_dir_cols + ["shape_id", "shape_array_key"],
        observed=True,
        group_keys=False,
    )
    .agg({"trip_instance_key": "count"})
    .reset_index()
    .sort_values(
        route_dir_cols + ["trip_instance_key"], ascending=sorting_order + [False]
    )
    .drop_duplicates(subset=route_dir_cols)
    .reset_index(drop=True)[route_dir_cols + ["shape_id", "shape_array_key"]]
).rename(
    columns={
        "gtfs_dataset_key": "schedule_gtfs_dataset_key",
        "shape_id": "common_shape_id",
    }
)

In [None]:
type(most_common_shape)

In [None]:
len(most_common_shape)

In [None]:
shape_geom = helpers.import_scheduled_shapes(
    analysis_date,
    columns=["shape_array_key", "geometry"],
)

In [None]:
common_shape_geom = pd.merge(
    shape_geom, most_common_shape, on="shape_array_key", how="inner"
).drop(columns="shape_array_key")

In [None]:
type(common_shape_geom), len(common_shape_geom)

In [None]:
common_shape_geom.columns

In [None]:
from shared_utils import portfolio_utils

In [None]:
common_shape_geom.schedule_gtfs_dataset_key.value_counts()

In [None]:
route_info = (
    helpers.import_scheduled_trips(
        analysis_date,
        columns=[
            "gtfs_dataset_key",
            "route_id",
            "route_long_name",
            "route_short_name",
            "route_desc",
        ],
    )
    .drop_duplicates()
    .pipe(portfolio_utils.add_route_name)
    .drop(columns=["route_long_name", "route_short_name", "route_desc"])
)

In [None]:
common_shape_geom2 = pd.merge(
    common_shape_geom,
    route_info.rename(columns={"route_name_used": "route_name"}),
    on=["schedule_gtfs_dataset_key", "route_id"],
)

In [None]:
common_shape_geom2.drop(columns=["geometry"])

In [None]:
common_shape_geom2.loc[common_shape_geom2.schedule_gtfs_dataset_key ==
                      "73105f2d1cabc8170ab066d96863c5d5"].explore("route_id")

### Don't look at most common shape, just load trips.
* `f5a749dd65924e025b1293c58f95f8d6` is Amtrak
* 73105f2d1cabc8170ab066d96863c5d5 is the City of Santa Maria

In [None]:
len(trips2)

In [None]:
trips2.head(2)

In [None]:
trips2.gtfs_dataset_key.value_counts()

In [None]:
test2 = pd.merge(shape_geom, trips2, on="shape_array_key", how="inner")

In [None]:
len(test2)

In [None]:
# test2.head(1)

In [None]:
route_dir_cols

### City of Santa Maria has many rows without a `direction_id` value. That is why so few routes are appearing.

In [None]:
test2.info()

In [None]:
test2.groupby(['gtfs_dataset_key', 'route_id',"shape_id", "shape_array_key"],
).agg({"trip_instance_key": "count"})

### Figuring out where `GTFS_DATA_DICT.digest_tables.operator_routes_map` comes from
* Nothing ups pop when I search our repo.

In [None]:
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
dec_op_routes = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11.parquet"

In [None]:
dec_op_df = gpd.read_parquet(dec_op_routes)

In [None]:
# filter for only the operators we care about
dec_op_df2 = dec_op_df.loc[
    dec_op_df.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [None]:
dec_op_df2.schedule_gtfs_dataset_key.value_counts()

In [None]:
dec_op_df2.explore()