## Find Missing Routes: 2 operators. 
* [Issue](https://github.com/cal-itp/data-analyses/issues/1312): Capital Corridor doesn't have any rail routes. 
* [Most of Santa Maria's routes not showing up in GTFS Digest](https://github.com/cal-itp/data-analyses/issues/1313)
* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`
* 1/7: the routes are appearing in `the longest shape` but not appearing in `route_typologies`

In [1]:
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
org_name_lists = ["Capitol Corridor Joint Powers Authority", "City of Santa Maria"]

In [4]:
analysis_date_list = rt_dates.y2024_dates

In [5]:
one_analysis_date = "2024-11-13"

In [6]:
schd_keys = [
    "5a8721fe96786fcd25fba1f8a0ee6358",
    "73105f2d1cabc8170ab066d96863c5d5",
    "f5a749dd65924e025b1293c58f95f8d6",
]

In [7]:
import sys

sys.path.append("../gtfs_funnel/")
import operator_scheduled_stats
import schedule_stats_by_route_direction

### Fix `schd_vp_url`

In [8]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

#### `df_sched` is already missing a lot of the routes.

In [9]:
# Get cardinal direction for each route
df_sched = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)

In [10]:
df_sched2 = df_sched.loc[df_sched.schedule_gtfs_dataset_key.isin(schd_keys)]

In [11]:
df_sched2.route_id.value_counts()

Shuttle                                 72
5                                       27
b3848f93-d26b-48a9-b6a6-5de22a4eab47     9
Name: route_id, dtype: int64

#### Go back to `gtfs_funnel/schedule_stats_by_route_direction`
* https://github.com/cal-itp/data-analyses/blob/1ba0f544a01f99966a6e210dd11666b4fe4a146e/gtfs_funnel/schedule_stats_by_route_direction.py#L190

In [None]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

##### `trip_metrics`: nothing is missing

In [None]:
trip_metrics = schedule_stats_by_route_direction.assemble_scheduled_trip_metrics(
    one_analysis_date, GTFS_DATA_DICT
)

In [None]:
trip_metrics.head(1)

In [None]:
trip_metrics2 = trip_metrics.loc[trip_metrics.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
trip_metrics2[
    ["schedule_gtfs_dataset_key", "route_id", "direction_id"]
].drop_duplicates()

##### Something is causing routes to drop off in `schedule_metrics_by_route_direction` even though I took out `direction_id`

In [None]:
route_group_merge_cols = ["schedule_gtfs_dataset_key", "route_id", "direction_id"]

In [None]:
route_group_merge_cols_no_dir_id = [
    "schedule_gtfs_dataset_key",
    "route_id",
]

In [None]:
route_dir_metrics = (
    schedule_stats_by_route_direction.schedule_metrics_by_route_direction(
        trip_metrics2, one_analysis_date, route_group_merge_cols
    )
)

In [None]:
route_dir_metrics[
    ["schedule_gtfs_dataset_key", "route_id", "direction_id"]
].drop_duplicates()

In [None]:
service_freq_df = schedule_stats_by_route_direction.gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
    trip_metrics2, route_group_merge_cols, long_or_wide="long"
)

In [None]:
service_freq_df2 = schedule_stats_by_route_direction.gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
    trip_metrics2, route_group_merge_cols_no_dir_id, long_or_wide="long"
)

In [None]:
service_freq_df.route_id.value_counts()

In [None]:
service_freq_df2.route_id.value_counts()

In [None]:
metrics_df = (
    trip_metrics2.groupby(route_group_merge_cols, observed=True, group_keys=False)
    .agg(
        {
            "median_stop_meters": "mean",
            # take mean of the median stop spacing for trip
            # does this make sense?
            # median is the single boiled down metric at the trip-level
            "scheduled_service_minutes": "mean",
        }
    )
    .reset_index()
    .rename(
        columns={
            "median_stop_meters": "avg_stop_meters",
            "scheduled_service_minutes": "avg_scheduled_service_minutes",
        }
    )
)

In [None]:
metrics_df.route_id.value_counts()

In [None]:
metrics_df2 = (
    trip_metrics2.groupby(
        route_group_merge_cols_no_dir_id, observed=True, group_keys=False
    )
    .agg(
        {
            "median_stop_meters": "mean",
            # take mean of the median stop spacing for trip
            # does this make sense?
            # median is the single boiled down metric at the trip-level
            "scheduled_service_minutes": "mean",
        }
    )
    .reset_index()
    .rename(
        columns={
            "median_stop_meters": "avg_stop_meters",
            "scheduled_service_minutes": "avg_scheduled_service_minutes",
        }
    )
)

In [None]:
metrics_df2.route_id.value_counts()

##### `gtfs_schedule_wrangling.most_common_shape_by_route_direction` is missing a lot of routes.
* Located in `rt_segment_speeds/segment_speed_utils`

In [None]:
common_shape = gtfs_schedule_wrangling.most_common_shape_by_route_direction(
    one_analysis_date
)

In [None]:
common_shape2 = common_shape.loc[common_shape.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
common_shape2[["route_id"]].drop_duplicates()

In [None]:
route_dir_cols = ["gtfs_dataset_key", "route_id", "direction_id"]

keep_trip_cols = route_dir_cols + ["trip_instance_key", "shape_id", "shape_array_key"]

trips = helpers.import_scheduled_trips(
    one_analysis_date, columns=keep_trip_cols, get_pandas=True
).rename(columns={"schedule_gtfs_dataset_key": "gtfs_dataset_key"})
sorting_order = [True for i in route_dir_cols]

In [None]:
trips.head(2)

In [None]:
trips2 = trips.loc[trips.gtfs_dataset_key.isin(schd_keys)]

In [None]:
trips2[["route_id", "direction_id"]].drop_duplicates()

In [None]:
route_dir_cols = ["gtfs_dataset_key", "route_id", "direction_id"]

In [None]:
most_common_shape = (
        trips2.groupby(route_dir_cols + ["shape_id", "shape_array_key"], 
                      observed=True, group_keys = False)
        .agg({"trip_instance_key": "count"})
        .reset_index()
        .sort_values(route_dir_cols + ["trip_instance_key"], 
                     ascending = sorting_order + [False])
        .drop_duplicates(subset=route_dir_cols)
        .reset_index(drop=True)
        [route_dir_cols + ["shape_id", "shape_array_key"]]
    ).rename(columns = {
        "gtfs_dataset_key": "schedule_gtfs_dataset_key", 
        "shape_id": "common_shape_id"
    })  

In [None]:
most_common_shape

In [None]:
route_wo_dir_cols = ["gtfs_dataset_key", "route_id"]

In [None]:
sorting_order

In [None]:
sorting_order + [False]

In [None]:
route_wo_dir_cols + ["trip_instance_key"]

In [None]:
most_common_shape = (
        trips2.groupby(route_wo_dir_cols + ["shape_id", "shape_array_key"], 
                      observed=True, group_keys = False)
        .agg({"trip_instance_key": "count"})
        .reset_index()
        .sort_values(route_wo_dir_cols + ["trip_instance_key"], 
                     ascending = sorting_order)
        .drop_duplicates(subset=route_wo_dir_cols)
        .reset_index(drop=True)
        [route_wo_dir_cols + ["shape_id", "shape_array_key"]]
    ).rename(columns = {
        "gtfs_dataset_key": "schedule_gtfs_dataset_key", 
        "shape_id": "common_shape_id"
    })  

In [None]:
most_common_shape

#### `df_avg_speeds` is also missing a lot of routes.

In [None]:
SEGMENT_GCS

In [None]:
GTFS_DATA_DICT.rt_stop_times.route_dir_single_summary

In [None]:
df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(analysis_date_list)

In [None]:
df_avg_speeds2 = df_avg_speeds.loc[
    df_avg_speeds.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [None]:
df_avg_speeds2.route_id.value_counts()

In [None]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(analysis_date_list)

In [None]:
df_rt_sched2 = df_rt_sched.loc[
    df_rt_sched.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [None]:
df_rt_sched2.route_id.value_counts()

### Fix Map: `gtfs_digest/merge_operator_data`

In [None]:
OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles
OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

In [None]:
operator_route_gdf = gpd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet",
)

In [None]:
operator_route_gdf.columns

In [None]:
operator_route_gdf2.columns

In [None]:
len(operator_route_gdf2)

In [None]:
operator_route_gdf2.is_rail.value_counts()

In [None]:
operator_route_gdf2.organization_name.value_counts()

In [None]:
operator_route_gdf2.schedule_gtfs_dataset_key.unique()

### Why does City of Santa Maria have multiple schedule_gtfs_dataset_keys?

In [None]:
operator_route_gdf2.groupby(["organization_name", "schedule_gtfs_dataset_key"]).agg(
    {"route_short_name": "nunique"}
)

In [None]:
operator_route_gdf2.drop(columns=["service_date"]).explore("organization_name")

In [None]:
# operator_route_gdf2.drop(columns = ["service_date"]).explore("shape_array_key")

### Starting from here [`gtfs_funnel/operator_scheduled_stats`](https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/gtfs_funnel/operator_scheduled_stats.py#L148)

In [None]:
analysis_date = "2024-11-13"

In [None]:
schd_keys = list(operator_route_gdf2.schedule_gtfs_dataset_key.unique())

#### Longest shape does have all the routes...

In [None]:
longest_shape_gdf = operator_scheduled_stats.longest_shape_by_route(analysis_date)

In [None]:
longest_shape_gdf2 = longest_shape_gdf.loc[
    longest_shape_gdf.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [None]:
longest_shape_gdf2.columns

In [None]:
longest_shape_gdf2.info()

In [None]:
longest_shape_gdf2.route_id.value_counts()

In [None]:
# longest_shape_gdf2.explore("schedule_gtfs_dataset_key")

In [None]:
longest_shape_gdf2.groupby(["schedule_gtfs_dataset_key", "route_id"]).agg(
    {"route_length_miles": "max"}
)

#### Somewhere along the way the routes are cut...maybe b/c of `direction_id`

In [None]:
OPERATOR_EXPORT = GTFS_DATA_DICT.schedule_tables.operator_scheduled_stats

In [None]:
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
dec_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11.parquet"

In [None]:
dec_df = gpd.read_parquet(dec_url)

In [None]:
dec_df.organization_name.value_counts().head()

In [None]:
dec_df.loc[
    dec_df.organization_name == "Alameda-Contra Costa Transit District"
].head().drop(columns=["geometry"]).T

In [None]:
dec_df2 = dec_df.loc[dec_df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
dec_df2.shape

In [None]:
type(dec_df2)

In [None]:
dec_df2.drop(columns=["geometry"]).T

In [None]:
# dec_df2.explore()

#### Find where in `gtfs_funnel` all the routes disappear

In [None]:
group_cols = ["schedule_gtfs_dataset_key"]

In [None]:
longest_shape_gdf2.info()

### something is going on in `operator_scheduled_stats.schedule_stats_by_operator`

In [None]:
ROUTE_TYPOLOGY = GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
route_typology = pd.read_parquet(f"{SCHED_GCS}{ROUTE_TYPOLOGY}_{analysis_date}.parquet")

In [None]:
from route_typologies import route_typologies

In [None]:
route_typology_grouped = (
    route_typology.groupby(["schedule_gtfs_dataset_key", "route_id"])
    .agg({**{f"is_{c}": "sum" for c in route_typologies}})
    .reset_index()
)

In [None]:
route_typology_grouped2 = route_typology_grouped.loc[
    route_typology_grouped.schedule_gtfs_dataset_key.isin(schd_keys)
]

#### Routes are missing for Santa Maria and Capitol Corridor in `ROUTE_TYPOLOGY`

In [None]:
route_typology_grouped2.T

In [None]:
route_gdf = longest_shape_gdf2.merge(
    route_typology_grouped2, on=["schedule_gtfs_dataset_key", "route_id"], how="outer"
)

In [None]:
route_gdf.shape

In [None]:
route_gdf.drop(columns=["geometry"])

In [None]:
# route_gdf2.explore("schedule_gtfs_dataset_key")

### Change merge from `inner` to `left`

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

In [None]:
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
my_test_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11_AH.parquet"

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [None]:
test_gdf = gpd.read_parquet(my_test_url)

In [None]:
test_gdf2 = test_gdf.loc[test_gdf.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
test_gdf2.explore("route_id")

#### Test with all the dates.

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
RT_SCHED_GCS

In [None]:
f"{OPERATOR_ROUTE}_AH_test"

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

In [None]:
test_df = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_routes_AH_test.parquet"
)

In [None]:
test_df.columns

In [None]:
op_routes_gdf = test_df.loc[test_df.organization_name.isin(org_name_lists)]

In [None]:
# Find the most recent geography for each route.
op_routes_gdf = op_routes_gdf.sort_values(by=["service_date"], ascending=False)

# Keep only the most recent row.
op_routes_gdf = op_routes_gdf.drop_duplicates(
    subset=["route_long_name", "route_short_name", "route_combined_name"]
)

# Drop service_dates
op_routes_gdf = op_routes_gdf.drop(columns=["service_date"])

In [None]:
op_routes_gdf.organization_name.value_counts()

In [None]:
op_routes_gdf.loc[op_routes_gdf.organization_name == "City of Santa Maria"].explore(
    "route_long_name"
)