## Find Missing Routes: 2 operators. 
* [Issue](https://github.com/cal-itp/data-analyses/issues/1312): Capital Corridor doesn't have any rail routes. 
* [Most of Santa Maria's routes not showing up in GTFS Digest](https://github.com/cal-itp/data-analyses/issues/1313)
* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`
* 1/7: the routes are appearing in `the longest shape` but not appearing in `route_typologies`

In [1]:
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import (
    gtfs_schedule_wrangling,
    helpers,
    metrics,
    segment_calcs,
    time_series_utils,
)
from shared_utils import (
    catalog_utils,
    portfolio_utils,
    rt_dates,
    rt_utils,
    time_helpers,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
org_name_lists = ["Capitol Corridor Joint Powers Authority", "City of Santa Maria"]

In [4]:
analysis_date_list = ["2024-11-13"]

In [5]:
one_analysis_date = "2024-11-13"

In [6]:
schd_keys = [
    "5a8721fe96786fcd25fba1f8a0ee6358",
    "73105f2d1cabc8170ab066d96863c5d5",
    "f5a749dd65924e025b1293c58f95f8d6",
]

In [7]:
import sys

sys.path.append("../gtfs_funnel/")
import operator_scheduled_stats
import schedule_stats_by_route_direction

In [8]:
def preview(df):
    df2 = df[
        ["schedule_gtfs_dataset_key", "route_id", "direction_id"]
    ].drop_duplicates()
    display(df2)

### Fix `schd_vp_url`

In [9]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [10]:
schd_vp_df = pd.read_parquet(schd_vp_url)

In [11]:
schd_vp_df2 = schd_vp_df.loc[schd_vp_df.organization_name.isin(org_name_lists)]

In [12]:
schd_vp_df2.route_id.unique()

array(['b3848f93-d26b-48a9-b6a6-5de22a4eab47', '5', 'Shuttle', 'CC'],
      dtype=object)

###  Check out `rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling`
* https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py 
* Tiffany: <i>Can you try specifying the dropna argument inside pandas groupby? Our pandas version has gone through upgrades, from 0.25 to now 1.5 ), and this argument was introduced in 1.1 and since it's dropna=True, that's probably what's driving the the row behavior.</i>
* It worked! Now time to rerun stuff further down the pipeline and see what happens.

In [13]:
def most_common_shape_by_route_direction(analysis_date: str) -> gpd.GeoDataFrame:
    """
    Find shape_id with most trips for that route-direction.
    Merge in shape geometry.
    """
    route_dir_cols = ["gtfs_dataset_key", "route_id", "direction_id"]

    keep_trip_cols = route_dir_cols + [
        "trip_instance_key",
        "shape_id",
        "shape_array_key",
    ]

    trips = helpers.import_scheduled_trips(
        analysis_date, columns=keep_trip_cols, get_pandas=True
    ).rename(columns={"schedule_gtfs_dataset_key": "gtfs_dataset_key"})
    sorting_order = [True for i in route_dir_cols]

    most_common_shape = (
        trips.groupby(
            route_dir_cols + ["shape_id", "shape_array_key"],
            observed=True,
            group_keys=False,
            dropna=False,
        )
        .agg({"trip_instance_key": "count"})
        .reset_index()
        .sort_values(
            route_dir_cols + ["trip_instance_key"], ascending=sorting_order + [False]
        )
        .drop_duplicates(subset=route_dir_cols)
        .reset_index(drop=True)[route_dir_cols + ["shape_id", "shape_array_key"]]
    ).rename(
        columns={
            "gtfs_dataset_key": "schedule_gtfs_dataset_key",
            "shape_id": "common_shape_id",
        }
    )

    shape_geom = helpers.import_scheduled_shapes(
        analysis_date,
        columns=["shape_array_key", "geometry"],
    )

    common_shape_geom = pd.merge(
        shape_geom, most_common_shape, on="shape_array_key", how="inner"
    ).drop(columns="shape_array_key")

    route_info = (
        helpers.import_scheduled_trips(
            analysis_date,
            columns=[
                "gtfs_dataset_key",
                "route_id",
                "route_long_name",
                "route_short_name",
                "route_desc",
            ],
        )
        .drop_duplicates()
        .pipe(portfolio_utils.add_route_name)
        .drop(columns=["route_long_name", "route_short_name", "route_desc"])
    )

    del shape_geom, most_common_shape

    common_shape_geom2 = pd.merge(
        common_shape_geom,
        route_info.rename(columns={"route_name_used": "route_name"}),
        on=["schedule_gtfs_dataset_key", "route_id"],
    )

    return common_shape_geom2

In [14]:
common_shape_test = most_common_shape_by_route_direction(one_analysis_date)

In [15]:
common_shape_test.columns

Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',
       'common_shape_id', 'route_name'],
      dtype='object')

In [16]:
common_shape_test2 = common_shape_test.loc[
    common_shape_test.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [17]:
common_shape_test2.route_id.unique()

array(['7', '6', 'CC', '8', 'Mall', '12X', '13X', '11', '30', 'Shuttle',
       '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '1B', '20', 'SF',
       '5', '4', '9', '1'], dtype=object)

#### I think not having anything in `direction_id` is messing everything up. 

In [18]:
common_shape_test2.drop(columns=["geometry"]).head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name
167,73105f2d1cabc8170ab066d96863c5d5,7,,715be44b-4dee-4c56-83f8-b1970d6133cf,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd."
193,73105f2d1cabc8170ab066d96863c5d5,6,,de042d01-f50a-4b67-ba25-4628643021fa,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound


### Breakdown `gtfs_digest/merge_data.`

#### Line 294: `df_sched` is already missing a lot of the routes.

In [19]:
# Get cardinal direction for each route
df_sched_og = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)

In [20]:
df_sched2_og = df_sched_og.loc[df_sched_og.schedule_gtfs_dataset_key.isin(schd_keys)]

In [21]:
df_sched2_og.route_id.value_counts()

Shuttle    6
5          3
Name: route_id, dtype: int64

##### Go back to `gtfs_funnel/schedule_stats_by_route_direction`
* https://github.com/cal-itp/data-analyses/blob/1ba0f544a01f99966a6e210dd11666b4fe4a146e/gtfs_funnel/schedule_stats_by_route_direction.py#L190
* Test 1: Updated `gtfs_schedule_wrangling` but a lot of routes are still missing. 

##### `assemble_scheduled_trip_metrics`: nothing is missing

In [22]:
trip_metrics = schedule_stats_by_route_direction.assemble_scheduled_trip_metrics(
    one_analysis_date, GTFS_DATA_DICT
)

In [23]:
trip_metrics.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id
0,0139b1253130b33adcd4b3a4490530d2,014dd8051849e5252df704ca9c381fd9,559.44,PM Peak,23.0,D4,


In [24]:
trip_metrics2 = trip_metrics.loc[trip_metrics.schedule_gtfs_dataset_key.isin(schd_keys)]

In [25]:
trip_metrics2.columns

Index(['schedule_gtfs_dataset_key', 'trip_instance_key', 'median_stop_meters',
       'time_of_day', 'scheduled_service_minutes', 'route_id', 'direction_id'],
      dtype='object')

In [26]:
trip_metrics2.shape

(335, 7)

In [27]:
trip_metrics2.time_of_day.unique()

array(['PM Peak', 'Midday', 'AM Peak', 'Early AM', 'Evening'],
      dtype=object)

##### Each row is populated.

In [28]:
trip_metrics2.loc[trip_metrics2.time_of_day == "Midday"].drop_duplicates(
    subset=["schedule_gtfs_dataset_key", "route_id", "direction_id"]
).drop(columns=["schedule_gtfs_dataset_key", "trip_instance_key"])

Unnamed: 0,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id
49725,405.04,Midday,35.0,3,
49729,178.05,Midday,14.98,Mall,
49731,451.15,Midday,41.0,11,
49736,361.12,Midday,30.0,1B,
49737,357.22,Midday,30.37,1,
49738,444.75,Midday,40.0,9,
49741,440.62,Midday,41.0,4,
49742,989.61,Midday,56.0,12X,
49744,437.51,Midday,42.0,5,0.0
49754,477.41,Midday,53.0,13X,


In [29]:
preview(trip_metrics2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
49724,73105f2d1cabc8170ab066d96863c5d5,30,
49725,73105f2d1cabc8170ab066d96863c5d5,3,
49727,73105f2d1cabc8170ab066d96863c5d5,20,
49728,73105f2d1cabc8170ab066d96863c5d5,4,
49729,73105f2d1cabc8170ab066d96863c5d5,Mall,
49730,73105f2d1cabc8170ab066d96863c5d5,5,0.0
49731,73105f2d1cabc8170ab066d96863c5d5,11,
49732,73105f2d1cabc8170ab066d96863c5d5,7,
49733,73105f2d1cabc8170ab066d96863c5d5,9,
49735,73105f2d1cabc8170ab066d96863c5d5,1,


##### `gtfs_funnel/schedule_stats_by_route_direction/schedule_metrics_by_route_direction` 
* update to `dropna=False`

In [30]:
from shared_utils.rt_utils import METERS_PER_MILE

In [31]:
def schedule_metrics_by_route_direction(
    df: pd.DataFrame,
    analysis_date: str,
    group_merge_cols: list,
) -> pd.DataFrame:
    """
    Aggregate trip-level metrics to route-direction, and
    attach shape geometry for common_shape_id.
    """
    service_freq_df = gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
        df, group_merge_cols, long_or_wide="long"
    )

    metrics_df = (
        df.groupby(group_merge_cols, observed=True, group_keys=False, dropna=False)
        .agg(
            {
                "median_stop_meters": "mean",
                # take mean of the median stop spacing for trip
                # does this make sense?
                # median is the single boiled down metric at the trip-level
                "scheduled_service_minutes": "mean",
            }
        )
        .reset_index()
        .rename(
            columns={
                "median_stop_meters": "avg_stop_meters",
                "scheduled_service_minutes": "avg_scheduled_service_minutes",
            }
        )
    )

    metrics_df = metrics_df.assign(
        avg_stop_miles=metrics_df.avg_stop_meters.divide(METERS_PER_MILE).round(2)
    ).drop(columns=["avg_stop_meters"])

    round_me = ["avg_stop_miles", "avg_scheduled_service_minutes"]
    metrics_df[round_me] = metrics_df[round_me].round(2)

    common_shape = gtfs_schedule_wrangling.most_common_shape_by_route_direction(
        analysis_date
    ).pipe(helpers.remove_shapes_outside_ca)

    df = pd.merge(common_shape, metrics_df, on=group_merge_cols, how="inner").merge(
        service_freq_df, on=group_merge_cols, how="inner"
    )

    return df

##### The routes are still missing even after `dropna=False` because `direction_id` is missing.

In [32]:
route_group_merge_cols = ["schedule_gtfs_dataset_key", "route_id", "direction_id"]

In [33]:
route_dir_metrics_og = schedule_metrics_by_route_direction(
    trip_metrics2, one_analysis_date, route_group_merge_cols
)

In [34]:
route_dir_metrics_og.drop(
    columns=["geometry", "schedule_gtfs_dataset_key", "common_shape_id"]
)

Unnamed: 0,route_id,direction_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,peak_offpeak,frequency
0,Shuttle,1.0,Shuttle to Auburn,72.0,13.74,5,all_day,,0.21
1,Shuttle,1.0,Shuttle to Auburn,72.0,13.74,2,,offpeak,0.08
2,Shuttle,1.0,Shuttle to Auburn,72.0,13.74,3,,peak,0.12
3,Shuttle,0.0,Shuttle to Auburn,70.0,11.78,5,all_day,,0.21
4,Shuttle,0.0,Shuttle to Auburn,70.0,11.78,3,,offpeak,0.12
5,Shuttle,0.0,Shuttle to Auburn,70.0,11.78,2,,peak,0.08
6,5,0.0,"Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way",42.0,0.27,18,all_day,,0.75
7,5,0.0,"Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way",42.0,0.27,8,,offpeak,0.33
8,5,0.0,"Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way",42.0,0.27,10,,peak,0.42


##### Looking at the dataframe using the original script in `schedule_stats_by_route_direction.schedule_metrics_by_route_direction`: a lot of missing values in `time_period`?

In [35]:
route_dir_metrics_script_og = (
    schedule_stats_by_route_direction.schedule_metrics_by_route_direction(
        trip_metrics2, one_analysis_date, route_group_merge_cols
    )
)

In [36]:
route_dir_metrics_script_og.drop(
    columns=["geometry", "schedule_gtfs_dataset_key", "common_shape_id"]
)

Unnamed: 0,route_id,direction_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,peak_offpeak,frequency
0,Shuttle,1.0,Shuttle to Auburn,72.0,13.74,5,all_day,,0.21
1,Shuttle,1.0,Shuttle to Auburn,72.0,13.74,2,,offpeak,0.08
2,Shuttle,1.0,Shuttle to Auburn,72.0,13.74,3,,peak,0.12
3,Shuttle,0.0,Shuttle to Auburn,70.0,11.78,5,all_day,,0.21
4,Shuttle,0.0,Shuttle to Auburn,70.0,11.78,3,,offpeak,0.12
5,Shuttle,0.0,Shuttle to Auburn,70.0,11.78,2,,peak,0.08
6,5,0.0,"Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way",42.0,0.27,18,all_day,,0.75
7,5,0.0,"Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way",42.0,0.27,8,,offpeak,0.33
8,5,0.0,"Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way",42.0,0.27,10,,peak,0.42


In [37]:
route_dir_metrics_og.route_id.unique()

array(['Shuttle', '5'], dtype=object)

##### `schedule_metrics_by_route_direction` update `dropna=False`

In [38]:
from shared_utils.rt_utils import METERS_PER_MILE

In [39]:
route_group_merge_cols_no_dir_id = [
    "schedule_gtfs_dataset_key",
    "route_id",
]

In [40]:
route_dir_metrics_wo_dir_id = schedule_metrics_by_route_direction(
    trip_metrics2, one_analysis_date, route_group_merge_cols_no_dir_id
)

##### All of the routes appear after excluding `direction_id` from `groupby` and adding `dropna=False` with the proper amount of data.

In [41]:
route_dir_metrics_wo_dir_id.drop(
    columns=["geometry", "schedule_gtfs_dataset_key", "common_shape_id"]
)

Unnamed: 0,route_id,direction_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,peak_offpeak,frequency
0,7,0.0,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",36.0,0.37,19,all_day,,0.79
1,7,0.0,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",36.0,0.37,9,,offpeak,0.38
2,7,0.0,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",36.0,0.37,10,,peak,0.42
3,6,0.0,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,38.0,0.38,18,all_day,,0.75
4,6,0.0,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,38.0,0.38,7,,offpeak,0.29
5,6,0.0,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,38.0,0.38,11,,peak,0.46
6,8,0.0,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,43.0,0.32,16,all_day,,0.67
7,8,0.0,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,43.0,0.32,8,,offpeak,0.33
8,8,0.0,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,43.0,0.32,8,,peak,0.33
9,Mall,0.0,Mall Shuttle,14.98,0.11,28,all_day,,1.17


##### `gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak` is missing a lot of routes -> break it out.

In [42]:
service_freq_df = gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
    trip_metrics2, route_group_merge_cols, long_or_wide="long"
)

In [43]:
service_freq_df.route_id.unique()

array(['5', 'CC', 'SF', 'Shuttle'], dtype=object)

In [44]:
service_freq_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,n_trips,time_period,peak_offpeak,frequency
0,73105f2d1cabc8170ab066d96863c5d5,5,0.0,18,all_day,,0.75
1,f5a749dd65924e025b1293c58f95f8d6,CC,0.0,12,all_day,,0.5


##### Changed `count_trips_by_group` to have argument `dropna=False` in `groupby`

In [45]:
def count_trips_by_group(df: pd.DataFrame, group_cols: list):
    """
    Given a df with trip_instance_key and an arbitrary list of
    group_cols, return trip counts by group.
    """
    assert "trip_instance_key" in df.columns
    df = (
        df.groupby(group_cols, dropna=False)
        .agg({"trip_instance_key": "count"})
        .reset_index()
    )
    df = df.rename(columns={"trip_instance_key": "n_trips"})
    return df

In [46]:
def add_peak_offpeak_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add a single peak_offpeak column based on the time-of-day column.
    """
    df = df.assign(peak_offpeak=df.time_of_day.map(time_helpers.TIME_OF_DAY_DICT))

    return df

In [47]:
def aggregate_time_of_day_to_peak_offpeak(
    df: pd.DataFrame,
    group_cols: list,
    long_or_wide: str,
) -> pd.DataFrame:
    """
    Aggregate time-of-day bins into peak/offpeak periods.
    Return n_trips and frequency for grouping of columns (route-direction, etc).
    Allow wide or long df to be returned.
    """
    peak_hours = sum(
        v
        for k, v in time_helpers.HOURS_BY_TIME_OF_DAY.items()
        if k in time_helpers.PEAK_PERIODS
    )

    offpeak_hours = sum(
        v
        for k, v in time_helpers.HOURS_BY_TIME_OF_DAY.items()
        if k not in time_helpers.PEAK_PERIODS
    )

    df = add_peak_offpeak_column(df)

    all_day = count_trips_by_group(df, group_cols).assign(time_period="all_day")
    peak_offpeak = count_trips_by_group(df, group_cols + ["peak_offpeak"]).rename(
        {"peak_offpeak": "time_period"}
    )

    df2 = pd.concat([all_day, peak_offpeak], axis=0, ignore_index=True)

    # Add service frequency (trips per hour)
    # there are different number of hours in peak and offpeak periods
    df2 = df2.assign(
        frequency=df2.apply(
            lambda x: round(x.n_trips / peak_hours, 2)
            if x.time_period == "peak"
            else round(x.n_trips / offpeak_hours, 2)
            if x.time_period == "offpeak"
            else round(x.n_trips / (peak_hours + offpeak_hours), 2),
            axis=1,
        )
    )

    if long_or_wide == "long":
        return df2

    elif long_or_wide == "wide":
        # Reshape from wide to long
        # get rid of multiindex column names
        df3 = df2.pivot(
            index=group_cols, columns="time_period", values=["n_trips", "frequency"]
        )

        df3.columns = [f"{b}_{a}" for a, b in df3.columns]
        df3 = df3.reset_index()

        return df3

In [48]:
service_freq_df_test1 = aggregate_time_of_day_to_peak_offpeak(
    trip_metrics2, route_group_merge_cols, long_or_wide="long"
)

In [49]:
service_freq_df_test1.route_id.unique()

array(['1', '11', '12X', '13X', '1B', '2', '20', '3', '30', '4', '5', '6',
       '7', '8', '8a7c42f9-51e4-4848-bf88-30c210f149ad', '9', 'Mall',
       'CC', 'SF', 'Shuttle'], dtype=object)

##### `metrics_df` portion of `gtfs_funnel.schedule_stas_by_route_direction.schedule_metrics_by_route_direction`
* Did `dropna=False` to get all the routes. 
* Without `dropna=False`, all the routes disappear.

In [50]:
metrics_df = (
    trip_metrics2.groupby(
        route_group_merge_cols, observed=True, group_keys=False, dropna=False
    )
    .agg(
        {
            "median_stop_meters": "mean",
            # take mean of the median stop spacing for trip
            # does this make sense?
            # median is the single boiled down metric at the trip-level
            "scheduled_service_minutes": "mean",
        }
    )
    .reset_index()
    .rename(
        columns={
            "median_stop_meters": "avg_stop_meters",
            "scheduled_service_minutes": "avg_scheduled_service_minutes",
        }
    )
)

In [51]:
metrics_df = metrics_df.assign(
    avg_stop_miles=metrics_df.avg_stop_meters.divide(METERS_PER_MILE).round(2)
).drop(columns=["avg_stop_meters"])

In [52]:
round_me = ["avg_stop_miles", "avg_scheduled_service_minutes"]
metrics_df[round_me] = metrics_df[round_me].round(2)

##### End of the function `gtfs_funnel/schedule_stats_by_route_direction/schedule_metrics_by_route_direction` -> called `route_dir_metrics` in `if __name__ == "__main__"` part.

In [53]:
route_dir_metrics = pd.merge(
    common_shape_test2, metrics_df, on=route_group_merge_cols, how="inner"
).merge(service_freq_df_test1, on=route_group_merge_cols, how="inner")

##### In `if __name__ == "__main__"` gtfs_funnel/schedule_stats_by_route`

In [54]:
ROUTE_TYPOLOGIES = GTFS_DATA_DICT.schedule_tables.route_typologies
route_typologies = pd.read_parquet(
    f"{SCHED_GCS}{ROUTE_TYPOLOGIES}_{one_analysis_date}.parquet",
    columns=route_group_merge_cols
    + [
        "is_coverage",
        "is_downtown_local",
        "is_local",
        "is_rapid",
        "is_express",
        "is_rail",
    ],
)

In [58]:
route_typologies.loc[route_typologies.schedule_gtfs_dataset_key.isin(schd_keys)]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail
1416,f5a749dd65924e025b1293c58f95f8d6,Shuttle,1.0,1,0,0,0,0,0
3572,73105f2d1cabc8170ab066d96863c5d5,13X,0.0,1,0,0,1,0,0
3573,73105f2d1cabc8170ab066d96863c5d5,20,0.0,1,0,0,0,0,0
3574,73105f2d1cabc8170ab066d96863c5d5,12X,0.0,1,0,0,1,1,0
3575,73105f2d1cabc8170ab066d96863c5d5,30,0.0,1,0,0,1,0,0
3576,73105f2d1cabc8170ab066d96863c5d5,2,0.0,0,1,0,1,0,0
3577,73105f2d1cabc8170ab066d96863c5d5,1,0.0,1,0,0,1,0,0
3578,73105f2d1cabc8170ab066d96863c5d5,1B,0.0,1,0,0,1,0,0
3579,73105f2d1cabc8170ab066d96863c5d5,4,0.0,1,0,0,1,0,0
3580,73105f2d1cabc8170ab066d96863c5d5,7,0.0,1,0,0,1,0,0


##### `cardinal_dir_df` also gets rid of a lot of stuff -> check it out. 

In [59]:
cardinal_dir_df = (
    schedule_stats_by_route_direction.cardinal_direction_for_route_direction(
        one_analysis_date, GTFS_DATA_DICT
    )
)

In [60]:
cardinal_dir_df2 = cardinal_dir_df.loc[
    cardinal_dir_df.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [61]:
preview(cardinal_dir_df2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
2581,73105f2d1cabc8170ab066d96863c5d5,5,0.0
3947,f5a749dd65924e025b1293c58f95f8d6,CC,0.0
3948,f5a749dd65924e025b1293c58f95f8d6,CC,1.0
4225,f5a749dd65924e025b1293c58f95f8d6,SF,0.0
4226,f5a749dd65924e025b1293c58f95f8d6,SF,1.0
4254,f5a749dd65924e025b1293c58f95f8d6,Shuttle,0.0
4255,f5a749dd65924e025b1293c58f95f8d6,Shuttle,1.0


In [62]:
STOP_TIMES_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction

In [63]:
stop_times_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{one_analysis_date}.parquet",
    filters=[[("stop_primary_direction", "!=", "Unknown")]],
)

In [64]:
stop_times_df2 = stop_times_df.loc[
    stop_times_df.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [65]:
stop_times_df2.columns

Index(['feed_key', 'stop_id', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'trip_instance_key', 'shape_array_key', 'stop_name', 'geometry',
       'prior_stop_sequence', 'subseq_stop_sequence', 'stop_pair',
       'stop_pair_name', 'stop_primary_direction', 'stop_meters'],
      dtype='object')

In [66]:
trip_scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name",
]

trips_df = helpers.import_scheduled_trips(
    one_analysis_date, columns=trip_scheduled_col, get_pandas=True
)

In [67]:
merge_cols = ["trip_instance_key", "schedule_gtfs_dataset_key", "shape_array_key"]

In [68]:
stop_times_with_trip = pd.merge(stop_times_df2, trips_df, on=merge_cols)

In [69]:
preview(stop_times_with_trip)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,30,
26,73105f2d1cabc8170ab066d96863c5d5,3,
66,73105f2d1cabc8170ab066d96863c5d5,20,
75,f5a749dd65924e025b1293c58f95f8d6,Shuttle,0.0
79,73105f2d1cabc8170ab066d96863c5d5,4,
104,f5a749dd65924e025b1293c58f95f8d6,SF,1.0
105,73105f2d1cabc8170ab066d96863c5d5,Mall,
111,73105f2d1cabc8170ab066d96863c5d5,5,0.0
147,73105f2d1cabc8170ab066d96863c5d5,11,
168,73105f2d1cabc8170ab066d96863c5d5,7,


In [70]:
main_cols = ["route_id", "schedule_gtfs_dataset_key", "direction_id"]

##### Changing dropna=False here too

In [71]:
agg1 = (
    stop_times_with_trip.groupby(main_cols + ["stop_primary_direction"], dropna=False)
    .agg({"stop_sequence": "count"})
    .reset_index()
    .rename(columns={"stop_sequence": "total_stops"})
)

In [72]:
agg2 = agg1.sort_values(
    by=main_cols + ["total_stops"],
    ascending=[True, True, True, False],
)

##### There are values for `route_primary_direction` but because `direction_id` is missing, it goes away? 
* AH: testing to see if filling `direction_id` with something will change things.

In [73]:
cardinal_dir_df = (
    agg2.drop_duplicates(subset=main_cols)
    .reset_index(drop=True)
    .drop(columns=["total_stops"])
    .rename(columns={"stop_primary_direction": "route_primary_direction"})
)

cardinal_dir_df.direction_id = cardinal_dir_df.direction_id.fillna(0)

In [74]:
cardinal_dir_df

Unnamed: 0,route_id,schedule_gtfs_dataset_key,direction_id,route_primary_direction
0,1,73105f2d1cabc8170ab066d96863c5d5,0.0,Northbound
1,11,73105f2d1cabc8170ab066d96863c5d5,0.0,Northbound
2,12X,73105f2d1cabc8170ab066d96863c5d5,0.0,Northbound
3,13X,73105f2d1cabc8170ab066d96863c5d5,0.0,Westbound
4,1B,73105f2d1cabc8170ab066d96863c5d5,0.0,Northbound
5,2,73105f2d1cabc8170ab066d96863c5d5,0.0,Westbound
6,20,73105f2d1cabc8170ab066d96863c5d5,0.0,Eastbound
7,3,73105f2d1cabc8170ab066d96863c5d5,0.0,Eastbound
8,30,73105f2d1cabc8170ab066d96863c5d5,0.0,Southbound
9,4,73105f2d1cabc8170ab066d96863c5d5,0.0,Southbound


In [75]:
route_group_merge_cols = ["schedule_gtfs_dataset_key", "route_id", "direction_id"]

In [76]:
route_group_merge_cols_wo_dir_id = [
    "schedule_gtfs_dataset_key",
    "route_id",
]

##### TO-DO: `route_typologies` is missing info for the missing routes. 

In [77]:
# route_typologies.head(2)

In [78]:
# route_typologies.loc[route_typologies.schedule_gtfs_dataset_key.isin(schd_keys)]

In [79]:
# route_dir_metrics.head(2).drop(columns = ["geometry"])

In [83]:
# route_dir_metrics.drop(columns = ["geometry"])

In [81]:
route_typologies.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail
0,1770249a5a2e770ca90628434d4934b1,3407,0.0,1,0,0,1,0,0


In [174]:
route_typologies.loc[route_typologies.schedule_gtfs_dataset_key.isin(schd_keys)]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail
1416,f5a749dd65924e025b1293c58f95f8d6,Shuttle,1.0,1,0,0,0,0,0
3572,73105f2d1cabc8170ab066d96863c5d5,13X,0.0,1,0,0,1,0,0
3573,73105f2d1cabc8170ab066d96863c5d5,20,0.0,1,0,0,0,0,0
3574,73105f2d1cabc8170ab066d96863c5d5,12X,0.0,1,0,0,1,1,0
3575,73105f2d1cabc8170ab066d96863c5d5,30,0.0,1,0,0,1,0,0
3576,73105f2d1cabc8170ab066d96863c5d5,2,0.0,0,1,0,1,0,0
3577,73105f2d1cabc8170ab066d96863c5d5,1,0.0,1,0,0,1,0,0
3578,73105f2d1cabc8170ab066d96863c5d5,1B,0.0,1,0,0,1,0,0
3579,73105f2d1cabc8170ab066d96863c5d5,4,0.0,1,0,0,1,0,0
3580,73105f2d1cabc8170ab066d96863c5d5,7,0.0,1,0,0,1,0,0


In [86]:
route_dir_metrics.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 69 entries, 0 to 68
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   geometry                       69 non-null     geometry
 1   schedule_gtfs_dataset_key      69 non-null     object  
 2   route_id                       69 non-null     object  
 3   direction_id                   21 non-null     float64 
 4   common_shape_id                69 non-null     object  
 5   route_name                     69 non-null     object  
 6   avg_scheduled_service_minutes  69 non-null     float64 
 7   avg_stop_miles                 69 non-null     float64 
 8   n_trips                        69 non-null     int64   
 9   time_period                    23 non-null     object  
 10  peak_offpeak                   46 non-null     object  
 11  frequency                      69 non-null     float64 
dtypes: float64(4), geometry(1), in

##### Have to fill in `direction_id` with 0? 

In [87]:
route_dir_metrics.direction_id  = route_dir_metrics.direction_id.fillna(0)

In [88]:
route_dir_metrics2 = pd.merge(
    route_dir_metrics, route_typologies, on=route_group_merge_cols, how="left"
).merge(cardinal_dir_df, on=route_group_merge_cols, how="left")

In [89]:
route_dir_metrics2.route_id.unique()

array(['7', '6', 'CC', '8', 'Mall', '12X', '13X', '11', '30', 'Shuttle',
       '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '1B', '20', 'SF',
       '5', '4', '9', '1'], dtype=object)

In [90]:
route_dir_metrics2.drop(
    columns=[
        "geometry",
        "common_shape_id",
        "geometry",
        "route_name",
        "is_coverage",
        "is_downtown_local",
        "is_local",
        "is_rapid",
        "is_express",
        "is_rail",
    ]
).sort_values(by=["schedule_gtfs_dataset_key"])

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,peak_offpeak,frequency,route_primary_direction
0,73105f2d1cabc8170ab066d96863c5d5,7,0.0,36.0,0.37,19,all_day,,0.79,Southbound
36,73105f2d1cabc8170ab066d96863c5d5,8a7c42f9-51e4-4848-bf88-30c210f149ad,0.0,41.65,0.25,18,all_day,,0.75,Northbound
37,73105f2d1cabc8170ab066d96863c5d5,8a7c42f9-51e4-4848-bf88-30c210f149ad,0.0,41.65,0.25,8,,offpeak,0.33,Northbound
38,73105f2d1cabc8170ab066d96863c5d5,8a7c42f9-51e4-4848-bf88-30c210f149ad,0.0,41.65,0.25,10,,peak,0.42,Northbound
39,73105f2d1cabc8170ab066d96863c5d5,2,0.0,53.24,0.29,17,all_day,,0.71,Westbound
40,73105f2d1cabc8170ab066d96863c5d5,2,0.0,53.24,0.29,6,,offpeak,0.25,Westbound
41,73105f2d1cabc8170ab066d96863c5d5,2,0.0,53.24,0.29,11,,peak,0.46,Westbound
42,73105f2d1cabc8170ab066d96863c5d5,3,0.0,35.11,0.26,18,all_day,,0.75,Eastbound
43,73105f2d1cabc8170ab066d96863c5d5,3,0.0,35.11,0.26,8,,offpeak,0.33,Eastbound
44,73105f2d1cabc8170ab066d96863c5d5,3,0.0,35.11,0.26,10,,peak,0.42,Eastbound


##### Double check that the columns are the same.

In [91]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [92]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

'schedule_route_dir/schedule_route_direction_metrics'

In [93]:
og_nov_url = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-11-13.parquet"

In [94]:
df_sched_og = gpd.read_parquet(og_nov_url)

In [95]:
df_sched_og.columns

Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',
       'common_shape_id', 'route_name', 'avg_scheduled_service_minutes',
       'avg_stop_miles', 'n_trips', 'time_period', 'peak_offpeak', 'frequency',
       'is_coverage', 'is_downtown_local', 'is_local', 'is_rapid',
       'is_express', 'is_rail', 'route_primary_direction'],
      dtype='object')

In [96]:
route_dir_metrics2.columns

Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',
       'common_shape_id', 'route_name', 'avg_scheduled_service_minutes',
       'avg_stop_miles', 'n_trips', 'time_period', 'peak_offpeak', 'frequency',
       'is_coverage', 'is_downtown_local', 'is_local', 'is_rapid',
       'is_express', 'is_rail', 'route_primary_direction'],
      dtype='object')

In [97]:
df_sched = route_dir_metrics2.copy()

#### HELP: `gtfs_digest/merge_data` line 300 `df_avg_speeds` is also missing a lot of routes.
* Not sure which file creates this ? 

In [98]:
SEGMENT_GCS

'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/'

In [99]:
GTFS_DATA_DICT.rt_stop_times.route_dir_single_summary

'rollup_singleday/speeds_route_dir'

In [100]:
df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(analysis_date_list)

In [101]:
analysis_date_list

['2024-11-13']

In [102]:
df_avg_speeds2 = df_avg_speeds.loc[
    df_avg_speeds.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [103]:
df_avg_speeds2.route_id.value_counts()

5    3
Name: route_id, dtype: int64

In [104]:
df_avg_speeds2.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,speed_mph,service_date
5006,73105f2d1cabc8170ab066d96863c5d5,5,0.0,all_day,15.73,2024-11-13
5007,73105f2d1cabc8170ab066d96863c5d5,5,0.0,offpeak,17.62,2024-11-13


##### Side note, there are actually all of the Dec values for all time_periods here for speed.

In [105]:
df_avg_speeds2.loc[df_avg_speeds2.route_id == "5"]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,speed_mph,service_date
5006,73105f2d1cabc8170ab066d96863c5d5,5,0.0,all_day,15.73,2024-11-13
5007,73105f2d1cabc8170ab066d96863c5d5,5,0.0,offpeak,17.62,2024-11-13
5008,73105f2d1cabc8170ab066d96863c5d5,5,0.0,peak,14.35,2024-11-13


##### Can't find which file powers `df_avg_speeds2`,`df_avg_speeds`  = `df_avg_speeds2`

#### Dataframe in line 307 `df_rt_sched` in `gtfs_digest/merge_data`

In [106]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(
    analysis_date_list
).astype({"direction_id": "float"})

In [107]:
df_rt_sched2 = df_rt_sched.loc[df_rt_sched.schedule_gtfs_dataset_key.isin(schd_keys)]

In [108]:
preview(df_rt_sched2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
5369,73105f2d1cabc8170ab066d96863c5d5,5,0.0
11696,f5a749dd65924e025b1293c58f95f8d6,CC,0.0
11699,f5a749dd65924e025b1293c58f95f8d6,CC,1.0


In [109]:
df_rt_sched2.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,name,schedule_source_record_id,service_date
5369,73105f2d1cabc8170ab066d96863c5d5,5,0.0,all_day,1207,1195,1662.72,714.0,3584,2223,0,12,5,17,2.16,0.62,0.73,0.72,1.0,1.0,2.33,97.81,Santa Maria Schedule,recxPy2JOcDFGDo31,2024-11-13
5370,73105f2d1cabc8170ab066d96863c5d5,5,0.0,offpeak,757,750,1221.38,336.0,2251,1000,0,5,3,8,1.84,0.44,0.62,0.61,1.0,1.0,3.64,152.67,Santa Maria Schedule,recxPy2JOcDFGDo31,2024-11-13


##### `dt_rt_sched` is created using  [`rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`](https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/rt_v_scheduled_routes.py) 

In [110]:
[*GTFS_DATA_DICT["stop_segments"]["route_dir_cols"]]

['route_id', 'direction_id']

In [111]:
dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables

##### `route_metrics` in `rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`

In [112]:
TRIP_EXPORT = dict_inputs.vp_trip_metrics

In [113]:
ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics

In [114]:
ROUTE_EXPORT

'vp_route_dir/route_direction_metrics'

In [115]:
trip_df = pd.read_parquet(f"{RT_SCHED_GCS}{TRIP_EXPORT}_{one_analysis_date}.parquet")

In [116]:
trip_df2 = trip_df.loc[trip_df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [117]:
trip_df2.shape

(233, 24)

##### Everything is available in `trip_df`

In [118]:
trip_df2.loc[trip_df2.time_of_day == "AM Peak"].drop(
    columns=["schedule_gtfs_dataset_key", "trip_instance_key"]
).sort_values(by=["route_id"]).drop_duplicates(
    subset=[
        "route_id",
        "direction_id",
    ]
).T

Unnamed: 0,49886,8692,85281,66369,86745,13643,18301,35587,59184,42254,86866,56523,61248,46157,43747,44473,42566
route_id,1,11,12X,1B,2,20,3,30,4,5,6,7,8,8a7c42f9-51e4-4848-bf88-30c210f149ad,9,CC,CC
direction_id,,,,,,,,,,0.00,,,,,,1.00,0.00
scheduled_service_minutes,30.37,41.00,56.00,30.00,57.00,120.00,36.00,168.00,41.00,42.00,38.00,36.00,43.00,41.65,40.00,109.00,189.00
total_vp,148,134,174,4319,159,459,120,645,107,151,160,134,140,141,127,210,255
rt_service_minutes,49.00,44.35,57.65,1439.63,52.63,152.65,39.63,214.65,35.32,49.92,53.00,44.25,46.35,46.67,42.00,69.77,84.53
minutes_atleast1_vp,50,45,58,1440,54,154,41,215,36,50,54,45,47,47,43,71,85
minutes_atleast2_vp,50,45,58,1440,53,153,40,215,35,50,53,44,46,47,43,70,85
vp_in_shape,146,133,128,0,159,445,92,482,103,130,158,85,140,141,123,0,0
route_short_name,1,11B,12X,1B,2,20,3,30,4,5,6,7,8,11,9,CC,CC
sched_rt_category,schedule_and_vp,schedule_and_vp,schedule_and_vp,schedule_and_vp,schedule_and_vp,schedule_and_vp,schedule_and_vp,schedule_and_vp,schedule_and_vp,schedule_and_vp,schedule_and_vp,schedule_and_vp,schedule_and_vp,schedule_and_vp,schedule_and_vp,schedule_and_vp,schedule_and_vp


##### Somewhere in `rt_scheduled_v_ran/rt_v_scheduled_routes`, routes go missing.

In [119]:
import sys

sys.path.append("../rt_scheduled_v_ran/scripts")
import rt_v_scheduled_routes

In [120]:
ROUTE_DIR_COLS = [*GTFS_DATA_DICT["stop_segments"]["route_dir_cols"]]

In [121]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "name",
    "schedule_source_record_id",
    "base64_url",
    "organization_source_record_id",
    "organization_name",
    "caltrans_district",
]

##### Have to break out `metrics.concatenate_peak_offpeak_allday_averages` which is in  `rt_segment_speeds/segment_speed_utils/` because all of the routes are missing.

In [122]:
route_df = (
    metrics.concatenate_peak_offpeak_allday_averages(
        trip_df2,
        group_cols=["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS,
        metric_type="rt_vs_schedule",
    )
    .pipe(metrics.derive_rt_vs_schedule_metrics)
    .pipe(rt_v_scheduled_routes.average_rt_trip_times)
)

In [123]:
preview(route_df)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,5,0.0
2,f5a749dd65924e025b1293c58f95f8d6,CC,0.0
4,f5a749dd65924e025b1293c58f95f8d6,CC,1.0


In [124]:
["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS

['schedule_gtfs_dataset_key', 'route_id', 'direction_id']

`calculate_avg_speeds` is from `rt_segement_speeds/segement_speed_utils/segment_calc.py` -> added `dropna=False`

In [125]:
def calculate_avg_speeds(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
    """
    Calculate the median, 20th, and 80th percentile speeds
    by groups.
    """
    # pd.groupby and pd.quantile is so slow
    # create our own list of speeds and use np
    df2 = (
        df.groupby(group_cols, observed=True, group_keys=False, dropna=False)
        .agg({"speed_mph": lambda x: sorted(list(x))})
        .reset_index()
        .rename(columns={"speed_mph": "speed_mph_list"})
    )

    df2 = df2.assign(
        p50_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=50), axis=1),
        n_trips=df2.apply(lambda x: len(x.speed_mph_list), axis=1).astype("int16"),
        p20_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=20), axis=1),
        p80_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=80), axis=1),
    )

    stats = df2.drop(columns="speed_mph_list")

    # Clean up for map
    speed_cols = [c for c in stats.columns if "_mph" in c]
    stats[speed_cols] = stats[speed_cols].round(2)

    return stats

`calculate_weighted_average_vp_schedule_metrics` is from `rt_segment_speeds/segment_speed_utils/metrics` -> added `dropna=False`

In [126]:
def calculate_weighted_average_vp_schedule_metrics(
    df: pd.DataFrame,
    group_cols: list,
) -> pd.DataFrame:

    sum_cols = [
        "minutes_atleast1_vp",
        "minutes_atleast2_vp",
        "rt_service_minutes",
        "scheduled_service_minutes",
        "total_vp",
        "vp_in_shape",
        "is_early",
        "is_ontime",
        "is_late",
    ]

    count_cols = ["trip_instance_key"]

    df2 = (
        df.groupby(group_cols, observed=True, group_keys=False, dropna=False)
        .agg({**{e: "sum" for e in sum_cols}, **{e: "count" for e in count_cols}})
        .reset_index()
        .rename(columns={"trip_instance_key": "n_vp_trips"})
    )

    return df2

`weighted_average_speeds_across_segments` is from `rt_segment_speeds/segment_speed_utils/metrics` -> added `dropna=False`

In [127]:
def weighted_average_speeds_across_segments(
    df: pd.DataFrame, group_cols: list
) -> pd.DataFrame:
    """
    We can use our segments and the deltas within a trip
    to calculate the trip-level average speed, or
    the route-direction-level average speed.
    But, we want a weighted average, using the raw deltas
    instead of mean(speed_mph), since segments can be varying lengths.
    """
    avg_speeds = (
        df.groupby(group_cols, observed=True, group_keys=False, dropna=False)
        .agg(
            {
                "meters_elapsed": "sum",
                "sec_elapsed": "sum",
            }
        )
        .reset_index()
    ).pipe(segment_calcs.speed_from_meters_elapsed_sec_elapsed)

    return avg_speeds

`concatenate_peak_offpeak_allday_averages` is from `rt_segment_speeds/segment_speed_utils/metrics`

In [128]:
def concatenate_peak_offpeak_allday_averages(
    df: pd.DataFrame, group_cols: list, metric_type: str
) -> pd.DataFrame:
    """
    Calculate average speeds for all day and
    peak_offpeak.
    Concatenate these, so that speeds are always calculated
    for the same 3 time periods.
    """
    if metric_type == "segment_speeds":
        avg_peak = calculate_avg_speeds(df, group_cols + ["peak_offpeak"])

        avg_allday = calculate_avg_speeds(df, group_cols).assign(peak_offpeak="all_day")

    elif metric_type == "summary_speeds":
        avg_peak = weighted_average_speeds_across_segments(
            df, group_cols + ["peak_offpeak"]
        )

        avg_allday = weighted_average_speeds_across_segments(df, group_cols).assign(
            peak_offpeak="all_day"
        )

    elif metric_type == "rt_vs_schedule":
        avg_peak = calculate_weighted_average_vp_schedule_metrics(
            df, group_cols + ["peak_offpeak"]
        )

        avg_allday = calculate_weighted_average_vp_schedule_metrics(
            df, group_cols
        ).assign(peak_offpeak="all_day")

    else:
        print(
            f"Valid metric types: ['segment_speeds', 'summary_speeds', 'rt_vs_schedule']"
        )

    # Concatenate so that every segment has 3 time periods: peak, offpeak, and all_day
    avg_metrics = pd.concat([avg_peak, avg_allday], axis=0, ignore_index=True).rename(
        columns={"peak_offpeak": "time_period"}
    )

    return avg_metrics

##### Going back to `rt_v_scheduled/scripts/rt_v_scheduled_routes.py`

In [129]:
route_metrics_df = concatenate_peak_offpeak_allday_averages(
    trip_df2,
    group_cols=["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS,
    metric_type="rt_vs_schedule",
)

In [130]:
preview(route_metrics_df)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,1,
2,73105f2d1cabc8170ab066d96863c5d5,11,
4,73105f2d1cabc8170ab066d96863c5d5,12X,
6,73105f2d1cabc8170ab066d96863c5d5,1B,
7,73105f2d1cabc8170ab066d96863c5d5,2,
9,73105f2d1cabc8170ab066d96863c5d5,20,
11,73105f2d1cabc8170ab066d96863c5d5,3,
13,73105f2d1cabc8170ab066d96863c5d5,30,
15,73105f2d1cabc8170ab066d96863c5d5,4,
17,73105f2d1cabc8170ab066d96863c5d5,5,0.0


In [131]:
route_metrics_df.shape

(50, 14)

##### A lot of stuff is here except `direction_id`

In [132]:
route_metrics_df = route_metrics_df.pipe(metrics.derive_rt_vs_schedule_metrics)

In [133]:
route_metrics_df = route_metrics_df.pipe(rt_v_scheduled_routes.average_rt_trip_times)

In [134]:
route_metrics_df = gtfs_schedule_wrangling.merge_operator_identifiers(
    route_metrics_df, [one_analysis_date], columns=crosswalk_cols
)

In [136]:
route_metrics_df.loc[route_metrics_df.time_period == "peak"].drop(
    columns=[
        "schedule_gtfs_dataset_key",
        "schedule_source_record_id",
        "base64_url",
        "organization_name",
        "organization_source_record_id",
        "caltrans_district",
    ]
)

Unnamed: 0,route_id,direction_id,time_period,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,name
1,1,,peak,449,443,439.04,303.67,1327,1309,0,0,10,10,3.02,0.99,1.0,1.0,1.0,1.0,1.45,43.9,Santa Maria Schedule
3,11,,peak,807,805,1837.0,246.0,2413,1258,0,2,4,6,1.31,0.52,0.44,0.44,1.0,1.0,7.47,306.17,Santa Maria Schedule
5,12X,,peak,299,294,459.04,280.0,884,713,0,3,2,5,1.93,0.81,0.65,0.64,1.0,1.0,1.64,91.81,Santa Maria Schedule
6,1B,,peak,1440,1440,1439.63,30.0,4319,0,0,0,1,1,3.0,0.0,1.0,1.0,1.0,1.0,47.99,1439.63,Santa Maria Schedule
8,2,,peak,1878,1872,1867.87,587.0,5614,1299,0,8,3,11,3.01,0.23,1.0,1.0,1.0,1.0,3.18,169.81,Santa Maria Schedule
10,20,,peak,449,447,446.24,360.0,1342,1196,0,0,3,3,3.01,0.89,1.0,1.0,1.0,1.0,1.24,148.75,Santa Maria Schedule
12,3,,peak,438,431,428.22,349.0,1295,992,0,2,8,10,3.02,0.77,1.0,1.0,1.0,1.0,1.23,42.82,Santa Maria Schedule
14,30,,peak,1809,1806,2924.88,677.0,5418,2034,0,0,4,4,1.85,0.38,0.62,0.62,1.0,1.0,4.32,731.22,Santa Maria Schedule
16,4,,peak,446,438,435.79,411.0,1318,1182,1,5,4,10,3.02,0.9,1.0,1.0,1.0,1.0,1.06,43.58,Santa Maria Schedule
18,5,0.0,peak,450,445,441.34,378.0,1333,1223,0,7,2,9,3.02,0.92,1.0,1.0,1.0,1.0,1.17,49.04,Santa Maria Schedule


##### `df_rt_sched` equals `df_rt_sched`

In [137]:
df_rt_sched_og = merge_data.concatenate_rt_vs_schedule_by_route_direction(
    analysis_date_list
)

In [138]:
df_rt_sched = route_metrics_df.copy()

In [139]:
df_rt_sched_og.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'name',
       'schedule_source_record_id', 'service_date'],
      dtype='object')

In [140]:
df_rt_sched_og.route_id.nunique()

1148

In [141]:
df_rt_sched.route_id.nunique()

16

#### `gtfs_digest/merge_data/` line 316: `df_crosswalk`

In [142]:
df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)

In [143]:
df_crosswalk.service_date.unique()

array(['2024-11-13T00:00:00.000000000'], dtype='datetime64[ns]')

#### `gtfs_digest/merge_data/merge_data_sources_by_route_direction`
* Have to make some tweaks since `df_avg_speeds2` is missing a lot of routes.

In [144]:
service_date_datetime = pd.to_datetime("2024-11-13T00:00:00.000000000")

In [145]:
type(service_date_datetime)

pandas._libs.tslibs.timestamps.Timestamp

##### Need to fix `df_sched` b/c `direction_id` and `route_primary_direction` are missing.

In [146]:
df_sched.columns

Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',
       'common_shape_id', 'route_name', 'avg_scheduled_service_minutes',
       'avg_stop_miles', 'n_trips', 'time_period', 'peak_offpeak', 'frequency',
       'is_coverage', 'is_downtown_local', 'is_local', 'is_rapid',
       'is_express', 'is_rail', 'route_primary_direction'],
      dtype='object')

In [147]:
df_sched[["route_primary_direction", "direction_id", "route_id"]].drop_duplicates()

Unnamed: 0,route_primary_direction,direction_id,route_id
0,Southbound,0.0,7
3,Northbound,0.0,6
6,Southbound,1.0,CC
9,Northbound,0.0,CC
12,Eastbound,0.0,8
15,Eastbound,0.0,Mall
18,Northbound,0.0,12X
21,Westbound,0.0,13X
24,Northbound,0.0,11
27,Southbound,0.0,30


In [148]:
df_sched["service_date"] = service_date_datetime

In [149]:
df_rt_sched["service_date"] = service_date_datetime

In [150]:
df_avg_speeds2.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'speed_mph', 'service_date'],
      dtype='object')

In [151]:
df_avg_speeds2["service_date"] = service_date_datetime

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_avg_speeds2["service_date"] = service_date_datetime


In [152]:
# merge1 = merge_data.merge_data_sources_by_route_direction(
# route_dir_metrics2,
# df_rt_sched,
# df_avg_speeds2,
# df_crosswalk
# )

In [153]:
primary_typology = merge_data.set_primary_typology(route_dir_metrics2)

In [154]:
route_time_cols = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
]

In [155]:
df_schedule2 = pd.merge(df_sched, primary_typology, on=route_time_cols, how="left")

In [156]:
df_schedule2.head().drop(
    columns=["geometry", "schedule_gtfs_dataset_key", "common_shape_id"]
)

Unnamed: 0,route_id,direction_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,peak_offpeak,frequency,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail,route_primary_direction,service_date,typology
0,7,0.0,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",36.0,0.37,19,all_day,,0.79,1.0,0.0,0.0,1.0,0.0,0.0,Southbound,2024-11-13,rapid
1,7,0.0,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",36.0,0.37,9,,offpeak,0.38,1.0,0.0,0.0,1.0,0.0,0.0,Southbound,2024-11-13,rapid
2,7,0.0,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",36.0,0.37,10,,peak,0.42,1.0,0.0,0.0,1.0,0.0,0.0,Southbound,2024-11-13,rapid
3,6,0.0,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,38.0,0.38,18,all_day,,0.75,1.0,0.0,0.0,1.0,0.0,0.0,Northbound,2024-11-13,rapid
4,6,0.0,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,38.0,0.38,7,,offpeak,0.29,1.0,0.0,0.0,1.0,0.0,0.0,Northbound,2024-11-13,rapid


In [157]:
df_rt_sched.shape

(50, 29)

In [158]:
df_rt_sched.route_id.unique()

array(['1', '11', '12X', '1B', '2', '20', '3', '30', '4', '5', '6', '7',
       '8', '8a7c42f9-51e4-4848-bf88-30c210f149ad', '9', 'CC'],
      dtype=object)

In [159]:
df = pd.merge(
    df_schedule2,
    df_rt_sched,
    on=route_time_cols + ["service_date"],
    how="outer",
    indicator="sched_rt_category",
).merge(
    df_avg_speeds2,
    on=route_time_cols + ["service_date"],
    how="outer",
)

##### Check that all the routes are here.

In [160]:
df.route_id.unique()

array(['7', '6', 'CC', '8', 'Mall', '12X', '13X', '11', '30', 'Shuttle',
       '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '1B', '20', 'SF',
       '5', '4', '9', '1'], dtype=object)

In [161]:
common_shape_test2.route_id.unique()

array(['7', '6', 'CC', '8', 'Mall', '12X', '13X', '11', '30', 'Shuttle',
       '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '1B', '20', 'SF',
       '5', '4', '9', '1'], dtype=object)

In [162]:
df2 = df.assign(
    sched_rt_category=df.sched_rt_category.map(
        gtfs_schedule_wrangling.sched_rt_category_dict
    )
)

In [163]:
df3 = df2.pipe(
    merge_data.merge_in_standardized_route_names,
)

In [164]:
drop_cols = [
    "schedule_source_record_id",
    "base64_url",
    "organization_source_record_id",
    "organization_name",
    "caltrans_district",
]

In [165]:
df4 = pd.merge(
    df3.drop(columns=drop_cols),
    df_crosswalk,
    on=["schedule_gtfs_dataset_key", "name", "service_date"],
    how="left",
)

In [166]:
df4.columns

Index(['geometry', 'schedule_gtfs_dataset_key', 'direction_id',
       'common_shape_id', 'route_name', 'avg_scheduled_service_minutes',
       'avg_stop_miles', 'n_trips', 'time_period', 'peak_offpeak', 'frequency',
       'is_coverage', 'is_downtown_local', 'is_local', 'is_rapid',
       'is_express', 'is_rail', 'route_primary_direction', 'service_date',
       'typology', 'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'sched_rt_category',
       'speed_mph', 'name', 'route_long_name', 'route_short_name',
       'route_combined_name', 'route_id', 'schedule_source_record_id',
       'base64_url', 'or

In [167]:
df4.direction_id = df4.direction_id.fillna(0)

##### Amanda, testing to see if filling direction_id with 0 will do anything.

In [176]:
df4[["route_id", "direction_id", "time_period","route_primary_direction"]].drop_duplicates().sort_values(by = ["route_id"])

Unnamed: 0,route_id,direction_id,time_period,route_primary_direction
69,1,0.0,offpeak,
66,1,0.0,all_day,Northbound
67,1,0.0,,Northbound
98,1,0.0,all_day,
70,1,0.0,peak,
99,11,0.0,all_day,
72,11,0.0,peak,
71,11,0.0,offpeak,
25,11,0.0,,Northbound
24,11,0.0,all_day,Northbound


In [169]:
df5 = df4.pipe(
    # Find the most common cardinal direction
    gtfs_schedule_wrangling.top_cardinal_direction
)

In [170]:
preview_cols = [
    "organization_name",
    "route_id",
    "sched_rt_category",
    "route_name",
    "direction_id",
    "route_primary_direction",
    "avg_scheduled_service_minutes",
    "avg_stop_miles",
    "n_trips",
    "peak_offpeak",
    "frequency",
    "typology",
    "minutes_atleast1_vp",
    "minutes_atleast2_vp",
    "total_rt_service_minutes",
    "total_scheduled_service_minutes",
    "total_vp",
    "vp_in_shape",
    "is_early",
    "is_ontime",
    "is_late",
    "n_vp_trips",
    "vp_per_minute",
    "pct_in_shape",
    "pct_rt_journey_atleast1_vp",
    "pct_rt_journey_atleast2_vp",
    "pct_sched_journey_atleast1_vp",
    "pct_sched_journey_atleast2_vp",
    "rt_sched_journey_ratio",
    "avg_rt_service_minutes",
    "speed_mph",
]

#### Observations
* There are no typologies for these previously missing routes.
* `Route_primary_direction` and `direction_id` is empty for all of City of Santa Maria 
* `route_ids` are repeated...somehow messed up during merges.

In [177]:
df5.loc[df5.time_period == "all_day"][preview_cols].sort_values(
    by=["organization_name","route_id"]
)

Unnamed: 0,organization_name,route_id,sched_rt_category,route_name,direction_id,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_trips,peak_offpeak,frequency,typology,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,speed_mph
6,Capitol Corridor Joint Powers Authority,CC,schedule_and_vp,"Daily train service between Auburn, Sacramento, Oakland and San Jose",1.0,Southbound,156.67,8.17,12.0,,0.5,unknown,1033.0,1031.0,1027.85,1437.0,3079.0,0.0,8.0,0.0,2.0,10.0,3.0,0.0,1.0,1.0,0.72,0.72,0.72,102.78,
9,Capitol Corridor Joint Powers Authority,CC,schedule_and_vp,"Daily train service between Auburn, Sacramento, Oakland and San Jose",0.0,Northbound,159.17,8.17,12.0,,0.5,unknown,1205.0,1203.0,1199.51,1672.0,3603.0,0.0,6.0,0.0,4.0,10.0,3.0,0.0,1.0,1.0,0.72,0.72,0.72,119.95,
51,Capitol Corridor Joint Powers Authority,SF,schedule_only,Shuttle to San Francisco Transbay Terminal,1.0,Westbound,32.08,6.69,12.0,,0.5,unknown,,,,,,,,,,,,,,,,,,,
54,Capitol Corridor Joint Powers Authority,SF,schedule_only,Shuttle to San Francisco Transbay Terminal,0.0,Eastbound,31.82,6.69,11.0,,0.46,unknown,,,,,,,,,,,,,,,,,,,
30,Capitol Corridor Joint Powers Authority,Shuttle,schedule_only,Shuttle to Auburn,1.0,Westbound,72.0,13.74,5.0,,0.21,coverage,,,,,,,,,,,,,,,,,,,
33,Capitol Corridor Joint Powers Authority,Shuttle,schedule_only,Shuttle to Auburn,0.0,Eastbound,70.0,11.78,5.0,,0.21,unknown,,,,,,,,,,,,,,,,,,,
66,City of Santa Maria,1,schedule_only,Rt 1. Transit Ctr to Preisker Park Via N. Broadway,0.0,Northbound,30.37,0.22,19.0,,0.79,rapid,,,,,,,,,,,,,,,,,,,
98,City of Santa Maria,1,vp_only,,0.0,Northbound,,,,,,,1235.0,1222.0,1215.66,576.97,3665.0,2478.0,1.0,0.0,18.0,19.0,3.01,0.68,1.0,1.0,1.0,1.0,2.11,63.98,
24,City of Santa Maria,11,schedule_only,R11. Transit Center to Gov't Center via S. Broadway,0.0,Northbound,41.0,0.28,22.0,,0.92,rapid,,,,,,,,,,,,,,,,,,,
99,City of Santa Maria,11,vp_only,,0.0,Northbound,,,,,,,1080.0,1074.0,2103.71,451.0,3218.0,1958.0,0.0,4.0,7.0,11.0,1.53,0.61,0.51,0.51,1.0,1.0,4.66,191.25,


In [172]:
stop

NameError: name 'stop' is not defined

### Fix `ROUTE_TYPOLOGIES` in `gtfs_funnel/route_typologies.py`

In [None]:
ROUTE_TYPOLOGIES

In [None]:
GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
route_typologies2 = route_typologies.loc[
    route_typologies.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [None]:
route_typologies2

In [None]:
route_dir_cols = [
    "schedule_gtfs_dataset_key", 
    "route_id", "direction_id", 
    "common_shape_id", "route_name", "route_meters"
]

##### Amanda: in `rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling`, I filled `nan` rows in `direction_id`. Then I commented out parts of `gtfs_funnel/route_typologies`

In [None]:
common_shape = gtfs_schedule_wrangling.most_common_shape_by_route_direction(
        one_analysis_date
    )

In [None]:
common_shape2 = common_shape.loc[common_shape.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
nov_typology_ah_test_df = pd.read_parquet("gs://calitp-analytics-data/data-analyses/gtfs_schedule/nacto_typologies/route_typologies_AH_TESTING_2024-11-13.parquet")

In [None]:
nov_typology_ah_test_df.loc[nov_typology_ah_test_df.schedule_gtfs_dataset_key.isin(schd_keys)]

### Fix Map: `gtfs_digest/merge_operator_data`

In [None]:
OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles
OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

In [None]:
operator_route_gdf = gpd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet",
)

In [None]:
operator_route_gdf.columns

In [None]:
operator_route_gdf2.columns

In [None]:
len(operator_route_gdf2)

In [None]:
operator_route_gdf2.is_rail.value_counts()

In [None]:
operator_route_gdf2.organization_name.value_counts()

In [None]:
operator_route_gdf2.schedule_gtfs_dataset_key.unique()

#### Why does City of Santa Maria have multiple schedule_gtfs_dataset_keys?

In [None]:
operator_route_gdf2.groupby(["organization_name", "schedule_gtfs_dataset_key"]).agg(
    {"route_short_name": "nunique"}
)

In [None]:
operator_route_gdf2.drop(columns=["service_date"]).explore("organization_name")

In [None]:
# operator_route_gdf2.drop(columns = ["service_date"]).explore("shape_array_key")

#### Starting from here [`gtfs_funnel/operator_scheduled_stats`](https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/gtfs_funnel/operator_scheduled_stats.py#L148)

In [None]:
analysis_date = "2024-11-13"

In [None]:
schd_keys = list(operator_route_gdf2.schedule_gtfs_dataset_key.unique())

#### Longest shape does have all the routes...

In [None]:
longest_shape_gdf = operator_scheduled_stats.longest_shape_by_route(analysis_date)

In [None]:
longest_shape_gdf2 = longest_shape_gdf.loc[
    longest_shape_gdf.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [None]:
longest_shape_gdf2.columns

In [None]:
longest_shape_gdf2.info()

In [None]:
longest_shape_gdf2.route_id.value_counts()

In [None]:
# longest_shape_gdf2.explore("schedule_gtfs_dataset_key")

In [None]:
longest_shape_gdf2.groupby(["schedule_gtfs_dataset_key", "route_id"]).agg(
    {"route_length_miles": "max"}
)

#### Somewhere along the way the routes are cut...maybe b/c of `direction_id`

In [None]:
OPERATOR_EXPORT = GTFS_DATA_DICT.schedule_tables.operator_scheduled_stats

In [None]:
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
dec_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11.parquet"

In [None]:
dec_df = gpd.read_parquet(dec_url)

In [None]:
dec_df.organization_name.value_counts().head()

In [None]:
dec_df.loc[
    dec_df.organization_name == "Alameda-Contra Costa Transit District"
].head().drop(columns=["geometry"]).T

In [None]:
dec_df2 = dec_df.loc[dec_df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
dec_df2.shape

In [None]:
type(dec_df2)

In [None]:
dec_df2.drop(columns=["geometry"]).T

In [None]:
# dec_df2.explore()

#### Find where in `gtfs_funnel` all the routes disappear

In [None]:
group_cols = ["schedule_gtfs_dataset_key"]

In [None]:
longest_shape_gdf2.info()

#### something is going on in `operator_scheduled_stats.schedule_stats_by_operator`

In [None]:
ROUTE_TYPOLOGY = GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
route_typology = pd.read_parquet(f"{SCHED_GCS}{ROUTE_TYPOLOGY}_{analysis_date}.parquet")

In [None]:
from route_typologies import route_typologies

In [None]:
route_typology_grouped = (
    route_typology.groupby(["schedule_gtfs_dataset_key", "route_id"])
    .agg({**{f"is_{c}": "sum" for c in route_typologies}})
    .reset_index()
)

In [None]:
route_typology_grouped2 = route_typology_grouped.loc[
    route_typology_grouped.schedule_gtfs_dataset_key.isin(schd_keys)
]

#### Routes are missing for Santa Maria and Capitol Corridor in `ROUTE_TYPOLOGY`

In [None]:
route_typology_grouped2.T

In [None]:
route_gdf = longest_shape_gdf2.merge(
    route_typology_grouped2, on=["schedule_gtfs_dataset_key", "route_id"], how="outer"
)

In [None]:
route_gdf.shape

In [None]:
route_gdf.drop(columns=["geometry"])

In [None]:
# route_gdf2.explore("schedule_gtfs_dataset_key")

#### Change merge from `inner` to `left`

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

In [None]:
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
my_test_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11_AH.parquet"

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [None]:
test_gdf = gpd.read_parquet(my_test_url)

In [None]:
test_gdf2 = test_gdf.loc[test_gdf.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
test_gdf2.explore("route_id")

#### Test with all the dates.

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
RT_SCHED_GCS

In [None]:
f"{OPERATOR_ROUTE}_AH_test"

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

In [None]:
test_df = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_routes_AH_test.parquet"
)

In [None]:
test_df.columns

In [None]:
op_routes_gdf = test_df.loc[test_df.organization_name.isin(org_name_lists)]

In [None]:
# Find the most recent geography for each route.
op_routes_gdf = op_routes_gdf.sort_values(by=["service_date"], ascending=False)

# Keep only the most recent row.
op_routes_gdf = op_routes_gdf.drop_duplicates(
    subset=["route_long_name", "route_short_name", "route_combined_name"]
)

# Drop service_dates
op_routes_gdf = op_routes_gdf.drop(columns=["service_date"])

In [None]:
op_routes_gdf.organization_name.value_counts()

In [None]:
op_routes_gdf.loc[op_routes_gdf.organization_name == "City of Santa Maria"].explore(
    "route_long_name"
)