## Find Missing Routes: 2 operators. 
* [Issue](https://github.com/cal-itp/data-analyses/issues/1312): Capital Corridor doesn't have any rail routes. 
* [Most of Santa Maria's routes not showing up in GTFS Digest](https://github.com/cal-itp/data-analyses/issues/1313)
* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`
* 1/7: the routes are appearing in `the longest shape` but not appearing in `route_typologies`

In [1]:
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import (
    gtfs_schedule_wrangling,
    helpers,
    metrics,
    segment_calcs,
    time_series_utils,
)
from shared_utils import (
    catalog_utils,
    portfolio_utils,
    rt_dates,
    rt_utils,
    time_helpers,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
org_name_lists = ["Capitol Corridor Joint Powers Authority", "City of Santa Maria"]

In [4]:
analysis_date_list = ["2024-11-13"]

In [5]:
one_analysis_date = "2024-11-13"

In [6]:
schd_keys = [
    "5a8721fe96786fcd25fba1f8a0ee6358",
    "73105f2d1cabc8170ab066d96863c5d5",
    "f5a749dd65924e025b1293c58f95f8d6",
]

In [7]:
import sys

sys.path.append("../gtfs_funnel/")
import operator_scheduled_stats
import schedule_stats_by_route_direction

In [8]:
def preview(df):
    df2 = df[
        ["schedule_gtfs_dataset_key", "route_id", "direction_id"]
    ].drop_duplicates()
    display(df2)

### Fix `schd_vp_url`

In [9]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [10]:
schd_vp_df = pd.read_parquet(schd_vp_url)

In [11]:
schd_vp_df2 = schd_vp_df.loc[schd_vp_df.organization_name.isin(org_name_lists)]

In [12]:
schd_vp_df2.route_id.unique()

array(['b3848f93-d26b-48a9-b6a6-5de22a4eab47', '5', 'Shuttle', 'CC'],
      dtype=object)

###  Check out `rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling`
* https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py 
* Tiffany: <i>Can you try specifying the dropna argument inside pandas groupby? Our pandas version has gone through upgrades, from 0.25 to now 1.5 ), and this argument was introduced in 1.1 and since it's dropna=True, that's probably what's driving the the row behavior.</i>
* It worked! Now time to rerun stuff further down the pipeline and see what happens.

In [13]:
def most_common_shape_by_route_direction(analysis_date: str) -> gpd.GeoDataFrame:
    """
    Find shape_id with most trips for that route-direction.
    Merge in shape geometry.
    """
    route_dir_cols = ["gtfs_dataset_key", "route_id", "direction_id"]

    keep_trip_cols = route_dir_cols + [
        "trip_instance_key",
        "shape_id",
        "shape_array_key",
    ]

    trips = helpers.import_scheduled_trips(
        analysis_date, columns=keep_trip_cols, get_pandas=True
    ).rename(columns={"schedule_gtfs_dataset_key": "gtfs_dataset_key"})
    sorting_order = [True for i in route_dir_cols]

    most_common_shape = (
        trips.groupby(
            route_dir_cols + ["shape_id", "shape_array_key"],
            observed=True,
            group_keys=False,
            dropna=False,
        )
        .agg({"trip_instance_key": "count"})
        .reset_index()
        .sort_values(
            route_dir_cols + ["trip_instance_key"], ascending=sorting_order + [False]
        )
        .drop_duplicates(subset=route_dir_cols)
        .reset_index(drop=True)[route_dir_cols + ["shape_id", "shape_array_key"]]
    ).rename(
        columns={
            "gtfs_dataset_key": "schedule_gtfs_dataset_key",
            "shape_id": "common_shape_id",
        }
    )

    shape_geom = helpers.import_scheduled_shapes(
        analysis_date,
        columns=["shape_array_key", "geometry"],
    )

    common_shape_geom = pd.merge(
        shape_geom, most_common_shape, on="shape_array_key", how="inner"
    ).drop(columns="shape_array_key")

    route_info = (
        helpers.import_scheduled_trips(
            analysis_date,
            columns=[
                "gtfs_dataset_key",
                "route_id",
                "route_long_name",
                "route_short_name",
                "route_desc",
            ],
        )
        .drop_duplicates()
        .pipe(portfolio_utils.add_route_name)
        .drop(columns=["route_long_name", "route_short_name", "route_desc"])
    )

    del shape_geom, most_common_shape

    common_shape_geom2 = pd.merge(
        common_shape_geom,
        route_info.rename(columns={"route_name_used": "route_name"}),
        on=["schedule_gtfs_dataset_key", "route_id"],
    )

    return common_shape_geom2

In [14]:
common_shape_test = most_common_shape_by_route_direction(one_analysis_date)

In [15]:
common_shape_test.columns

Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',
       'common_shape_id', 'route_name'],
      dtype='object')

In [16]:
common_shape_test2 = common_shape_test.loc[
    common_shape_test.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [17]:
common_shape_test2.route_id.unique()

array(['7', '6', 'CC', '8', 'Mall', '12X', '13X', '11', '30', 'Shuttle',
       '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '1B', '20', 'SF',
       '5', '4', '9', '1'], dtype=object)

In [18]:
[rt_dates.DATES["dec2024"]] + [rt_dates.DATES["nov2024"]]

['2024-12-11', '2024-11-13']

In [19]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

'schedule_route_dir/schedule_route_direction_metrics'

### Breakdown `gtfs_digest/merge_data.`

#### Line 294: `df_sched` is already missing a lot of the routes.

In [20]:
# Get cardinal direction for each route
df_sched = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)

In [21]:
df_sched2 = df_sched.loc[df_sched.schedule_gtfs_dataset_key.isin(schd_keys)]

In [22]:
df_sched2.route_id.value_counts()

Shuttle    6
5          3
Name: route_id, dtype: int64

##### Go back to `gtfs_funnel/schedule_stats_by_route_direction`
* https://github.com/cal-itp/data-analyses/blob/1ba0f544a01f99966a6e210dd11666b4fe4a146e/gtfs_funnel/schedule_stats_by_route_direction.py#L190
* Test 1: Updated `gtfs_schedule_wrangling` but a lot of routes are still missing. 

In [23]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [24]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

'schedule_route_dir/schedule_route_direction_metrics'

In [25]:
nov_rt_sched = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/AH_schedule_route_dir/schedule_route_direction_metrics_2024-12-11.parquet"
)

In [26]:
nov_rt_sched2 = nov_rt_sched.loc[nov_rt_sched.schedule_gtfs_dataset_key.isin(schd_keys)]

In [27]:
nov_rt_sched2.route_id.unique()

array(['Shuttle', '5'], dtype=object)

##### `assemble_scheduled_trip_metrics`: nothing is missing

In [28]:
trip_metrics = schedule_stats_by_route_direction.assemble_scheduled_trip_metrics(
    one_analysis_date, GTFS_DATA_DICT
)

In [29]:
trip_metrics.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id
0,0139b1253130b33adcd4b3a4490530d2,014dd8051849e5252df704ca9c381fd9,559.44,PM Peak,23.0,D4,


In [30]:
trip_metrics2 = trip_metrics.loc[trip_metrics.schedule_gtfs_dataset_key.isin(schd_keys)]

In [31]:
preview(trip_metrics2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
49724,73105f2d1cabc8170ab066d96863c5d5,30,
49725,73105f2d1cabc8170ab066d96863c5d5,3,
49727,73105f2d1cabc8170ab066d96863c5d5,20,
49728,73105f2d1cabc8170ab066d96863c5d5,4,
49729,73105f2d1cabc8170ab066d96863c5d5,Mall,
49730,73105f2d1cabc8170ab066d96863c5d5,5,0.0
49731,73105f2d1cabc8170ab066d96863c5d5,11,
49732,73105f2d1cabc8170ab066d96863c5d5,7,
49733,73105f2d1cabc8170ab066d96863c5d5,9,
49735,73105f2d1cabc8170ab066d96863c5d5,1,


##### `schedule_metrics_by_route_direction` 
* Something is causing routes to drop off in  even though I took out `direction_id`
* Break it out even more.

In [32]:
route_group_merge_cols = ["schedule_gtfs_dataset_key", "route_id", "direction_id"]

In [33]:
route_group_merge_cols_no_dir_id = [
    "schedule_gtfs_dataset_key",
    "route_id",
]

In [34]:
route_dir_metrics = (
    schedule_stats_by_route_direction.schedule_metrics_by_route_direction(
        trip_metrics2, one_analysis_date, route_group_merge_cols
    )
)

In [35]:
route_dir_metrics[
    ["schedule_gtfs_dataset_key", "route_id", "direction_id"]
].drop_duplicates()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,f5a749dd65924e025b1293c58f95f8d6,Shuttle,1.0
3,f5a749dd65924e025b1293c58f95f8d6,Shuttle,0.0
6,73105f2d1cabc8170ab066d96863c5d5,5,0.0


In [36]:
route_dir_metrics_wo_dir_id = (
    schedule_stats_by_route_direction.schedule_metrics_by_route_direction(
        trip_metrics2, one_analysis_date, route_group_merge_cols_no_dir_id
    )
)

In [37]:
route_dir_metrics_wo_dir_id[
    ["schedule_gtfs_dataset_key", "route_id", "direction_id"]
].drop_duplicates()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,7,
3,73105f2d1cabc8170ab066d96863c5d5,6,
6,73105f2d1cabc8170ab066d96863c5d5,8,
9,73105f2d1cabc8170ab066d96863c5d5,Mall,
12,73105f2d1cabc8170ab066d96863c5d5,12X,
15,73105f2d1cabc8170ab066d96863c5d5,13X,
18,73105f2d1cabc8170ab066d96863c5d5,11,
21,73105f2d1cabc8170ab066d96863c5d5,30,
24,f5a749dd65924e025b1293c58f95f8d6,Shuttle,1.0
27,f5a749dd65924e025b1293c58f95f8d6,Shuttle,0.0


In [38]:
route_group_merge_cols = ["schedule_gtfs_dataset_key", "route_id", "direction_id"]

##### `gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak` is missing a lot of routes -> break it out.

In [39]:
service_freq_df = gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
    trip_metrics2, route_group_merge_cols, long_or_wide="long"
)

In [40]:
service_freq_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,n_trips,time_period,peak_offpeak,frequency
0,73105f2d1cabc8170ab066d96863c5d5,5,0.0,18,all_day,,0.75
1,f5a749dd65924e025b1293c58f95f8d6,CC,0.0,12,all_day,,0.5


In [41]:
service_freq_df.route_id.unique()

array(['5', 'CC', 'SF', 'Shuttle'], dtype=object)

##### Changed `count_trips_by_group` to have argument `dropna=False` in `groupby`

In [42]:
def count_trips_by_group(df: pd.DataFrame, group_cols: list):
    """
    Given a df with trip_instance_key and an arbitrary list of
    group_cols, return trip counts by group.
    """
    assert "trip_instance_key" in df.columns
    df = (
        df.groupby(group_cols, dropna=False)
        .agg({"trip_instance_key": "count"})
        .reset_index()
    )
    df = df.rename(columns={"trip_instance_key": "n_trips"})
    return df

In [43]:
def add_peak_offpeak_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add a single peak_offpeak column based on the time-of-day column.
    """
    df = df.assign(peak_offpeak=df.time_of_day.map(time_helpers.TIME_OF_DAY_DICT))

    return df

In [44]:
def aggregate_time_of_day_to_peak_offpeak(
    df: pd.DataFrame,
    group_cols: list,
    long_or_wide: str,
) -> pd.DataFrame:
    """
    Aggregate time-of-day bins into peak/offpeak periods.
    Return n_trips and frequency for grouping of columns (route-direction, etc).
    Allow wide or long df to be returned.
    """
    peak_hours = sum(
        v
        for k, v in time_helpers.HOURS_BY_TIME_OF_DAY.items()
        if k in time_helpers.PEAK_PERIODS
    )

    offpeak_hours = sum(
        v
        for k, v in time_helpers.HOURS_BY_TIME_OF_DAY.items()
        if k not in time_helpers.PEAK_PERIODS
    )

    df = add_peak_offpeak_column(df)

    all_day = count_trips_by_group(df, group_cols).assign(time_period="all_day")
    peak_offpeak = count_trips_by_group(df, group_cols + ["peak_offpeak"]).rename(
        {"peak_offpeak": "time_period"}
    )

    df2 = pd.concat([all_day, peak_offpeak], axis=0, ignore_index=True)

    # Add service frequency (trips per hour)
    # there are different number of hours in peak and offpeak periods
    df2 = df2.assign(
        frequency=df2.apply(
            lambda x: round(x.n_trips / peak_hours, 2)
            if x.time_period == "peak"
            else round(x.n_trips / offpeak_hours, 2)
            if x.time_period == "offpeak"
            else round(x.n_trips / (peak_hours + offpeak_hours), 2),
            axis=1,
        )
    )

    if long_or_wide == "long":
        return df2

    elif long_or_wide == "wide":
        # Reshape from wide to long
        # get rid of multiindex column names
        df3 = df2.pivot(
            index=group_cols, columns="time_period", values=["n_trips", "frequency"]
        )

        df3.columns = [f"{b}_{a}" for a, b in df3.columns]
        df3 = df3.reset_index()

        return df3

In [45]:
service_freq_df_test1 = aggregate_time_of_day_to_peak_offpeak(
    trip_metrics2, route_group_merge_cols, long_or_wide="long"
)

##### `metrics_df` portion of `gtfs_funnel.schedule_stas_by_route_direction.schedule_metrics_by_route_direction`
* Did `dropna=False` to get all the routes. 
* Without `dropna=False`, all the routes disappear.

In [46]:
metrics_df = (
    trip_metrics2.groupby(
        route_group_merge_cols, observed=True, group_keys=False, dropna=False
    )
    .agg(
        {
            "median_stop_meters": "mean",
            # take mean of the median stop spacing for trip
            # does this make sense?
            # median is the single boiled down metric at the trip-level
            "scheduled_service_minutes": "mean",
        }
    )
    .reset_index()
    .rename(
        columns={
            "median_stop_meters": "avg_stop_meters",
            "scheduled_service_minutes": "avg_scheduled_service_minutes",
        }
    )
)

In [47]:
from shared_utils.rt_utils import METERS_PER_MILE

In [48]:
metrics_df = metrics_df.assign(
    avg_stop_miles=metrics_df.avg_stop_meters.divide(METERS_PER_MILE).round(2)
).drop(columns=["avg_stop_meters"])

In [49]:
round_me = ["avg_stop_miles", "avg_scheduled_service_minutes"]
metrics_df[round_me] = metrics_df[round_me].round(2)

In [50]:
route_dir_metrics = pd.merge(
    common_shape_test2, metrics_df, on=route_group_merge_cols, how="inner"
).merge(service_freq_df_test1, on=route_group_merge_cols, how="inner")

##### Still in `gtfs_funnel/schedule_stats_by_route`

In [51]:
ROUTE_TYPOLOGIES = GTFS_DATA_DICT.schedule_tables.route_typologies

In [52]:
route_typologies = pd.read_parquet(
    f"{SCHED_GCS}{ROUTE_TYPOLOGIES}_{one_analysis_date}.parquet",
    columns=route_group_merge_cols
    + [
        "is_coverage",
        "is_downtown_local",
        "is_local",
        "is_rapid",
        "is_express",
        "is_rail",
    ],
)

In [53]:
cardinal_dir_df = (
    schedule_stats_by_route_direction.cardinal_direction_for_route_direction(
        one_analysis_date, GTFS_DATA_DICT
    )
)

##### `cardinal_dir_df` also gets rid of a lot of stuff.

In [54]:
cardinal_dir_df2 = cardinal_dir_df.loc[
    cardinal_dir_df.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [55]:
preview(cardinal_dir_df2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
2581,73105f2d1cabc8170ab066d96863c5d5,5,0.0
3947,f5a749dd65924e025b1293c58f95f8d6,CC,0.0
3948,f5a749dd65924e025b1293c58f95f8d6,CC,1.0
4225,f5a749dd65924e025b1293c58f95f8d6,SF,0.0
4226,f5a749dd65924e025b1293c58f95f8d6,SF,1.0
4254,f5a749dd65924e025b1293c58f95f8d6,Shuttle,0.0
4255,f5a749dd65924e025b1293c58f95f8d6,Shuttle,1.0


In [56]:
STOP_TIMES_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction

In [57]:
STOP_TIMES_FILE

'stop_times_direction'

In [58]:
stop_times_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{one_analysis_date}.parquet",
    filters=[[("stop_primary_direction", "!=", "Unknown")]],
)

In [59]:
stop_times_df.head(1)

Unnamed: 0,feed_key,stop_id,stop_sequence,schedule_gtfs_dataset_key,trip_instance_key,shape_array_key,stop_name,geometry,prior_stop_sequence,subseq_stop_sequence,stop_pair,stop_pair_name,stop_primary_direction,stop_meters
0,3b357b65167b3749dff376c9d3624f09,c24c75f6-8397-4d8c-a4e5-72444083a9c8,2,c4726e0acfbcbd26e1dc38b8bd046c03,000000c923cd55be781113b823ce9879,a8ebcf8a2e79025380807e9d58b26f52,Toy Story Bus Stop,b'\x01\x01\x00\x00\x003\xe2\x8a}\r\x8a\x07A\x98H\x01\xc87e\x1c\xc1',1,,c24c75f6-8397-4d8c-a4e5-72444083a9c8__,Toy Story Bus Stop__,Northbound,1127.76


In [60]:
stop_times_df2 = stop_times_df.loc[
    stop_times_df.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [61]:
stop_times_df2.head(2)

Unnamed: 0,feed_key,stop_id,stop_sequence,schedule_gtfs_dataset_key,trip_instance_key,shape_array_key,stop_name,geometry,prior_stop_sequence,subseq_stop_sequence,stop_pair,stop_pair_name,stop_primary_direction,stop_meters
6324,c86a471a1a4c36bb8cf7da9d6a20f202,f09af637-87de-4bdb-bf49-660539686c97,2,73105f2d1cabc8170ab066d96863c5d5,005bb393ed8b22ca4d8e7cc8d7895231,c6e9cda0db8bf76bc535f590ca1fccb5,Broadway at Stowell,b'\x01\x01\x00\x00\x00\x8b\x1a\xbc\x0f\x88q\xe3\xc0(q\xf2\xcfo\xe3\x14\xc1',1,3,f09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c,Broadway at Stowell__Betteravia at Miller (Panda Express),Southbound,1275.56
6325,c86a471a1a4c36bb8cf7da9d6a20f202,47def414-f158-496a-91cb-5f3fb0aa406c,3,73105f2d1cabc8170ab066d96863c5d5,005bb393ed8b22ca4d8e7cc8d7895231,c6e9cda0db8bf76bc535f590ca1fccb5,Betteravia at Miller (Panda Express),"b'\x01\x01\x00\x00\x00.\xacHd""Q\xe3\xc0x\x83P\x05\\\xfc\x14\xc1'",2,4,47def414-f158-496a-91cb-5f3fb0aa406c__a94160c1-bd99-4898-921f-941aa748ce6f,Betteravia at Miller (Panda Express)__McCoy at Broadway (Outbound),Southbound,1615.97


In [62]:
trip_scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name",
]

trips_df = helpers.import_scheduled_trips(
    one_analysis_date, columns=trip_scheduled_col, get_pandas=True
)

In [63]:
merge_cols = ["trip_instance_key", "schedule_gtfs_dataset_key", "shape_array_key"]

In [64]:
stop_times_with_trip = pd.merge(stop_times_df2, trips_df, on=merge_cols)

In [65]:
preview(stop_times_with_trip)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,30,
26,73105f2d1cabc8170ab066d96863c5d5,3,
66,73105f2d1cabc8170ab066d96863c5d5,20,
75,f5a749dd65924e025b1293c58f95f8d6,Shuttle,0.0
79,73105f2d1cabc8170ab066d96863c5d5,4,
104,f5a749dd65924e025b1293c58f95f8d6,SF,1.0
105,73105f2d1cabc8170ab066d96863c5d5,Mall,
111,73105f2d1cabc8170ab066d96863c5d5,5,0.0
147,73105f2d1cabc8170ab066d96863c5d5,11,
168,73105f2d1cabc8170ab066d96863c5d5,7,


In [66]:
main_cols = ["route_id", "schedule_gtfs_dataset_key", "direction_id"]

##### Changing dropna=False here too

In [67]:
agg1 = (
    stop_times_with_trip.groupby(main_cols + ["stop_primary_direction"], dropna=False)
    .agg({"stop_sequence": "count"})
    .reset_index()
    .rename(columns={"stop_sequence": "total_stops"})
)

In [68]:
preview(agg1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,1,
4,73105f2d1cabc8170ab066d96863c5d5,11,
8,73105f2d1cabc8170ab066d96863c5d5,12X,
12,73105f2d1cabc8170ab066d96863c5d5,13X,
16,73105f2d1cabc8170ab066d96863c5d5,1B,
20,73105f2d1cabc8170ab066d96863c5d5,2,
24,73105f2d1cabc8170ab066d96863c5d5,20,
28,73105f2d1cabc8170ab066d96863c5d5,3,
32,73105f2d1cabc8170ab066d96863c5d5,30,
36,73105f2d1cabc8170ab066d96863c5d5,4,


##### Making sure `dropna=False` really is the reason

In [69]:
agg_wo_dropna = (
    stop_times_with_trip.groupby(
        main_cols + ["stop_primary_direction"],
    )
    .agg({"stop_sequence": "count"})
    .reset_index()
    .rename(columns={"stop_sequence": "total_stops"})
)

In [70]:
preview(agg_wo_dropna)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,5,0.0
4,f5a749dd65924e025b1293c58f95f8d6,CC,0.0
7,f5a749dd65924e025b1293c58f95f8d6,CC,1.0
10,f5a749dd65924e025b1293c58f95f8d6,SF,0.0
11,f5a749dd65924e025b1293c58f95f8d6,SF,1.0
12,f5a749dd65924e025b1293c58f95f8d6,Shuttle,0.0
14,f5a749dd65924e025b1293c58f95f8d6,Shuttle,1.0


In [71]:
agg2 = agg1.sort_values(
    by=main_cols + ["total_stops"],
    ascending=[True, True, True, False],
)

In [72]:
cardinal_dir_df = (
    agg2.drop_duplicates(subset=main_cols)
    .reset_index(drop=True)
    .drop(columns=["total_stops"])
    .rename(columns={"stop_primary_direction": "route_primary_direction"})
)

In [73]:
preview(cardinal_dir_df)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,1,
1,73105f2d1cabc8170ab066d96863c5d5,11,
2,73105f2d1cabc8170ab066d96863c5d5,12X,
3,73105f2d1cabc8170ab066d96863c5d5,13X,
4,73105f2d1cabc8170ab066d96863c5d5,1B,
5,73105f2d1cabc8170ab066d96863c5d5,2,
6,73105f2d1cabc8170ab066d96863c5d5,20,
7,73105f2d1cabc8170ab066d96863c5d5,3,
8,73105f2d1cabc8170ab066d96863c5d5,30,
9,73105f2d1cabc8170ab066d96863c5d5,4,


In [74]:
service_freq_df_test1

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,n_trips,time_period,peak_offpeak,frequency
0,73105f2d1cabc8170ab066d96863c5d5,1,,19,all_day,,0.79
1,73105f2d1cabc8170ab066d96863c5d5,11,,22,all_day,,0.92
2,73105f2d1cabc8170ab066d96863c5d5,12X,,11,all_day,,0.46
3,73105f2d1cabc8170ab066d96863c5d5,13X,,11,all_day,,0.46
4,73105f2d1cabc8170ab066d96863c5d5,1B,,12,all_day,,0.5
5,73105f2d1cabc8170ab066d96863c5d5,2,,17,all_day,,0.71
6,73105f2d1cabc8170ab066d96863c5d5,20,,6,all_day,,0.25
7,73105f2d1cabc8170ab066d96863c5d5,3,,18,all_day,,0.75
8,73105f2d1cabc8170ab066d96863c5d5,30,,9,all_day,,0.38
9,73105f2d1cabc8170ab066d96863c5d5,4,,18,all_day,,0.75


In [75]:
route_dir_metrics2 = pd.merge(
    route_dir_metrics, route_typologies, on=route_group_merge_cols, how="left"
).merge(cardinal_dir_df, on=route_group_merge_cols, how="left")

In [76]:
preview(route_dir_metrics2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,7,
3,73105f2d1cabc8170ab066d96863c5d5,6,
6,f5a749dd65924e025b1293c58f95f8d6,CC,1.0
9,f5a749dd65924e025b1293c58f95f8d6,CC,0.0
12,73105f2d1cabc8170ab066d96863c5d5,8,
15,73105f2d1cabc8170ab066d96863c5d5,Mall,
18,73105f2d1cabc8170ab066d96863c5d5,12X,
21,73105f2d1cabc8170ab066d96863c5d5,13X,
24,73105f2d1cabc8170ab066d96863c5d5,11,
27,73105f2d1cabc8170ab066d96863c5d5,30,


In [77]:
route_dir_metrics2.columns

Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',
       'common_shape_id', 'route_name', 'avg_scheduled_service_minutes',
       'avg_stop_miles', 'n_trips', 'time_period', 'peak_offpeak', 'frequency',
       'is_coverage', 'is_downtown_local', 'is_local', 'is_rapid',
       'is_express', 'is_rail', 'route_primary_direction'],
      dtype='object')

##### `df_sched` is `route_dir_metrics2`

#### `gtfs_digest/merge_data` line 300 `df_avg_speeds` is also missing a lot of routes.
* HELP: not sure which file creates this ? 

In [78]:
SEGMENT_GCS

'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/'

In [79]:
GTFS_DATA_DICT.rt_stop_times.route_dir_single_summary

'rollup_singleday/speeds_route_dir'

In [80]:
df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(analysis_date_list)

In [81]:
df_avg_speeds2 = df_avg_speeds.loc[
    df_avg_speeds.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [82]:
df_avg_speeds2.route_id.value_counts()

5    3
Name: route_id, dtype: int64

In [83]:
df_avg_speeds2.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,speed_mph,service_date
5006,73105f2d1cabc8170ab066d96863c5d5,5,0.0,all_day,15.73,2024-11-13
5007,73105f2d1cabc8170ab066d96863c5d5,5,0.0,offpeak,17.62,2024-11-13


##### Side note, there are actually all of the Dec values for all time_periods here for speed.

In [84]:
df_avg_speeds2.loc[df_avg_speeds2.route_id == "5"]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,speed_mph,service_date
5006,73105f2d1cabc8170ab066d96863c5d5,5,0.0,all_day,15.73,2024-11-13
5007,73105f2d1cabc8170ab066d96863c5d5,5,0.0,offpeak,17.62,2024-11-13
5008,73105f2d1cabc8170ab066d96863c5d5,5,0.0,peak,14.35,2024-11-13


##### Can't find which file powers `df_avg_speeds2`,`df_avg_speeds`  = `df_avg_speeds2`

#### Dataframe in line 307 `df_rt_sched` in `gtfs_digest/merge_data`

In [85]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(
    analysis_date_list
).astype({"direction_id": "float"})

In [86]:
df_rt_sched2 = df_rt_sched.loc[df_rt_sched.schedule_gtfs_dataset_key.isin(schd_keys)]

In [87]:
preview(df_rt_sched2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
5369,73105f2d1cabc8170ab066d96863c5d5,5,0.0
11696,f5a749dd65924e025b1293c58f95f8d6,CC,0.0
11699,f5a749dd65924e025b1293c58f95f8d6,CC,1.0


In [88]:
df_rt_sched2.head()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,name,schedule_source_record_id,service_date
5369,73105f2d1cabc8170ab066d96863c5d5,5,0.0,all_day,1207,1195,1662.72,714.0,3584,2223,0,12,5,17,2.16,0.62,0.73,0.72,1.0,1.0,2.33,97.81,Santa Maria Schedule,recxPy2JOcDFGDo31,2024-11-13
5370,73105f2d1cabc8170ab066d96863c5d5,5,0.0,offpeak,757,750,1221.38,336.0,2251,1000,0,5,3,8,1.84,0.44,0.62,0.61,1.0,1.0,3.64,152.67,Santa Maria Schedule,recxPy2JOcDFGDo31,2024-11-13
5371,73105f2d1cabc8170ab066d96863c5d5,5,0.0,peak,450,445,441.34,378.0,1333,1223,0,7,2,9,3.02,0.92,1.0,1.0,1.0,1.0,1.17,49.04,Santa Maria Schedule,recxPy2JOcDFGDo31,2024-11-13
11696,f5a749dd65924e025b1293c58f95f8d6,CC,0.0,all_day,1205,1203,1199.51,1672.0,3603,0,6,0,4,10,3.0,0.0,1.0,1.0,0.72,0.72,0.72,119.95,Bay Area 511 Capitol Corridor Schedule,recMJKHEBWxlMt1I5,2024-11-13
11697,f5a749dd65924e025b1293c58f95f8d6,CC,0.0,offpeak,570,569,568.94,619.0,1704,0,1,0,3,4,3.0,0.0,1.0,1.0,0.92,0.92,0.92,142.24,Bay Area 511 Capitol Corridor Schedule,recMJKHEBWxlMt1I5,2024-11-13


##### Go into file `rt_segment_speeds/segment_speed_utils/metrics.py`
* I think this file is used [here](https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/rt_v_scheduled_routes.py) -> `rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`

In [89]:
[*GTFS_DATA_DICT["stop_segments"]["route_dir_cols"]]

['route_id', 'direction_id']

In [90]:
dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables

In [91]:
TRIP_EXPORT = dict_inputs.vp_trip_metrics

In [92]:
TRIP_EXPORT

'vp_trip/trip_metrics'

In [93]:
GTFS_DATA_DICT.rt_vs_schedule_tables.vp_route_direction_metrics

'vp_route_dir/route_direction_metrics'

In [94]:
dict_inputs.vp_route_direction_metrics

'vp_route_dir/route_direction_metrics'

In [95]:
ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics

In [96]:
trip_df = pd.read_parquet(f"{RT_SCHED_GCS}{TRIP_EXPORT}_{one_analysis_date}.parquet")

In [97]:
trip_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,route_id,direction_id,scheduled_service_minutes,total_vp,rt_service_minutes,minutes_atleast1_vp,minutes_atleast2_vp,vp_in_shape,route_short_name,sched_rt_category,time_of_day,peak_offpeak,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_difference,is_early,is_ontime,is_late
0,3c62ad6ee589d56eca915ce291a5df0a,0001aafdcc1443da02a93a3596e67f36,41,0.0,50.0,165,54.52,56,55,156,41,schedule_and_vp,Early AM,offpeak,3.03,0.94,1.0,1.0,1.0,1.0,4.52,0,1,0
1,1770249a5a2e770ca90628434d4934b1,0001e1cccb0466096f3faed959b2f27b,4133,0.0,107.0,323,107.98,109,107,323,77,schedule_and_vp,Midday,offpeak,2.99,1.0,1.0,0.99,1.0,1.0,0.98,0,1,0


In [98]:
trip_df2 = trip_df.loc[trip_df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [99]:
preview(trip_df2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
127,73105f2d1cabc8170ab066d96863c5d5,30,
407,73105f2d1cabc8170ab066d96863c5d5,3,
1797,73105f2d1cabc8170ab066d96863c5d5,20,
2622,73105f2d1cabc8170ab066d96863c5d5,4,
3009,73105f2d1cabc8170ab066d96863c5d5,5,0.0
3096,73105f2d1cabc8170ab066d96863c5d5,11,
3338,73105f2d1cabc8170ab066d96863c5d5,7,
3512,73105f2d1cabc8170ab066d96863c5d5,9,
4197,73105f2d1cabc8170ab066d96863c5d5,1,
6708,73105f2d1cabc8170ab066d96863c5d5,12X,


In [100]:
import sys

sys.path.append("../rt_scheduled_v_ran/scripts")
import rt_v_scheduled_routes

In [101]:
ROUTE_DIR_COLS = [*GTFS_DATA_DICT["stop_segments"]["route_dir_cols"]]

##### Somewhere in `rt_scheduled_v_ran/rt_v_scheduled_routes`, routes go missing.

In [102]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "name",
    "schedule_source_record_id",
    "base64_url",
    "organization_source_record_id",
    "organization_name",
    "caltrans_district",
]

route_df = (
    metrics.concatenate_peak_offpeak_allday_averages(
        trip_df2,
        group_cols=["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS,
        metric_type="rt_vs_schedule",
    )
    .pipe(metrics.derive_rt_vs_schedule_metrics)
    .pipe(rt_v_scheduled_routes.average_rt_trip_times)
)

In [103]:
preview(route_df)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,5,0.0
2,f5a749dd65924e025b1293c58f95f8d6,CC,0.0
4,f5a749dd65924e025b1293c58f95f8d6,CC,1.0


##### `metrics.concatenate_peak_offpeak_allday_averages` references to `segment_calcs.calculate_avg_speeds` which lives in `rt_segment_speeds/segment_speed_utils`: add `dropna=False`

In [104]:
["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS

['schedule_gtfs_dataset_key', 'route_id', 'direction_id']

`calculate_avg_speeds` is from `rt_segement_speeds/segement_speed_utils/segment_calc.spy`

In [105]:
def calculate_avg_speeds(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
    """
    Calculate the median, 20th, and 80th percentile speeds
    by groups.
    """
    # pd.groupby and pd.quantile is so slow
    # create our own list of speeds and use np
    df2 = (
        df.groupby(group_cols, observed=True, group_keys=False, dropna=False)
        .agg({"speed_mph": lambda x: sorted(list(x))})
        .reset_index()
        .rename(columns={"speed_mph": "speed_mph_list"})
    )

    df2 = df2.assign(
        p50_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=50), axis=1),
        n_trips=df2.apply(lambda x: len(x.speed_mph_list), axis=1).astype("int16"),
        p20_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=20), axis=1),
        p80_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=80), axis=1),
    )

    stats = df2.drop(columns="speed_mph_list")

    # Clean up for map
    speed_cols = [c for c in stats.columns if "_mph" in c]
    stats[speed_cols] = stats[speed_cols].round(2)

    return stats

`concatenate_peak_offpeak_allday_averages` and `calculate_weighted_average_vp_schedule_metrics` is from `rt_segment_speeds/segment_speed_utils/metrics`

In [106]:
def calculate_weighted_average_vp_schedule_metrics(
    df: pd.DataFrame,
    group_cols: list,
) -> pd.DataFrame:

    sum_cols = [
        "minutes_atleast1_vp",
        "minutes_atleast2_vp",
        "rt_service_minutes",
        "scheduled_service_minutes",
        "total_vp",
        "vp_in_shape",
        "is_early",
        "is_ontime",
        "is_late",
    ]

    count_cols = ["trip_instance_key"]

    df2 = (
        df.groupby(group_cols, observed=True, group_keys=False, dropna=False)
        .agg({**{e: "sum" for e in sum_cols}, **{e: "count" for e in count_cols}})
        .reset_index()
        .rename(columns={"trip_instance_key": "n_vp_trips"})
    )

    return df2

In [107]:
def concatenate_peak_offpeak_allday_averages(
    df: pd.DataFrame, group_cols: list, metric_type: str
) -> pd.DataFrame:
    """
    Calculate average speeds for all day and
    peak_offpeak.
    Concatenate these, so that speeds are always calculated
    for the same 3 time periods.
    """
    if metric_type == "segment_speeds":
        avg_peak = calculate_avg_speeds(df, group_cols + ["peak_offpeak"])

        avg_allday = calculate_avg_speeds(df, group_cols).assign(peak_offpeak="all_day")

    elif metric_type == "summary_speeds":
        avg_peak = metrics.weighted_average_speeds_across_segments(
            df, group_cols + ["peak_offpeak"]
        )

        avg_allday = metrics.weighted_average_speeds_across_segments(
            df, group_cols
        ).assign(peak_offpeak="all_day")

    elif metric_type == "rt_vs_schedule":
        avg_peak = calculate_weighted_average_vp_schedule_metrics(
            df, group_cols + ["peak_offpeak"]
        )

        avg_allday = calculate_weighted_average_vp_schedule_metrics(
            df, group_cols
        ).assign(peak_offpeak="all_day")

    else:
        print(
            f"Valid metric types: ['segment_speeds', 'summary_speeds', 'rt_vs_schedule']"
        )

    # Concatenate so that every segment has 3 time periods: peak, offpeak, and all_day
    avg_metrics = pd.concat([avg_peak, avg_allday], axis=0, ignore_index=True).rename(
        columns={"peak_offpeak": "time_period"}
    )

    return avg_metrics

##### Going back to `rt_v_scheduled/scripts/rt_v_scheduled_routes.py`
* Adding `dropna=False` to `calculate_weighted_average_vp_schedule_metrics` seems to do the trick.

In [108]:
route_df_test = concatenate_peak_offpeak_allday_averages(
    trip_df2,
    group_cols=["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS,
    metric_type="rt_vs_schedule",
)

In [109]:
preview(route_df_test)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,1,
2,73105f2d1cabc8170ab066d96863c5d5,11,
4,73105f2d1cabc8170ab066d96863c5d5,12X,
6,73105f2d1cabc8170ab066d96863c5d5,1B,
7,73105f2d1cabc8170ab066d96863c5d5,2,
9,73105f2d1cabc8170ab066d96863c5d5,20,
11,73105f2d1cabc8170ab066d96863c5d5,3,
13,73105f2d1cabc8170ab066d96863c5d5,30,
15,73105f2d1cabc8170ab066d96863c5d5,4,
17,73105f2d1cabc8170ab066d96863c5d5,5,0.0


In [110]:
route_df_test2 = route_df_test.pipe(metrics.derive_rt_vs_schedule_metrics)

In [111]:
preview(route_df_test2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,1,
2,73105f2d1cabc8170ab066d96863c5d5,11,
4,73105f2d1cabc8170ab066d96863c5d5,12X,
6,73105f2d1cabc8170ab066d96863c5d5,1B,
7,73105f2d1cabc8170ab066d96863c5d5,2,
9,73105f2d1cabc8170ab066d96863c5d5,20,
11,73105f2d1cabc8170ab066d96863c5d5,3,
13,73105f2d1cabc8170ab066d96863c5d5,30,
15,73105f2d1cabc8170ab066d96863c5d5,4,
17,73105f2d1cabc8170ab066d96863c5d5,5,0.0


In [112]:
route_df_test3 = route_df_test.pipe(rt_v_scheduled_routes.average_rt_trip_times)

In [113]:
preview(route_df_test3)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,1,
2,73105f2d1cabc8170ab066d96863c5d5,11,
4,73105f2d1cabc8170ab066d96863c5d5,12X,
6,73105f2d1cabc8170ab066d96863c5d5,1B,
7,73105f2d1cabc8170ab066d96863c5d5,2,
9,73105f2d1cabc8170ab066d96863c5d5,20,
11,73105f2d1cabc8170ab066d96863c5d5,3,
13,73105f2d1cabc8170ab066d96863c5d5,30,
15,73105f2d1cabc8170ab066d96863c5d5,4,
17,73105f2d1cabc8170ab066d96863c5d5,5,0.0


In [114]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "name",
    "schedule_source_record_id",
    "base64_url",
    "organization_source_record_id",
    "organization_name",
    "caltrans_district",
]

In [115]:
route_df_test4 = gtfs_schedule_wrangling.merge_operator_identifiers(
    route_df_test3, [one_analysis_date], columns=crosswalk_cols
)

In [116]:
preview(route_df_test4)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,1,
2,73105f2d1cabc8170ab066d96863c5d5,11,
4,73105f2d1cabc8170ab066d96863c5d5,12X,
6,73105f2d1cabc8170ab066d96863c5d5,1B,
7,73105f2d1cabc8170ab066d96863c5d5,2,
9,73105f2d1cabc8170ab066d96863c5d5,20,
11,73105f2d1cabc8170ab066d96863c5d5,3,
13,73105f2d1cabc8170ab066d96863c5d5,30,
15,73105f2d1cabc8170ab066d96863c5d5,4,
17,73105f2d1cabc8170ab066d96863c5d5,5,0.0


In [117]:
df_rt_sched = route_df_test4.copy()

##### `df_rt_sched` equals `df_rt_sched`

In [118]:
df_rt_sched_og = merge_data.concatenate_rt_vs_schedule_by_route_direction(
    analysis_date_list
)

In [119]:
df_rt_sched.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'rt_sched_journey_ratio', 'avg_rt_service_minutes',
       'name', 'schedule_source_record_id', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district'],
      dtype='object')

In [120]:
df_rt_sched_og.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'name',
       'schedule_source_record_id', 'service_date'],
      dtype='object')

#### `gtfs_digest/merge_data/` line 316: `df_crosswalk`

In [135]:
df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)

In [136]:
df_crosswalk.service_date.unique()

array(['2024-11-13T00:00:00.000000000'], dtype='datetime64[ns]')

#### `gtfs_digest/merge_data/merge_data_sources_by_route_direction`

In [139]:
service_date_datetime = "2024-11-13T00:00:00.000000000"

In [140]:
type(service_date_datetime)

str

In [143]:
route_dir_metrics2["service_date"] = pd.to_datetime("2024-11-13T00:00:00.000000000")

In [144]:
df_rt_sched["service_date"] = pd.to_datetime("2024-11-13T00:00:00.000000000")

In [145]:
df_avg_speeds2["service_date"] = pd.to_datetime("2024-11-13T00:00:00.000000000")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_avg_speeds2["service_date"] = pd.to_datetime('2024-11-13T00:00:00.000000000')


In [147]:
# merge1 = merge_data.merge_data_sources_by_route_direction(
# route_dir_metrics2,
# df_rt_sched,
# df_avg_speeds2,
# df_crosswalk
# )

In [149]:
primary_typology = merge_data.set_primary_typology(route_dir_metrics2)

In [151]:
route_time_cols = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
]

In [152]:
df_schedule2 = pd.merge(
    route_dir_metrics2, primary_typology, on=route_time_cols, how="left"
)

In [164]:
df_schedule2.columns

Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',
       'common_shape_id', 'route_name', 'avg_scheduled_service_minutes',
       'avg_stop_miles', 'n_trips', 'time_period', 'peak_offpeak', 'frequency',
       'is_coverage', 'is_downtown_local', 'is_local', 'is_rapid',
       'is_express', 'is_rail', 'route_primary_direction', 'service_date',
       'typology'],
      dtype='object')

In [154]:
preview(df_schedule2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,7,
3,73105f2d1cabc8170ab066d96863c5d5,6,
6,f5a749dd65924e025b1293c58f95f8d6,CC,1.0
9,f5a749dd65924e025b1293c58f95f8d6,CC,0.0
12,73105f2d1cabc8170ab066d96863c5d5,8,
15,73105f2d1cabc8170ab066d96863c5d5,Mall,
18,73105f2d1cabc8170ab066d96863c5d5,12X,
21,73105f2d1cabc8170ab066d96863c5d5,13X,
24,73105f2d1cabc8170ab066d96863c5d5,11,
27,73105f2d1cabc8170ab066d96863c5d5,30,


In [165]:
df_rt_sched.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'rt_sched_journey_ratio', 'avg_rt_service_minutes',
       'name', 'schedule_source_record_id', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district', 'service_date'],
      dtype='object')

In [155]:
df = pd.merge(
    df_schedule2,
    df_rt_sched,
    on=route_time_cols + ["service_date"],
    how="outer",
    indicator="sched_rt_category",
).merge(
    df_avg_speeds2,
    on=route_time_cols + ["service_date"],
    how="outer",
)

In [156]:
preview(df)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,7,
3,73105f2d1cabc8170ab066d96863c5d5,6,
6,f5a749dd65924e025b1293c58f95f8d6,CC,1.0
9,f5a749dd65924e025b1293c58f95f8d6,CC,0.0
12,73105f2d1cabc8170ab066d96863c5d5,8,
15,73105f2d1cabc8170ab066d96863c5d5,Mall,
18,73105f2d1cabc8170ab066d96863c5d5,12X,
21,73105f2d1cabc8170ab066d96863c5d5,13X,
24,73105f2d1cabc8170ab066d96863c5d5,11,
27,73105f2d1cabc8170ab066d96863c5d5,30,


In [160]:
df2 = (
    df.assign(
        sched_rt_category=df.sched_rt_category.map(
            gtfs_schedule_wrangling.sched_rt_category_dict
        )
    )
    .pipe(
        merge_data.merge_in_standardized_route_names,
    )
    .merge(
        df_crosswalk,
        on=["schedule_gtfs_dataset_key", "name", "service_date"],
        how="left",
    )
)

In [161]:
preview(df2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,7,
3,73105f2d1cabc8170ab066d96863c5d5,6,
6,f5a749dd65924e025b1293c58f95f8d6,CC,1.0
9,f5a749dd65924e025b1293c58f95f8d6,CC,0.0
12,73105f2d1cabc8170ab066d96863c5d5,8,
15,73105f2d1cabc8170ab066d96863c5d5,Mall,
18,73105f2d1cabc8170ab066d96863c5d5,12X,
21,73105f2d1cabc8170ab066d96863c5d5,13X,
24,73105f2d1cabc8170ab066d96863c5d5,11,
27,73105f2d1cabc8170ab066d96863c5d5,30,


##### Find out why there are certain duplicated columns later.

In [166]:
df2 = df2.drop(columns=["organization_name_x"]).rename(
    columns={"organization_name_y": "organization_name"}
)

In [167]:
df3 = df2.pipe(
    # Find the most common cardinal direction
    gtfs_schedule_wrangling.top_cardinal_direction
)

In [168]:
preview(df3)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,7,
3,73105f2d1cabc8170ab066d96863c5d5,6,
6,f5a749dd65924e025b1293c58f95f8d6,CC,1.0
9,f5a749dd65924e025b1293c58f95f8d6,CC,0.0
12,73105f2d1cabc8170ab066d96863c5d5,8,
15,73105f2d1cabc8170ab066d96863c5d5,Mall,
18,73105f2d1cabc8170ab066d96863c5d5,12X,
21,73105f2d1cabc8170ab066d96863c5d5,13X,
24,73105f2d1cabc8170ab066d96863c5d5,11,
27,73105f2d1cabc8170ab066d96863c5d5,30,


In [173]:
preview(schd_vp_df2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
120423,5a8721fe96786fcd25fba1f8a0ee6358,b3848f93-d26b-48a9-b6a6-5de22a4eab47,0.0
135212,73105f2d1cabc8170ab066d96863c5d5,5,0.0
304769,f5a749dd65924e025b1293c58f95f8d6,Shuttle,0.0
304835,f5a749dd65924e025b1293c58f95f8d6,Shuttle,1.0
367736,f5a749dd65924e025b1293c58f95f8d6,CC,0.0
367778,f5a749dd65924e025b1293c58f95f8d6,CC,1.0


##### Where did geometry and common_shape_id pop up??
##### Also missing a bunch of columns: `'n_scheduled_trips', 'vp_per_minute', 'pct_in_shape', 'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp', 'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp'`

In [184]:
preview_cols = [
    
    "route_combined_name",
    "route_id",
    "direction_id",
    
    "route_primary_direction",
    "avg_scheduled_service_minutes",
    "avg_stop_miles",
    "n_trips",
    "time_period",
    "peak_offpeak",
    "frequency",
    "typology",
    "minutes_atleast1_vp",
    "minutes_atleast2_vp",
    "total_rt_service_minutes",
    "total_scheduled_service_minutes",
    "total_vp",
    "vp_in_shape",
    "is_early",
    "is_ontime",
    "is_late",
    "n_vp_trips",
    "rt_sched_journey_ratio",
    "avg_rt_service_minutes",
    "sched_rt_category",
    "speed_mph",
]

In [185]:
df3[preview_cols].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101
route_combined_name,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.","Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.","Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,CC Capitol Corridor,CC Capitol Corridor,CC Capitol Corridor,CC Capitol Corridor,CC Capitol Corridor,CC Capitol Corridor,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,Mall Shuttle,Mall Shuttle,Mall Shuttle,12X Broadway/Orcutt Express,12X Broadway/Orcutt Express,12X Broadway/Orcutt Express,13X Transit Center/PVHS/N. Broadway,13X Transit Center/PVHS/N. Broadway,13X Transit Center/PVHS/N. Broadway,R11. Transit Center to Gov't Center via S. Broadway,R11. Transit Center to Gov't Center via S. Broadway,R11. Transit Center to Gov't Center via S. Broadway,Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc,Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc,Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc,Shuttle Shuttle_Auburn,Shuttle Shuttle_Auburn,Shuttle Shuttle_Auburn,Shuttle Shuttle_Auburn,Shuttle Shuttle_Auburn,Shuttle Shuttle_Auburn,Rt 11. Transit Center to Gov't Center via S. Broadway,Rt 11. Transit Center to Gov't Center via S. Broadway,Rt 11. Transit Center to Gov't Center via S. Broadway,"Rt 2. Transit Center to PVH School via Western., Donovan Rd","Rt 2. Transit Center to PVH School via Western., Donovan Rd","Rt 2. Transit Center to PVH School via Western., Donovan Rd","Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln.","Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln.","Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln.",Rt 1. Transit Ctr to Preisker Park Via N. Broadway,Rt 1. Transit Ctr to Preisker Park Via N. Broadway,Rt 1. Transit Ctr to Preisker Park Via N. Broadway,Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB,Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB,...,SF Shuttle_SF,SF Shuttle_SF,SF Shuttle_SF,SF Shuttle_SF,SF Shuttle_SF,"Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way","Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way","Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way","Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd.","Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd.","Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd.",Rt 9. Transit Center to PVH via Alvin Ave.,Rt 9. Transit Center to PVH via Alvin Ave.,Rt 9. Transit Center to PVH via Alvin Ave.,Rt 1. Transit Ctr to Preisker Park Via N. Broadway,Rt 1. Transit Ctr to Preisker Park Via N. Broadway,Rt 1. Transit Ctr to Preisker Park Via N. Broadway,Rt 1. Transit Ctr to Preisker Park Via N. Broadway,Rt 1. Transit Ctr to Preisker Park Via N. Broadway,R11. Transit Center to Gov't Center via S. Broadway,R11. Transit Center to Gov't Center via S. Broadway,12X Broadway/Orcutt Express,12X Broadway/Orcutt Express,Rt 1. Transit Ctr to Preisker Park Via N. Broadway,"Rt 2. Transit Center to PVH School via Western., Donovan Rd","Rt 2. Transit Center to PVH School via Western., Donovan Rd",Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB,Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB,"Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln.","Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln.",Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc,Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc,"Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd.","Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd.","Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way","Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way",Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.","Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,Rt 11. Transit Center to Gov't Center via S. Broadway,Rt 11. Transit Center to Gov't Center via S. Broadway,Rt 9. Transit Center to PVH via Alvin Ave.,Rt 9. Transit Center to PVH via Alvin Ave.,CC Capitol Corridor,CC Capitol Corridor,CC Capitol Corridor,CC Capitol Corridor
route_id,7,7,7,6,6,6,CC,CC,CC,CC,CC,CC,8,8,8,Mall,Mall,Mall,12X,12X,12X,13X,13X,13X,11,11,11,30,30,30,Shuttle,Shuttle,Shuttle,Shuttle,Shuttle,Shuttle,8a7c42f9-51e4-4848-bf88-30c210f149ad,8a7c42f9-51e4-4848-bf88-30c210f149ad,8a7c42f9-51e4-4848-bf88-30c210f149ad,2,2,2,3,3,3,1B,1B,1B,20,20,...,SF,SF,SF,SF,SF,5,5,5,4,4,4,9,9,9,1,1,1,1,1,11,11,12X,12X,1B,2,2,20,20,3,3,30,30,4,4,5,5,6,6,7,7,8,8,8a7c42f9-51e4-4848-bf88-30c210f149ad,8a7c42f9-51e4-4848-bf88-30c210f149ad,9,9,CC,CC,CC,CC
direction_id,,,,,,,1.00,1.00,1.00,0.00,0.00,0.00,,,,,,,,,,,,,,,,,,,1.00,1.00,1.00,0.00,0.00,0.00,,,,,,,,,,,,,,,...,1.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,,,,,,,,,,,,,,,,,,,,,,,,,,,0.00,0.00,,,,,,,,,,,0.00,0.00,1.00,1.00
route_primary_direction,,,,,,,Southbound,Southbound,Southbound,Northbound,Northbound,Northbound,,,,,,,,,,,,,,,,,,,Westbound,Westbound,Westbound,Eastbound,Eastbound,Eastbound,,,,,,,,,,,,,,,...,Westbound,Westbound,Eastbound,Eastbound,Eastbound,Northbound,Northbound,Northbound,,,,,,,,,,,,,,,,,,,,,,,,,,,Northbound,Northbound,,,,,,,,,,,Northbound,Northbound,Southbound,Southbound
avg_scheduled_service_minutes,36.00,36.00,36.00,38.00,38.00,38.00,156.67,156.67,156.67,159.17,159.17,159.17,43.00,43.00,43.00,14.98,14.98,14.98,56.00,56.00,56.00,50.82,50.82,50.82,41.00,41.00,41.00,166.33,166.33,166.33,72.00,72.00,72.00,70.00,70.00,70.00,41.65,41.65,41.65,53.24,53.24,53.24,35.11,35.11,35.11,30.00,30.00,30.00,119.17,119.17,...,32.08,32.08,31.82,31.82,31.82,42.00,42.00,42.00,41.06,41.06,41.06,40.00,40.00,40.00,30.37,30.37,30.37,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
avg_stop_miles,0.37,0.37,0.37,0.38,0.38,0.38,8.17,8.17,8.17,8.17,8.17,8.17,0.32,0.32,0.32,0.11,0.11,0.11,0.61,0.61,0.61,0.29,0.29,0.29,0.28,0.28,0.28,1.22,1.22,1.22,13.74,13.74,13.74,11.78,11.78,11.78,0.25,0.25,0.25,0.29,0.29,0.29,0.26,0.26,0.26,0.22,0.22,0.22,5.72,5.72,...,6.69,6.69,6.69,6.69,6.69,0.27,0.27,0.27,0.27,0.27,0.27,0.28,0.28,0.28,0.22,0.22,0.22,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
n_trips,19.00,9.00,10.00,18.00,7.00,11.00,12.00,7.00,5.00,12.00,5.00,7.00,16.00,8.00,8.00,28.00,14.00,14.00,11.00,6.00,5.00,11.00,6.00,5.00,22.00,10.00,12.00,9.00,5.00,4.00,5.00,2.00,3.00,5.00,3.00,2.00,18.00,8.00,10.00,17.00,6.00,11.00,18.00,8.00,10.00,12.00,5.00,7.00,6.00,3.00,...,5.00,7.00,11.00,4.00,7.00,18.00,8.00,10.00,18.00,8.00,10.00,18.00,8.00,10.00,19.00,9.00,10.00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
time_period,all_day,,,all_day,,,all_day,,,all_day,,,all_day,,,all_day,,,all_day,,,all_day,,,all_day,,,all_day,,,all_day,,,all_day,,,all_day,,,all_day,,,all_day,,,all_day,,,all_day,,...,,,all_day,,,all_day,,,all_day,,,all_day,,,all_day,,,offpeak,peak,offpeak,peak,offpeak,peak,peak,offpeak,peak,offpeak,peak,offpeak,peak,offpeak,peak,offpeak,peak,offpeak,peak,offpeak,peak,offpeak,peak,offpeak,peak,offpeak,peak,offpeak,peak,offpeak,peak,offpeak,peak
peak_offpeak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,...,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,offpeak,peak,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
frequency,0.79,0.38,0.42,0.75,0.29,0.46,0.50,0.29,0.21,0.50,0.21,0.29,0.67,0.33,0.33,1.17,0.58,0.58,0.46,0.25,0.21,0.46,0.25,0.21,0.92,0.42,0.50,0.38,0.21,0.17,0.21,0.08,0.12,0.21,0.12,0.08,0.75,0.33,0.42,0.71,0.25,0.46,0.75,0.33,0.42,0.50,0.21,0.29,0.25,0.12,...,0.21,0.29,0.46,0.17,0.29,0.75,0.33,0.42,0.75,0.33,0.42,0.75,0.33,0.42,0.79,0.38,0.42,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Fix Map: `gtfs_digest/merge_operator_data`

In [None]:
OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles
OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

In [None]:
operator_route_gdf = gpd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet",
)

In [None]:
operator_route_gdf.columns

In [None]:
operator_route_gdf2.columns

In [None]:
len(operator_route_gdf2)

In [None]:
operator_route_gdf2.is_rail.value_counts()

In [None]:
operator_route_gdf2.organization_name.value_counts()

In [None]:
operator_route_gdf2.schedule_gtfs_dataset_key.unique()

#### Why does City of Santa Maria have multiple schedule_gtfs_dataset_keys?

In [None]:
operator_route_gdf2.groupby(["organization_name", "schedule_gtfs_dataset_key"]).agg(
    {"route_short_name": "nunique"}
)

In [None]:
operator_route_gdf2.drop(columns=["service_date"]).explore("organization_name")

In [None]:
# operator_route_gdf2.drop(columns = ["service_date"]).explore("shape_array_key")

#### Starting from here [`gtfs_funnel/operator_scheduled_stats`](https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/gtfs_funnel/operator_scheduled_stats.py#L148)

In [None]:
analysis_date = "2024-11-13"

In [None]:
schd_keys = list(operator_route_gdf2.schedule_gtfs_dataset_key.unique())

#### Longest shape does have all the routes...

In [None]:
longest_shape_gdf = operator_scheduled_stats.longest_shape_by_route(analysis_date)

In [None]:
longest_shape_gdf2 = longest_shape_gdf.loc[
    longest_shape_gdf.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [None]:
longest_shape_gdf2.columns

In [None]:
longest_shape_gdf2.info()

In [None]:
longest_shape_gdf2.route_id.value_counts()

In [None]:
# longest_shape_gdf2.explore("schedule_gtfs_dataset_key")

In [None]:
longest_shape_gdf2.groupby(["schedule_gtfs_dataset_key", "route_id"]).agg(
    {"route_length_miles": "max"}
)

#### Somewhere along the way the routes are cut...maybe b/c of `direction_id`

In [None]:
OPERATOR_EXPORT = GTFS_DATA_DICT.schedule_tables.operator_scheduled_stats

In [None]:
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
dec_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11.parquet"

In [None]:
dec_df = gpd.read_parquet(dec_url)

In [None]:
dec_df.organization_name.value_counts().head()

In [None]:
dec_df.loc[
    dec_df.organization_name == "Alameda-Contra Costa Transit District"
].head().drop(columns=["geometry"]).T

In [None]:
dec_df2 = dec_df.loc[dec_df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
dec_df2.shape

In [None]:
type(dec_df2)

In [None]:
dec_df2.drop(columns=["geometry"]).T

In [None]:
# dec_df2.explore()

#### Find where in `gtfs_funnel` all the routes disappear

In [None]:
group_cols = ["schedule_gtfs_dataset_key"]

In [None]:
longest_shape_gdf2.info()

#### something is going on in `operator_scheduled_stats.schedule_stats_by_operator`

In [None]:
ROUTE_TYPOLOGY = GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
route_typology = pd.read_parquet(f"{SCHED_GCS}{ROUTE_TYPOLOGY}_{analysis_date}.parquet")

In [None]:
from route_typologies import route_typologies

In [None]:
route_typology_grouped = (
    route_typology.groupby(["schedule_gtfs_dataset_key", "route_id"])
    .agg({**{f"is_{c}": "sum" for c in route_typologies}})
    .reset_index()
)

In [None]:
route_typology_grouped2 = route_typology_grouped.loc[
    route_typology_grouped.schedule_gtfs_dataset_key.isin(schd_keys)
]

#### Routes are missing for Santa Maria and Capitol Corridor in `ROUTE_TYPOLOGY`

In [None]:
route_typology_grouped2.T

In [None]:
route_gdf = longest_shape_gdf2.merge(
    route_typology_grouped2, on=["schedule_gtfs_dataset_key", "route_id"], how="outer"
)

In [None]:
route_gdf.shape

In [None]:
route_gdf.drop(columns=["geometry"])

In [None]:
# route_gdf2.explore("schedule_gtfs_dataset_key")

#### Change merge from `inner` to `left`

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

In [None]:
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
my_test_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11_AH.parquet"

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [None]:
test_gdf = gpd.read_parquet(my_test_url)

In [None]:
test_gdf2 = test_gdf.loc[test_gdf.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
test_gdf2.explore("route_id")

#### Test with all the dates.

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
RT_SCHED_GCS

In [None]:
f"{OPERATOR_ROUTE}_AH_test"

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

In [None]:
test_df = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_routes_AH_test.parquet"
)

In [None]:
test_df.columns

In [None]:
op_routes_gdf = test_df.loc[test_df.organization_name.isin(org_name_lists)]

In [None]:
# Find the most recent geography for each route.
op_routes_gdf = op_routes_gdf.sort_values(by=["service_date"], ascending=False)

# Keep only the most recent row.
op_routes_gdf = op_routes_gdf.drop_duplicates(
    subset=["route_long_name", "route_short_name", "route_combined_name"]
)

# Drop service_dates
op_routes_gdf = op_routes_gdf.drop(columns=["service_date"])

In [None]:
op_routes_gdf.organization_name.value_counts()

In [None]:
op_routes_gdf.loc[op_routes_gdf.organization_name == "City of Santa Maria"].explore(
    "route_long_name"
)