## Find Missing Routes: 2 operators. 
* [Issue](https://github.com/cal-itp/data-analyses/issues/1312): Capital Corridor doesn't have any rail routes. 
* [Most of Santa Maria's routes not showing up in GTFS Digest](https://github.com/cal-itp/data-analyses/issues/1313)
* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`
* 1/7: the routes are appearing in `the longest shape` but not appearing in `route_typologies`

In [1]:
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import (
    gtfs_schedule_wrangling,
    helpers,
    metrics,
    segment_calcs,
    time_series_utils,
)
from shared_utils import (
    catalog_utils,
    portfolio_utils,
    rt_dates,
    rt_utils,
    time_helpers,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
org_name_lists = ["Capitol Corridor Joint Powers Authority", "City of Santa Maria"]

In [4]:
analysis_date_list = ["2024-11-13"]

In [5]:
one_analysis_date = "2024-11-13"

In [6]:
schd_keys = [
    "5a8721fe96786fcd25fba1f8a0ee6358",
    "73105f2d1cabc8170ab066d96863c5d5",
    "f5a749dd65924e025b1293c58f95f8d6",
]

In [7]:
import sys

sys.path.append("../gtfs_funnel/")
import operator_scheduled_stats
import schedule_stats_by_route_direction

In [8]:
def preview(df):
    df2 = df[
        ["schedule_gtfs_dataset_key", "route_id", "direction_id"]
    ].drop_duplicates()
    display(df2)

### Fix `schd_vp_url`

In [9]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [10]:
schd_vp_df = pd.read_parquet(schd_vp_url)

In [11]:
schd_vp_df2 = schd_vp_df.loc[schd_vp_df.organization_name.isin(org_name_lists)]

In [12]:
schd_vp_df2.route_id.unique()

array(['b3848f93-d26b-48a9-b6a6-5de22a4eab47', '5', 'Shuttle', 'CC'],
      dtype=object)

In [13]:
schd_vp_df2.route_id.value_counts()

Shuttle                                 132
CC                                       84
5                                        67
b3848f93-d26b-48a9-b6a6-5de22a4eab47      6
Name: route_id, dtype: int64

In [14]:
schd_vp_df2.time_period.unique()

array(['all_day', 'offpeak', 'peak', None], dtype=object)

In [15]:
schd_vp_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 289 entries, 120423 to 367819
Data columns (total 47 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   schedule_gtfs_dataset_key        289 non-null    object        
 1   direction_id                     289 non-null    float64       
 2   time_period                      277 non-null    object        
 3   avg_scheduled_service_minutes    198 non-null    float64       
 4   avg_stop_miles                   198 non-null    float64       
 5   n_scheduled_trips                289 non-null    int64         
 6   frequency                        198 non-null    float64       
 7   is_express                       132 non-null    float64       
 8   is_rapid                         132 non-null    float64       
 9   is_rail                          132 non-null    float64       
 10  is_coverage                      132 non-null    float

###  Check out `rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling`
* https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py 
* Tiffany: <i>Can you try specifying the dropna argument inside pandas groupby? Our pandas version has gone through upgrades, from 0.25 to now 1.5 ), and this argument was introduced in 1.1 and since it's dropna=True, that's probably what's driving the the row behavior.</i>
* It worked! Now time to rerun stuff further down the pipeline and see what happens.

In [16]:
def most_common_shape_by_route_direction(analysis_date: str) -> gpd.GeoDataFrame:
    """
    Find shape_id with most trips for that route-direction.
    Merge in shape geometry.
    """
    route_dir_cols = ["gtfs_dataset_key", "route_id", "direction_id"]

    keep_trip_cols = route_dir_cols + [
        "trip_instance_key",
        "shape_id",
        "shape_array_key",
    ]

    trips = helpers.import_scheduled_trips(
        analysis_date, columns=keep_trip_cols, get_pandas=True
    ).rename(columns={"schedule_gtfs_dataset_key": "gtfs_dataset_key"})
    sorting_order = [True for i in route_dir_cols]

    most_common_shape = (
        trips.groupby(
            route_dir_cols + ["shape_id", "shape_array_key"],
            observed=True,
            group_keys=False,
            dropna=False,
        )
        .agg({"trip_instance_key": "count"})
        .reset_index()
        .sort_values(
            route_dir_cols + ["trip_instance_key"], ascending=sorting_order + [False]
        )
        .drop_duplicates(subset=route_dir_cols)
        .reset_index(drop=True)[route_dir_cols + ["shape_id", "shape_array_key"]]
    ).rename(
        columns={
            "gtfs_dataset_key": "schedule_gtfs_dataset_key",
            "shape_id": "common_shape_id",
        }
    )

    shape_geom = helpers.import_scheduled_shapes(
        analysis_date,
        columns=["shape_array_key", "geometry"],
    )

    common_shape_geom = pd.merge(
        shape_geom, most_common_shape, on="shape_array_key", how="inner"
    ).drop(columns="shape_array_key")

    route_info = (
        helpers.import_scheduled_trips(
            analysis_date,
            columns=[
                "gtfs_dataset_key",
                "route_id",
                "route_long_name",
                "route_short_name",
                "route_desc",
            ],
        )
        .drop_duplicates()
        .pipe(portfolio_utils.add_route_name)
        .drop(columns=["route_long_name", "route_short_name", "route_desc"])
    )

    del shape_geom, most_common_shape

    common_shape_geom2 = pd.merge(
        common_shape_geom,
        route_info.rename(columns={"route_name_used": "route_name"}),
        on=["schedule_gtfs_dataset_key", "route_id"],
    )

    return common_shape_geom2

In [17]:
common_shape_test = most_common_shape_by_route_direction(one_analysis_date)

In [18]:
common_shape_test.columns

Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',
       'common_shape_id', 'route_name'],
      dtype='object')

In [19]:
common_shape_test2 = common_shape_test.loc[
    common_shape_test.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [20]:
common_shape_test2.route_id.unique()

array(['7', '6', 'CC', '8', 'Mall', '12X', '13X', '11', '30', 'Shuttle',
       '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '1B', '20', 'SF',
       '5', '4', '9', '1'], dtype=object)

#### I think not having anything in `direction_id` is messing everything up. 

In [21]:
common_shape_test2.drop(columns=["geometry"]).head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name
167,73105f2d1cabc8170ab066d96863c5d5,7,,715be44b-4dee-4c56-83f8-b1970d6133cf,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd."
193,73105f2d1cabc8170ab066d96863c5d5,6,,de042d01-f50a-4b67-ba25-4628643021fa,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound


### Breakdown `gtfs_digest/merge_data.`

#### Line 294: `df_sched` is already missing a lot of the routes.

In [22]:
# Get cardinal direction for each route
df_sched_og = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)

In [23]:
df_sched2_og = df_sched_og.loc[df_sched_og.schedule_gtfs_dataset_key.isin(schd_keys)]

In [24]:
df_sched2_og.route_id.value_counts()

Shuttle    6
5          3
Name: route_id, dtype: int64

##### Go back to `gtfs_funnel/schedule_stats_by_route_direction`
* https://github.com/cal-itp/data-analyses/blob/1ba0f544a01f99966a6e210dd11666b4fe4a146e/gtfs_funnel/schedule_stats_by_route_direction.py#L190
* Test 1: Updated `gtfs_schedule_wrangling` but a lot of routes are still missing. 

##### `assemble_scheduled_trip_metrics`: nothing is missing

In [25]:
trip_metrics = schedule_stats_by_route_direction.assemble_scheduled_trip_metrics(
    one_analysis_date, GTFS_DATA_DICT
)

In [26]:
trip_metrics.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id
0,0139b1253130b33adcd4b3a4490530d2,014dd8051849e5252df704ca9c381fd9,559.44,PM Peak,23.0,D4,


In [27]:
trip_metrics2 = trip_metrics.loc[trip_metrics.schedule_gtfs_dataset_key.isin(schd_keys)]

In [28]:
trip_metrics2.columns

Index(['schedule_gtfs_dataset_key', 'trip_instance_key', 'median_stop_meters',
       'time_of_day', 'scheduled_service_minutes', 'route_id', 'direction_id'],
      dtype='object')

In [29]:
trip_metrics2.shape

(335, 7)

In [30]:
trip_metrics2.time_of_day.unique()

array(['PM Peak', 'Midday', 'AM Peak', 'Early AM', 'Evening'],
      dtype=object)

##### Each row is populated.

In [31]:
trip_metrics2.loc[trip_metrics2.time_of_day == "Midday"].drop_duplicates(
    subset=["schedule_gtfs_dataset_key", "route_id", "direction_id"]
).drop(columns=["schedule_gtfs_dataset_key", "trip_instance_key"])

Unnamed: 0,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id
49725,405.04,Midday,35.0,3,
49729,178.05,Midday,14.98,Mall,
49731,451.15,Midday,41.0,11,
49736,361.12,Midday,30.0,1B,
49737,357.22,Midday,30.37,1,
49738,444.75,Midday,40.0,9,
49741,440.62,Midday,41.0,4,
49742,989.61,Midday,56.0,12X,
49744,437.51,Midday,42.0,5,0.0
49754,477.41,Midday,53.0,13X,


In [32]:
preview(trip_metrics2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
49724,73105f2d1cabc8170ab066d96863c5d5,30,
49725,73105f2d1cabc8170ab066d96863c5d5,3,
49727,73105f2d1cabc8170ab066d96863c5d5,20,
49728,73105f2d1cabc8170ab066d96863c5d5,4,
49729,73105f2d1cabc8170ab066d96863c5d5,Mall,
49730,73105f2d1cabc8170ab066d96863c5d5,5,0.0
49731,73105f2d1cabc8170ab066d96863c5d5,11,
49732,73105f2d1cabc8170ab066d96863c5d5,7,
49733,73105f2d1cabc8170ab066d96863c5d5,9,
49735,73105f2d1cabc8170ab066d96863c5d5,1,


##### `gtfs_funnel/schedule_stats_by_route_direction/schedule_metrics_by_route_direction` 
* update to `dropna=False`

In [33]:
from shared_utils.rt_utils import METERS_PER_MILE

In [34]:
def schedule_metrics_by_route_direction(
    df: pd.DataFrame,
    analysis_date: str,
    group_merge_cols: list,
) -> pd.DataFrame:
    """
    Aggregate trip-level metrics to route-direction, and
    attach shape geometry for common_shape_id.
    """
    service_freq_df = gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
        df, group_merge_cols, long_or_wide="long"
    )

    metrics_df = (
        df.groupby(group_merge_cols, observed=True, group_keys=False, dropna=False)
        .agg(
            {
                "median_stop_meters": "mean",
                # take mean of the median stop spacing for trip
                # does this make sense?
                # median is the single boiled down metric at the trip-level
                "scheduled_service_minutes": "mean",
            }
        )
        .reset_index()
        .rename(
            columns={
                "median_stop_meters": "avg_stop_meters",
                "scheduled_service_minutes": "avg_scheduled_service_minutes",
            }
        )
    )

    metrics_df = metrics_df.assign(
        avg_stop_miles=metrics_df.avg_stop_meters.divide(METERS_PER_MILE).round(2)
    ).drop(columns=["avg_stop_meters"])

    round_me = ["avg_stop_miles", "avg_scheduled_service_minutes"]
    metrics_df[round_me] = metrics_df[round_me].round(2)

    common_shape = gtfs_schedule_wrangling.most_common_shape_by_route_direction(
        analysis_date
    ).pipe(helpers.remove_shapes_outside_ca)

    df = pd.merge(common_shape, metrics_df, on=group_merge_cols, how="inner").merge(
        service_freq_df, on=group_merge_cols, how="inner"
    )

    return df

##### The routes are still missing even after `dropna=False` because `direction_id` is missing.

In [35]:
route_group_merge_cols = ["schedule_gtfs_dataset_key", "route_id", "direction_id"]

In [36]:
route_dir_metrics_og = schedule_metrics_by_route_direction(
    trip_metrics2, one_analysis_date, route_group_merge_cols
)

In [37]:
route_dir_metrics_og.drop(
    columns=["geometry", "schedule_gtfs_dataset_key", "common_shape_id"]
)

Unnamed: 0,route_id,direction_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,peak_offpeak,frequency
0,Shuttle,1.0,Shuttle to Auburn,72.0,13.74,5,all_day,,0.21
1,Shuttle,1.0,Shuttle to Auburn,72.0,13.74,2,,offpeak,0.08
2,Shuttle,1.0,Shuttle to Auburn,72.0,13.74,3,,peak,0.12
3,Shuttle,0.0,Shuttle to Auburn,70.0,11.78,5,all_day,,0.21
4,Shuttle,0.0,Shuttle to Auburn,70.0,11.78,3,,offpeak,0.12
5,Shuttle,0.0,Shuttle to Auburn,70.0,11.78,2,,peak,0.08
6,5,0.0,"Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way",42.0,0.27,18,all_day,,0.75
7,5,0.0,"Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way",42.0,0.27,8,,offpeak,0.33
8,5,0.0,"Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way",42.0,0.27,10,,peak,0.42


##### Looking at the dataframe using the original script in `schedule_stats_by_route_direction.schedule_metrics_by_route_direction`: a lot of missing values in `time_period`?

In [38]:
route_dir_metrics_script_og = (
    schedule_stats_by_route_direction.schedule_metrics_by_route_direction(
        trip_metrics2, one_analysis_date, route_group_merge_cols
    )
)

In [39]:
route_dir_metrics_script_og.drop(
    columns=["geometry", "schedule_gtfs_dataset_key", "common_shape_id"]
)

Unnamed: 0,route_id,direction_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,peak_offpeak,frequency
0,Shuttle,1.0,Shuttle to Auburn,72.0,13.74,5,all_day,,0.21
1,Shuttle,1.0,Shuttle to Auburn,72.0,13.74,2,,offpeak,0.08
2,Shuttle,1.0,Shuttle to Auburn,72.0,13.74,3,,peak,0.12
3,Shuttle,0.0,Shuttle to Auburn,70.0,11.78,5,all_day,,0.21
4,Shuttle,0.0,Shuttle to Auburn,70.0,11.78,3,,offpeak,0.12
5,Shuttle,0.0,Shuttle to Auburn,70.0,11.78,2,,peak,0.08
6,5,0.0,"Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way",42.0,0.27,18,all_day,,0.75
7,5,0.0,"Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way",42.0,0.27,8,,offpeak,0.33
8,5,0.0,"Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way",42.0,0.27,10,,peak,0.42


In [40]:
route_dir_metrics_og.route_id.unique()

array(['Shuttle', '5'], dtype=object)

##### `schedule_metrics_by_route_direction` update `dropna=False`

In [41]:
from shared_utils.rt_utils import METERS_PER_MILE

In [42]:
route_group_merge_cols_no_dir_id = [
    "schedule_gtfs_dataset_key",
    "route_id",
]

In [43]:
route_dir_metrics_wo_dir_id = schedule_metrics_by_route_direction(
    trip_metrics2, one_analysis_date, route_group_merge_cols_no_dir_id
)

##### All of the routes appear after excluding `direction_id` from `groupby` and adding `dropna=False` with the proper amount of data.

In [44]:
route_dir_metrics_wo_dir_id.drop(
    columns=["geometry", "schedule_gtfs_dataset_key", "common_shape_id"]
)

Unnamed: 0,route_id,direction_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,peak_offpeak,frequency
0,7,0.0,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",36.0,0.37,19,all_day,,0.79
1,7,0.0,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",36.0,0.37,9,,offpeak,0.38
2,7,0.0,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",36.0,0.37,10,,peak,0.42
3,6,0.0,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,38.0,0.38,18,all_day,,0.75
4,6,0.0,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,38.0,0.38,7,,offpeak,0.29
5,6,0.0,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,38.0,0.38,11,,peak,0.46
6,8,0.0,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,43.0,0.32,16,all_day,,0.67
7,8,0.0,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,43.0,0.32,8,,offpeak,0.33
8,8,0.0,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,43.0,0.32,8,,peak,0.33
9,Mall,0.0,Mall Shuttle,14.98,0.11,28,all_day,,1.17


##### `gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak` is missing a lot of routes -> break it out.

In [45]:
service_freq_df = gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
    trip_metrics2, route_group_merge_cols, long_or_wide="long"
)

In [46]:
service_freq_df.route_id.unique()

array(['5', 'CC', 'SF', 'Shuttle'], dtype=object)

In [47]:
service_freq_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,n_trips,time_period,peak_offpeak,frequency
0,73105f2d1cabc8170ab066d96863c5d5,5,0.0,18,all_day,,0.75
1,f5a749dd65924e025b1293c58f95f8d6,CC,0.0,12,all_day,,0.5


##### Changed `count_trips_by_group` to have argument `dropna=False` in `groupby`

In [48]:
def count_trips_by_group(df: pd.DataFrame, group_cols: list):
    """
    Given a df with trip_instance_key and an arbitrary list of
    group_cols, return trip counts by group.
    """
    assert "trip_instance_key" in df.columns
    df = (
        df.groupby(group_cols, dropna=False)
        .agg({"trip_instance_key": "count"})
        .reset_index()
    )
    df = df.rename(columns={"trip_instance_key": "n_trips"})
    return df

In [49]:
def add_peak_offpeak_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add a single peak_offpeak column based on the time-of-day column.
    """
    df = df.assign(peak_offpeak=df.time_of_day.map(time_helpers.TIME_OF_DAY_DICT))

    return df

In [50]:
def aggregate_time_of_day_to_peak_offpeak(
    df: pd.DataFrame,
    group_cols: list,
    long_or_wide: str,
) -> pd.DataFrame:
    """
    Aggregate time-of-day bins into peak/offpeak periods.
    Return n_trips and frequency for grouping of columns (route-direction, etc).
    Allow wide or long df to be returned.
    """
    peak_hours = sum(
        v
        for k, v in time_helpers.HOURS_BY_TIME_OF_DAY.items()
        if k in time_helpers.PEAK_PERIODS
    )

    offpeak_hours = sum(
        v
        for k, v in time_helpers.HOURS_BY_TIME_OF_DAY.items()
        if k not in time_helpers.PEAK_PERIODS
    )

    df = add_peak_offpeak_column(df)

    all_day = count_trips_by_group(df, group_cols).assign(time_period="all_day")
    peak_offpeak = count_trips_by_group(df, group_cols + ["peak_offpeak"]).rename(
        {"peak_offpeak": "time_period"}
    )

    df2 = pd.concat([all_day, peak_offpeak], axis=0, ignore_index=True)

    # Add service frequency (trips per hour)
    # there are different number of hours in peak and offpeak periods
    df2 = df2.assign(
        frequency=df2.apply(
            lambda x: round(x.n_trips / peak_hours, 2)
            if x.time_period == "peak"
            else round(x.n_trips / offpeak_hours, 2)
            if x.time_period == "offpeak"
            else round(x.n_trips / (peak_hours + offpeak_hours), 2),
            axis=1,
        )
    )

    if long_or_wide == "long":
        return df2

    elif long_or_wide == "wide":
        # Reshape from wide to long
        # get rid of multiindex column names
        df3 = df2.pivot(
            index=group_cols, columns="time_period", values=["n_trips", "frequency"]
        )

        df3.columns = [f"{b}_{a}" for a, b in df3.columns]
        df3 = df3.reset_index()

        return df3

In [51]:
service_freq_df_test1 = aggregate_time_of_day_to_peak_offpeak(
    trip_metrics2, route_group_merge_cols, long_or_wide="long"
)

In [52]:
service_freq_df_test1.route_id.unique()

array(['1', '11', '12X', '13X', '1B', '2', '20', '3', '30', '4', '5', '6',
       '7', '8', '8a7c42f9-51e4-4848-bf88-30c210f149ad', '9', 'Mall',
       'CC', 'SF', 'Shuttle'], dtype=object)

##### `metrics_df` portion of `gtfs_funnel.schedule_stas_by_route_direction.schedule_metrics_by_route_direction`
* Did `dropna=False` to get all the routes. 
* Without `dropna=False`, all the routes disappear.

In [53]:
metrics_df = (
    trip_metrics2.groupby(
        route_group_merge_cols, observed=True, group_keys=False, dropna=False
    )
    .agg(
        {
            "median_stop_meters": "mean",
            # take mean of the median stop spacing for trip
            # does this make sense?
            # median is the single boiled down metric at the trip-level
            "scheduled_service_minutes": "mean",
        }
    )
    .reset_index()
    .rename(
        columns={
            "median_stop_meters": "avg_stop_meters",
            "scheduled_service_minutes": "avg_scheduled_service_minutes",
        }
    )
)

In [54]:
metrics_df = metrics_df.assign(
    avg_stop_miles=metrics_df.avg_stop_meters.divide(METERS_PER_MILE).round(2)
).drop(columns=["avg_stop_meters"])

In [55]:
round_me = ["avg_stop_miles", "avg_scheduled_service_minutes"]
metrics_df[round_me] = metrics_df[round_me].round(2)

##### End of the function `gtfs_funnel/schedule_stats_by_route_direction/schedule_metrics_by_route_direction` -> called `route_dir_metrics` in `if __name__ == "__main__"` part.

In [56]:
route_dir_metrics = pd.merge(
    common_shape_test2, metrics_df, on=route_group_merge_cols, how="inner"
).merge(service_freq_df_test1, on=route_group_merge_cols, how="inner")

##### In `if __name__ == "__main__"` gtfs_funnel/schedule_stats_by_route`

In [57]:
ROUTE_TYPOLOGIES = GTFS_DATA_DICT.schedule_tables.route_typologies
route_typologies = pd.read_parquet(
    f"{SCHED_GCS}{ROUTE_TYPOLOGIES}_{one_analysis_date}.parquet",
    columns=route_group_merge_cols
    + [
        "is_coverage",
        "is_downtown_local",
        "is_local",
        "is_rapid",
        "is_express",
        "is_rail",
    ],
)

In [58]:
route_typologies.loc[route_typologies.schedule_gtfs_dataset_key.isin(schd_keys)]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail
1416,f5a749dd65924e025b1293c58f95f8d6,Shuttle,1.0,1,0,0,0,0,0
3572,73105f2d1cabc8170ab066d96863c5d5,13X,0.0,1,0,0,1,0,0
3573,73105f2d1cabc8170ab066d96863c5d5,20,0.0,1,0,0,0,0,0
3574,73105f2d1cabc8170ab066d96863c5d5,12X,0.0,1,0,0,1,1,0
3575,73105f2d1cabc8170ab066d96863c5d5,30,0.0,1,0,0,1,0,0
3576,73105f2d1cabc8170ab066d96863c5d5,2,0.0,0,1,0,1,0,0
3577,73105f2d1cabc8170ab066d96863c5d5,1,0.0,1,0,0,1,0,0
3578,73105f2d1cabc8170ab066d96863c5d5,1B,0.0,1,0,0,1,0,0
3579,73105f2d1cabc8170ab066d96863c5d5,4,0.0,1,0,0,1,0,0
3580,73105f2d1cabc8170ab066d96863c5d5,7,0.0,1,0,0,1,0,0


##### `cardinal_dir_df` also gets rid of a lot of stuff -> check it out. 

In [59]:
cardinal_dir_df = (
    schedule_stats_by_route_direction.cardinal_direction_for_route_direction(
        one_analysis_date, GTFS_DATA_DICT
    )
)

In [60]:
cardinal_dir_df2 = cardinal_dir_df.loc[
    cardinal_dir_df.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [61]:
preview(cardinal_dir_df2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
2581,73105f2d1cabc8170ab066d96863c5d5,5,0.0
3947,f5a749dd65924e025b1293c58f95f8d6,CC,0.0
3948,f5a749dd65924e025b1293c58f95f8d6,CC,1.0
4225,f5a749dd65924e025b1293c58f95f8d6,SF,0.0
4226,f5a749dd65924e025b1293c58f95f8d6,SF,1.0
4254,f5a749dd65924e025b1293c58f95f8d6,Shuttle,0.0
4255,f5a749dd65924e025b1293c58f95f8d6,Shuttle,1.0


In [62]:
STOP_TIMES_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction

In [63]:
stop_times_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{one_analysis_date}.parquet",
    filters=[[("stop_primary_direction", "!=", "Unknown")]],
)

In [64]:
stop_times_df2 = stop_times_df.loc[
    stop_times_df.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [65]:
stop_times_df2.columns

Index(['feed_key', 'stop_id', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'trip_instance_key', 'shape_array_key', 'stop_name', 'geometry',
       'prior_stop_sequence', 'subseq_stop_sequence', 'stop_pair',
       'stop_pair_name', 'stop_primary_direction', 'stop_meters'],
      dtype='object')

In [66]:
trip_scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name",
]

trips_df = helpers.import_scheduled_trips(
    one_analysis_date, columns=trip_scheduled_col, get_pandas=True
)

In [67]:
merge_cols = ["trip_instance_key", "schedule_gtfs_dataset_key", "shape_array_key"]

In [68]:
stop_times_with_trip = pd.merge(stop_times_df2, trips_df, on=merge_cols)

In [69]:
preview(stop_times_with_trip)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,30,
26,73105f2d1cabc8170ab066d96863c5d5,3,
66,73105f2d1cabc8170ab066d96863c5d5,20,
75,f5a749dd65924e025b1293c58f95f8d6,Shuttle,0.0
79,73105f2d1cabc8170ab066d96863c5d5,4,
104,f5a749dd65924e025b1293c58f95f8d6,SF,1.0
105,73105f2d1cabc8170ab066d96863c5d5,Mall,
111,73105f2d1cabc8170ab066d96863c5d5,5,0.0
147,73105f2d1cabc8170ab066d96863c5d5,11,
168,73105f2d1cabc8170ab066d96863c5d5,7,


In [70]:
main_cols = ["route_id", "schedule_gtfs_dataset_key", "direction_id"]

##### Changing dropna=False here too

In [71]:
agg1 = (
    stop_times_with_trip.groupby(main_cols + ["stop_primary_direction"], dropna=False)
    .agg({"stop_sequence": "count"})
    .reset_index()
    .rename(columns={"stop_sequence": "total_stops"})
)

In [72]:
agg2 = agg1.sort_values(
    by=main_cols + ["total_stops"],
    ascending=[True, True, True, False],
)

##### There are values for `route_primary_direction` but because `direction_id` is missing, it goes away? 
* AH: testing to see if filling `direction_id` with something will change things.

In [73]:
cardinal_dir_df = (
    agg2.drop_duplicates(subset=main_cols)
    .reset_index(drop=True)
    .drop(columns=["total_stops"])
    .rename(columns={"stop_primary_direction": "route_primary_direction"})
)

cardinal_dir_df.direction_id = cardinal_dir_df.direction_id.fillna(0)

In [74]:
cardinal_dir_df

Unnamed: 0,route_id,schedule_gtfs_dataset_key,direction_id,route_primary_direction
0,1,73105f2d1cabc8170ab066d96863c5d5,0.0,Northbound
1,11,73105f2d1cabc8170ab066d96863c5d5,0.0,Northbound
2,12X,73105f2d1cabc8170ab066d96863c5d5,0.0,Northbound
3,13X,73105f2d1cabc8170ab066d96863c5d5,0.0,Westbound
4,1B,73105f2d1cabc8170ab066d96863c5d5,0.0,Northbound
5,2,73105f2d1cabc8170ab066d96863c5d5,0.0,Westbound
6,20,73105f2d1cabc8170ab066d96863c5d5,0.0,Eastbound
7,3,73105f2d1cabc8170ab066d96863c5d5,0.0,Eastbound
8,30,73105f2d1cabc8170ab066d96863c5d5,0.0,Southbound
9,4,73105f2d1cabc8170ab066d96863c5d5,0.0,Southbound


In [75]:
route_group_merge_cols = ["schedule_gtfs_dataset_key", "route_id", "direction_id"]

In [76]:
route_group_merge_cols_wo_dir_id = [
    "schedule_gtfs_dataset_key",
    "route_id",
]

##### TO-DO: `route_typologies` is missing info for the missing routes. 

In [77]:
# route_typologies.head(2)

In [78]:
# route_typologies.loc[route_typologies.schedule_gtfs_dataset_key.isin(schd_keys)]

In [79]:
# route_dir_metrics.head(2).drop(columns = ["geometry"])

In [80]:
# route_dir_metrics.drop(columns = ["geometry"])

In [81]:
route_typologies.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail
0,1770249a5a2e770ca90628434d4934b1,3407,0.0,1,0,0,1,0,0


In [82]:
route_typologies.loc[route_typologies.schedule_gtfs_dataset_key.isin(schd_keys)]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail
1416,f5a749dd65924e025b1293c58f95f8d6,Shuttle,1.0,1,0,0,0,0,0
3572,73105f2d1cabc8170ab066d96863c5d5,13X,0.0,1,0,0,1,0,0
3573,73105f2d1cabc8170ab066d96863c5d5,20,0.0,1,0,0,0,0,0
3574,73105f2d1cabc8170ab066d96863c5d5,12X,0.0,1,0,0,1,1,0
3575,73105f2d1cabc8170ab066d96863c5d5,30,0.0,1,0,0,1,0,0
3576,73105f2d1cabc8170ab066d96863c5d5,2,0.0,0,1,0,1,0,0
3577,73105f2d1cabc8170ab066d96863c5d5,1,0.0,1,0,0,1,0,0
3578,73105f2d1cabc8170ab066d96863c5d5,1B,0.0,1,0,0,1,0,0
3579,73105f2d1cabc8170ab066d96863c5d5,4,0.0,1,0,0,1,0,0
3580,73105f2d1cabc8170ab066d96863c5d5,7,0.0,1,0,0,1,0,0


In [83]:
route_dir_metrics.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 69 entries, 0 to 68
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   geometry                       69 non-null     geometry
 1   schedule_gtfs_dataset_key      69 non-null     object  
 2   route_id                       69 non-null     object  
 3   direction_id                   21 non-null     float64 
 4   common_shape_id                69 non-null     object  
 5   route_name                     69 non-null     object  
 6   avg_scheduled_service_minutes  69 non-null     float64 
 7   avg_stop_miles                 69 non-null     float64 
 8   n_trips                        69 non-null     int64   
 9   time_period                    23 non-null     object  
 10  peak_offpeak                   46 non-null     object  
 11  frequency                      69 non-null     float64 
dtypes: float64(4), geometry(1), in

##### Have to fill in `direction_id` with 0? 

In [84]:
route_dir_metrics.direction_id = route_dir_metrics.direction_id.fillna(0)

In [85]:
route_dir_metrics2 = pd.merge(
    route_dir_metrics, route_typologies, on=route_group_merge_cols, how="left"
).merge(cardinal_dir_df, on=route_group_merge_cols, how="left")

In [86]:
route_dir_metrics2.route_id.unique()

array(['7', '6', 'CC', '8', 'Mall', '12X', '13X', '11', '30', 'Shuttle',
       '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '1B', '20', 'SF',
       '5', '4', '9', '1'], dtype=object)

##### Amanda: how does `peak_offpeak` fit in with `time_period`?

In [87]:
route_dir_metrics2.drop(
    columns=[
        "geometry",
        "common_shape_id",
        "geometry",
        "route_name",
        "is_coverage",
        "is_downtown_local",
        "is_local",
        "is_rapid",
        "is_express",
        "is_rail",
    ]
).sort_values(by=["schedule_gtfs_dataset_key"])

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,peak_offpeak,frequency,route_primary_direction
0,73105f2d1cabc8170ab066d96863c5d5,7,0.0,36.0,0.37,19,all_day,,0.79,Southbound
36,73105f2d1cabc8170ab066d96863c5d5,8a7c42f9-51e4-4848-bf88-30c210f149ad,0.0,41.65,0.25,18,all_day,,0.75,Northbound
37,73105f2d1cabc8170ab066d96863c5d5,8a7c42f9-51e4-4848-bf88-30c210f149ad,0.0,41.65,0.25,8,,offpeak,0.33,Northbound
38,73105f2d1cabc8170ab066d96863c5d5,8a7c42f9-51e4-4848-bf88-30c210f149ad,0.0,41.65,0.25,10,,peak,0.42,Northbound
39,73105f2d1cabc8170ab066d96863c5d5,2,0.0,53.24,0.29,17,all_day,,0.71,Westbound
40,73105f2d1cabc8170ab066d96863c5d5,2,0.0,53.24,0.29,6,,offpeak,0.25,Westbound
41,73105f2d1cabc8170ab066d96863c5d5,2,0.0,53.24,0.29,11,,peak,0.46,Westbound
42,73105f2d1cabc8170ab066d96863c5d5,3,0.0,35.11,0.26,18,all_day,,0.75,Eastbound
43,73105f2d1cabc8170ab066d96863c5d5,3,0.0,35.11,0.26,8,,offpeak,0.33,Eastbound
44,73105f2d1cabc8170ab066d96863c5d5,3,0.0,35.11,0.26,10,,peak,0.42,Eastbound


##### Double check that the columns are the same.

In [88]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [89]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

'schedule_route_dir/schedule_route_direction_metrics'

In [90]:
og_nov_url = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-11-13.parquet"

In [91]:
df_sched_og = gpd.read_parquet(og_nov_url)

In [92]:
df_sched_og.columns

Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',
       'common_shape_id', 'route_name', 'avg_scheduled_service_minutes',
       'avg_stop_miles', 'n_trips', 'time_period', 'peak_offpeak', 'frequency',
       'is_coverage', 'is_downtown_local', 'is_local', 'is_rapid',
       'is_express', 'is_rail', 'route_primary_direction'],
      dtype='object')

In [93]:
df_sched_og = df_sched_og.loc[df_sched_og.schedule_gtfs_dataset_key.isin(schd_keys)]

In [94]:
df_sched_og[['route_id', 'direction_id','time_period','peak_offpeak']].sort_values(by= ['route_id', 'direction_id',])

Unnamed: 0,route_id,direction_id,time_period,peak_offpeak
2248,5,0.0,all_day,
2249,5,0.0,,offpeak
2250,5,0.0,,peak
1849,Shuttle,0.0,all_day,
1850,Shuttle,0.0,,offpeak
1851,Shuttle,0.0,,peak
1846,Shuttle,1.0,all_day,
1847,Shuttle,1.0,,offpeak
1848,Shuttle,1.0,,peak


In [95]:
route_dir_metrics2[['route_id', 'direction_id','time_period','peak_offpeak']].sort_values(by= ['route_id', 'direction_id',])

Unnamed: 0,route_id,direction_id,time_period,peak_offpeak
66,1,0.0,all_day,
67,1,0.0,,offpeak
68,1,0.0,,peak
24,11,0.0,all_day,
25,11,0.0,,offpeak
26,11,0.0,,peak
18,12X,0.0,all_day,
19,12X,0.0,,offpeak
20,12X,0.0,,peak
21,13X,0.0,all_day,


In [96]:
route_dir_metrics2.columns

Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',
       'common_shape_id', 'route_name', 'avg_scheduled_service_minutes',
       'avg_stop_miles', 'n_trips', 'time_period', 'peak_offpeak', 'frequency',
       'is_coverage', 'is_downtown_local', 'is_local', 'is_rapid',
       'is_express', 'is_rail', 'route_primary_direction'],
      dtype='object')

In [97]:
df_sched = route_dir_metrics2.copy()

In [98]:
df_sched.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 69 entries, 0 to 68
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   geometry                       69 non-null     geometry
 1   schedule_gtfs_dataset_key      69 non-null     object  
 2   route_id                       69 non-null     object  
 3   direction_id                   69 non-null     float64 
 4   common_shape_id                69 non-null     object  
 5   route_name                     69 non-null     object  
 6   avg_scheduled_service_minutes  69 non-null     float64 
 7   avg_stop_miles                 69 non-null     float64 
 8   n_trips                        69 non-null     int64   
 9   time_period                    23 non-null     object  
 10  peak_offpeak                   46 non-null     object  
 11  frequency                      69 non-null     float64 
 12  is_coverage                   

#### `gtfs_digest/merge_data` line 300 `df_avg_speeds` is also missing a lot of routes.
* [File `rt_segment_speeds/scripts/average_summary_speeds.py`](https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/average_summary_speeds.py)

In [99]:
import sys

sys.path.append("../rt_segment_speeds/scripts/")
import average_segment_speeds
import average_summary_speeds
from segment_speed_utils import (
    gtfs_schedule_wrangling,
    helpers,
    metrics,
    segment_calcs,
    time_series_utils,
)

In [100]:
df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(analysis_date_list)

In [101]:
df_avg_speeds2 = df_avg_speeds.loc[
    df_avg_speeds.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [102]:
df_avg_speeds2.route_id.value_counts()

5    3
Name: route_id, dtype: int64

In [103]:
df_avg_speeds2.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,speed_mph,service_date
5006,73105f2d1cabc8170ab066d96863c5d5,5,0.0,all_day,15.73,2024-11-13
5007,73105f2d1cabc8170ab066d96863c5d5,5,0.0,offpeak,17.62,2024-11-13


##### See what is in `average_segment_speeds.concatenate_trip_segment_speeds`

In [104]:
segment_type = "stop_segments"

In [105]:
df = average_segment_speeds.concatenate_trip_segment_speeds(
    analysis_date_list, segment_type
)

concatenated files


In [106]:
df = df.pipe(gtfs_schedule_wrangling.add_weekday_weekend_column)

##### All the routes are here!

In [107]:
df2 = df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [108]:
df2.route_id.unique()

array(['30', '3', '20', '4', '5', '11', '7', '9', '1', '12X', '6', '2',
       '8', '8a7c42f9-51e4-4848-bf88-30c210f149ad', 'CC'], dtype=object)

In [109]:
df2.shape

(3543, 17)

In [110]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3543 entries, 159381 to 2656608
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   schedule_gtfs_dataset_key  3543 non-null   object        
 1   shape_array_key            3543 non-null   object        
 2   shape_id                   3543 non-null   object        
 3   stop_sequence              3543 non-null   int64         
 4   route_id                   3543 non-null   object        
 5   direction_id               531 non-null    float64       
 6   stop_pair                  3543 non-null   object        
 7   stop_pair_name             3543 non-null   object        
 8   trip_instance_key          3543 non-null   object        
 9   speed_mph                  3543 non-null   float64       
 10  meters_elapsed             3543 non-null   float64       
 11  sec_elapsed                3543 non-null   float64       
 12

In [111]:
df2.head(1).T

Unnamed: 0,159381
schedule_gtfs_dataset_key,73105f2d1cabc8170ab066d96863c5d5
shape_array_key,c6e9cda0db8bf76bc535f590ca1fccb5
shape_id,8746730d-27f9-4fb2-9f52-987afe356929
stop_sequence,2
route_id,30
direction_id,
stop_pair,f09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c
stop_pair_name,Broadway at Stowell__Betteravia at Miller (Panda Express)
trip_instance_key,005bb393ed8b22ca4d8e7cc8d7895231
speed_mph,13.21


##### <b>Amanda: filled in `nans` with 0.</b>

In [112]:
df2.direction_id = df2.direction_id.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.direction_id = df2.direction_id.fillna(0)


##### Run `segment_averages`

In [113]:
segment_type

'stop_segments'

In [114]:
dict_inputs = GTFS_DATA_DICT[segment_type]

In [115]:
OPERATOR_COLS = [
    "schedule_gtfs_dataset_key",
]

In [116]:
ROUTE_DIR_COLS = [*dict_inputs["route_dir_cols"]]
STOP_PAIR_COLS = [*dict_inputs["stop_pair_cols"]]

In [117]:
STOP_PAIR_COLS

['stop_pair', 'stop_pair_name']

In [118]:
ROUTE_DIR_COLS

['route_id', 'direction_id']

In [119]:
group_cols = OPERATOR_COLS + ROUTE_DIR_COLS + STOP_PAIR_COLS

In [120]:
group_cols

['schedule_gtfs_dataset_key',
 'route_id',
 'direction_id',
 'stop_pair',
 'stop_pair_name']

In [121]:
ROUTE_SEG_FILE = dict_inputs["route_dir_single_segment"]

In [122]:
ROUTE_SEG_FILE = f"{ROUTE_SEG_FILE}_AH_TESTING"

In [123]:
""" segment_averages_df = average_segment_speeds.segment_averages(
            [one_analysis_date], 
            segment_type, 
            group_cols = OPERATOR_COLS + ROUTE_DIR_COLS + STOP_PAIR_COLS,
            export_file = ROUTE_SEG_FILE,
            weighted_averages = True
        )"""

' segment_averages_df = average_segment_speeds.segment_averages(\n            [one_analysis_date], \n            segment_type, \n            group_cols = OPERATOR_COLS + ROUTE_DIR_COLS + STOP_PAIR_COLS,\n            export_file = ROUTE_SEG_FILE,\n            weighted_averages = True\n        )'

##### <b>Added `dropna=False` to `rt_segment_speeds/segment_speed_utils/segment_calcs.calculate_avg_speeds`</b>

In [124]:
def calculate_avg_speeds(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
    """
    Calculate the median, 20th, and 80th percentile speeds
    by groups.
    """
    # pd.groupby and pd.quantile is so slow
    # create our own list of speeds and use np
    df2 = (
        df.groupby(group_cols, observed=True, group_keys=False, dropna=False)
        .agg({"speed_mph": lambda x: sorted(list(x))})
        .reset_index()
        .rename(columns={"speed_mph": "speed_mph_list"})
    )

    df2 = df2.assign(
        p50_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=50), axis=1),
        n_trips=df2.apply(lambda x: len(x.speed_mph_list), axis=1).astype("int16"),
        p20_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=20), axis=1),
        p80_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=80), axis=1),
    )

    stats = df2.drop(columns="speed_mph_list")

    # Clean up for map
    speed_cols = [c for c in stats.columns if "_mph" in c]
    stats[speed_cols] = stats[speed_cols].round(2)

    return stats

In [125]:
avg_speeds = calculate_avg_speeds(
    df2,
    group_cols + ["time_of_day"],
)

In [126]:
avg_speeds.head()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,stop_pair,stop_pair_name,time_of_day,p50_mph,n_trips,p20_mph,p80_mph
0,73105f2d1cabc8170ab066d96863c5d5,1,0.0,1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644,Broadway at Hermosa__Broadway at Fesler,AM Peak,32.35,4,10.41,51.44
1,73105f2d1cabc8170ab066d96863c5d5,1,0.0,1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644,Broadway at Hermosa__Broadway at Fesler,Early AM,20.2,1,20.2,20.2
2,73105f2d1cabc8170ab066d96863c5d5,1,0.0,1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644,Broadway at Hermosa__Broadway at Fesler,Evening,13.37,1,13.37,13.37
3,73105f2d1cabc8170ab066d96863c5d5,1,0.0,1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644,Broadway at Hermosa__Broadway at Fesler,Midday,15.96,6,1.0,20.2
4,73105f2d1cabc8170ab066d96863c5d5,1,0.0,1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644,Broadway at Hermosa__Broadway at Fesler,PM Peak,14.66,5,0.93,16.33


In [127]:
average_segment_speeds.CROSSWALK_COLS

['schedule_gtfs_dataset_key',
 'name',
 'caltrans_district',
 'organization_source_record_id',
 'organization_name',
 'base64_url']

In [128]:
avg_speeds2 = avg_speeds.pipe(
    gtfs_schedule_wrangling.merge_operator_identifiers,
    analysis_date_list,
    columns=average_segment_speeds.CROSSWALK_COLS,
)

In [129]:
avg_speeds2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1120 entries, 0 to 1119
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   schedule_gtfs_dataset_key      1120 non-null   object 
 1   route_id                       1120 non-null   object 
 2   direction_id                   1120 non-null   float64
 3   stop_pair                      1120 non-null   object 
 4   stop_pair_name                 1120 non-null   object 
 5   time_of_day                    1120 non-null   object 
 6   p50_mph                        1120 non-null   float64
 7   n_trips                        1120 non-null   int16  
 8   p20_mph                        1120 non-null   float64
 9   p80_mph                        1120 non-null   float64
 10  name                           1120 non-null   object 
 11  caltrans_district              1120 non-null   object 
 12  organization_source_record_id  1120 non-null   o

In [130]:
preview(avg_speeds)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,1,0.0
91,73105f2d1cabc8170ab066d96863c5d5,11,0.0
148,73105f2d1cabc8170ab066d96863c5d5,12X,0.0
204,73105f2d1cabc8170ab066d96863c5d5,2,0.0
315,73105f2d1cabc8170ab066d96863c5d5,20,0.0
334,73105f2d1cabc8170ab066d96863c5d5,3,0.0
409,73105f2d1cabc8170ab066d96863c5d5,30,0.0
492,73105f2d1cabc8170ab066d96863c5d5,4,0.0
584,73105f2d1cabc8170ab066d96863c5d5,5,0.0
717,73105f2d1cabc8170ab066d96863c5d5,6,0.0


In [131]:
preview(avg_speeds2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,1,0.0
91,73105f2d1cabc8170ab066d96863c5d5,11,0.0
148,73105f2d1cabc8170ab066d96863c5d5,12X,0.0
204,73105f2d1cabc8170ab066d96863c5d5,2,0.0
315,73105f2d1cabc8170ab066d96863c5d5,20,0.0
334,73105f2d1cabc8170ab066d96863c5d5,3,0.0
409,73105f2d1cabc8170ab066d96863c5d5,30,0.0
492,73105f2d1cabc8170ab066d96863c5d5,4,0.0
584,73105f2d1cabc8170ab066d96863c5d5,5,0.0
717,73105f2d1cabc8170ab066d96863c5d5,6,0.0


##### Move onto `merge_in_segment_geometry` part of `rt_segment_speeds/scripts/average_segement_speeds`
* Original function=only 3 routes showing...Check it out.

In [132]:
avg_speeds_with_geom = average_segment_speeds.merge_in_segment_geometry(
    avg_speeds2, one_analysis_date, segment_type
)

In [133]:
preview(avg_speeds_with_geom)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,5,0.0
133,f5a749dd65924e025b1293c58f95f8d6,CC,1.0
136,f5a749dd65924e025b1293c58f95f8d6,CC,0.0


In [134]:
from calitp_data_analysis.geography_utils import WGS84

##### Down another rabbit hole: this `SEGMENT_FILE` doesn't contain values for direction_id 
* Need to find out whre it's originally made.
* Can fill in `direction_id` with 0 for now.

In [135]:
SEGMENT_FILE = GTFS_DATA_DICT[segment_type].segments_file

segment_geom = gpd.read_parquet(
    f"{SEGMENT_GCS}{SEGMENT_FILE}_{one_analysis_date}.parquet",
).to_crs(WGS84)

In [136]:
segment_geom2 = segment_geom.loc[segment_geom.schedule_gtfs_dataset_key.isin(schd_keys)]

In [137]:
segment_geom2.direction_id = segment_geom2.direction_id.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [138]:
preview(segment_geom2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
181780,73105f2d1cabc8170ab066d96863c5d5,30,0.0
570760,73105f2d1cabc8170ab066d96863c5d5,3,0.0
1613296,73105f2d1cabc8170ab066d96863c5d5,20,0.0
1969198,73105f2d1cabc8170ab066d96863c5d5,4,0.0
2083066,73105f2d1cabc8170ab066d96863c5d5,5,0.0
2112284,73105f2d1cabc8170ab066d96863c5d5,11,0.0
2165911,73105f2d1cabc8170ab066d96863c5d5,7,0.0
2215180,73105f2d1cabc8170ab066d96863c5d5,9,0.0
2386098,73105f2d1cabc8170ab066d96863c5d5,1,0.0
2720537,73105f2d1cabc8170ab066d96863c5d5,12X,0.0


##### Continue on with the rest of `merge_in_segment_geometry` in `rt_segment_speeds/scripts/average_segment_speeds`

In [139]:
dict_inputs["route_dir_single_segment"]

'rollup_singleday/speeds_route_dir_segments'

In [140]:
geom_file_cols = segment_geom2.columns.tolist()

In [141]:
col_order = [c for c in avg_speeds2.columns]

In [142]:
merge_cols = list(set(col_order).intersection(geom_file_cols))

In [143]:
gdf = (
    pd.merge(
        segment_geom2[merge_cols + ["geometry"]].drop_duplicates(),
        avg_speeds2,
        on=merge_cols,
    )
    .reset_index(drop=True)
    .reindex(columns=col_order + ["geometry"])
)

In [144]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   schedule_gtfs_dataset_key      1266 non-null   object  
 1   route_id                       1266 non-null   object  
 2   direction_id                   1266 non-null   float64 
 3   stop_pair                      1266 non-null   object  
 4   stop_pair_name                 1266 non-null   object  
 5   time_of_day                    1266 non-null   object  
 6   p50_mph                        1266 non-null   float64 
 7   n_trips                        1266 non-null   int16   
 8   p20_mph                        1266 non-null   float64 
 9   p80_mph                        1266 non-null   float64 
 10  name                           1266 non-null   object  
 11  caltrans_district              1266 non-null   object  
 12  organization_source_record

In [145]:
gdf.drop(
    columns=[
        "geometry",
        "organization_source_record_id",
        "organization_name",
        "base64_url",
    ]
).sample()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,stop_pair,stop_pair_name,time_of_day,p50_mph,n_trips,p20_mph,p80_mph,name,caltrans_district
896,73105f2d1cabc8170ab066d96863c5d5,6,0.0,208b45c8-def8-4f85-93c0-e5d6f80d2794__6a0336b1-ae16-4831-9b03-7b2117fdc6a3,Rice Ranch at Orcutt Rd__Orcutt Rd at Valley View,Midday,23.32,6,21.06,24.89,Santa Maria Schedule,05 - San Luis Obispo


##### `rt_segment_speeds/scripts/average_segment_speeds` gives me the speeds by stop for a route. However, in `gtfss_digest/merge_data`, we want the speeds for the entire route from `average_segment_speeds` is summarized in `rt_segment_speeds/scripts/average_summary_speeds`

In [146]:
dict_inputs["route_dir_single_segment"]

'rollup_singleday/speeds_route_dir_segments'

##### **This file below is used in `gtfs_digest/merge_data`**
* gs://calitp-analytics-data/data-analyses/rt_segment_speeds/ and rollup_singleday/speeds_route_dir_AH_TEST_2024-11-13

In [147]:
GTFS_DATA_DICT.rt_stop_times.route_dir_single_summary

'rollup_singleday/speeds_route_dir'

In [148]:
dict_inputs = GTFS_DATA_DICT[segment_type]

In [149]:
avg_summary_speeds_url = "gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_route_dir_AH_TEST_2024-11-13.parquet"

In [150]:
avg_summary_speeds_df = gpd.read_parquet(avg_summary_speeds_url)

In [151]:
avg_summary_speeds_df.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,meters_elapsed,sec_elapsed,speed_mph,name,caltrans_district,organization_source_record_id,organization_name,base64_url,route_name,geometry
0,55a01ef72af21906934ae8ffb4786e86,200X,0.0,peak,87203.48,8822.0,22.11,Bay Area 511 Tri Delta Schedule,04 - Oakland,recEEJVeGrHGoTwgj,Eastern Contra Costa Transit Authority,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD0zRA==,Martinez / Pittsburg,"LINESTRING (-122.13931 38.01857, -122.13873 38.01847, -122.13850 38.01837, -122.13833 38.01809, -122.13843 38.01803, -122.13854 38.01795, -122.13874 38.01781, -122.13878 38.01778, -122.13898 38.01766, -122.13921 38.01752, -122.13977 38.01718, -122.13993 38.01707, -122.13980 38.01695, -122.13975 38.01691, -122.13940 38.01655, -122.13892 38.01605, -122.13884 38.01597, -122.13830 38.01540, -122.13778 38.01486, -122.13762 38.01471, -122.13724 38.01431, -122.13702 38.01407, -122.13672 38.01376, -122.13625 38.01329, -122.13619 38.01322, -122.13592 38.01294, -122.13567 38.01267, -122.13514 38.01211, -122.13509 38.01206, -122.13460 38.01155, -122.13436 38.01130, -122.13407 38.01099, -122.13354 38.01045, -122.13302 38.00990, -122.13250 38.00936, -122.13196 38.00879, -122.13165 38.00846, -122.13144 38.00824, -122.13124 38.00799, -122.13104 38.00773, -122.13098 38.00765, -122.13092 38.00755, -122.13087 38.00746, -122.13083 38.00738, -122.13080 38.00719, -122.13080 38.00717, -122.13080 38.00712, -122.13083 38.00687, -122.13090 38.00648, -122.13095 38.00623, -122.13097 38.00608, -122.13101 38.00585, -122.13103 38.00568, -122.13106 38.00545, -122.13111 38.00510, -122.13112 38.00499, -122.13114 38.00480, -122.13118 38.00447, -122.13123 38.00420, -122.13126 38.00410, -122.13132 38.00386, -122.13133 38.00375, -122.13139 38.00345, -122.13140 38.00342, -122.13143 38.00318, -122.13148 38.00294, -122.13150 38.00278, -122.13152 38.00267, -122.13156 38.00239, -122.13159 38.00206, -122.13162 38.00175, -122.13169 38.00136, -122.13176 38.00101, -122.13181 38.00068, -122.13182 38.00061, -122.13183 38.00056, -122.13184 38.00051, -122.13187 38.00033, -122.13191 38.00008, -122.13192 38.00004, -122.13198 37.99966, -122.13202 37.99938, -122.13207 37.99906, -122.13208 37.99901, -122.13209 37.99893, -122.13211 37.99874, -122.13212 37.99870, -122.13217 37.99840, -122.13223 37.99808, -122.13232 37.99758, -122.13237 37.99729, -122.13245 37.99681, -122.13246 37.99664, -122.13245 37.99644, -122.13245 37.99639, -122.13244 37.99632, -122.13240 37.99611, -122.13236 37.99597, -122.13232 37.99586, -122.13227 37.99569, -122.13214 37.99544, -122.13199 37.99512, -122.13187 37.99487, -122.13158 37.99421, -122.13130 37.99359, -122.13121 37.99340, -122.13109 37.99314, -122.13089 37.99271, -122.13060 37.99209, -122.13049 37.99181, -122.13037 37.99152, -122.13030 37.99135, -122.13017 37.99103, -122.13003 37.99073, -122.12996 37.99058, -122.12990 37.99047, -122.12964 37.98996, -122.12937 37.99010, -122.12926 37.99015, -122.12884 37.99037, -122.12871 37.99044, -122.12870 37.99044, -122.12850 37.99055, -122.12819 37.99069, -122.12799 37.99079, -122.12778 37.99088, -122.12756 37.99096, -122.12737 37.99103, -122.12690 37.99119, -122.12646 37.99138, -122.12611 37.99157, -122.12573 37.99178, -122.12530 37.99202, -122.12489 37.99224, -122.12461 37.99240, -122.12377 37.99288, -122.12285 37.99328, -122.12225 37.99356, -122.12183 37.99377, -122.12143 37.99394, -122.12103 37.99411, -122.12080 37.99419, -122.12062 37.99426, -122.12034 37.99434, -122.12015 37.99436, -122.11996 37.99434, -122.11982 37.99428, -122.11962 37.99420, -122.11944 37.99385, -122.11926 37.99393, -122.11917 37.99399, -122.11889 37.99425, -122.11872 37.99449, -122.11835 37.99511, -122.11815 37.99533, -122.11799 37.99545, -122.11781 37.99553, -122.11753 37.99562, -122.11734 37.99566, -122.11705 37.99568, -122.11676 37.99566, -122.11562 37.99538, -122.11520 37.99524, -122.11461 37.99505, -122.11466 37.99493, -122.11465 37.99487, -122.11454 37.99470, -122.11452 37.99465, -122.11452 37.99462, -122.11460 37.99440, -122.11464 37.99436, -122.11472 37.99433, -122.11483 37.99429, -122.11487 37.99426, -122.11499 37.99404, -122.11501 37.99399, -122.11501 37.99393, -122.11497 37.99384, -122.11492 37.99381, -122.11489 37.99378, -122.11473 37.99374, -122.11464 37.99382, -122.11454 37.99388, -122.11454 37.99390, -122.11444 37.99415, -122.11449 37.99420, -122.11454 37.99429, -122.11457 37.99435, -122.11455 37.99443, -122.11452 37.99462, -122.11452 37.99465, -122.11454 37.99470, -122.11465 37.99487, -122.11466 37.99493, -122.11461 37.99505, -122.11425 37.99494, -122.11391 37.99483, -122.11353 37.99461, -122.11334 37.99448, -122.11307 37.99426, -122.11283 37.99449, -122.11263 37.99468, -122.11243 37.99480, -122.11223 37.99483, -122.11201 37.99482, -122.11183 37.99477, -122.11170 37.99469, -122.11127 37.99439, -122.11118 37.99436, -122.11100 37.99431, -122.11091 37.99431, -122.11073 37.99433, -122.11003 37.99457, -122.10983 37.99462, -122.10964 37.99463, -122.10944 37.99460, -122.10848 37.99432, -122.10840 37.99428, -122.10827 37.99417, -122.10821 37.99410, -122.10814 37.99394, -122.10812 37.99385, -122.10763 37.99383, -122.10752 37.99382, -122.10746 37.99382, -122.10718 37.99379, -122.10680 37.99372, -122.10636 37.99361, -122.10592 37.99347, -122.10390 37.99261, -122.10355 37.99243, -122.10346 37.99239, -122.10251 37.99199, -122.10228 37.99183, -122.10207 37.99164, -122.10178 37.99114, -122.10167 37.99100, -122.10155 37.99088, -122.10143 37.99081, -122.10101 37.99056, -122.10062 37.99042, -122.10018 37.99035, -122.09972 37.99034, -122.09892 37.99043, -122.09844 37.99044, -122.09816 37.99041, -122.09683 37.99022, -122.09619 37.99016, -122.09561 37.99011, -122.09477 37.99010, -122.08799 37.99057, -122.08791 37.99057, -122.08780 37.99057, -122.08761 37.99054, -122.08706 37.99039, -122.08692 37.99074, -122.08682 37.99100, -122.08671 37.99127, -122.08648 37.99175, -122.08570 37.99184, -122.08556 37.99186, -122.08540 37.99189, -122.08532 37.99191, -122.08521 37.99194, -122.08503 37.99200, -122.08463 37.99216, -122.08442 37.99224, -122.08426 37.99229, -122.08411 37.99233, -122.08392 37.99237, -122.08369 37.99241, -122.08239 37.99263, -122.08176 37.99274, -122.08147 37.99279, -122.08120 37.99285, -122.08098 37.99291, -122.08077 37.99298, -122.08052 37.99308, -122.08030 37.99319, -122.08010 37.99331, -122.07990 37.99344, -122.07977 37.99354, -122.07966 37.99363, -122.07954 37.99374, -122.07942 37.99386, -122.07935 37.99394, -122.07928 37.99402, -122.07917 37.99416, -122.07908 37.99430, -122.07900 37.99445, -122.07894 37.99458, -122.07892 37.99463, -122.07886 37.99479, -122.07860 37.99552, -122.07842 37.99604, -122.07824 37.99654, -122.07810 37.99694, -122.07803 37.99709, -122.07795 37.99723, -122.07785 37.99736, -122.07775 37.99747, -122.07765 37.99756, -122.07758 37.99762, -122.07744 37.99773, -122.07713 37.99737, -122.07667 37.99684, -122.07586 37.99590, -122.07516 37.99512, -122.07504 37.99498, -122.07487 37.99478, -122.07484 37.99467, -122.07475 37.99455, -122.07468 37.99446, -122.07444 37.99414, -122.07428 37.99385, -122.07423 37.99375, -122.07418 37.99363, -122.07416 37.99357, -122.07412 37.99343, -122.07410 37.99338, -122.07406 37.99325, -122.07401 37.99305, -122.07397 37.99289, -122.07390 37.99259, -122.07382 37.99228, -122.07377 37.99207, -122.07412 37.99201, -122.07458 37.99193, -122.07481 37.99189, -122.07497 37.99193, -122.07505 37.99196, -122.07512 37.99201, -122.07516 37.99207, -122.07518 37.99216, -122.07518 37.99218, -122.07518 37.99221, -122.07518 37.99226, -122.07516 37.99237, -122.07510 37.99249, -122.07499 37.99260, -122.07485 37.99278, -122.07401 37.99305, -122.07387 37.99310, -122.07330 37.99330, -122.07206 37.99373, -122.07180 37.99382, -122.07115 37.99404, -122.07049 37.99427, -122.07007 37.99441, -122.06988 37.99447, -122.06972 37.99453, -122.06952 37.99460, -122.06941 37.99463, -122.06911 37.99474, -122.06851 37.99495, -122.06768 37.99523, -122.06646 37.99564, -122.06576 37.99588, -122.06458 37.99628, -122.06424 37.99640, -122.06375 37.99657, -122.06346 37.99668, -122.06317 37.99681, -122.06289 37.99693, -122.06255 37.99710, -122.06227 37.99725, -122.06198 37.99741, -122.06115 37.99787, -122.06085 37.99804, -122.06043 37.99828, -122.06003 37.99850, -122.05982 37.99861, -122.05956 37.99875, -122.05928 37.99889, -122.05900 37.99902, -122.05856 37.99922, -122.05827 37.99934, -122.05805 37.99943, -122.05783 37.99952, -122.05761 37.99961, -122.05720 37.99976, -122.05685 37.99988, -122.05641 38.00002, -122.05611 38.00011, -122.05580 38.00019, -122.05547 38.00028, -122.05512 38.00037, -122.05483 38.00045, -122.05457 38.00050, -122.05439 38.00054, -122.05309 38.00087, -122.05250 38.00102, -122.05127 38.00132, -122.04939 38.00179, -122.04758 38.00224, -122.04736 38.00230, -122.04714 38.00236, -122.04684 38.00243, -122.04639 38.00254, -122.04529 38.00281, -122.04446 38.00302, -122.04445 38.00302, -122.04281 38.00343, -122.04186 38.00363, -122.04098 38.00385, -122.04063 38.00394, -122.04028 38.00403, -122.03994 38.00413, -122.03959 38.00424, -122.03957 38.00425, -122.03915 38.00438, -122.03889 38.00447, -122.03862 38.00459, -122.03829 38.00473, -122.03780 38.00497, -122.03746 38.00514, -122.03644 38.00566, -122.03614 38.00581, -122.03586 38.00595, -122.03526 38.00625, -122.03444 38.00667, -122.03408 38.00685, -122.03377 38.00700, -122.03345 38.00715, -122.03317 38.00727, -122.03280 38.00742, -122.03247 38.00755, -122.03233 38.00760, -122.03214 38.00767, -122.03180 38.00779, -122.03146 38.00790, -122.03110 38.00800, -122.03073 38.00810, -122.03045 38.00818, -122.03014 38.00826, -122.02986 38.00832, -122.02941 38.00843, -122.02902 38.00852, -122.02865 38.00859, -122.02828 38.00865, -122.02757 38.00876, -122.02747 38.00877, -122.02711 38.00882, -122.02629 38.00894, -122.02506 38.00914, -122.02446 38.00924, -122.02399 38.00931, -122.02282 38.00949, -122.02263 38.00952, -122.02197 38.00962, -122.02162 38.00967, -122.02119 38.00973, -122.02029 38.00988, -122.01975 38.00997, -122.01930 38.01003, -122.01847 38.01016, -122.01770 38.01028, -122.01757 38.01030, -122.01706 38.01038, -122.01602 38.01054, -122.01494 38.01071, -122.01358 38.01090, -122.01155 38.01121, -122.01132 38.01124, -122.00908 38.01158, -122.00874 38.01163, -122.00583 38.01208, -122.00558 38.01212, -122.00257 38.01258, -122.00152 38.01274, -122.00069 38.01286, -122.00010 38.01295, -121.99973 38.01301, -121.99922 38.01310, -121.99869 38.01321, -121.99825 38.01331, -121.99809 38.01335, -121.99744 38.01352, -121.99728 38.01357, -121.99694 38.01367, -121.99629 38.01389, -121.99582 38.01406, -121.99522 38.01430, -121.99462 38.01456, -121.99407 38.01482, -121.99375 38.01498, -121.99349 38.01511, -121.99298 38.01540, -121.99239 38.01575, -121.99193 38.01605, -121.99152 38.01633, -121.99106 38.01663, -121.99084 38.01678, -121.99037 38.01710, -121.99012 38.01726, -121.98987 38.01741, -121.98961 38.01757, -121.98937 38.01770, -121.98905 38.01788, -121.98865 38.01809, -121.98839 38.01822, -121.98803 38.01839, -121.98766 38.01856, -121.98761 38.01858, -121.98706 38.01881, -121.98646 38.01908, -121.98643 38.01909, -121.98492 38.01974, -121.98293 38.02061, -121.98290 38.02062, -121.98225 38.02091, -121.98033 38.02174, -121.97969 38.02202, -121.97928 38.02220, -121.97899 38.02232, -121.97851 38.02253, -121.97820 38.02265, -121.97790 38.02276, -121.97757 38.02287, -121.97719 38.02299, -121.97684 38.02309, -121.97652 38.02318, -121.97620 38.02325, -121.97578 38.02335, -121.97557 38.02339, -121.97518 38.02346, -121.97467 38.02354, -121.97430 38.02359, -121.97385 38.02364, -121.97342 38.02367, -121.97300 38.02369, -121.97258 38.02371, -121.97216 38.02371, -121.97193 38.02370, -121.97176 38.02370, -121.97152 38.02369, -121.97040 38.02351, -121.97014 38.02346, -121.97001 38.02343, -121.96955 38.02333, -121.96910 38.02323, -121.96894 38.02318, -121.96878 38.02312, -121.96854 38.02300, -121.96821 38.02273, -121.96807 38.02261, -121.96793 38.02247, -121.96769 38.02223, -121.96759 38.02214, -121.96745 38.02201, -121.96731 38.02191, -121.96716 38.02184, -121.96701 38.02178, -121.96687 38.02175, -121.96674 38.02173, -121.96665 38.02172, -121.96632 38.02173, -121.96616 38.02173, -121.96604 38.02173, -121.96604 38.02135, -121.96605 38.02120, -121.96605 38.02107, -121.96606 38.02085, -121.96609 38.02014, -121.96614 38.01950, -121.96614 38.01937, -121.96610 38.01937, -121.96595 38.01937, -121.96486 38.01934, -121.96461 38.01933, -121.96438 38.01931, -121.96418 38.01929, -121.96396 38.01926, -121.96306 38.01910, -121.96217 38.01894, -121.96103 38.01873, -121.95996 38.01854, -121.95823 38.01824, -121.95734 38.01809, -121.95688 38.01801, -121.95663 38.01797, -121.95638 38.01793, -121.95605 38.01787, -121.95567 38.01780, -121.95540 38.01775, -121.95524 38.01772, -121.95497 38.01767, -121.95474 38.01763, -121.95445 38.01757, -121.95417 38.01750, -121.95390 38.01742, -121.95349 38.01728, -121.95326 38.01719, -121.95296 38.01706, -121.95271 38.01694, -121.95251 38.01683, -121.95229 38.01671, -121.95217 38.01664, -121.95205 38.01657, -121.95191 38.01652, -121.95177 38.01647, -121.95161 38.01641, -121.95142 38.01634, -121.95122 38.01627, -121.95105 38.01622, -121.95085 38.01617, -121.95059 38.01612, -121.95041 38.01609, -121.94874 38.01581, -121.94841 38.01576, -121.94774 38.01566, -121.94732 38.01559, -121.94693 38.01553, -121.94649 38.01547, -121.94615 38.01542, -121.94585 38.01538, -121.94585 38.01592, -121.94585 38.01600, -121.94583 38.01633, -121.94583 38.01690, -121.94513 38.01689, -121.94494 38.01690, -121.94486 38.01695, -121.94482 38.01702, -121.94482 38.01716, -121.94483 38.01756, -121.94484 38.01798, -121.94484 38.01820, -121.94494 38.01813, -121.94507 38.01807, -121.94519 38.01796, -121.94520 38.01786, -121.94519 38.01732, -121.94519 38.01731, -121.94514 38.01725)"


##### Only one route is showing!

In [152]:
avg_summary_speeds_df2 = avg_summary_speeds_df.loc[
    avg_summary_speeds_df.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [153]:
preview(avg_summary_speeds_df2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
1002,73105f2d1cabc8170ab066d96863c5d5,5,0.0


##### Need to breakout `average_summary_speeds`

In [154]:
common_shape_geom = gtfs_schedule_wrangling.most_common_shape_by_route_direction(
    one_analysis_date
).to_crs(WGS84)

In [155]:
common_shape_geom.columns

Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',
       'common_shape_id', 'route_name'],
      dtype='object')

In [156]:
common_shape_geom2 = common_shape_geom.loc[
    common_shape_geom.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [157]:
common_shape_geom2.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 23 entries, 167 to 1098
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   geometry                   23 non-null     geometry
 1   schedule_gtfs_dataset_key  23 non-null     object  
 2   route_id                   23 non-null     object  
 3   direction_id               23 non-null     float64 
 4   common_shape_id            23 non-null     object  
 5   route_name                 23 non-null     object  
dtypes: float64(1), geometry(1), object(4)
memory usage: 1.3+ KB


##### This `concatenate_trip_segment_speeds` is from `rt_segment_speeds/scripts/average_segment_speeds`

In [158]:
df = average_summary_speeds.concatenate_trip_segment_speeds(
    analysis_date_list, segment_type
)

concatenated files


In [159]:
type(df)

pandas.core.frame.DataFrame

In [160]:
df2 = df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)]

##### Filled in `direction_id` with 0

In [161]:
df2.direction_id = df2.direction_id.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.direction_id = df2.direction_id.fillna(0)


In [162]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3543 entries, 159381 to 2656608
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   schedule_gtfs_dataset_key  3543 non-null   object        
 1   shape_array_key            3543 non-null   object        
 2   shape_id                   3543 non-null   object        
 3   stop_sequence              3543 non-null   int64         
 4   route_id                   3543 non-null   object        
 5   direction_id               3543 non-null   float64       
 6   stop_pair                  3543 non-null   object        
 7   stop_pair_name             3543 non-null   object        
 8   trip_instance_key          3543 non-null   object        
 9   speed_mph                  3543 non-null   float64       
 10  meters_elapsed             3543 non-null   float64       
 11  sec_elapsed                3543 non-null   float64       
 12

In [163]:
df2.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,shape_array_key,shape_id,stop_sequence,route_id,direction_id,stop_pair,stop_pair_name,trip_instance_key,speed_mph,meters_elapsed,sec_elapsed,time_of_day,arrival_time,service_date,peak_offpeak,weekday_weekend
159381,73105f2d1cabc8170ab066d96863c5d5,c6e9cda0db8bf76bc535f590ca1fccb5,8746730d-27f9-4fb2-9f52-987afe356929,2,30,0.0,f09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c,Broadway at Stowell__Betteravia at Miller (Panda Express),005bb393ed8b22ca4d8e7cc8d7895231,13.21,1930.84,327.0,PM Peak,2024-11-13 15:23:45,2024-11-13,peak,weekday


In [164]:
trip_group_cols = OPERATOR_COLS + ROUTE_DIR_COLS

In [165]:
trip_avg = (
    metrics.weighted_average_speeds_across_segments(
        df2,
        trip_group_cols + ["peak_offpeak"],
    )
    .pipe(
        gtfs_schedule_wrangling.merge_operator_identifiers,
        analysis_date_list,
        columns=CROSSWALK_COLS,
    )
    .reset_index(drop=True)
)

NameError: name 'CROSSWALK_COLS' is not defined

In [None]:
trip_avg.head(1)

In [None]:
trip_avg.shape

In [None]:
dict_inputs = GTFS_DATA_DICT[segment_type]

##### Skipping this part because I can't find `MIN_TRIP_SECONDS` and `MAX_TRIP_SECONDS` in `dict_input`

In [None]:
""" trip_avg_filtered = trip_avg[
        (trip_avg.meters_elapsed >= average_summary_speeds.METERS_CUTOFF) & 
        (trip_avg.sec_elapsed >= average_summary_speeds.MIN_TRIP_SECONDS) & 
        (trip_avg.sec_elapsed <= average_summary_speeds.MAX_TRIP_SECONDS)
    ]
    """

In [None]:
group_cols = OPERATOR_COLS + ROUTE_DIR_COLS

In [None]:
avg_speeds = (
    metrics.concatenate_peak_offpeak_allday_averages(
        trip_avg, group_cols, metric_type="summary_speeds"
    )
    .pipe(
        gtfs_schedule_wrangling.merge_operator_identifiers,
        analysis_date_list,
        columns=CROSSWALK_COLS,
    )
    .reset_index(drop=True)
)

In [None]:
preview(avg_speeds)

In [None]:
avg_speeds.info()

In [None]:
avg_speeds_with_geom = average_summary_speeds.merge_in_common_shape_geometry(
    avg_speeds, one_analysis_date
)

In [None]:
preview(avg_speeds_with_geom)

In [None]:
avg_speeds_with_geom.info()

##### Double check that my work matches what's in `gtfs_digest/merge_data`

In [None]:
SEGMENT_GCS

In [None]:
GTFS_DATA_DICT.rt_stop_times.route_dir_single_summary

In [None]:
df_avg_speeds_og = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_route_dir_2024-11-13.parquet"
)

In [None]:
df_avg_speeds_og = df_avg_speeds_og.loc[
    df_avg_speeds_og.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [None]:
preview(df_avg_speeds_og)

In [None]:
df_avg_speeds_og.columns

In [None]:
avg_speeds_with_geom.columns

In [None]:
df_avg_speeds = avg_speeds_with_geom.copy()

#### Dataframe in line 307 `df_rt_sched` in `gtfs_digest/merge_data`

In [None]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(
    analysis_date_list
).astype({"direction_id": "float"})

In [None]:
df_rt_sched2 = df_rt_sched.loc[df_rt_sched.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
preview(df_rt_sched2)

In [None]:
df_rt_sched2.head(2)

##### `dt_rt_sched` is created using  [`rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`](https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/rt_v_scheduled_routes.py) 

In [None]:
[*GTFS_DATA_DICT["stop_segments"]["route_dir_cols"]]

In [None]:
dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables

##### `route_metrics` in `rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`

In [None]:
TRIP_EXPORT = dict_inputs.vp_trip_metrics

In [None]:
TRIP_EXPORT

In [None]:
ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics

In [None]:
trip_df = pd.read_parquet(f"{RT_SCHED_GCS}{TRIP_EXPORT}_{one_analysis_date}.parquet")

In [None]:
trip_df2 = trip_df.loc[trip_df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
trip_df2.shape

##### Everything is available in `trip_df`. Fill in Direction_id with 0.

In [None]:
trip_df2.info()

In [None]:
trip_df2.direction_id = trip_df2.direction_id.fillna(0)

In [None]:
trip_df2.loc[trip_df2.time_of_day == "AM Peak"].drop(
    columns=["schedule_gtfs_dataset_key", "trip_instance_key"]
).sort_values(by=["route_id"]).drop_duplicates(
    subset=[
        "route_id",
        "direction_id",
    ]
).T

##### Somewhere in `rt_scheduled_v_ran/rt_v_scheduled_routes`, routes go missing.

In [None]:
import sys

sys.path.append("../rt_scheduled_v_ran/scripts")
import rt_v_scheduled_routes

In [None]:
ROUTE_DIR_COLS = [*GTFS_DATA_DICT["stop_segments"]["route_dir_cols"]]

In [None]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "name",
    "schedule_source_record_id",
    "base64_url",
    "organization_source_record_id",
    "organization_name",
    "caltrans_district",
]

##### Have to break out `metrics.concatenate_peak_offpeak_allday_averages` which is in  `rt_segment_speeds/segment_speed_utils/` because all of the routes are missing.

In [None]:
route_df = (
    metrics.concatenate_peak_offpeak_allday_averages(
        trip_df2,
        group_cols=["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS,
        metric_type="rt_vs_schedule",
    )
    .pipe(metrics.derive_rt_vs_schedule_metrics)
    .pipe(rt_v_scheduled_routes.average_rt_trip_times)
)

In [None]:
preview(route_df)

In [None]:
route_df.info()

In [None]:
["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS

`calculate_avg_speeds` is from `rt_segement_speeds/segement_speed_utils/segment_calc.py` -> added `dropna=False`

In [None]:
def calculate_avg_speeds(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
    """
    Calculate the median, 20th, and 80th percentile speeds
    by groups.
    """
    # pd.groupby and pd.quantile is so slow
    # create our own list of speeds and use np
    df2 = (
        df.groupby(group_cols, observed=True, group_keys=False, dropna=False)
        .agg({"speed_mph": lambda x: sorted(list(x))})
        .reset_index()
        .rename(columns={"speed_mph": "speed_mph_list"})
    )

    df2 = df2.assign(
        p50_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=50), axis=1),
        n_trips=df2.apply(lambda x: len(x.speed_mph_list), axis=1).astype("int16"),
        p20_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=20), axis=1),
        p80_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=80), axis=1),
    )

    stats = df2.drop(columns="speed_mph_list")

    # Clean up for map
    speed_cols = [c for c in stats.columns if "_mph" in c]
    stats[speed_cols] = stats[speed_cols].round(2)

    return stats

`calculate_weighted_average_vp_schedule_metrics` is from `rt_segment_speeds/segment_speed_utils/metrics` -> added `dropna=False`

In [None]:
def calculate_weighted_average_vp_schedule_metrics(
    df: pd.DataFrame,
    group_cols: list,
) -> pd.DataFrame:

    sum_cols = [
        "minutes_atleast1_vp",
        "minutes_atleast2_vp",
        "rt_service_minutes",
        "scheduled_service_minutes",
        "total_vp",
        "vp_in_shape",
        "is_early",
        "is_ontime",
        "is_late",
    ]

    count_cols = ["trip_instance_key"]

    df2 = (
        df.groupby(group_cols, observed=True, group_keys=False, dropna=False)
        .agg({**{e: "sum" for e in sum_cols}, **{e: "count" for e in count_cols}})
        .reset_index()
        .rename(columns={"trip_instance_key": "n_vp_trips"})
    )

    return df2

`weighted_average_speeds_across_segments` is from `rt_segment_speeds/segment_speed_utils/metrics` -> added `dropna=False`

In [None]:
def weighted_average_speeds_across_segments(
    df: pd.DataFrame, group_cols: list
) -> pd.DataFrame:
    """
    We can use our segments and the deltas within a trip
    to calculate the trip-level average speed, or
    the route-direction-level average speed.
    But, we want a weighted average, using the raw deltas
    instead of mean(speed_mph), since segments can be varying lengths.
    """
    avg_speeds = (
        df.groupby(group_cols, observed=True, group_keys=False, dropna=False)
        .agg(
            {
                "meters_elapsed": "sum",
                "sec_elapsed": "sum",
            }
        )
        .reset_index()
    ).pipe(segment_calcs.speed_from_meters_elapsed_sec_elapsed)

    return avg_speeds

`concatenate_peak_offpeak_allday_averages` is from `rt_segment_speeds/segment_speed_utils/metrics`

In [None]:
def concatenate_peak_offpeak_allday_averages(
    df: pd.DataFrame, group_cols: list, metric_type: str
) -> pd.DataFrame:
    """
    Calculate average speeds for all day and
    peak_offpeak.
    Concatenate these, so that speeds are always calculated
    for the same 3 time periods.
    """
    if metric_type == "segment_speeds":
        avg_peak = calculate_avg_speeds(df, group_cols + ["peak_offpeak"])

        avg_allday = calculate_avg_speeds(df, group_cols).assign(peak_offpeak="all_day")

    elif metric_type == "summary_speeds":
        avg_peak = weighted_average_speeds_across_segments(
            df, group_cols + ["peak_offpeak"]
        )

        avg_allday = weighted_average_speeds_across_segments(df, group_cols).assign(
            peak_offpeak="all_day"
        )

    elif metric_type == "rt_vs_schedule":
        avg_peak = calculate_weighted_average_vp_schedule_metrics(
            df, group_cols + ["peak_offpeak"]
        )

        avg_allday = calculate_weighted_average_vp_schedule_metrics(
            df, group_cols
        ).assign(peak_offpeak="all_day")

    else:
        print(
            f"Valid metric types: ['segment_speeds', 'summary_speeds', 'rt_vs_schedule']"
        )

    # Concatenate so that every segment has 3 time periods: peak, offpeak, and all_day
    avg_metrics = pd.concat([avg_peak, avg_allday], axis=0, ignore_index=True).rename(
        columns={"peak_offpeak": "time_period"}
    )

    return avg_metrics

##### Going back to `rt_v_scheduled/scripts/rt_v_scheduled_routes.py`

In [None]:
route_metrics_df = concatenate_peak_offpeak_allday_averages(
    trip_df2,
    group_cols=["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS,
    metric_type="rt_vs_schedule",
)

In [None]:
preview(route_metrics_df)

In [None]:
route_metrics_df.shape

In [None]:
route_metrics_df.route_id.value_counts()

In [None]:
route_metrics_df.loc[route_metrics_df.route_id == "CC"]

##### A lot of stuff is here except `direction_id`

In [None]:
route_metrics_df = route_metrics_df.pipe(metrics.derive_rt_vs_schedule_metrics)

In [None]:
route_metrics_df = route_metrics_df.pipe(rt_v_scheduled_routes.average_rt_trip_times)

In [None]:
route_metrics_df = gtfs_schedule_wrangling.merge_operator_identifiers(
    route_metrics_df, [one_analysis_date], columns=crosswalk_cols
)

In [None]:
route_metrics_df.groupby(["route_id", "time_period"]).agg({"direction_id": "nunique"})

In [None]:
route_metrics_df.loc[route_metrics_df.time_period == "peak"].drop(
    columns=[
        "schedule_gtfs_dataset_key",
        "schedule_source_record_id",
        "base64_url",
        "organization_name",
        "organization_source_record_id",
        "caltrans_district",
    ]
)

##### `df_rt_sched` equals `df_rt_sched`

In [None]:
df_rt_sched_og = merge_data.concatenate_rt_vs_schedule_by_route_direction(
    analysis_date_list
)

In [None]:
df_rt_sched = route_metrics_df.copy()

In [None]:
df_rt_sched_og.columns

##### How do all these columns pop up?? 

In [None]:
df_rt_sched.columns

In [None]:
df_rt_sched = df_rt_sched.drop(
    columns=[
        "base64_url",
        "organization_source_record_id",
        "organization_name",
        "caltrans_district",
    ]
)

In [None]:
df_rt_sched_og.route_id.nunique()

In [None]:
df_rt_sched.route_id.nunique()

#### `gtfs_digest/merge_data/` line 316: `df_crosswalk`

In [None]:
df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)

#### `gtfs_digest/merge_data/merge_data_sources_by_route_direction`
* Have to make some tweaks since `df_avg_speeds2` is missing a lot of routes.

In [None]:
service_date_datetime = pd.to_datetime("2024-11-13T00:00:00.000000000")

##### Why are time_periods and peak_off_peak different between `df_sched` and `df_rt_sched`
* Something is wrong with `df_sched` because a lot of `time_period` values are missing~
##### Amanda, test: fill in `nans` in `time_period` with `peak_offpeak`
* This might solve why all the routes are missing in Nov/Dec too?

In [None]:
df_sched.time_period = df_sched.time_period.fillna(df_sched.peak_offpeak)

In [None]:
df_rt_sched[["route_id", "time_period", "direction_id"]].drop_duplicates().sort_values(by=["route_id","direction_id"])

In [None]:
 df_sched[["route_id", "time_period", "direction_id"]].drop_duplicates().sort_values(by=["route_id","direction_id"])

In [None]:
df_sched["service_date"] = service_date_datetime

In [None]:
df_rt_sched["service_date"] = service_date_datetime

In [None]:
df_avg_speeds.columns

In [None]:
df_avg_speeds["service_date"] = service_date_datetime

In [None]:
# merge1 = merge_data.merge_data_sources_by_route_direction(
# route_dir_metrics2,
# df_rt_sched,
# df_avg_speeds2,
# df_crosswalk
# )

In [None]:
primary_typology = merge_data.set_primary_typology(route_dir_metrics2)

In [None]:
primary_typology.head(1)

In [None]:
route_time_cols = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
]

In [None]:
df_schedule2 = pd.merge(df_sched, primary_typology, on=route_time_cols, how="left")

In [None]:
df_schedule2.info()

In [None]:
df_schedule2.head(3).drop(
    columns=["geometry", "schedule_gtfs_dataset_key", "common_shape_id"]
)

In [None]:
route_time_cols

In [None]:
df = pd.merge(
    df_schedule2,
    df_rt_sched,
    on=route_time_cols + ["service_date"],
    how="outer",
    indicator="sched_rt_category",
).merge(
        df_avg_speeds,
        on = route_time_cols + ["service_date"],
        how = "outer",
    )

##### Check that all the routes are here.

In [None]:
df.route_id.unique()

In [None]:
df2 = df.assign(
    sched_rt_category=df.sched_rt_category.map(
        gtfs_schedule_wrangling.sched_rt_category_dict
    )
)

In [None]:
df3 = df2.pipe(
    merge_data.merge_in_standardized_route_names,
)

###### Extra columns are popping up?? Detailed below. 

In [None]:
drop_cols = [
    "schedule_source_record_id",
    "base64_url",
    "organization_source_record_id",
    "organization_name",
    "caltrans_district",
]

In [None]:
df3.columns

In [None]:
df4 = pd.merge(
    df3.drop(columns=drop_cols),
    df_crosswalk,
    on=["schedule_gtfs_dataset_key", "name", "service_date"],
    how="left",
)

In [None]:
df4.columns

In [None]:
df4.direction_id = df4.direction_id.fillna(0)

##### Amanda, testing to see if filling direction_id with 0 will do anything.

In [None]:
df4[
    ["route_id", "direction_id", "time_period", "route_primary_direction"]
].drop_duplicates().sort_values(by=["route_id"])

In [None]:
df5 = df4.pipe(
    # Find the most common cardinal direction
    gtfs_schedule_wrangling.top_cardinal_direction
)

In [None]:
preview_cols = [
    "organization_name",
    "route_id",
    "sched_rt_category",
    "route_name",
    "direction_id",
    "route_primary_direction",
    "avg_scheduled_service_minutes",
    "avg_stop_miles",
    "n_trips",
    "peak_offpeak",
    "frequency",
    "typology",
    "minutes_atleast1_vp",
    "minutes_atleast2_vp",
    "total_rt_service_minutes",
    "total_scheduled_service_minutes",
    "total_vp",
    "vp_in_shape",
    "is_early",
    "is_ontime",
    "is_late",
    "n_vp_trips",
    "vp_per_minute",
    "pct_in_shape",
    "pct_rt_journey_atleast1_vp",
    "pct_rt_journey_atleast2_vp",
    "pct_sched_journey_atleast1_vp",
    "pct_sched_journey_atleast2_vp",
    "rt_sched_journey_ratio",
    "avg_rt_service_minutes",
    "speed_mph",
]

#### Observations
* There are no typologies for these previously missing routes.
* `Route_primary_direction` and `direction_id` is empty for all of City of Santa Maria 
* `route_ids` are repeated...somehow messed up during merges.
* I have an extra column for `peak_offpeak ` and `time_period`

In [None]:
df5.info()

In [None]:
df[["time_period", "route_id"]].drop_duplicates().sort_values(by=["route_id"])

In [None]:
df5.loc[df5.time_period == "all_day"][preview_cols].sort_values(
    by=["organization_name", "route_id"]
)

In [None]:
stop

### Fix `ROUTE_TYPOLOGIES` in `gtfs_funnel/route_typologies.py`

In [None]:
ROUTE_TYPOLOGIES

In [None]:
GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
route_typologies2 = route_typologies.loc[
    route_typologies.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [None]:
route_typologies2

In [None]:
route_dir_cols = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "common_shape_id",
    "route_name",
    "route_meters",
]

##### Amanda: in `rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling`, I filled `nan` rows in `direction_id`. Then I commented out parts of `gtfs_funnel/route_typologies`

In [None]:
common_shape = gtfs_schedule_wrangling.most_common_shape_by_route_direction(
    one_analysis_date
)

In [None]:
common_shape2 = common_shape.loc[common_shape.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
nov_typology_ah_test_df = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/gtfs_schedule/nacto_typologies/route_typologies_AH_TESTING_2024-11-13.parquet"
)

In [None]:
nov_typology_ah_test_df.loc[
    nov_typology_ah_test_df.schedule_gtfs_dataset_key.isin(schd_keys)
]

### Fix Map: `gtfs_digest/merge_operator_data`

In [None]:
OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles
OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

In [None]:
operator_route_gdf = gpd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet",
)

In [None]:
operator_route_gdf.columns

In [None]:
operator_route_gdf2.columns

In [None]:
len(operator_route_gdf2)

In [None]:
operator_route_gdf2.is_rail.value_counts()

In [None]:
operator_route_gdf2.organization_name.value_counts()

In [None]:
operator_route_gdf2.schedule_gtfs_dataset_key.unique()

#### Why does City of Santa Maria have multiple schedule_gtfs_dataset_keys?

In [None]:
operator_route_gdf2.groupby(["organization_name", "schedule_gtfs_dataset_key"]).agg(
    {"route_short_name": "nunique"}
)

In [None]:
operator_route_gdf2.drop(columns=["service_date"]).explore("organization_name")

In [None]:
# operator_route_gdf2.drop(columns = ["service_date"]).explore("shape_array_key")

#### Starting from here [`gtfs_funnel/operator_scheduled_stats`](https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/gtfs_funnel/operator_scheduled_stats.py#L148)

In [None]:
analysis_date = "2024-11-13"

In [None]:
schd_keys = list(operator_route_gdf2.schedule_gtfs_dataset_key.unique())

#### Longest shape does have all the routes...

In [None]:
longest_shape_gdf = operator_scheduled_stats.longest_shape_by_route(analysis_date)

In [None]:
longest_shape_gdf2 = longest_shape_gdf.loc[
    longest_shape_gdf.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [None]:
longest_shape_gdf2.columns

In [None]:
longest_shape_gdf2.info()

In [None]:
longest_shape_gdf2.route_id.value_counts()

In [None]:
# longest_shape_gdf2.explore("schedule_gtfs_dataset_key")

In [None]:
longest_shape_gdf2.groupby(["schedule_gtfs_dataset_key", "route_id"]).agg(
    {"route_length_miles": "max"}
)

#### Somewhere along the way the routes are cut...maybe b/c of `direction_id`

In [None]:
OPERATOR_EXPORT = GTFS_DATA_DICT.schedule_tables.operator_scheduled_stats

In [None]:
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
dec_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11.parquet"

In [None]:
dec_df = gpd.read_parquet(dec_url)

In [None]:
dec_df.organization_name.value_counts().head()

In [None]:
dec_df.loc[
    dec_df.organization_name == "Alameda-Contra Costa Transit District"
].head().drop(columns=["geometry"]).T

In [None]:
dec_df2 = dec_df.loc[dec_df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
dec_df2.shape

In [None]:
type(dec_df2)

In [None]:
dec_df2.drop(columns=["geometry"]).T

In [None]:
# dec_df2.explore()

#### Find where in `gtfs_funnel` all the routes disappear

In [None]:
group_cols = ["schedule_gtfs_dataset_key"]

In [None]:
longest_shape_gdf2.info()

#### something is going on in `operator_scheduled_stats.schedule_stats_by_operator`

In [None]:
ROUTE_TYPOLOGY = GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
route_typology = pd.read_parquet(f"{SCHED_GCS}{ROUTE_TYPOLOGY}_{analysis_date}.parquet")

In [None]:
from route_typologies import route_typologies

In [None]:
route_typology_grouped = (
    route_typology.groupby(["schedule_gtfs_dataset_key", "route_id"])
    .agg({**{f"is_{c}": "sum" for c in route_typologies}})
    .reset_index()
)

In [None]:
route_typology_grouped2 = route_typology_grouped.loc[
    route_typology_grouped.schedule_gtfs_dataset_key.isin(schd_keys)
]

#### Routes are missing for Santa Maria and Capitol Corridor in `ROUTE_TYPOLOGY`

In [None]:
route_typology_grouped2.T

In [None]:
route_gdf = longest_shape_gdf2.merge(
    route_typology_grouped2, on=["schedule_gtfs_dataset_key", "route_id"], how="outer"
)

In [None]:
route_gdf.shape

In [None]:
route_gdf.drop(columns=["geometry"])

In [None]:
# route_gdf2.explore("schedule_gtfs_dataset_key")

#### Change merge from `inner` to `left`

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

In [None]:
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
my_test_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11_AH.parquet"

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [None]:
test_gdf = gpd.read_parquet(my_test_url)

In [None]:
test_gdf2 = test_gdf.loc[test_gdf.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
test_gdf2.explore("route_id")

#### Test with all the dates.

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
RT_SCHED_GCS

In [None]:
f"{OPERATOR_ROUTE}_AH_test"

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

In [None]:
test_df = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_routes_AH_test.parquet"
)

In [None]:
test_df.columns

In [None]:
op_routes_gdf = test_df.loc[test_df.organization_name.isin(org_name_lists)]

In [None]:
# Find the most recent geography for each route.
op_routes_gdf = op_routes_gdf.sort_values(by=["service_date"], ascending=False)

# Keep only the most recent row.
op_routes_gdf = op_routes_gdf.drop_duplicates(
    subset=["route_long_name", "route_short_name", "route_combined_name"]
)

# Drop service_dates
op_routes_gdf = op_routes_gdf.drop(columns=["service_date"])

In [None]:
op_routes_gdf.organization_name.value_counts()

In [None]:
op_routes_gdf.loc[op_routes_gdf.organization_name == "City of Santa Maria"].explore(
    "route_long_name"
)