## Find Missing Routes: 2 operators. 
* [Issue](https://github.com/cal-itp/data-analyses/issues/1312): Capital Corridor doesn't have any rail routes. 
* [Most of Santa Maria's routes not showing up in GTFS Digest](https://github.com/cal-itp/data-analyses/issues/1313)
* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`

To-Do
* Move all the code here to the proper file.
* Rerun all the scripts that create the underlying dataframes for November date (`df_sched`,`df_avg_speeds`,`df_rt_sched`) and merge them using `gtfs_digest/merge_data.merge_data_sources_by_route_direction()`

In [1]:
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import (
    gtfs_schedule_wrangling,
    helpers,
    metrics,
    segment_calcs,
    time_series_utils,
)
from shared_utils import (
    catalog_utils,
    portfolio_utils,
    rt_dates,
    rt_utils,
    time_helpers,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
from shared_utils.rt_utils import METERS_PER_MILE

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
org_name_lists = ["Capitol Corridor Joint Powers Authority", "City of Santa Maria"]

In [5]:
analysis_date_list = ["2024-11-13"]

In [6]:
one_analysis_date = "2024-11-13"

In [7]:
schd_keys = [
    "5a8721fe96786fcd25fba1f8a0ee6358",
    "73105f2d1cabc8170ab066d96863c5d5",
    "f5a749dd65924e025b1293c58f95f8d6",
]

In [8]:
import sys

sys.path.append("../gtfs_funnel/")
import operator_scheduled_stats
import schedule_stats_by_route_direction

In [9]:
def preview(df):
    df2 = df[
        ["schedule_gtfs_dataset_key", "route_id", "direction_id"]
    ].drop_duplicates()
    display(df2)

### Fix `schd_vp_url`

In [10]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [11]:
schd_vp_df = pd.read_parquet(schd_vp_url)

In [12]:
schd_vp_df2 = schd_vp_df.loc[schd_vp_df.organization_name.isin(org_name_lists)]

In [13]:
schd_vp_df2.route_id.unique()

array(['b3848f93-d26b-48a9-b6a6-5de22a4eab47', '5', 'Shuttle', 'CC'],
      dtype=object)

In [14]:
schd_vp_df2.route_id.value_counts()

Shuttle                                 132
CC                                       84
5                                        67
b3848f93-d26b-48a9-b6a6-5de22a4eab47      6
Name: route_id, dtype: int64

In [15]:
schd_vp_df2.time_period.unique()

array(['all_day', 'offpeak', 'peak', None], dtype=object)

###  <font color="red">DONE</font> Check out `rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling`
* https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py 
* Tiffany: <i>Can you try specifying the dropna argument inside pandas groupby? Our pandas version has gone through upgrades, from 0.25 to now 1.5 ), and this argument was introduced in 1.1 and since it's dropna=True, that's probably what's driving the the row behavior.</i>
* It worked! Now time to rerun stuff further down the pipeline and see what happens.

In [16]:
common_shape_test = gtfs_schedule_wrangling.most_common_shape_by_route_direction(
    one_analysis_date
)

In [17]:
common_shape_test2 = common_shape_test.loc[
    common_shape_test.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [18]:
common_shape_test2.route_id.unique()

array(['7', '6', 'CC', '8', 'Mall', '12X', '13X', '11', '30', 'Shuttle',
       '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '1B', '20', 'SF',
       '5', '4', '9', '1'], dtype=object)

### Breakdown `gtfs_digest/merge_data.`

#### Line 294:<font color="red">DONE making all the changes to the original files.</font> `df_sched` is already missing a lot of the routes.

In [19]:
# Get cardinal direction for each route
df_sched_og = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)

In [20]:
df_sched2_og = df_sched_og.loc[df_sched_og.schedule_gtfs_dataset_key.isin(schd_keys)]

In [21]:
df_sched2_og.route_id.value_counts()

Shuttle    6
5          3
Name: route_id, dtype: int64

##### Go back to `gtfs_funnel/schedule_stats_by_route_direction`
* https://github.com/cal-itp/data-analyses/blob/1ba0f544a01f99966a6e210dd11666b4fe4a146e/gtfs_funnel/schedule_stats_by_route_direction.py#L190
* **Filled in `direction_id` with 0**

##### `assemble_scheduled_trip_metrics`: nothing is missing but `direction_id` is missing a lot of values.

In [22]:
trip_metrics = schedule_stats_by_route_direction.assemble_scheduled_trip_metrics(
    one_analysis_date, GTFS_DATA_DICT
)

In [23]:
trip_metrics2 = trip_metrics.loc[trip_metrics.schedule_gtfs_dataset_key.isin(schd_keys)]

In [24]:
trip_metrics2.columns

Index(['schedule_gtfs_dataset_key', 'trip_instance_key', 'median_stop_meters',
       'time_of_day', 'scheduled_service_minutes', 'route_id', 'direction_id'],
      dtype='object')

In [25]:
trip_metrics2.direction_id = trip_metrics2.direction_id.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trip_metrics2.direction_id = trip_metrics2.direction_id.fillna(0)


In [26]:
trip_metrics2.shape

(335, 7)

In [27]:
trip_metrics2.time_of_day.unique()

array(['PM Peak', 'Midday', 'AM Peak', 'Early AM', 'Evening'],
      dtype=object)

##### Each row is populated.

In [28]:
trip_metrics2.loc[trip_metrics2.time_of_day == "Midday"].drop_duplicates(
    subset=["schedule_gtfs_dataset_key", "route_id", "direction_id"]
).drop(columns=["schedule_gtfs_dataset_key", "trip_instance_key"])

Unnamed: 0,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id
49725,405.04,Midday,35.0,3,0.0
49729,178.05,Midday,14.98,Mall,0.0
49731,451.15,Midday,41.0,11,0.0
49736,361.12,Midday,30.0,1B,0.0
49737,357.22,Midday,30.37,1,0.0
49738,444.75,Midday,40.0,9,0.0
49741,440.62,Midday,41.0,4,0.0
49742,989.61,Midday,56.0,12X,0.0
49744,437.51,Midday,42.0,5,0.0
49754,477.41,Midday,53.0,13X,0.0


In [29]:
preview(trip_metrics2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
49724,73105f2d1cabc8170ab066d96863c5d5,30,0.0
49725,73105f2d1cabc8170ab066d96863c5d5,3,0.0
49727,73105f2d1cabc8170ab066d96863c5d5,20,0.0
49728,73105f2d1cabc8170ab066d96863c5d5,4,0.0
49729,73105f2d1cabc8170ab066d96863c5d5,Mall,0.0
49730,73105f2d1cabc8170ab066d96863c5d5,5,0.0
49731,73105f2d1cabc8170ab066d96863c5d5,11,0.0
49732,73105f2d1cabc8170ab066d96863c5d5,7,0.0
49733,73105f2d1cabc8170ab066d96863c5d5,9,0.0
49735,73105f2d1cabc8170ab066d96863c5d5,1,0.0


In [30]:
trip_metrics2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 335 entries, 49724 to 113340
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  335 non-null    object 
 1   trip_instance_key          335 non-null    object 
 2   median_stop_meters         335 non-null    float64
 3   time_of_day                335 non-null    object 
 4   scheduled_service_minutes  335 non-null    float64
 5   route_id                   335 non-null    object 
 6   direction_id               335 non-null    float64
dtypes: float64(3), object(4)
memory usage: 20.9+ KB


##### <font color="red">DONE</font>`gtfs_funnel/schedule_stats_by_route_direction/schedule_metrics_by_route_direction` 
* **updated to `dropna=False` and also filled in `time_period` with `peak_offpeak`**

In [31]:
def schedule_metrics_by_route_direction(
    df: pd.DataFrame,
    analysis_date: str,
    group_merge_cols: list,
) -> pd.DataFrame:
    """
    Aggregate trip-level metrics to route-direction, and
    attach shape geometry for common_shape_id.
    """
    service_freq_df = gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
        df, group_merge_cols, long_or_wide="long"
    )

    metrics_df = (
        df.groupby(group_merge_cols, observed=True, group_keys=False, dropna=False)
        .agg(
            {
                "median_stop_meters": "mean",
                # take mean of the median stop spacing for trip
                # does this make sense?
                # median is the single boiled down metric at the trip-level
                "scheduled_service_minutes": "mean",
            }
        )
        .reset_index()
        .rename(
            columns={
                "median_stop_meters": "avg_stop_meters",
                "scheduled_service_minutes": "avg_scheduled_service_minutes",
            }
        )
    )

    metrics_df = metrics_df.assign(
        avg_stop_miles=metrics_df.avg_stop_meters.divide(METERS_PER_MILE).round(2)
    ).drop(columns=["avg_stop_meters"])

    round_me = ["avg_stop_miles", "avg_scheduled_service_minutes"]
    metrics_df[round_me] = metrics_df[round_me].round(2)

    common_shape = gtfs_schedule_wrangling.most_common_shape_by_route_direction(
        analysis_date
    ).pipe(helpers.remove_shapes_outside_ca)

    df = pd.merge(common_shape, metrics_df, on=group_merge_cols, how="inner").merge(
        service_freq_df, on=group_merge_cols, how="inner"
    )

    df.time_period = df.time_period.fillna(df.peak_offpeak)
    return df

In [32]:
route_group_merge_cols = ["schedule_gtfs_dataset_key", "route_id", "direction_id"]

In [33]:
route_dir_metrics = schedule_metrics_by_route_direction(
    trip_metrics2, one_analysis_date, route_group_merge_cols
)

In [34]:
preview(route_dir_metrics)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,7,0.0
3,73105f2d1cabc8170ab066d96863c5d5,6,0.0
6,73105f2d1cabc8170ab066d96863c5d5,8,0.0
9,73105f2d1cabc8170ab066d96863c5d5,Mall,0.0
12,73105f2d1cabc8170ab066d96863c5d5,12X,0.0
15,73105f2d1cabc8170ab066d96863c5d5,13X,0.0
18,73105f2d1cabc8170ab066d96863c5d5,11,0.0
21,73105f2d1cabc8170ab066d96863c5d5,30,0.0
24,f5a749dd65924e025b1293c58f95f8d6,Shuttle,1.0
27,f5a749dd65924e025b1293c58f95f8d6,Shuttle,0.0


In [35]:
route_dir_metrics.drop(
    columns=[
        "geometry",
        "schedule_gtfs_dataset_key",
        "common_shape_id",
    ]
)

Unnamed: 0,route_id,direction_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,peak_offpeak,frequency
0,7,0.0,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",36.0,0.37,19,all_day,,0.79
1,7,0.0,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",36.0,0.37,9,offpeak,offpeak,0.38
2,7,0.0,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",36.0,0.37,10,peak,peak,0.42
3,6,0.0,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,38.0,0.38,18,all_day,,0.75
4,6,0.0,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,38.0,0.38,7,offpeak,offpeak,0.29
5,6,0.0,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,38.0,0.38,11,peak,peak,0.46
6,8,0.0,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,43.0,0.32,16,all_day,,0.67
7,8,0.0,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,43.0,0.32,8,offpeak,offpeak,0.33
8,8,0.0,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,43.0,0.32,8,peak,peak,0.33
9,Mall,0.0,Mall Shuttle,14.98,0.11,28,all_day,,1.17


#####  Still in `rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling` 
* **Updated `dropna=False` in `groupby`**
* **Filled in `time_period` with `peak_offpeak`**

##### In `if __name__ == "__main__"` in `gtfs_funnel/schedule_stats_by_route`

In [36]:
ROUTE_TYPOLOGIES = GTFS_DATA_DICT.schedule_tables.route_typologies
route_typologies = pd.read_parquet(
    f"{SCHED_GCS}{ROUTE_TYPOLOGIES}_{one_analysis_date}.parquet",
    columns=route_group_merge_cols
    + [
        "is_coverage",
        "is_downtown_local",
        "is_local",
        "is_rapid",
        "is_express",
        "is_rail",
    ],
)

In [37]:
route_typologies.loc[route_typologies.schedule_gtfs_dataset_key.isin(schd_keys)]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail
1416,f5a749dd65924e025b1293c58f95f8d6,Shuttle,1.0,1,0,0,0,0,0
3572,73105f2d1cabc8170ab066d96863c5d5,13X,0.0,1,0,0,1,0,0
3573,73105f2d1cabc8170ab066d96863c5d5,20,0.0,1,0,0,0,0,0
3574,73105f2d1cabc8170ab066d96863c5d5,12X,0.0,1,0,0,1,1,0
3575,73105f2d1cabc8170ab066d96863c5d5,30,0.0,1,0,0,1,0,0
3576,73105f2d1cabc8170ab066d96863c5d5,2,0.0,0,1,0,1,0,0
3577,73105f2d1cabc8170ab066d96863c5d5,1,0.0,1,0,0,1,0,0
3578,73105f2d1cabc8170ab066d96863c5d5,1B,0.0,1,0,0,1,0,0
3579,73105f2d1cabc8170ab066d96863c5d5,4,0.0,1,0,0,1,0,0
3580,73105f2d1cabc8170ab066d96863c5d5,7,0.0,1,0,0,1,0,0


##### `cardinal_direction_for_route_direction` also gets rid of a lot of stuff -> Fix this

In [38]:
STOP_TIMES_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction

In [39]:
stop_times_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{one_analysis_date}.parquet",
    filters=[[("stop_primary_direction", "!=", "Unknown")]],
)

In [40]:
stop_times_df2 = stop_times_df.loc[
    stop_times_df.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [41]:
trip_scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name",
]

trips_df = helpers.import_scheduled_trips(
    one_analysis_date, columns=trip_scheduled_col, get_pandas=True
)

In [42]:
merge_cols = ["trip_instance_key", "schedule_gtfs_dataset_key", "shape_array_key"]

In [43]:
stop_times_with_trip = pd.merge(stop_times_df2, trips_df, on=merge_cols)

##### Fill in `direction_id`

In [44]:
stop_times_with_trip.direction_id = stop_times_with_trip.direction_id.fillna(0)

In [45]:
main_cols = ["route_id", "schedule_gtfs_dataset_key", "direction_id"]

##### <font color="red">Done</font> Changing dropna=False here too

In [46]:
agg1 = (
    stop_times_with_trip.groupby(main_cols + ["stop_primary_direction"], dropna=False)
    .agg({"stop_sequence": "count"})
    .reset_index()
    .rename(columns={"stop_sequence": "total_stops"})
)

In [47]:
agg2 = agg1.sort_values(
    by=main_cols + ["total_stops"],
    ascending=[True, True, True, False],
)

##### There are values for `route_primary_direction` but because `direction_id` is missing, it goes away? 
* AH: testing to see if filling `direction_id` with something will change things.

In [48]:
cardinal_dir_df = (
    agg2.drop_duplicates(subset=main_cols)
    .reset_index(drop=True)
    .drop(columns=["total_stops"])
    .rename(columns={"stop_primary_direction": "route_primary_direction"})
)

In [49]:
cardinal_dir_df

Unnamed: 0,route_id,schedule_gtfs_dataset_key,direction_id,route_primary_direction
0,1,73105f2d1cabc8170ab066d96863c5d5,0.0,Northbound
1,11,73105f2d1cabc8170ab066d96863c5d5,0.0,Northbound
2,12X,73105f2d1cabc8170ab066d96863c5d5,0.0,Northbound
3,13X,73105f2d1cabc8170ab066d96863c5d5,0.0,Westbound
4,1B,73105f2d1cabc8170ab066d96863c5d5,0.0,Northbound
5,2,73105f2d1cabc8170ab066d96863c5d5,0.0,Westbound
6,20,73105f2d1cabc8170ab066d96863c5d5,0.0,Eastbound
7,3,73105f2d1cabc8170ab066d96863c5d5,0.0,Eastbound
8,30,73105f2d1cabc8170ab066d96863c5d5,0.0,Southbound
9,4,73105f2d1cabc8170ab066d96863c5d5,0.0,Southbound


##### Continuing back to `if __name__ == "__main__"` portion of `gtfs_funnel/schedule_stats_by_route`

In [50]:
route_typologies.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail
0,1770249a5a2e770ca90628434d4934b1,3407,0.0,1,0,0,1,0,0


In [51]:
route_group_merge_cols

['schedule_gtfs_dataset_key', 'route_id', 'direction_id']

In [52]:
preview(route_dir_metrics)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,7,0.0
3,73105f2d1cabc8170ab066d96863c5d5,6,0.0
6,73105f2d1cabc8170ab066d96863c5d5,8,0.0
9,73105f2d1cabc8170ab066d96863c5d5,Mall,0.0
12,73105f2d1cabc8170ab066d96863c5d5,12X,0.0
15,73105f2d1cabc8170ab066d96863c5d5,13X,0.0
18,73105f2d1cabc8170ab066d96863c5d5,11,0.0
21,73105f2d1cabc8170ab066d96863c5d5,30,0.0
24,f5a749dd65924e025b1293c58f95f8d6,Shuttle,1.0
27,f5a749dd65924e025b1293c58f95f8d6,Shuttle,0.0


In [53]:
route_dir_metrics2 = pd.merge(
    route_dir_metrics, route_typologies, on=route_group_merge_cols, how="left"
).merge(cardinal_dir_df, on=route_group_merge_cols, how="left")

In [54]:
route_dir_metrics2.route_id.unique()

array(['7', '6', '8', 'Mall', '12X', '13X', '11', '30', 'Shuttle',
       '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '1B', '20', '5',
       '4', '9', '1'], dtype=object)

In [55]:
""" route_dir_metrics2.drop(
    columns=[
        "geometry",
        "common_shape_id",
        "geometry",
        "route_name",
        "is_coverage",
        "is_downtown_local",
        "is_local",
        "is_rapid",
        "is_express",
        "is_rail",
        "schedule_gtfs_dataset_key"
    ]
).sort_values(by=["route_id","direction_id"])"""

' route_dir_metrics2.drop(\n    columns=[\n        "geometry",\n        "common_shape_id",\n        "geometry",\n        "route_name",\n        "is_coverage",\n        "is_downtown_local",\n        "is_local",\n        "is_rapid",\n        "is_express",\n        "is_rail",\n        "schedule_gtfs_dataset_key"\n    ]\n).sort_values(by=["route_id","direction_id"])'

##### Double check that the columns are the same.

In [56]:
og_nov_url = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-11-13.parquet"

In [57]:
df_sched_og = gpd.read_parquet(og_nov_url)

In [58]:
df_sched_og = df_sched_og.loc[df_sched_og.schedule_gtfs_dataset_key.isin(schd_keys)]

In [59]:
df_sched_og[["route_id", "direction_id", "time_period", "peak_offpeak"]].sort_values(
    by=[
        "route_id",
        "direction_id",
    ]
)

Unnamed: 0,route_id,direction_id,time_period,peak_offpeak
2248,5,0.0,all_day,
2249,5,0.0,,offpeak
2250,5,0.0,,peak
1849,Shuttle,0.0,all_day,
1850,Shuttle,0.0,,offpeak
1851,Shuttle,0.0,,peak
1846,Shuttle,1.0,all_day,
1847,Shuttle,1.0,,offpeak
1848,Shuttle,1.0,,peak


In [60]:
route_dir_metrics2[
    ["route_id", "direction_id", "time_period", "peak_offpeak"]
].sort_values(
    by=[
        "route_id",
        "direction_id",
    ]
)

Unnamed: 0,route_id,direction_id,time_period,peak_offpeak
54,1,0.0,all_day,
55,1,0.0,offpeak,offpeak
56,1,0.0,peak,peak
18,11,0.0,all_day,
19,11,0.0,offpeak,offpeak
20,11,0.0,peak,peak
12,12X,0.0,all_day,
13,12X,0.0,offpeak,offpeak
14,12X,0.0,peak,peak
15,13X,0.0,all_day,


In [61]:
route_dir_metrics2.columns == df_sched_og.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [62]:
df_sched = route_dir_metrics2.copy()

#### <font color="red">DONE</font> `gtfs_digest/merge_data` line 300 `df_avg_speeds` is also missing a lot of routes.
* [File `rt_segment_speeds/scripts/average_summary_speeds.py`](https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/average_summary_speeds.py)

In [63]:
import sys

sys.path.append("../rt_segment_speeds/scripts/")
import average_segment_speeds
import average_summary_speeds
from segment_speed_utils import (
    gtfs_schedule_wrangling,
    helpers,
    metrics,
    segment_calcs,
    time_series_utils,
)

In [64]:
df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(analysis_date_list)

In [65]:
df_avg_speeds2 = df_avg_speeds.loc[
    df_avg_speeds.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [66]:
df_avg_speeds2.route_id.value_counts()

5    3
Name: route_id, dtype: int64

##### See what is in `rt_segment_speeds/scripts/average_segment_speeds.concatenate_trip_segment_speeds`

In [67]:
segment_type = "stop_segments"

In [68]:
df = average_segment_speeds.concatenate_trip_segment_speeds(
    analysis_date_list, segment_type
)

concatenated files


##### <font color="red">Done.</font> <b>Amanda: filled in `nans` with 0.</b>

In [69]:
df.direction_id = df.direction_id.fillna(0)

In [70]:
df2 = df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [71]:
df2.route_id.unique()

array(['30', '3', '20', '4', '5', '11', '7', '9', '1', '12X', '6', '2',
       '8', '8a7c42f9-51e4-4848-bf88-30c210f149ad', 'CC'], dtype=object)

In [72]:
df2.shape

(3543, 17)

In [73]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3543 entries, 159381 to 2656608
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   schedule_gtfs_dataset_key  3543 non-null   object        
 1   shape_array_key            3543 non-null   object        
 2   shape_id                   3543 non-null   object        
 3   stop_sequence              3543 non-null   int64         
 4   route_id                   3543 non-null   object        
 5   direction_id               3543 non-null   float64       
 6   stop_pair                  3543 non-null   object        
 7   stop_pair_name             3543 non-null   object        
 8   trip_instance_key          3543 non-null   object        
 9   speed_mph                  3543 non-null   float64       
 10  meters_elapsed             3543 non-null   float64       
 11  sec_elapsed                3543 non-null   float64       
 12

In [74]:
df2.time_of_day.unique()

array(['PM Peak', 'Early AM', 'Midday', 'AM Peak', 'Evening'],
      dtype=object)

In [75]:
df2.peak_offpeak.unique()

array(['peak', 'offpeak'], dtype=object)

In [76]:
df2.groupby(["route_id", "direction_id"]).agg({"stop_pair": "nunique"})

Unnamed: 0_level_0,Unnamed: 1_level_0,stop_pair
route_id,direction_id,Unnamed: 2_level_1
1,0.0,23
11,0.0,19
12X,0.0,14
2,0.0,31
20,0.0,7
3,0.0,20
30,0.0,27
4,0.0,25
5,0.0,34
6,0.0,29


In [77]:
df2.head(4).T

Unnamed: 0,159381,159382,159383,159384
schedule_gtfs_dataset_key,73105f2d1cabc8170ab066d96863c5d5,73105f2d1cabc8170ab066d96863c5d5,73105f2d1cabc8170ab066d96863c5d5,73105f2d1cabc8170ab066d96863c5d5
shape_array_key,c6e9cda0db8bf76bc535f590ca1fccb5,c6e9cda0db8bf76bc535f590ca1fccb5,c6e9cda0db8bf76bc535f590ca1fccb5,c6e9cda0db8bf76bc535f590ca1fccb5
shape_id,8746730d-27f9-4fb2-9f52-987afe356929,8746730d-27f9-4fb2-9f52-987afe356929,8746730d-27f9-4fb2-9f52-987afe356929,8746730d-27f9-4fb2-9f52-987afe356929
stop_sequence,2,2,3,3
route_id,30,30,30,30
direction_id,0.00,0.00,0.00,0.00
stop_pair,f09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c,f09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c,47def414-f158-496a-91cb-5f3fb0aa406c__a94160c1-bd99-4898-921f-941aa748ce6f,47def414-f158-496a-91cb-5f3fb0aa406c__a94160c1-bd99-4898-921f-941aa748ce6f
stop_pair_name,Broadway at Stowell__Betteravia at Miller (Panda Express),Broadway at Stowell__Betteravia at Miller (Panda Express),Betteravia at Miller (Panda Express)__McCoy at Broadway (Outbound),Betteravia at Miller (Panda Express)__McCoy at Broadway (Outbound)
trip_instance_key,005bb393ed8b22ca4d8e7cc8d7895231,217b90defbc6c69f05e19d16e96d1e3f,005bb393ed8b22ca4d8e7cc8d7895231,217b90defbc6c69f05e19d16e96d1e3f
speed_mph,13.21,13.89,18.88,17.04


##### Now moving onto the function `rt_segment_speeds/scripts/average_segment_speeds/segment_averages()`

In [78]:
dict_inputs = GTFS_DATA_DICT[segment_type]

In [79]:
OPERATOR_COLS = [
    "schedule_gtfs_dataset_key",
]

In [80]:
ROUTE_DIR_COLS = [*dict_inputs["route_dir_cols"]]
STOP_PAIR_COLS = [*dict_inputs["stop_pair_cols"]]

In [81]:
group_cols = OPERATOR_COLS + ROUTE_DIR_COLS + STOP_PAIR_COLS

In [82]:
group_cols

['schedule_gtfs_dataset_key',
 'route_id',
 'direction_id',
 'stop_pair',
 'stop_pair_name']

##### <font color="red">Done.</font> <b>Added `dropna=False` to `rt_segment_speeds/segment_speed_utils/segment_calcs.calculate_avg_speeds`</b>

In [83]:
def calculate_avg_speeds(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
    """
    Calculate the median, 20th, and 80th percentile speeds
    by groups.
    """
    # pd.groupby and pd.quantile is so slow
    # create our own list of speeds and use np
    df2 = (
        df.groupby(group_cols, observed=True, group_keys=False, dropna=False)
        .agg({"speed_mph": lambda x: sorted(list(x))})
        .reset_index()
        .rename(columns={"speed_mph": "speed_mph_list"})
    )

    df2 = df2.assign(
        p50_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=50), axis=1),
        n_trips=df2.apply(lambda x: len(x.speed_mph_list), axis=1).astype("int16"),
        p20_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=20), axis=1),
        p80_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=80), axis=1),
    )

    stats = df2.drop(columns="speed_mph_list")

    # Clean up for map
    speed_cols = [c for c in stats.columns if "_mph" in c]
    stats[speed_cols] = stats[speed_cols].round(2)

    return stats

In [84]:
avg_speeds = calculate_avg_speeds(
    df2,
    group_cols + ["time_of_day"],
)

In [85]:
avg_speeds.head()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,stop_pair,stop_pair_name,time_of_day,p50_mph,n_trips,p20_mph,p80_mph
0,73105f2d1cabc8170ab066d96863c5d5,1,0.0,1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644,Broadway at Hermosa__Broadway at Fesler,AM Peak,32.35,4,10.41,51.44
1,73105f2d1cabc8170ab066d96863c5d5,1,0.0,1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644,Broadway at Hermosa__Broadway at Fesler,Early AM,20.2,1,20.2,20.2
2,73105f2d1cabc8170ab066d96863c5d5,1,0.0,1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644,Broadway at Hermosa__Broadway at Fesler,Evening,13.37,1,13.37,13.37
3,73105f2d1cabc8170ab066d96863c5d5,1,0.0,1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644,Broadway at Hermosa__Broadway at Fesler,Midday,15.96,6,1.0,20.2
4,73105f2d1cabc8170ab066d96863c5d5,1,0.0,1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644,Broadway at Hermosa__Broadway at Fesler,PM Peak,14.66,5,0.93,16.33


##### Go back to `rt_sgment_speeds/scripts/average_segment_speeds.segment_averages()`

In [86]:
avg_speeds2 = avg_speeds.pipe(
    gtfs_schedule_wrangling.merge_operator_identifiers,
    analysis_date_list,
    columns=average_segment_speeds.CROSSWALK_COLS,
)

In [87]:
avg_speeds2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1120 entries, 0 to 1119
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   schedule_gtfs_dataset_key      1120 non-null   object 
 1   route_id                       1120 non-null   object 
 2   direction_id                   1120 non-null   float64
 3   stop_pair                      1120 non-null   object 
 4   stop_pair_name                 1120 non-null   object 
 5   time_of_day                    1120 non-null   object 
 6   p50_mph                        1120 non-null   float64
 7   n_trips                        1120 non-null   int16  
 8   p20_mph                        1120 non-null   float64
 9   p80_mph                        1120 non-null   float64
 10  name                           1120 non-null   object 
 11  caltrans_district              1120 non-null   object 
 12  organization_source_record_id  1120 non-null   o

In [88]:
preview(avg_speeds2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,1,0.0
91,73105f2d1cabc8170ab066d96863c5d5,11,0.0
148,73105f2d1cabc8170ab066d96863c5d5,12X,0.0
204,73105f2d1cabc8170ab066d96863c5d5,2,0.0
315,73105f2d1cabc8170ab066d96863c5d5,20,0.0
334,73105f2d1cabc8170ab066d96863c5d5,3,0.0
409,73105f2d1cabc8170ab066d96863c5d5,30,0.0
492,73105f2d1cabc8170ab066d96863c5d5,4,0.0
584,73105f2d1cabc8170ab066d96863c5d5,5,0.0
717,73105f2d1cabc8170ab066d96863c5d5,6,0.0


##### Move onto`rt_segment_speeds/scripts/average_segement_speeds/merge_in_segment_geometry()`
* Original function=only 3 routes showing...Check it out.

In [89]:
avg_speeds_with_geom = average_segment_speeds.merge_in_segment_geometry(
    avg_speeds2, one_analysis_date, segment_type
)

In [90]:
preview(avg_speeds_with_geom)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,30,0.0
110,73105f2d1cabc8170ab066d96863c5d5,3,0.0
202,73105f2d1cabc8170ab066d96863c5d5,20,0.0
221,73105f2d1cabc8170ab066d96863c5d5,4,0.0
339,73105f2d1cabc8170ab066d96863c5d5,5,0.0
472,73105f2d1cabc8170ab066d96863c5d5,11,0.0
529,73105f2d1cabc8170ab066d96863c5d5,7,0.0
587,73105f2d1cabc8170ab066d96863c5d5,9,0.0
692,73105f2d1cabc8170ab066d96863c5d5,1,0.0
790,73105f2d1cabc8170ab066d96863c5d5,12X,0.0


In [91]:
from calitp_data_analysis.geography_utils import WGS84

##### Down another rabbit hole: this `SEGMENT_FILE` doesn't contain values for direction_id 
* Need to find out whre it's originally made.
* <font color="red">Done</font> **Fill in `direction_id` with 0.**

In [92]:
SEGMENT_FILE = GTFS_DATA_DICT[segment_type].segments_file

segment_geom = gpd.read_parquet(
    f"{SEGMENT_GCS}{SEGMENT_FILE}_{one_analysis_date}.parquet",
).to_crs(WGS84)

In [93]:
segment_geom.direction_id = segment_geom.direction_id.fillna(0)

In [94]:
segment_geom2 = segment_geom.loc[segment_geom.schedule_gtfs_dataset_key.isin(schd_keys)]

In [95]:
preview(segment_geom2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
181780,73105f2d1cabc8170ab066d96863c5d5,30,0.0
570760,73105f2d1cabc8170ab066d96863c5d5,3,0.0
1613296,73105f2d1cabc8170ab066d96863c5d5,20,0.0
1969198,73105f2d1cabc8170ab066d96863c5d5,4,0.0
2083066,73105f2d1cabc8170ab066d96863c5d5,5,0.0
2112284,73105f2d1cabc8170ab066d96863c5d5,11,0.0
2165911,73105f2d1cabc8170ab066d96863c5d5,7,0.0
2215180,73105f2d1cabc8170ab066d96863c5d5,9,0.0
2386098,73105f2d1cabc8170ab066d96863c5d5,1,0.0
2720537,73105f2d1cabc8170ab066d96863c5d5,12X,0.0


In [96]:
segment_geom2.drop(columns=["geometry"]).head(2)

Unnamed: 0,trip_instance_key,shape_array_key,stop_id1,stop_sequence,stop_id2,segment_id,stop_pair,schedule_gtfs_dataset_key,route_id,direction_id,st_trip_instance_key,segment_uuid
181780,005bb393ed8b22ca4d8e7cc8d7895231,c6e9cda0db8bf76bc535f590ca1fccb5,120f2635-ec31-435e-a089-225b26965f12,1,f09af637-87de-4bdb-bf49-660539686c97,120f2635-ec31-435e-a089-225b26965f12-f09af637-87de-4bdb-bf49-660539686c97-1,120f2635-ec31-435e-a089-225b26965f12__f09af637-87de-4bdb-bf49-660539686c97,73105f2d1cabc8170ab066d96863c5d5,30,0.0,005bb393ed8b22ca4d8e7cc8d7895231,73105f2d1cabc8170ab066d96863c5d5__30__nan__120f2635-ec31-435e-a089-225b26965f12-f09af637-87de-4bdb-bf49-660539686c97-1
181781,005bb393ed8b22ca4d8e7cc8d7895231,c6e9cda0db8bf76bc535f590ca1fccb5,f09af637-87de-4bdb-bf49-660539686c97,2,47def414-f158-496a-91cb-5f3fb0aa406c,f09af637-87de-4bdb-bf49-660539686c97-47def414-f158-496a-91cb-5f3fb0aa406c-1,f09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c,73105f2d1cabc8170ab066d96863c5d5,30,0.0,005bb393ed8b22ca4d8e7cc8d7895231,73105f2d1cabc8170ab066d96863c5d5__30__nan__f09af637-87de-4bdb-bf49-660539686c97-47def414-f158-496a-91cb-5f3fb0aa406c-1


##### Continue on with the rest of `merge_in_segment_geometry` in `rt_segment_speeds/scripts/average_segment_speeds`

In [97]:
dict_inputs["route_dir_single_segment"]

'rollup_singleday/speeds_route_dir_segments'

In [98]:
geom_file_cols = segment_geom2.columns.tolist()

In [99]:
col_order = [c for c in avg_speeds2.columns]

In [100]:
merge_cols = list(set(col_order).intersection(geom_file_cols))

In [101]:
gdf = (
    pd.merge(
        segment_geom2[merge_cols + ["geometry"]].drop_duplicates(),
        avg_speeds2,
        on=merge_cols,
    )
    .reset_index(drop=True)
    .reindex(columns=col_order + ["geometry"])
)

In [102]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   schedule_gtfs_dataset_key      1266 non-null   object  
 1   route_id                       1266 non-null   object  
 2   direction_id                   1266 non-null   float64 
 3   stop_pair                      1266 non-null   object  
 4   stop_pair_name                 1266 non-null   object  
 5   time_of_day                    1266 non-null   object  
 6   p50_mph                        1266 non-null   float64 
 7   n_trips                        1266 non-null   int16   
 8   p20_mph                        1266 non-null   float64 
 9   p80_mph                        1266 non-null   float64 
 10  name                           1266 non-null   object  
 11  caltrans_district              1266 non-null   object  
 12  organization_source_record

In [103]:
gdf.drop(
    columns=[
        "geometry",
        "organization_source_record_id",
        "organization_name",
        "base64_url",
    ]
).sample()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,stop_pair,stop_pair_name,time_of_day,p50_mph,n_trips,p20_mph,p80_mph,name,caltrans_district
1162,73105f2d1cabc8170ab066d96863c5d5,8,0.0,712b4000-441b-4b64-8a8e-36ec38bbbce1__ae050555-4c98-44e7-ad1a-d536b91d2012,Carmen ln at Trinity (Wesgate)(Outbound)__Carmen Ln at Carmelia Ln.,AM Peak,24.39,3,20.99,31.3,Santa Maria Schedule,05 - San Luis Obispo


##### `rt_segment_speeds/scripts/average_segment_speeds` gives me the speeds by stop for a route. However, in `gtfss_digest/merge_data`, we want the speeds for the entire route from `average_segment_speeds` is summarized in `rt_segment_speeds/scripts/average_summary_speeds`

In [104]:
dict_inputs["route_dir_single_segment"]

'rollup_singleday/speeds_route_dir_segments'

##### **This file below is used in `gtfs_digest/merge_data`. Need to breakout `average_summary_speeds`**
* gs://calitp-analytics-data/data-analyses/rt_segment_speeds/ and rollup_singleday/speeds_route_dir_AH_TEST_2024-11-13

In [105]:
GTFS_DATA_DICT.rt_stop_times.route_dir_single_summary

'rollup_singleday/speeds_route_dir'

In [106]:
dict_inputs = GTFS_DATA_DICT[segment_type]

In [107]:
avg_summary_speeds_url = "gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_route_dir_AH_TEST_2024-11-13.parquet"

In [108]:
avg_summary_speeds_df = gpd.read_parquet(avg_summary_speeds_url)

##### Only one route is showing!

In [109]:
avg_summary_speeds_df2 = avg_summary_speeds_df.loc[
    avg_summary_speeds_df.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [110]:
preview(avg_summary_speeds_df2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
1002,73105f2d1cabc8170ab066d96863c5d5,5,0.0


In [111]:
common_shape_geom = gtfs_schedule_wrangling.most_common_shape_by_route_direction(
    one_analysis_date
).to_crs(WGS84)

In [112]:
common_shape_geom.columns

Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',
       'common_shape_id', 'route_name'],
      dtype='object')

In [113]:
common_shape_geom2 = common_shape_geom.loc[
    common_shape_geom.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [114]:
common_shape_geom2.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 23 entries, 167 to 1098
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   geometry                   23 non-null     geometry
 1   schedule_gtfs_dataset_key  23 non-null     object  
 2   route_id                   23 non-null     object  
 3   direction_id               23 non-null     float64 
 4   common_shape_id            23 non-null     object  
 5   route_name                 23 non-null     object  
dtypes: float64(1), geometry(1), object(4)
memory usage: 1.3+ KB


##### <font color="red">DONE.</font> This `concatenate_trip_segment_speeds` is from `rt_segment_speeds/scripts/average_segment_speeds`

In [115]:
df = average_summary_speeds.concatenate_trip_segment_speeds(
    analysis_date_list, segment_type
)

concatenated files


In [116]:
df2 = df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)]

##### <font color="red">DONE</font> **Filled in `direction_id` with 0. Should actually go back to `average_summary_speeds.concatenate_trip_segment_speeds` and fill it in there**

In [117]:
df2.direction_id = df2.direction_id.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.direction_id = df2.direction_id.fillna(0)


In [118]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3543 entries, 159381 to 2656608
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   schedule_gtfs_dataset_key  3543 non-null   object        
 1   shape_array_key            3543 non-null   object        
 2   shape_id                   3543 non-null   object        
 3   stop_sequence              3543 non-null   int64         
 4   route_id                   3543 non-null   object        
 5   direction_id               3543 non-null   float64       
 6   stop_pair                  3543 non-null   object        
 7   stop_pair_name             3543 non-null   object        
 8   trip_instance_key          3543 non-null   object        
 9   speed_mph                  3543 non-null   float64       
 10  meters_elapsed             3543 non-null   float64       
 11  sec_elapsed                3543 non-null   float64       
 12

In [119]:
preview(df2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
159381,73105f2d1cabc8170ab066d96863c5d5,30,0.0
472131,73105f2d1cabc8170ab066d96863c5d5,3,0.0
1320980,73105f2d1cabc8170ab066d96863c5d5,20,0.0
1627284,73105f2d1cabc8170ab066d96863c5d5,4,0.0
1727996,73105f2d1cabc8170ab066d96863c5d5,5,0.0
1754122,73105f2d1cabc8170ab066d96863c5d5,11,0.0
1801423,73105f2d1cabc8170ab066d96863c5d5,7,0.0
1838091,73105f2d1cabc8170ab066d96863c5d5,9,0.0
1986825,73105f2d1cabc8170ab066d96863c5d5,1,0.0
2277584,73105f2d1cabc8170ab066d96863c5d5,12X,0.0


##### Continuing on with `average_summary_speeds`

In [120]:
trip_group_cols = OPERATOR_COLS + ROUTE_DIR_COLS

In [121]:
trip_avg = (
    metrics.weighted_average_speeds_across_segments(
        df2,
        trip_group_cols + ["peak_offpeak"],
    )
    .pipe(
        gtfs_schedule_wrangling.merge_operator_identifiers,
        analysis_date_list,
        columns=average_segment_speeds.CROSSWALK_COLS,
    )
    .reset_index(drop=True)
)

In [122]:
trip_avg.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,peak_offpeak,meters_elapsed,sec_elapsed,speed_mph,name,caltrans_district,organization_source_record_id,organization_name,base64_url
0,73105f2d1cabc8170ab066d96863c5d5,1,0.0,offpeak,355890.88,60001.0,13.27,Santa Maria Schedule,05 - San Luis Obispo,rec9zGMJgNnes75K1,City of Santa Maria,aHR0cHM6Ly9zbXJ0LnRyaXBzaG90LmNvbS92MS9ndGZzLnppcD9yZWdpb25JZD1DQTU1OEREQy1EN0YyLTRCNDgtOUNBQy1ERUVBMTEzNEY4MjA=


In [123]:
trip_avg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   schedule_gtfs_dataset_key      30 non-null     object 
 1   route_id                       30 non-null     object 
 2   direction_id                   30 non-null     float64
 3   peak_offpeak                   30 non-null     object 
 4   meters_elapsed                 30 non-null     float64
 5   sec_elapsed                    30 non-null     float64
 6   speed_mph                      30 non-null     float64
 7   name                           30 non-null     object 
 8   caltrans_district              30 non-null     object 
 9   organization_source_record_id  30 non-null     object 
 10  organization_name              30 non-null     object 
 11  base64_url                     30 non-null     object 
dtypes: float64(4), object(8)
memory usage: 2.9+ KB


In [124]:
preview(trip_avg)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,1,0.0
2,73105f2d1cabc8170ab066d96863c5d5,11,0.0
4,73105f2d1cabc8170ab066d96863c5d5,12X,0.0
6,73105f2d1cabc8170ab066d96863c5d5,2,0.0
8,73105f2d1cabc8170ab066d96863c5d5,20,0.0
10,73105f2d1cabc8170ab066d96863c5d5,3,0.0
12,73105f2d1cabc8170ab066d96863c5d5,30,0.0
14,73105f2d1cabc8170ab066d96863c5d5,4,0.0
16,73105f2d1cabc8170ab066d96863c5d5,5,0.0
18,73105f2d1cabc8170ab066d96863c5d5,6,0.0


##### Skipping this part because I can't find `MIN_TRIP_SECONDS` and `MAX_TRIP_SECONDS` in `dict_input`

In [None]:
""" trip_avg_filtered = trip_avg[
        (trip_avg.meters_elapsed >= average_summary_speeds.METERS_CUTOFF) & 
        (trip_avg.sec_elapsed >= average_summary_speeds.MIN_TRIP_SECONDS) & 
        (trip_avg.sec_elapsed <= average_summary_speeds.MAX_TRIP_SECONDS)
    ]
    """

In [125]:
group_cols = OPERATOR_COLS + ROUTE_DIR_COLS

In [126]:
avg_speeds = (
    metrics.concatenate_peak_offpeak_allday_averages(
        trip_avg, group_cols, metric_type="summary_speeds"
    )
    .pipe(
        gtfs_schedule_wrangling.merge_operator_identifiers,
        analysis_date_list,
        columns=average_segment_speeds.CROSSWALK_COLS,
    )
    .reset_index(drop=True)
)

In [127]:
preview(avg_speeds)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,1,0.0
2,73105f2d1cabc8170ab066d96863c5d5,11,0.0
4,73105f2d1cabc8170ab066d96863c5d5,12X,0.0
6,73105f2d1cabc8170ab066d96863c5d5,2,0.0
8,73105f2d1cabc8170ab066d96863c5d5,20,0.0
10,73105f2d1cabc8170ab066d96863c5d5,3,0.0
12,73105f2d1cabc8170ab066d96863c5d5,30,0.0
14,73105f2d1cabc8170ab066d96863c5d5,4,0.0
16,73105f2d1cabc8170ab066d96863c5d5,5,0.0
18,73105f2d1cabc8170ab066d96863c5d5,6,0.0


In [128]:
avg_speeds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   schedule_gtfs_dataset_key      46 non-null     object 
 1   route_id                       46 non-null     object 
 2   direction_id                   46 non-null     float64
 3   time_period                    46 non-null     object 
 4   meters_elapsed                 46 non-null     float64
 5   sec_elapsed                    46 non-null     float64
 6   speed_mph                      46 non-null     float64
 7   name                           46 non-null     object 
 8   caltrans_district              46 non-null     object 
 9   organization_source_record_id  46 non-null     object 
 10  organization_name              46 non-null     object 
 11  base64_url                     46 non-null     object 
dtypes: float64(4), object(8)
memory usage: 4.4+ KB


In [129]:
avg_speeds_with_geom = average_summary_speeds.merge_in_common_shape_geometry(
    avg_speeds, one_analysis_date
)

In [130]:
preview(avg_speeds_with_geom)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,73105f2d1cabc8170ab066d96863c5d5,7,0.0
3,73105f2d1cabc8170ab066d96863c5d5,6,0.0
6,f5a749dd65924e025b1293c58f95f8d6,CC,1.0
8,f5a749dd65924e025b1293c58f95f8d6,CC,0.0
10,73105f2d1cabc8170ab066d96863c5d5,8,0.0
13,73105f2d1cabc8170ab066d96863c5d5,12X,0.0
16,73105f2d1cabc8170ab066d96863c5d5,11,0.0
19,73105f2d1cabc8170ab066d96863c5d5,30,0.0
22,73105f2d1cabc8170ab066d96863c5d5,8a7c42f9-51e4-4848-bf88-30c210f149ad,0.0
25,73105f2d1cabc8170ab066d96863c5d5,2,0.0


In [131]:
avg_speeds_with_geom.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   schedule_gtfs_dataset_key      46 non-null     object  
 1   route_id                       46 non-null     object  
 2   direction_id                   46 non-null     float64 
 3   time_period                    46 non-null     object  
 4   meters_elapsed                 46 non-null     float64 
 5   sec_elapsed                    46 non-null     float64 
 6   speed_mph                      46 non-null     float64 
 7   name                           46 non-null     object  
 8   caltrans_district              46 non-null     object  
 9   organization_source_record_id  46 non-null     object  
 10  organization_name              46 non-null     object  
 11  base64_url                     46 non-null     object  
 12  route_name                    

##### Double check that my work matches what's in `gtfs_digest/merge_data`

In [132]:
df_avg_speeds_og = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_route_dir_2024-11-13.parquet"
)

In [133]:
df_avg_speeds_og = df_avg_speeds_og.loc[
    df_avg_speeds_og.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [134]:
preview(df_avg_speeds_og)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
1000,73105f2d1cabc8170ab066d96863c5d5,5,0.0


In [135]:
df_avg_speeds_og.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'meters_elapsed', 'sec_elapsed', 'speed_mph', 'name',
       'caltrans_district', 'organization_source_record_id',
       'organization_name', 'base64_url', 'route_name', 'geometry'],
      dtype='object')

In [136]:
avg_speeds_with_geom.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'meters_elapsed', 'sec_elapsed', 'speed_mph', 'name',
       'caltrans_district', 'organization_source_record_id',
       'organization_name', 'base64_url', 'route_name', 'geometry'],
      dtype='object')

In [None]:
df_avg_speeds = avg_speeds_with_geom.copy()

#### Dataframe in line 307 `df_rt_sched` in `gtfs_digest/merge_data`

In [None]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(
    analysis_date_list
).astype({"direction_id": "float"})

In [None]:
df_rt_sched2 = df_rt_sched.loc[df_rt_sched.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
preview(df_rt_sched2)

In [None]:
df_rt_sched2.head(2)

##### `dt_rt_sched` is created using  [`rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`](https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/rt_v_scheduled_routes.py) 

In [None]:
[*GTFS_DATA_DICT["stop_segments"]["route_dir_cols"]]

In [None]:
dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables

##### `route_metrics` in `rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`

In [None]:
TRIP_EXPORT = dict_inputs.vp_trip_metrics

In [None]:
ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics

##### <font color="red">DONE</font> **Everything is available in `trip_df`. Fill in Direction_id with 0.**
* Where is `trip_df` created again?

In [None]:
trip_df = pd.read_parquet(f"{RT_SCHED_GCS}{TRIP_EXPORT}_{one_analysis_date}.parquet")

In [None]:
trip_df2 = trip_df.loc[trip_df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
trip_df2.direction_id = trip_df2.direction_id.fillna(0)

In [None]:
trip_df2.shape

In [None]:
preview(trip_df2)

In [None]:
trip_df2.info()

In [None]:
trip_df2.loc[trip_df2.time_of_day == "AM Peak"].drop(
    columns=["schedule_gtfs_dataset_key", "trip_instance_key"]
).sort_values(by=["route_id"]).drop_duplicates(
    subset=[
        "route_id",
        "direction_id",
    ]
).T

In [None]:
import sys

sys.path.append("../rt_scheduled_v_ran/scripts")
import rt_v_scheduled_routes

In [None]:
ROUTE_DIR_COLS = [*GTFS_DATA_DICT["stop_segments"]["route_dir_cols"]]

In [None]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "name",
    "schedule_source_record_id",
    "base64_url",
    "organization_source_record_id",
    "organization_name",
    "caltrans_district",
]

##### Have to break out `metrics.concatenate_peak_offpeak_allday_averages` which is in  `rt_segment_speeds/segment_speed_utils/` because all of the routes are missing.

In [None]:
route_df = (
    metrics.concatenate_peak_offpeak_allday_averages(
        trip_df2,
        group_cols=["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS,
        metric_type="rt_vs_schedule",
    )
    .pipe(metrics.derive_rt_vs_schedule_metrics)
    .pipe(rt_v_scheduled_routes.average_rt_trip_times)
)

In [None]:
preview(route_df)

`calculate_avg_speeds` is from `rt_segement_speeds/segement_speed_utils/segment_calc.py` -> added `dropna=False`

<font color="red">DONE</font> `calculate_weighted_average_vp_schedule_metrics` is from `rt_segment_speeds/segment_speed_utils/metrics` -> added `dropna=False`

In [None]:
def calculate_weighted_average_vp_schedule_metrics(
    df: pd.DataFrame,
    group_cols: list,
) -> pd.DataFrame:

    sum_cols = [
        "minutes_atleast1_vp",
        "minutes_atleast2_vp",
        "rt_service_minutes",
        "scheduled_service_minutes",
        "total_vp",
        "vp_in_shape",
        "is_early",
        "is_ontime",
        "is_late",
    ]

    count_cols = ["trip_instance_key"]

    df2 = (
        df.groupby(group_cols, observed=True, group_keys=False, dropna=False)
        .agg({**{e: "sum" for e in sum_cols}, **{e: "count" for e in count_cols}})
        .reset_index()
        .rename(columns={"trip_instance_key": "n_vp_trips"})
    )

    return df2

<font color="red">DONE</font>`weighted_average_speeds_across_segments` is from `rt_segment_speeds/segment_speed_utils/metrics` -> added `dropna=False`

In [None]:
def weighted_average_speeds_across_segments(
    df: pd.DataFrame, group_cols: list
) -> pd.DataFrame:
    """
    We can use our segments and the deltas within a trip
    to calculate the trip-level average speed, or
    the route-direction-level average speed.
    But, we want a weighted average, using the raw deltas
    instead of mean(speed_mph), since segments can be varying lengths.
    """
    avg_speeds = (
        df.groupby(group_cols, observed=True, group_keys=False, dropna=False)
        .agg(
            {
                "meters_elapsed": "sum",
                "sec_elapsed": "sum",
            }
        )
        .reset_index()
    ).pipe(segment_calcs.speed_from_meters_elapsed_sec_elapsed)

    return avg_speeds

`concatenate_peak_offpeak_allday_averages` is from `rt_segment_speeds/segment_speed_utils/metrics`

In [None]:
def concatenate_peak_offpeak_allday_averages(
    df: pd.DataFrame, group_cols: list, metric_type: str
) -> pd.DataFrame:
    """
    Calculate average speeds for all day and
    peak_offpeak.
    Concatenate these, so that speeds are always calculated
    for the same 3 time periods.
    """
    if metric_type == "segment_speeds":
        avg_peak = calculate_avg_speeds(df, group_cols + ["peak_offpeak"])

        avg_allday = calculate_avg_speeds(df, group_cols).assign(peak_offpeak="all_day")

    elif metric_type == "summary_speeds":
        avg_peak = weighted_average_speeds_across_segments(
            df, group_cols + ["peak_offpeak"]
        )

        avg_allday = weighted_average_speeds_across_segments(df, group_cols).assign(
            peak_offpeak="all_day"
        )

    elif metric_type == "rt_vs_schedule":
        avg_peak = calculate_weighted_average_vp_schedule_metrics(
            df, group_cols + ["peak_offpeak"]
        )

        avg_allday = calculate_weighted_average_vp_schedule_metrics(
            df, group_cols
        ).assign(peak_offpeak="all_day")

    else:
        print(
            f"Valid metric types: ['segment_speeds', 'summary_speeds', 'rt_vs_schedule']"
        )

    # Concatenate so that every segment has 3 time periods: peak, offpeak, and all_day
    avg_metrics = pd.concat([avg_peak, avg_allday], axis=0, ignore_index=True).rename(
        columns={"peak_offpeak": "time_period"}
    )

    return avg_metrics

##### Going back to `rt_v_scheduled/scripts/rt_v_scheduled_routes.py`

In [None]:
route_metrics_df = concatenate_peak_offpeak_allday_averages(
    trip_df2,
    group_cols=["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS,
    metric_type="rt_vs_schedule",
)

In [None]:
preview(route_metrics_df)

In [None]:
route_metrics_df.shape

In [None]:
route_metrics_df.route_id.value_counts()

In [None]:
route_metrics_df.loc[route_metrics_df.route_id == "CC"]

In [None]:
route_metrics_df = route_metrics_df.pipe(metrics.derive_rt_vs_schedule_metrics)

In [None]:
route_metrics_df.columns

In [None]:
route_metrics_df2 = route_metrics_df.pipe(rt_v_scheduled_routes.average_rt_trip_times)

In [None]:
route_metrics_df2.columns

In [None]:
route_metrics_df3 = gtfs_schedule_wrangling.merge_operator_identifiers(
    route_metrics_df2, [one_analysis_date], columns=crosswalk_cols
)

In [None]:
route_metrics_df3.columns

In [None]:
route_metrics_df.loc[route_metrics_df.time_period == "peak"].drop(
    columns=[
        "schedule_gtfs_dataset_key",
        "schedule_source_record_id",
        "base64_url",
        "organization_name",
        "organization_source_record_id",
        "caltrans_district",
    ]
)

##### `df_rt_sched` equals `df_rt_sched`

In [None]:
df_rt_sched_og = merge_data.concatenate_rt_vs_schedule_by_route_direction(
    analysis_date_list
)

In [None]:
df_rt_sched = route_metrics_df3.copy()

In [None]:
df_rt_sched_og.columns

##### All these columns pop up around the step of `gtfs_schedule_wrangling.merge_operator_identifiers` because the extra columns match what is in `crosswalk_cols`?? 

In [None]:
df_rt_sched.columns

In [None]:
df_rt_sched = df_rt_sched.drop(
    columns=[
        "base64_url",
        "organization_source_record_id",
        "organization_name",
        "caltrans_district",
    ]
)

In [None]:
df_rt_sched_og.loc[
    df_rt_sched_og.schedule_gtfs_dataset_key.isin(schd_keys)
].route_id.nunique()

In [None]:
df_rt_sched.route_id.nunique()

In [None]:
df_rt_sched.shape

#### `gtfs_digest/merge_data/` line 316: `df_crosswalk`

In [None]:
df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)

#### `gtfs_digest/merge_data/merge_data_sources_by_route_direction`
* Have to make some tweaks since `df_avg_speeds2` is missing a lot of routes.

In [None]:
service_date_datetime = pd.to_datetime("2024-11-13T00:00:00.000000000")

##### Why are time_periods and peak_off_peak different between `df_sched` and `df_rt_sched`
* Something is wrong with `df_sched` because a lot of `time_period` values are missing~
##### Amanda, test: fill in `nans` in `time_period` with `peak_offpeak`
* This might solve why all the routes are missing in Nov/Dec too?

In [None]:
df_rt_sched[["route_id", "time_period", "direction_id"]].drop_duplicates().sort_values(
    by=["route_id", "direction_id"]
)

In [None]:
df_sched[["route_id", "time_period", "direction_id"]].drop_duplicates().sort_values(
    by=["route_id", "direction_id"]
)

In [None]:
df_sched["service_date"] = service_date_datetime

In [None]:
df_rt_sched["service_date"] = service_date_datetime

In [None]:
df_avg_speeds["service_date"] = service_date_datetime

In [None]:
# merge1 = merge_data.merge_data_sources_by_route_direction(
# route_dir_metrics2,
# df_rt_sched,
# df_avg_speeds2,
# df_crosswalk
# )

In [None]:
primary_typology = merge_data.set_primary_typology(route_dir_metrics2)

In [None]:
primary_typology.head(1)

In [None]:
route_time_cols = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
]

In [None]:
df_schedule2 = pd.merge(df_sched, primary_typology, on=route_time_cols, how="left")

In [None]:
df_schedule2.info()

In [None]:
route_time_cols

In [None]:
df_schedule2.columns

In [None]:
df_rt_sched.columns

In [None]:
df_avg_speeds.columns

In [None]:
df = pd.merge(
    df_schedule2,
    df_rt_sched,
    on=route_time_cols + ["service_date"],
    how="outer",
    indicator="sched_rt_category",
).merge(
    df_avg_speeds,
    on=route_time_cols + ["service_date"],
    how="outer",
)

##### Check that all the routes are here.

In [None]:
df.route_id.unique()

In [None]:
df.route_id.value_counts()

In [None]:
df.columns

In [None]:
df2 = df.assign(
    sched_rt_category=df.sched_rt_category.map(
        gtfs_schedule_wrangling.sched_rt_category_dict
    )
)

In [None]:
df2.columns

In [None]:
df3 = df2.pipe(
    merge_data.merge_in_standardized_route_names,
)

In [None]:
df3.columns

###### Extra columns are popping up?? Detailed below. 

In [None]:
drop_cols = [
    "schedule_source_record_id",
    "base64_url",
    "organization_source_record_id",
    "organization_name",
    "caltrans_district",
]

In [None]:
df4 = pd.merge(
    df3.drop(columns=drop_cols),
    df_crosswalk,
    on=["schedule_gtfs_dataset_key", "name", "service_date"],
    how="left",
)

In [None]:
df4.columns

In [None]:
df4.route_id.value_counts()

##### Lots of repeated columns...why!!

In [None]:
df4.info()

In [None]:
df5 = df4.pipe(
    # Find the most common cardinal direction
    gtfs_schedule_wrangling.top_cardinal_direction
)

#### Observations
* There are no typologies for these previously missing routes.
* `Route_primary_direction` and `direction_id` is empty for all of City of Santa Maria 
* `route_ids` are repeated...somehow messed up during merges.
* I have an extra column for `peak_offpeak ` and `time_period`

In [None]:
df5.info()

In [None]:
df[["time_period", "route_id"]].drop_duplicates().sort_values(by=["route_id"])

In [None]:
df5.columns

In [None]:
preview_cols = [
    "organization_name",
    "route_id",
    "sched_rt_category",
    "direction_id",
    "route_primary_direction",
    "avg_scheduled_service_minutes",
    "avg_stop_miles",
    "n_trips",
    "time_period",
    "frequency",
    "typology",
    "minutes_atleast1_vp",
    "minutes_atleast2_vp",
    "total_rt_service_minutes",
    "total_scheduled_service_minutes",
    "total_vp",
    "vp_in_shape",
    "is_early",
    "is_ontime",
    "is_late",
    "n_vp_trips",
    "vp_per_minute",
    "pct_in_shape",
    "pct_rt_journey_atleast1_vp",
    "pct_rt_journey_atleast2_vp",
    "pct_sched_journey_atleast1_vp",
    "pct_sched_journey_atleast2_vp",
    "rt_sched_journey_ratio",
    "avg_rt_service_minutes",
    "speed_mph",
]

In [None]:
df5.groupby(
    [
        "route_id",
        "sched_rt_category",
    ]
).agg({"organization_name": "count"})

In [None]:
df5.loc[df5.route_id == "1B"][preview_cols].sort_values(
    by=["organization_name", "route_id"]
)

In [None]:
df5.loc[df5.time_period == "peak"][preview_cols].sort_values(
    by=["organization_name", "route_id"]
)

In [None]:
stop

### Fix `ROUTE_TYPOLOGIES` in `gtfs_funnel/route_typologies.py`

In [None]:
ROUTE_TYPOLOGIES

In [None]:
GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
route_typologies2 = route_typologies.loc[
    route_typologies.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [None]:
route_typologies2

In [None]:
route_dir_cols = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "common_shape_id",
    "route_name",
    "route_meters",
]

##### Amanda: in `rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling`, I filled `nan` rows in `direction_id`. Then I commented out parts of `gtfs_funnel/route_typologies`

In [None]:
common_shape = gtfs_schedule_wrangling.most_common_shape_by_route_direction(
    one_analysis_date
)

In [None]:
common_shape2 = common_shape.loc[common_shape.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
nov_typology_ah_test_df = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/gtfs_schedule/nacto_typologies/route_typologies_AH_TESTING_2024-11-13.parquet"
)

In [None]:
nov_typology_ah_test_df.loc[
    nov_typology_ah_test_df.schedule_gtfs_dataset_key.isin(schd_keys)
]

### Fix Map: `gtfs_digest/merge_operator_data`

In [None]:
OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles
OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

In [None]:
operator_route_gdf = gpd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet",
)

In [None]:
operator_route_gdf.columns

In [None]:
operator_route_gdf2.columns

In [None]:
len(operator_route_gdf2)

In [None]:
operator_route_gdf2.is_rail.value_counts()

In [None]:
operator_route_gdf2.organization_name.value_counts()

In [None]:
operator_route_gdf2.schedule_gtfs_dataset_key.unique()

#### Why does City of Santa Maria have multiple schedule_gtfs_dataset_keys?

In [None]:
operator_route_gdf2.groupby(["organization_name", "schedule_gtfs_dataset_key"]).agg(
    {"route_short_name": "nunique"}
)

In [None]:
operator_route_gdf2.drop(columns=["service_date"]).explore("organization_name")

In [None]:
# operator_route_gdf2.drop(columns = ["service_date"]).explore("shape_array_key")

#### Starting from here [`gtfs_funnel/operator_scheduled_stats`](https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/gtfs_funnel/operator_scheduled_stats.py#L148)

In [None]:
analysis_date = "2024-11-13"

In [None]:
schd_keys = list(operator_route_gdf2.schedule_gtfs_dataset_key.unique())

#### Longest shape does have all the routes...

In [None]:
longest_shape_gdf = operator_scheduled_stats.longest_shape_by_route(analysis_date)

In [None]:
longest_shape_gdf2 = longest_shape_gdf.loc[
    longest_shape_gdf.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [None]:
longest_shape_gdf2.columns

In [None]:
longest_shape_gdf2.info()

In [None]:
longest_shape_gdf2.route_id.value_counts()

In [None]:
# longest_shape_gdf2.explore("schedule_gtfs_dataset_key")

In [None]:
longest_shape_gdf2.groupby(["schedule_gtfs_dataset_key", "route_id"]).agg(
    {"route_length_miles": "max"}
)

#### Somewhere along the way the routes are cut...maybe b/c of `direction_id`

In [None]:
OPERATOR_EXPORT = GTFS_DATA_DICT.schedule_tables.operator_scheduled_stats

In [None]:
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
dec_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11.parquet"

In [None]:
dec_df = gpd.read_parquet(dec_url)

In [None]:
dec_df.organization_name.value_counts().head()

In [None]:
dec_df.loc[
    dec_df.organization_name == "Alameda-Contra Costa Transit District"
].head().drop(columns=["geometry"]).T

In [None]:
dec_df2 = dec_df.loc[dec_df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
dec_df2.shape

In [None]:
type(dec_df2)

In [None]:
dec_df2.drop(columns=["geometry"]).T

In [None]:
# dec_df2.explore()

#### Find where in `gtfs_funnel` all the routes disappear

In [None]:
group_cols = ["schedule_gtfs_dataset_key"]

In [None]:
longest_shape_gdf2.info()

#### something is going on in `operator_scheduled_stats.schedule_stats_by_operator`

In [None]:
ROUTE_TYPOLOGY = GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
route_typology = pd.read_parquet(f"{SCHED_GCS}{ROUTE_TYPOLOGY}_{analysis_date}.parquet")

In [None]:
from route_typologies import route_typologies

In [None]:
route_typology_grouped = (
    route_typology.groupby(["schedule_gtfs_dataset_key", "route_id"])
    .agg({**{f"is_{c}": "sum" for c in route_typologies}})
    .reset_index()
)

In [None]:
route_typology_grouped2 = route_typology_grouped.loc[
    route_typology_grouped.schedule_gtfs_dataset_key.isin(schd_keys)
]

#### Routes are missing for Santa Maria and Capitol Corridor in `ROUTE_TYPOLOGY`

In [None]:
route_typology_grouped2.T

In [None]:
route_gdf = longest_shape_gdf2.merge(
    route_typology_grouped2, on=["schedule_gtfs_dataset_key", "route_id"], how="outer"
)

In [None]:
route_gdf.shape

In [None]:
route_gdf.drop(columns=["geometry"])

In [None]:
# route_gdf2.explore("schedule_gtfs_dataset_key")

#### Change merge from `inner` to `left`

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

In [None]:
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
my_test_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11_AH.parquet"

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [None]:
test_gdf = gpd.read_parquet(my_test_url)

In [None]:
test_gdf2 = test_gdf.loc[test_gdf.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
test_gdf2.explore("route_id")

#### Test with all the dates.

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
RT_SCHED_GCS

In [None]:
f"{OPERATOR_ROUTE}_AH_test"

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

In [None]:
test_df = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_routes_AH_test.parquet"
)

In [None]:
test_df.columns

In [None]:
op_routes_gdf = test_df.loc[test_df.organization_name.isin(org_name_lists)]

In [None]:
# Find the most recent geography for each route.
op_routes_gdf = op_routes_gdf.sort_values(by=["service_date"], ascending=False)

# Keep only the most recent row.
op_routes_gdf = op_routes_gdf.drop_duplicates(
    subset=["route_long_name", "route_short_name", "route_combined_name"]
)

# Drop service_dates
op_routes_gdf = op_routes_gdf.drop(columns=["service_date"])

In [None]:
op_routes_gdf.organization_name.value_counts()

In [None]:
op_routes_gdf.loc[op_routes_gdf.organization_name == "City of Santa Maria"].explore(
    "route_long_name"
)