# Research Request - GTFS Digest: Add Rail and Ferry Operators. #1386
# Focus on Rail here. 

* Why is Amtrak "schedule_only"? I thought it had a real time component?
    * 3/3: no reports.calitp.org indicates it has no real time data. 
* Understand why Southern California Regional Rail Authority 	 & San Bernardino County Transportation Authority 	are still "vp_only" after all the other operators have been fixed.
* Visualize rail routes differently for operators with both? 

In [1]:
import _section1_utils as section1
import _section2_utils as section2
import geopandas as gpd
import merge_data
import merge_operator_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import catalog_utils, portfolio_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date_list = [rt_dates.DATES["feb2025"]]

In [4]:
analysis_date = rt_dates.DATES["feb2025"]

In [22]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [23]:
schd_vp_df = pd.read_parquet(
    schd_vp_url,
)

In [24]:
# Filter for  Feb
schd_vp_df2 = schd_vp_df.loc[(schd_vp_df.service_date == "2025-02-12")]

In [25]:
# Drop duplicates
schd_vp_df3 = (
    schd_vp_df2[
        [
            "schedule_gtfs_dataset_key",
            "organization_name",
            "service_date",
            "sched_rt_category",
            "caltrans_district",
        ]
    ]
    .drop_duplicates(subset=["organization_name"])
    .sort_values(by=["organization_name"])
)

In [26]:
schd_vp_df3.sched_rt_category.value_counts()

schedule_and_vp    104
schedule_only       86
vp_only              2
Name: sched_rt_category, dtype: int64

## Trying to undestand why Southern California Regional Rail Authority 	 & San Bernardino County Transportation Authority 	are still "vp_only" after all the other operators have been fixed.

In [13]:
vp_only_ops = list(
    schd_vp_df3.loc[
        schd_vp_df3.sched_rt_category == "vp_only"
    ].organization_name.unique()
)

In [14]:
vp_only_ops_df = schd_vp_df.loc[schd_vp_df.organization_name.isin(vp_only_ops)]

In [15]:
vp_only_ops_sched_keys = list(vp_only_ops_df.schedule_gtfs_dataset_key.unique())

In [34]:
vp_only_ops_df.groupby(
    [
        "caltrans_district",
        "schedule_gtfs_dataset_key",
        "organization_name",
        "sched_rt_category",
    ]
).agg({"service_date": "max"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,service_date
caltrans_district,schedule_gtfs_dataset_key,organization_name,sched_rt_category,Unnamed: 4_level_1
07 - Los Angeles,759ad28de7d4bb8b2bf9bb7d83655100,San Bernardino County Transportation Authority,schedule_only,NaT
07 - Los Angeles,759ad28de7d4bb8b2bf9bb7d83655100,San Bernardino County Transportation Authority,vp_only,NaT
07 - Los Angeles,759ad28de7d4bb8b2bf9bb7d83655100,San Bernardino County Transportation Authority,schedule_and_vp,NaT
07 - Los Angeles,759ad28de7d4bb8b2bf9bb7d83655100,Southern California Regional Rail Authority,schedule_only,2023-12-13
07 - Los Angeles,759ad28de7d4bb8b2bf9bb7d83655100,Southern California Regional Rail Authority,vp_only,NaT
07 - Los Angeles,759ad28de7d4bb8b2bf9bb7d83655100,Southern California Regional Rail Authority,schedule_and_vp,2023-12-13
07 - Los Angeles,c4092405159366c705b62df938293a4e,San Bernardino County Transportation Authority,schedule_only,NaT
07 - Los Angeles,c4092405159366c705b62df938293a4e,San Bernardino County Transportation Authority,vp_only,NaT
07 - Los Angeles,c4092405159366c705b62df938293a4e,San Bernardino County Transportation Authority,schedule_and_vp,NaT
07 - Los Angeles,c4092405159366c705b62df938293a4e,Southern California Regional Rail Authority,schedule_only,2024-06-12


### Southern California Regional Rail Authority 	& San Bernardino County Transportation Authority are the same operator?

In [90]:
vp_only_ops_df.groupby(
    [
        "caltrans_district",
        "schedule_gtfs_dataset_key",
        "organization_name",
    ]
).agg({"route_id": "nunique"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,route_id
caltrans_district,schedule_gtfs_dataset_key,organization_name,Unnamed: 3_level_1
07 - Los Angeles,759ad28de7d4bb8b2bf9bb7d83655100,Southern California Regional Rail Authority,7
07 - Los Angeles,c4092405159366c705b62df938293a4e,Southern California Regional Rail Authority,7
07 - Los Angeles / Ventura,c4092405159366c705b62df938293a4e,Southern California Regional Rail Authority,7
08 - San Bernardino,c4092405159366c705b62df938293a4e,San Bernardino County Transportation Authority,7


In [105]:
vp_only_ops_df[[
        "caltrans_district",
        "schedule_gtfs_dataset_key",
        "organization_name",
        "route_id"
    ]].drop_duplicates().sort_values(by = ['route_id'])

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,organization_name,route_id
157635,07 - Los Angeles,759ad28de7d4bb8b2bf9bb7d83655100,Southern California Regional Rail Authority,91 Line
389109,07 - Los Angeles / Ventura,c4092405159366c705b62df938293a4e,Southern California Regional Rail Authority,91 Line
293913,08 - San Bernardino,c4092405159366c705b62df938293a4e,San Bernardino County Transportation Authority,91 Line
293914,07 - Los Angeles,c4092405159366c705b62df938293a4e,Southern California Regional Rail Authority,91 Line
157670,07 - Los Angeles,759ad28de7d4bb8b2bf9bb7d83655100,Southern California Regional Rail Authority,Antelope Valley Line
389155,07 - Los Angeles / Ventura,c4092405159366c705b62df938293a4e,Southern California Regional Rail Authority,Antelope Valley Line
293973,08 - San Bernardino,c4092405159366c705b62df938293a4e,San Bernardino County Transportation Authority,Antelope Valley Line
293974,07 - Los Angeles,c4092405159366c705b62df938293a4e,Southern California Regional Rail Authority,Antelope Valley Line
157712,07 - Los Angeles,759ad28de7d4bb8b2bf9bb7d83655100,Southern California Regional Rail Authority,Inland Emp.-Orange Co. Line
389251,07 - Los Angeles / Ventura,c4092405159366c705b62df938293a4e,Southern California Regional Rail Authority,Inland Emp.-Orange Co. Line


## Go back to  `schedule_stats_by_route_direction`

In [17]:
import sys

sys.path.append("../gtfs_funnel/")
import schedule_stats_by_route_direction

In [18]:
route_group_merge_cols = ["schedule_gtfs_dataset_key", "route_id", "direction_id"]

### Only SBR is back in the `trip_metrics` line 203 after my small tweak that i Only ran for 2025 dates. 

In [19]:
trip_metrics = schedule_stats_by_route_direction.assemble_scheduled_trip_metrics(
    analysis_date, GTFS_DATA_DICT
)

In [20]:
trip_metrics = trip_metrics.loc[
    trip_metrics.schedule_gtfs_dataset_key.isin(vp_only_ops_sched_keys)
]

In [21]:
len(trip_metrics)

202

#### Only the key for `San Bernardino County Transportation Authority` is kept. Where's Metrolink's key?

In [35]:
trip_metrics.schedule_gtfs_dataset_key.unique()

array(['c4092405159366c705b62df938293a4e'], dtype=object)

### Operators are now missing in the function `schedule_metrics_by_route_direction`

In [28]:
route_dir_metrics = (
    schedule_stats_by_route_direction.schedule_metrics_by_route_direction(
        trip_metrics, analysis_date, route_group_merge_cols
    )
)

In [29]:
len(route_dir_metrics)

0

In [30]:
service_freq_df = gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
    trip_metrics, route_group_merge_cols, long_or_wide="long"
)

In [31]:
len(service_freq_df)

41

In [32]:
service_freq_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,n_trips,time_period,peak_offpeak,frequency
0,c4092405159366c705b62df938293a4e,91 Line,0.0,7,all_day,,0.29
1,c4092405159366c705b62df938293a4e,91 Line,1.0,7,all_day,,0.29


In [91]:
service_freq_df.schedule_gtfs_dataset_key.unique()

array(['c4092405159366c705b62df938293a4e'], dtype=object)

In [33]:
service_freq_df.route_id.nunique()

7

In [92]:
metrics_df = (
        trip_metrics.groupby(route_group_merge_cols, observed=True, group_keys=False, dropna=False)
        .agg(
            {
                "median_stop_meters": "mean",
                # take mean of the median stop spacing for trip
                # does this make sense?
                # median is the single boiled down metric at the trip-level
                "scheduled_service_minutes": "mean",
            }
        )
        .reset_index()
        .rename(
            columns={
                "median_stop_meters": "avg_stop_meters",
                "scheduled_service_minutes": "avg_scheduled_service_minutes",
            }
        )
    )

In [93]:
len(metrics_df)

14

In [95]:
from shared_utils.rt_utils import METERS_PER_MILE

In [96]:
metrics_df = metrics_df.assign(
        avg_stop_miles=metrics_df.avg_stop_meters.divide(METERS_PER_MILE).round(2)
    ).drop(columns=["avg_stop_meters"])

In [97]:
round_me = ["avg_stop_miles", "avg_scheduled_service_minutes"]
metrics_df[round_me] = metrics_df[round_me].round(2)

In [98]:
common_shape = gtfs_schedule_wrangling.most_common_shape_by_route_direction(
        analysis_date
    )

In [103]:
common_shape = common_shape.loc[
    common_shape.schedule_gtfs_dataset_key.isin(vp_only_ops_sched_keys)
]

In [104]:
common_shape.head(1)

Unnamed: 0,geometry,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name


In [100]:
df = pd.merge(common_shape, metrics_df, on=route_group_merge_cols, how="inner").merge(
        service_freq_df, on=route_group_merge_cols, how="inner"
    )

In [101]:
len(df)

0

### Only the key for `San Bernardino County Transportation Authority` is kept. Where's Metrolink's key? `STOP_TIMES_FILE`.

In [36]:
STOP_TIMES_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction

In [37]:
STOP_TIMES_FILE

'stop_times_direction'

In [38]:
GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction

'stop_times_direction'

In [39]:
df = gpd.read_parquet(f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{analysis_date}.parquet")

In [40]:
analysis_date

'2025-02-12'

In [44]:
df2 = df.loc[df.schedule_gtfs_dataset_key.isin(vp_only_ops_sched_keys)]

In [46]:
len(df2), df2.schedule_gtfs_dataset_key.nunique()

(2053, 1)

In [48]:
 df2.schedule_gtfs_dataset_key.unique()

array(['c4092405159366c705b62df938293a4e'], dtype=object)

### Go back to `gtfs_funnel/stop_times_with_direction.py`

#### Per Tiffany's advice, check `trips`. Metrolink is missing!

In [49]:
trip_scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name",
    "feed_key",
]

In [50]:
trips_df = helpers.import_scheduled_trips(
    analysis_date, columns=trip_scheduled_col, get_pandas=True
)

In [51]:
trips_df = trips_df.loc[trips_df.schedule_gtfs_dataset_key.isin(vp_only_ops_sched_keys)]

In [52]:
len(trips_df)

202

In [53]:
trips_df.head(1)

Unnamed: 0,route_id,trip_instance_key,schedule_gtfs_dataset_key,shape_array_key,direction_id,route_long_name,route_short_name,route_desc,name,feed_key
114833,San Bernardino Line,e0e22a031655b8c4eb6eb418166b567c,c4092405159366c705b62df938293a4e,,0.0,Metrolink San Bernardino Line,,,Metrolink Schedule,8fc9cfe86b4e9e8c7cf508b2486605f1


In [54]:
feed_keys_list = list(trips_df.feed_key.unique())

In [55]:
feed_keys_list

['8fc9cfe86b4e9e8c7cf508b2486605f1']

In [56]:
trips = helpers.import_scheduled_trips(
    analysis_date,
    columns=[
        "gtfs_dataset_key",
        "feed_key",
        "trip_id",
        "trip_instance_key",
        "shape_array_key",
    ],
    get_pandas=True,
)

In [57]:
trips = trips.loc[trips.feed_key.isin(feed_keys_list)]

In [58]:
len(trips)

202

In [60]:
trips.feed_key.nunique()

1

In [59]:
stop

NameError: name 'stop' is not defined

#### Stops - Metrolink is also gone.

In [61]:
stops = helpers.import_scheduled_stops(
    analysis_date,
    columns=["feed_key", "stop_id", "stop_name", "geometry"],
    crs=PROJECT_CRS,
    get_pandas=True,
)

In [62]:
stops = stops.loc[stops.feed_key.isin(feed_keys_list)]

In [63]:
len(stops)

66

In [64]:
stops.feed_key.nunique()

1

In [88]:
stops.feed_key.unique()

array(['8fc9cfe86b4e9e8c7cf508b2486605f1'], dtype=object)

#### Stop_times - everything is here. 

In [65]:
stop_times = helpers.import_scheduled_stop_times(
    analysis_date,
    columns=["feed_key", "trip_id", "stop_id", "stop_sequence"],
    get_pandas=True,
)

In [66]:
stop_times = stop_times.loc[stop_times.feed_key.isin(feed_keys_list)]

In [67]:
len(stop_times)

2053

In [68]:
import stop_times_with_direction

In [69]:
stop_times2 = stop_times.pipe(stop_times_with_direction.keep_first_trip, analysis_date)

In [70]:
len(stop_times2)

2053

In [71]:
stop_times2.feed_key.unique()

array(['8fc9cfe86b4e9e8c7cf508b2486605f1'], dtype=object)

#### Line 47 merge

In [73]:
st_with_trip = pd.merge(stop_times, trips, on=["feed_key", "trip_id"], how="inner")

In [74]:
len(st_with_trip)

2053

In [75]:
st_with_stop = pd.merge(
    st_with_trip, stops, on=["feed_key", "stop_id"], how="inner"
).drop(columns=["trip_id"])

In [76]:
len(st_with_stop)

2053

In [77]:
st_with_stop = gpd.GeoDataFrame(st_with_stop, geometry="geometry", crs=PROJECT_CRS)

In [84]:
st_with_stop.feed_key.nunique()

1

In [87]:
st_with_stop.feed_key.unique()

array(['8fc9cfe86b4e9e8c7cf508b2486605f1'], dtype=object)

### `stop_times_with_direction/prep_scheduled_stop_times` everything is in order here.

In [78]:
scheduled_stop_times = stop_times_with_direction.prep_scheduled_stop_times(
    analysis_date
)

In [79]:
scheduled_stop_times = scheduled_stop_times.loc[
    scheduled_stop_times.feed_key.isin(feed_keys_list)
]

In [80]:
len(scheduled_stop_times)

2053

In [86]:
scheduled_stop_times.feed_key.unique()

array(['8fc9cfe86b4e9e8c7cf508b2486605f1'], dtype=object)

### Continue with Line 251 in `stop_times_with_direction`

In [81]:
trip_cols = ["trip_instance_key"]
trip_stop_cols = ["trip_instance_key", "stop_sequence"]

In [82]:
df = (
    stop_times_with_direction.find_prior_subseq_stop_info(
        scheduled_stop_times,
        analysis_date,
        trip_cols=trip_cols,
        trip_stop_cols=trip_stop_cols,
    )
    .sort_values(trip_stop_cols)
    .reset_index(drop=True)
)

In [83]:
len(df)

2053

In [85]:
df.head(1)

Unnamed: 0,feed_key,stop_id,stop_sequence,schedule_gtfs_dataset_key,trip_instance_key,shape_array_key,stop_name,geometry,stop_meters,prior_stop_sequence,subseq_stop_sequence,stop_primary_direction,stop_pair,stop_pair_name
0,8fc9cfe86b4e9e8c7cf508b2486605f1,185,1,c4092405159366c705b62df938293a4e,01dc5389671b18ec6c00c52306792d6b,,San Bernardino - Downtown Metrolink Station,POINT (249458.295 -431571.641),,,2.0,Unknown,185__124,San Bernardino - Downtown Metrolink Station__San Bernardino Depot Metrolink Station


In [89]:
df.schedule_gtfs_dataset_key.unique()

array(['c4092405159366c705b62df938293a4e'], dtype=object)