## Adding Cardinal Direction into Pipeline
* Editing [this file](https://github.com/cal-itp/data-analyses/blob/ah_gtfs_portfolio/gtfs_funnel/schedule_stats_by_route_direction.py#L23)

In [1]:
import _section2_utils as section2
import geopandas as gpd
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [4]:
GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction

'stop_times_direction'

In [5]:
analysis_date = "2024-04-17"

### `assemble_scheduled_trip_metrics`
* `df` from this function is the same thing that is loaded from `section2_utils.load_scheduled_stop_times`.
* How come `df` is read as a `gpd`? Could I just read it in as a regular pandas dataframe?

In [6]:
STOP_TIMES_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction

In [7]:
stop_times_gdf = gpd.read_parquet(
    f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{analysis_date}.parquet"
)

In [8]:
type(stop_times_gdf)

geopandas.geodataframe.GeoDataFrame

In [9]:
scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name",
]

In [10]:
trips_to_route_cols_subset = ["trip_instance_key", "route_id", "direction_id"]

In [11]:
# Add more columns to this.
trips_to_route_df = helpers.import_scheduled_trips(
    analysis_date, columns=scheduled_col, get_pandas=True
)

In [12]:
time_of_day = gtfs_schedule_wrangling.get_trip_time_buckets(analysis_date)[
    ["trip_instance_key", "time_of_day", "scheduled_service_minutes"]
]

In [13]:
trip_cols = ["schedule_gtfs_dataset_key", "trip_instance_key"]

grouped_df = stop_times_gdf.groupby(trip_cols, observed=True, group_keys=False)

In [14]:
len(grouped_df)

103451

In [15]:
df2 = pd.merge(
    grouped_df.agg({"stop_meters": "median"})
    .reset_index()
    .rename(columns={"stop_meters": "median_stop_meters"}),
    time_of_day,
    on="trip_instance_key",
    how="left",
).merge(
    trips_to_route_df[trips_to_route_cols_subset], on="trip_instance_key", how="inner"
)

In [16]:
df2.head()

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id
0,0139b1253130b33adcd4b3a4490530d2,00b4bee9c4a71a1ffffd3e642c9a07f8,416.83,Midday,40.0,P2,
1,0139b1253130b33adcd4b3a4490530d2,0102978f1e2ac86cff442d0c622e8dcb,387.2,PM Peak,30.0,P4,
2,0139b1253130b33adcd4b3a4490530d2,01fbd0f23a0ee04f8e5903f8558e704e,451.55,Midday,40.0,T6,
3,0139b1253130b33adcd4b3a4490530d2,021961b9b493a7217a7362e9496a27b0,585.05,PM Peak,22.0,D3,
4,0139b1253130b33adcd4b3a4490530d2,02c07cf7403e838aa39811f8d095faea,585.05,AM Peak,22.0,D3,


In [17]:
df2.route_id.nunique()

1778

In [18]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103451 entries, 0 to 103450
Data columns (total 7 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   schedule_gtfs_dataset_key  103451 non-null  object 
 1   trip_instance_key          103451 non-null  object 
 2   median_stop_meters         103447 non-null  float64
 3   time_of_day                103451 non-null  object 
 4   scheduled_service_minutes  103451 non-null  float64
 5   route_id                   103451 non-null  object 
 6   direction_id               93963 non-null   float64
dtypes: float64(3), object(4)
memory usage: 6.3+ MB


In [19]:
df2.direction_id = df2.direction_id.fillna(0)

#### Add in Cardinal Direction Stuff somewhere here?

In [20]:
stop_times_col = [
    "feed_key",
    "stop_id",
    "stop_sequence",
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "stop_name",
    "prior_stop_sequence",
    "subseq_stop_sequence",
    "stop_pair",
    "stop_pair_name",
    "stop_primary_direction",
    "stop_meters",
]

In [21]:
stop_times_gdf2 = stop_times_gdf[stop_times_col]

In [22]:
trips_to_route_df.route_id.nunique()

1778

In [23]:
trips_to_route_df.columns

Index(['route_id', 'trip_instance_key', 'schedule_gtfs_dataset_key',
       'shape_array_key', 'direction_id', 'route_long_name',
       'route_short_name', 'route_desc', 'name'],
      dtype='object')

In [24]:
merge_cols = [
    "trip_instance_key",
    "schedule_gtfs_dataset_key",
    "shape_array_key",
]

In [25]:
cardinal_dir1 = pd.merge(stop_times_gdf2, trips_to_route_df, on=merge_cols, how="inner")

In [26]:
cardinal_dir1.direction_id = cardinal_dir1.direction_id.fillna(0)

In [27]:
cardinal_dir1.shape

(3558553, 19)

In [28]:
cardinal_dir1.route_id.nunique()

1778

In [29]:
cardinal_dir1.trip_instance_key.nunique()

103451

In [30]:
cardinal_dir1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3558553 entries, 0 to 3558552
Data columns (total 19 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   feed_key                   object 
 1   stop_id                    object 
 2   stop_sequence              int64  
 3   schedule_gtfs_dataset_key  object 
 4   trip_instance_key          object 
 5   shape_array_key            object 
 6   stop_name                  object 
 7   prior_stop_sequence        Int64  
 8   subseq_stop_sequence       Int64  
 9   stop_pair                  object 
 10  stop_pair_name             object 
 11  stop_primary_direction     object 
 12  stop_meters                float64
 13  route_id                   object 
 14  direction_id               float64
 15  route_long_name            object 
 16  route_short_name           object 
 17  route_desc                 object 
 18  name                       object 
dtypes: Int64(2), float64(2), int64(1), object(

### Why do some of the route IDS drop off?
* Some have `direction_id` that are `nan`

In [31]:
cardinal_dir2 = (
    cardinal_dir1.groupby(
        [
            "route_id",
            "schedule_gtfs_dataset_key",
            "direction_id",
            "stop_primary_direction",
        ]
    )
    .agg({"stop_sequence": "count"})
    .reset_index()
    .rename(columns={"stop_sequence": "total_stops"})
)

In [32]:
cardinal_dir2.route_id.nunique()

1778

In [33]:
cardinal_dir1_routes = set(cardinal_dir1.route_id.unique().tolist())
cardinal_dir2_routes = set(cardinal_dir2.route_id.unique().tolist())

In [34]:
len(cardinal_dir1_routes - cardinal_dir2_routes)

0

In [35]:
(cardinal_dir1_routes - cardinal_dir2_routes)

set()

In [36]:
cardinal_dir1.loc[(cardinal_dir1.route_id == "0177a66b-9f33-407d-a72e-776429fb73d4")][
    ["stop_primary_direction", "direction_id"]
].drop_duplicates()

Unnamed: 0,stop_primary_direction,direction_id
1518439,Unknown,0.0
1518440,Eastbound,0.0
1518441,Southbound,0.0
1518446,Westbound,0.0
1518448,Northbound,0.0


In [37]:
cardinal_dir1.loc[(cardinal_dir1.route_id == "9f38a05f-6eea-47f4-bf42-992a789e7b49")][
    ["stop_primary_direction", "direction_id"]
].drop_duplicates()

Unnamed: 0,stop_primary_direction,direction_id
1027572,Unknown,0.0
1027573,Westbound,0.0


In [38]:
cardinal_dir1.loc[(cardinal_dir1.route_id == "3ff1b747-a791-4eb3-90b2-25cb355b6c67")][
    ["route_id", "stop_primary_direction", "direction_id"]
].drop_duplicates()

Unnamed: 0,route_id,stop_primary_direction,direction_id
179634,3ff1b747-a791-4eb3-90b2-25cb355b6c67,Unknown,0.0
179635,3ff1b747-a791-4eb3-90b2-25cb355b6c67,Southbound,0.0
179638,3ff1b747-a791-4eb3-90b2-25cb355b6c67,Eastbound,0.0


In [39]:
cardinal_dir2.loc[
    (cardinal_dir2.route_id == "001")
    & (cardinal_dir2.schedule_gtfs_dataset_key == "cb3074eb8b423dfc5acfeeb0de95eb82")
]

Unnamed: 0,route_id,schedule_gtfs_dataset_key,direction_id,stop_primary_direction,total_stops
0,1,cb3074eb8b423dfc5acfeeb0de95eb82,0.0,Northbound,124
1,1,cb3074eb8b423dfc5acfeeb0de95eb82,0.0,Southbound,434
2,1,cb3074eb8b423dfc5acfeeb0de95eb82,0.0,Unknown,62
3,1,cb3074eb8b423dfc5acfeeb0de95eb82,0.0,Westbound,1426
4,1,cb3074eb8b423dfc5acfeeb0de95eb82,1.0,Eastbound,1550
5,1,cb3074eb8b423dfc5acfeeb0de95eb82,1.0,Northbound,434
6,1,cb3074eb8b423dfc5acfeeb0de95eb82,1.0,Southbound,62
7,1,cb3074eb8b423dfc5acfeeb0de95eb82,1.0,Unknown,62


In [40]:
cardinal_dir3 = cardinal_dir2.sort_values(
    by=["route_id", "schedule_gtfs_dataset_key", "direction_id", "total_stops"],
    ascending=[True, True, True, False],
)

In [41]:
cardinal_dir3.loc[
    (cardinal_dir3.route_id == "001")
    & (cardinal_dir3.schedule_gtfs_dataset_key == "cb3074eb8b423dfc5acfeeb0de95eb82")
]

Unnamed: 0,route_id,schedule_gtfs_dataset_key,direction_id,stop_primary_direction,total_stops
3,1,cb3074eb8b423dfc5acfeeb0de95eb82,0.0,Westbound,1426
1,1,cb3074eb8b423dfc5acfeeb0de95eb82,0.0,Southbound,434
0,1,cb3074eb8b423dfc5acfeeb0de95eb82,0.0,Northbound,124
2,1,cb3074eb8b423dfc5acfeeb0de95eb82,0.0,Unknown,62
4,1,cb3074eb8b423dfc5acfeeb0de95eb82,1.0,Eastbound,1550
5,1,cb3074eb8b423dfc5acfeeb0de95eb82,1.0,Northbound,434
6,1,cb3074eb8b423dfc5acfeeb0de95eb82,1.0,Southbound,62
7,1,cb3074eb8b423dfc5acfeeb0de95eb82,1.0,Unknown,62


In [42]:
# Drop duplicates so only the top stop_primary_direction is kept.
cardinal_dir4 = cardinal_dir3.drop_duplicates(
    subset=[
        "route_id",
        "schedule_gtfs_dataset_key",
        "direction_id",
    ]
).reset_index(drop=True)

In [43]:
cardinal_dir4.loc[
    (cardinal_dir4.route_id == "001")
    & (cardinal_dir4.schedule_gtfs_dataset_key == "cb3074eb8b423dfc5acfeeb0de95eb82")
]

Unnamed: 0,route_id,schedule_gtfs_dataset_key,direction_id,stop_primary_direction,total_stops
0,1,cb3074eb8b423dfc5acfeeb0de95eb82,0.0,Westbound,1426
1,1,cb3074eb8b423dfc5acfeeb0de95eb82,1.0,Eastbound,1550


In [44]:
cardinal_dir4 = cardinal_dir4.drop(columns=["total_stops"])

In [45]:
cardinal_dir4.route_id.nunique()

1778

### Merge (some route IDs are missing because they don't have 0/1 populated in `direction_id`)

In [46]:
df2.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id
0,0139b1253130b33adcd4b3a4490530d2,00b4bee9c4a71a1ffffd3e642c9a07f8,416.83,Midday,40.0,P2,0.0
1,0139b1253130b33adcd4b3a4490530d2,0102978f1e2ac86cff442d0c622e8dcb,387.2,PM Peak,30.0,P4,0.0


In [47]:
cardinal_dir4.head(2)

Unnamed: 0,route_id,schedule_gtfs_dataset_key,direction_id,stop_primary_direction
0,1,cb3074eb8b423dfc5acfeeb0de95eb82,0.0,Westbound
1,1,cb3074eb8b423dfc5acfeeb0de95eb82,1.0,Eastbound


In [48]:
cardinal_dir4.route_id.nunique(), df2.route_id.nunique()

(1778, 1778)

In [49]:
len(df2), len(cardinal_dir4)

(103451, 4408)

In [50]:
pd.merge(
    df2,
    cardinal_dir4,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

_merge    
both          103451
left_only          0
right_only         0
dtype: int64

In [51]:
pd.merge(
    df2,
    cardinal_dir4,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
    indicator=True,
)[["_merge"]].value_counts()

_merge    
both          103451
left_only          0
right_only         0
dtype: int64

In [52]:
m1 = pd.merge(
    df2,
    cardinal_dir4,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [53]:
pd.merge(
    df2,
    cardinal_dir4,
    on=["schedule_gtfs_dataset_key", "route_id"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

_merge    
both          185236
left_only          0
right_only         0
dtype: int64

#### How does harmonizing the route names fit into here? 
* [This script](https://github.com/cal-itp/data-analyses/blob/b1e5d4f870400251240eeba4a6515a0848e5d6f8/gtfs_funnel/clean_route_naming.py#L4)

In [54]:
m1.head(3)

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id,stop_primary_direction
0,0139b1253130b33adcd4b3a4490530d2,00b4bee9c4a71a1ffffd3e642c9a07f8,416.83,Midday,40.0,P2,0.0,Westbound
1,0139b1253130b33adcd4b3a4490530d2,1813b6df69d2559a6b55935bc9c251d6,416.83,PM Peak,40.0,P2,0.0,Westbound
2,0139b1253130b33adcd4b3a4490530d2,18950c64fc31d8452eea43d51a79014e,416.83,Midday,40.0,P2,0.0,Westbound


In [55]:
m1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103451 entries, 0 to 103450
Data columns (total 8 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   schedule_gtfs_dataset_key  103451 non-null  object 
 1   trip_instance_key          103451 non-null  object 
 2   median_stop_meters         103447 non-null  float64
 3   time_of_day                103451 non-null  object 
 4   scheduled_service_minutes  103451 non-null  float64
 5   route_id                   103451 non-null  object 
 6   direction_id               103451 non-null  float64
 7   stop_primary_direction     103451 non-null  object 
dtypes: float64(3), object(5)
memory usage: 7.1+ MB


### Final

In [56]:
def find_most_common_dir(
    stop_times_gdf: gpd.GeoDataFrame,
    trips_to_route_df: pd.DataFrame,
) -> pd.DataFrame:
    """
    Load load_scheduled_trips() and load_scheduled_stop_times()
    """
    stop_times_col = [
        "feed_key",
        "stop_id",
        "stop_sequence",
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
        "shape_array_key",
        "stop_name",
        "prior_stop_sequence",
        "subseq_stop_sequence",
        "stop_pair",
        "stop_pair_name",
        "stop_primary_direction",
        "stop_meters",
    ]

    stop_times_gdf2 = stop_times_gdf[stop_times_col]

    # Merge dfs
    merge_cols = ["trip_instance_key", "schedule_gtfs_dataset_key", "shape_array_key"]

    df1 = pd.merge(stop_times_gdf2, trips_to_route_df, on=merge_cols, how="inner")
    df1.direction_id = df1.direction_id.fillna(0)
    
    agg1 = (
        df1.groupby(
            [
                "route_id",
                "schedule_gtfs_dataset_key",
                "direction_id",
                "stop_primary_direction",
            ]
        )
        .agg({"stop_sequence": "count"})
        .reset_index()
        .rename(columns={"stop_sequence": "total_stops"})
    )

    # Sort and drop duplicates so that the
    # largest # of stops by stop_primary_direction is at the top
    agg2 = agg1.sort_values(
        by=["route_id", "schedule_gtfs_dataset_key", "direction_id", "total_stops"],
        ascending=[True, True, True, False],
    )

    # Drop duplicates so only the top stop_primary_direction is kept.
    agg3 = agg2.drop_duplicates(
    subset=[
        "route_id",
        "schedule_gtfs_dataset_key",
        "direction_id",
    ]).reset_index(drop=True)
    
    agg3 = agg3.drop(columns=["total_stops"])
    
    
    
    return agg3

In [57]:
def assemble_scheduled_trip_metrics(
    analysis_date: str, 
    dict_inputs: dict
) -> pd.DataFrame:
    """
    Get GTFS schedule trip metrics including time-of-day buckets,
    scheduled service minutes, and median stop spacing.
    """
    STOP_TIMES_FILE = dict_inputs.rt_vs_schedule_tables.stop_times_direction
    
    # Load files
    df = gpd.read_parquet(
        f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{analysis_date}.parquet"
    )
    
    scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name"
    ]
    
    trips_to_route = helpers.import_scheduled_trips(
        analysis_date,
        columns = scheduled_col,
        get_pandas = True
    )
    
    
    time_of_day = (gtfs_schedule_wrangling.get_trip_time_buckets(analysis_date)   
                   [["trip_instance_key", "time_of_day", 
                     "scheduled_service_minutes"]]
              )
    
    trip_cols = ["schedule_gtfs_dataset_key", "trip_instance_key"]
    
    grouped_df = df.groupby(trip_cols, observed=True, group_keys=False)
    
    trips_to_route_cols_subset = ["trip_instance_key", "route_id", "direction_id"]
    # Get median / mean stop meters for the trip
    # Attach time-of-day and route_id and direction_id
    # Merge using a subset
    median_stop_meters_df= pd.merge(
        grouped_df.agg({"stop_meters": "median"}).reset_index().rename(
            columns = {"stop_meters": "median_stop_meters"}),
        time_of_day,
        on = "trip_instance_key",
        how = "left"
    ).merge(
        trips_to_route[trips_to_route_cols_subset],
        on = "trip_instance_key",
        how = "inner"
    )
    
    median_stop_meters_df.direction_id = median_stop_meters_df.direction_id.fillna(0)
    
    # Get cardinal direction
    cardinal_direction_df = find_most_common_dir(df,trips_to_route)
    
    # Merge everything together
    m1 = pd.merge(
    median_stop_meters_df,
    cardinal_direction_df,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner")
    
    return m1

In [58]:
test = assemble_scheduled_trip_metrics(analysis_date, GTFS_DATA_DICT)

In [59]:
test.head()

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id,stop_primary_direction
0,0139b1253130b33adcd4b3a4490530d2,00b4bee9c4a71a1ffffd3e642c9a07f8,416.83,Midday,40.0,P2,0.0,Westbound
1,0139b1253130b33adcd4b3a4490530d2,1813b6df69d2559a6b55935bc9c251d6,416.83,PM Peak,40.0,P2,0.0,Westbound
2,0139b1253130b33adcd4b3a4490530d2,18950c64fc31d8452eea43d51a79014e,416.83,Midday,40.0,P2,0.0,Westbound
3,0139b1253130b33adcd4b3a4490530d2,1f8b0f73bd19df77ed2a53326cb42b09,425.64,Midday,36.0,P2,0.0,Westbound
4,0139b1253130b33adcd4b3a4490530d2,2c20ea3bfc355c168eccba16ad62a018,425.64,PM Peak,36.0,P2,0.0,Westbound


In [60]:
test.shape

(103451, 8)

In [61]:
route_cols = [
            "schedule_gtfs_dataset_key", 
            "route_id", 
            "direction_id"
        ]

In [62]:
import sys
sys.path.append("../gtfs_funnel")
import schedule_stats_by_route_direction

In [63]:
route_dir_metrics = schedule_stats_by_route_direction.schedule_metrics_by_route_direction(
            test, analysis_date, route_cols)

In [64]:
ROUTE_TYPOLOGIES = GTFS_DATA_DICT.schedule_tables.route_typologies

In [65]:
route_typologies = pd.read_parquet(
            f"{SCHED_GCS}{ROUTE_TYPOLOGIES}_{analysis_date}.parquet",
            columns = route_cols + [
                "is_coverage", "is_downtown_local", 
                "is_local", "is_rapid", "is_express", "is_rail"]
        )

In [66]:
route_dir_metrics2 = pd.merge(
            route_dir_metrics,
            route_typologies,
            on = route_cols,
            how = "left"
        )

In [67]:
route_dir_metrics2.head().drop(columns = ['geometry'
])

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail
0,36b8fbf12e4adc76b21651462b200860,569,1.0,p_859,Sacramento,94.0,2.61,2,all_day,0.08,1.0,0.0,0.0,0.0,0.0,0.0
1,36b8fbf12e4adc76b21651462b200860,569,1.0,p_859,Sacramento,94.0,2.61,1,offpeak,0.06,1.0,0.0,0.0,0.0,0.0,0.0
2,36b8fbf12e4adc76b21651462b200860,569,1.0,p_859,Sacramento,94.0,2.61,1,peak,0.12,1.0,0.0,0.0,0.0,0.0,0.0
3,36b8fbf12e4adc76b21651462b200860,569,0.0,p_867,Sacramento,87.5,3.46,2,all_day,0.08,1.0,0.0,0.0,0.0,0.0,0.0
4,36b8fbf12e4adc76b21651462b200860,569,0.0,p_867,Sacramento,87.5,3.46,2,peak,0.25,1.0,0.0,0.0,0.0,0.0,0.0
