## Adding Cardinal Direction into Pipeline
* Editing [this file](https://github.com/cal-itp/data-analyses/blob/ah_gtfs_portfolio/gtfs_funnel/schedule_stats_by_route_direction.py#L23)

In [1]:
import _section2_utils as section2
import geopandas as gpd
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [4]:
GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction

'stop_times_direction'

In [5]:
analysis_date = "2024-04-17"

In [6]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_trip_metrics

'schedule_trip/schedule_trip_metrics'

### Seeing my changes in `gtfs_funnel/schedule_stats_by_route_direction.py` 6/7

#### `schedule_route_dir`

In [7]:
sched_route = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [8]:
# gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-04-15_AH_TESTING.parquet

In [9]:
route_url = f"{RT_SCHED_GCS}{sched_route}_AH_TESTING_"

In [10]:
#https://storage.cloud.google.com/calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_AH_TESTING_2024-04-15.parquet

In [11]:
apr_15_routes = pd.read_parquet(f"{route_url}2024-04-15.parquet")

In [12]:
apr_16_routes = pd.read_parquet(f"{route_url}2024-04-16.parquet")

In [13]:
apr_16_routes.head(1).drop(columns = ['geometry'])

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name,stop_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail
0,3c62ad6ee589d56eca915ce291a5df0a,2b5c285d-29b2-4d19-8f12-f85348cd832e,0.0,ea5549cf-85a1-4f9e-a4e4-39e53493d2d1,WEST SACRAMENTO SHUTTLE - 240,Westbound,50.0,0.21,12,all_day,0.5,1.0,0.0,0.0,1.0,0.0,0.0


#### `schedule_trip`

In [14]:
# gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_trip/schedule_trip_metrics_2024-04-15_AH_TESTING.parquet

In [15]:
trip_gcs = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_trip_metrics

In [16]:
trip_url = f"{RT_SCHED_GCS}{trip_gcs}_"

In [17]:
trip_url

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_trip/schedule_trip_metrics_'

In [18]:
apr_15_trips = pd.read_parquet(f"{trip_url}2024-04-15_AH_TESTING.parquet")

In [19]:
apr_15_trips.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id,stop_primary_direction
0,0139b1253130b33adcd4b3a4490530d2,004c9c82f6ef126000f4067c79f48ef2,740.13,PM Peak,55.0,c6726149-9979-4ebb-85f6-0be90402266c,0.0,Southbound
1,0139b1253130b33adcd4b3a4490530d2,1caf03cf383f956c179788b51e850db0,893.83,PM Peak,57.0,c6726149-9979-4ebb-85f6-0be90402266c,0.0,Southbound


In [20]:
apr_16_trips = pd.read_parquet(f"{trip_url}2024-04-16_AH_TESTING.parquet")

In [21]:
apr_16_trips.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id,stop_primary_direction
0,0139b1253130b33adcd4b3a4490530d2,000a5e91dd67d8dbbbe91b97159bfc6c,1447.13,Midday,60.0,3ff1b747-a791-4eb3-90b2-25cb355b6c67,0.0,Southbound
1,0139b1253130b33adcd4b3a4490530d2,0e578363758f6012c878553b563ba908,1447.13,Midday,60.0,3ff1b747-a791-4eb3-90b2-25cb355b6c67,0.0,Southbound


#### From 6/7 I realize that the `stop_cardinal_direction` column is missing. How do I add it in?
* Breaking apart `schedule_metrics_by_route_direction`


In [22]:
route_group_cols = [
            "schedule_gtfs_dataset_key", 
            "route_id", 
            "direction_id",
            "stop_primary_direction"
        ]
        
route_merge_cols = [
            "schedule_gtfs_dataset_key", 
            "route_id", 
            "direction_id",
        ]

In [23]:
service_freq_df = gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
        apr_16_trips, route_group_cols, long_or_wide = "long")

In [24]:
service_freq_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,stop_primary_direction,n_trips,time_period,frequency
0,0139b1253130b33adcd4b3a4490530d2,0177a66b-9f33-407d-a72e-776429fb73d4,0.0,Eastbound,2,all_day,0.08
1,0139b1253130b33adcd4b3a4490530d2,0ad6c6aa-1939-45a0-a3a8-02ebe8e19092,0.0,Northbound,13,all_day,0.54


In [25]:
metrics_df = (apr_16_trips.groupby(route_group_cols, 
                             observed=True, group_keys=False)
                  .agg({
                      "median_stop_meters": "mean", 
                      # take mean of the median stop spacing for trip
                      # does this make sense?
                      # median is the single boiled down metric at the trip-level
                      "scheduled_service_minutes": "mean",
                  }).reset_index()
                  .rename(columns = {
                      "median_stop_meters": "avg_stop_meters",
                      "scheduled_service_minutes": "avg_scheduled_service_minutes"
                  })
                 )

In [26]:
metrics_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,stop_primary_direction,avg_stop_meters,avg_scheduled_service_minutes
0,0139b1253130b33adcd4b3a4490530d2,0177a66b-9f33-407d-a72e-776429fb73d4,0.0,Eastbound,789.86,63.5
1,0139b1253130b33adcd4b3a4490530d2,0ad6c6aa-1939-45a0-a3a8-02ebe8e19092,0.0,Northbound,760.42,50.54


In [27]:
from shared_utils.rt_utils import METERS_PER_MILE

In [28]:
metrics_df = metrics_df.assign(
        avg_stop_miles = metrics_df.avg_stop_meters.divide(METERS_PER_MILE).round(2)
    ).drop(columns = ["avg_stop_meters"])

In [29]:
round_me = ["avg_stop_miles", "avg_scheduled_service_minutes"]
metrics_df[round_me] = metrics_df[round_me].round(2)

common_shape = gtfs_schedule_wrangling.most_common_shape_by_route_direction(
        analysis_date
    ).pipe(helpers.remove_shapes_outside_ca)

### `gtfs_digest/merge_data.py`

In [30]:
analysis_date_list = ["2024-04-15","2024-04-16"]

In [31]:

route_time_cols = ["schedule_gtfs_dataset_key", 
                   "route_id", "direction_id", "time_period", "stop_primary_direction"]

In [32]:
sort_cols = route_time_cols + ["service_date"]


In [33]:
def concatenate_schedule_by_route_direction(
    date_list: list
) -> pd.DataFrame:
    """
    Concatenate schedule data that's been 
    aggregated to route-direction-time_period.
    """
    # FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics
    FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics
    FILE = f"{FILE}_AH_TESTING"
    df = time_series_utils.concatenate_datasets_across_dates(
        RT_SCHED_GCS,
        FILE,
        date_list,
        data_type = "df",
        columns = route_time_cols + [
            "avg_scheduled_service_minutes", 
            "avg_stop_miles",
            "n_trips", "frequency", 
            "is_express", "is_rapid",  "is_rail",
            "is_coverage", "is_downtown_local", "is_local",
            
        ],
    ).sort_values(sort_cols).rename(
        columns = {
            # rename so we understand data source
            "n_trips": "n_scheduled_trips",
        }
    ).reset_index(drop=True)    
    
    return df

In [34]:
concat_test = concatenate_schedule_by_route_direction(analysis_date_list)

In [35]:
concat_test.head()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,stop_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,51.77,0.27,22,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-04-15
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,51.77,0.27,22,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-04-16
2,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,offpeak,Northbound,51.77,0.27,10,0.62,0.0,0.0,0.0,0.0,1.0,0.0,2024-04-15
3,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,offpeak,Northbound,51.77,0.27,10,0.62,0.0,0.0,0.0,0.0,1.0,0.0,2024-04-16
4,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,peak,Northbound,51.77,0.27,12,1.5,0.0,0.0,0.0,0.0,1.0,0.0,2024-04-15


In [36]:
import sys

sys.path.append("../gtfs_digest")
import merge_data

#### `set_primary_typology` should also include `stop_primary_direction`?

In [37]:
def set_primary_typology(df: pd.DataFrame) -> pd.DataFrame:
    """
    Choose a primary typology, and we'll be more generous if 
    multiple typologies are found.
    """
    subset_cols = [c for c in df.columns if "is_" in c and 
                   c not in ["is_ontime", "is_early", "is_late"]]
    keep_cols = route_time_cols + subset_cols
    
    df2 = df[keep_cols].sort_values(
        route_time_cols + subset_cols
    ).drop_duplicates(subset=route_time_cols)
    
    ranks = {
        "coverage": 1,
        "local": 2, 
        "downtown_local": 3,
        "express": 4,        
        "rapid": 5,
        "rail": 6,
    }
    
    # Find the max "score" / typology type, and use that
    for c in ranks.keys():
        df2[f"{c}_score"] = df2[f"is_{c}"] * ranks[c]
    
    df2["max_score"] = df2[[c for c in df2.columns if "_score" in c]].max(axis=1)
    df2["typology"] = df2.max_score.map({v: k for k, v in ranks.items()})
    df2 = df2.assign(
        typology = df2.typology.fillna("unknown")
    )
    
    df3 = df2[route_time_cols + ["typology"]]
    
    return df3

In [38]:
primary_typology = set_primary_typology(concat_test)

In [39]:
primary_typology.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,stop_primary_direction,typology
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,downtown_local
2,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,offpeak,Northbound,downtown_local


In [40]:
df_sched2 = pd.merge(
        concat_test,
        primary_typology,
        on = route_time_cols,
        how = "left"
    )

In [41]:
df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(analysis_date_list)

In [42]:
df_avg_speeds.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,speed_mph,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,17.94,2024-04-15
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,16.21,2024-04-16


In [43]:
df_rt_sched = (
        merge_data.concatenate_rt_vs_schedule_by_route_direction(
            analysis_date_list)
        .astype({"direction_id": "float"})
    )

In [44]:
df_rt_sched.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,name,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,1600,1575,1743.37,1139.0,4728,3813,0,0,22,22,2.71,0.81,0.92,0.9,1.0,1.0,1.53,79.24,Bay Area 511 Marin Schedule,2024-04-15
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,1760,1741,1809.46,1139.0,5219,3875,0,0,22,22,2.88,0.74,0.97,0.96,1.0,1.0,1.59,82.25,Bay Area 511 Marin Schedule,2024-04-16


#### How do I incoporate `cardinal direction` to the other two datasets `df_rt_sched` and `df_avg_speeds`?

In [45]:
df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)

In [46]:
df_crosswalk.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,name,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,caltrans_district,service_date
0,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,recrAG7e0oOiR6FiP,aHR0cHM6Ly9nb3ZjYnVzLmNvbS9ndGZz,rec7EN71rsZxDFxZd,Ventura County Transportation Commission,07 - Los Angeles,2024-04-15
1,bff13f8993ff18e43577db1f5596e014,Merced GMV Schedule,rec0qwiwmJzZWh8w2,aHR0cHM6Ly90aGVidXNsaXZlLmNvbS9ndGZz,recVSX7dwjxAb557T,Transit Joint Powers Authority for Merced County,10 - Stockton,2024-04-15


In [47]:
df_crosswalk.service_date.unique()

array(['2024-04-15T00:00:00.000000000', '2024-04-16T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [48]:
route_time_cols2 = ["schedule_gtfs_dataset_key", 
                   "route_id", "direction_id", "time_period"]

In [49]:
df = pd.merge(
        df_sched2,
        df_rt_sched,
        on = route_time_cols2 + ["service_date"],
        how = "outer",
        indicator = "sched_rt_category"
    ).merge(
        df_avg_speeds,
        on = route_time_cols2 + ["service_date"],
        how = "outer",
    )

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22065 entries, 0 to 22064
Data columns (total 38 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   schedule_gtfs_dataset_key        22065 non-null  object        
 1   route_id                         22065 non-null  object        
 2   direction_id                     22065 non-null  float64       
 3   time_period                      22065 non-null  object        
 4   stop_primary_direction           21026 non-null  object        
 5   avg_scheduled_service_minutes    21026 non-null  float64       
 6   avg_stop_miles                   21026 non-null  float64       
 7   n_scheduled_trips                21026 non-null  float64       
 8   frequency                        21026 non-null  float64       
 9   is_express                       19949 non-null  float64       
 10  is_rapid                         19949 non-null  float64  

#### `merge_in_standardized_route_names` isn't assigning any names.
`df = df.assign(
        sched_rt_category = df.sched_rt_category.map(
            gtfs_schedule_wrangling.sched_rt_category_dict)
    ).pipe(
        merge_data.merge_in_standardized_route_names,
    ).merge(
        df_crosswalk,
        on = ["schedule_gtfs_dataset_key", "name", "service_date"],
        how = "left"
    )`

##### `gtfs_schedule_wrangling.sched_rt_category_dict`
Isn't working, there's no left only col??

In [52]:
gtfs_schedule_wrangling.sched_rt_category_dict??

[0;31mType:[0m        dict
[0;31mString form:[0m {'left_only': 'schedule_only', 'both': 'schedule_and_vp', 'right_only': 'vp_only'}
[0;31mLength:[0m      3
[0;31mDocstring:[0m  
dict() -> new empty dictionary
dict(mapping) -> new dictionary initialized from a mapping object's
    (key, value) pairs
dict(iterable) -> new dictionary initialized as if via:
    d = {}
    for k, v in iterable:
        d[k] = v
dict(**kwargs) -> new dictionary initialized with the name=value pairs
    in the keyword argument list.  For example:  dict(one=1, two=2)

In [None]:
def merge_in_standardized_route_names(
    df: pd.DataFrame, 
) -> pd.DataFrame:
    
    keep_cols = [
        "schedule_gtfs_dataset_key", "name", 
        "route_id", "service_date", 
    ]
    
    CLEAN_ROUTES = GTFS_DATA_DICT.schedule_tables.route_identification
    
    route_names_df = pd.read_parquet(
        f"{SCHED_GCS}{CLEAN_ROUTES}.parquet"
    )
    
    route_names_df = time_series_utils.clean_standardized_route_names(
        route_names_df).drop_duplicates()
    
    if "name" in df.columns:
        df = df.drop(columns = "name")
    
    # Use `route_id` to merge to standardized_route_names
    df2 = pd.merge(
        df,
        route_names_df,
        on = ["schedule_gtfs_dataset_key", 
              "route_id", "service_date"],
        how = "left",
    ).drop_duplicates()
    
    # Clean up
    
    # After merging, we can replace route_id with recent_route_id2 
    drop_cols = ["route_desc", "combined_name", "route_id2"]
    
    df3 = time_series_utils.parse_route_combined_name(df2).drop(
        columns = drop_cols).drop_duplicates().reset_index(drop=True)
    
    return df3

In [None]:
df = df.assign(
        sched_rt_category = df.sched_rt_category.map(
            gtfs_schedule_wrangling.sched_rt_category_dict)
    )

In [None]:
df.info()

In [None]:
integrify = [
        "n_scheduled_trips", "n_vp_trips",
        "minutes_atleast1_vp", "minutes_atleast2_vp",
        "total_vp", "vp_in_shape",
        "is_early", "is_ontime", "is_late"
    ]
    
df[integrify] = df[integrify].fillna(0).astype("int")

In [None]:
df.head(3)

In [None]:
df.shape

In [None]:
df.info()

### `assemble_scheduled_trip_metrics`
* `df` from this function is the same thing that is loaded from `section2_utils.load_scheduled_stop_times`.
* How come `df` is read as a `gpd`? Could I just read it in as a regular pandas dataframe?

In [None]:
STOP_TIMES_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction

In [None]:
stop_times_gdf = gpd.read_parquet(
    f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{analysis_date}.parquet"
)

In [None]:
type(stop_times_gdf)

In [None]:
scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name",
]

In [None]:
trips_to_route_cols_subset = ["trip_instance_key", "route_id", "direction_id"]

In [None]:
# Add more columns to this.
trips_to_route_df = helpers.import_scheduled_trips(
    analysis_date, columns=scheduled_col, get_pandas=True
)

In [None]:
time_of_day = gtfs_schedule_wrangling.get_trip_time_buckets(analysis_date)[
    ["trip_instance_key", "time_of_day", "scheduled_service_minutes"]
]

In [None]:
trip_cols = ["schedule_gtfs_dataset_key", "trip_instance_key"]

grouped_df = stop_times_gdf.groupby(trip_cols, observed=True, group_keys=False)

In [None]:
len(grouped_df)

In [None]:
df2 = pd.merge(
    grouped_df.agg({"stop_meters": "median"})
    .reset_index()
    .rename(columns={"stop_meters": "median_stop_meters"}),
    time_of_day,
    on="trip_instance_key",
    how="left",
).merge(
    trips_to_route_df[trips_to_route_cols_subset], on="trip_instance_key", how="inner"
)

In [None]:
df2.head()

In [None]:
df2.route_id.nunique()

In [None]:
df2.info()

In [None]:
df2.direction_id = df2.direction_id.fillna(0)

#### Add in Cardinal Direction Stuff somewhere here?

In [None]:
stop_times_col = [
    "feed_key",
    "stop_id",
    "stop_sequence",
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "stop_name",
    "prior_stop_sequence",
    "subseq_stop_sequence",
    "stop_pair",
    "stop_pair_name",
    "stop_primary_direction",
    "stop_meters",
]

In [None]:
stop_times_gdf2 = stop_times_gdf[stop_times_col]

In [None]:
trips_to_route_df.route_id.nunique()

In [None]:
trips_to_route_df.columns

In [None]:
merge_cols = [
    "trip_instance_key",
    "schedule_gtfs_dataset_key",
    "shape_array_key",
]

In [None]:
cardinal_dir1 = pd.merge(stop_times_gdf2, trips_to_route_df, on=merge_cols, how="inner")

In [None]:
cardinal_dir1.direction_id = cardinal_dir1.direction_id.fillna(0)

In [None]:
cardinal_dir1.shape

In [None]:
cardinal_dir1.route_id.nunique()

In [None]:
cardinal_dir1.trip_instance_key.nunique()

In [None]:
cardinal_dir1.info()

### Why do some of the route IDS drop off?
* Some have `direction_id` that are `nan`

In [None]:
cardinal_dir2 = (
    cardinal_dir1.groupby(
        [
            "route_id",
            "schedule_gtfs_dataset_key",
            "direction_id",
            "stop_primary_direction",
        ]
    )
    .agg({"stop_sequence": "count"})
    .reset_index()
    .rename(columns={"stop_sequence": "total_stops"})
)

In [None]:
cardinal_dir2.route_id.nunique()

In [None]:
cardinal_dir1_routes = set(cardinal_dir1.route_id.unique().tolist())
cardinal_dir2_routes = set(cardinal_dir2.route_id.unique().tolist())

In [None]:
len(cardinal_dir1_routes - cardinal_dir2_routes)

In [None]:
(cardinal_dir1_routes - cardinal_dir2_routes)

In [None]:
cardinal_dir1.loc[(cardinal_dir1.route_id == "0177a66b-9f33-407d-a72e-776429fb73d4")][
    ["stop_primary_direction", "direction_id"]
].drop_duplicates()

In [None]:
cardinal_dir1.loc[(cardinal_dir1.route_id == "9f38a05f-6eea-47f4-bf42-992a789e7b49")][
    ["stop_primary_direction", "direction_id"]
].drop_duplicates()

In [None]:
cardinal_dir1.loc[(cardinal_dir1.route_id == "3ff1b747-a791-4eb3-90b2-25cb355b6c67")][
    ["route_id", "stop_primary_direction", "direction_id"]
].drop_duplicates()

In [None]:
cardinal_dir2.loc[
    (cardinal_dir2.route_id == "001")
    & (cardinal_dir2.schedule_gtfs_dataset_key == "cb3074eb8b423dfc5acfeeb0de95eb82")
]

In [None]:
cardinal_dir3 = cardinal_dir2.sort_values(
    by=["route_id", "schedule_gtfs_dataset_key", "direction_id", "total_stops"],
    ascending=[True, True, True, False],
)

In [None]:
cardinal_dir3.loc[
    (cardinal_dir3.route_id == "001")
    & (cardinal_dir3.schedule_gtfs_dataset_key == "cb3074eb8b423dfc5acfeeb0de95eb82")
]

In [None]:
# Drop duplicates so only the top stop_primary_direction is kept.
cardinal_dir4 = cardinal_dir3.drop_duplicates(
    subset=[
        "route_id",
        "schedule_gtfs_dataset_key",
        "direction_id",
    ]
).reset_index(drop=True)

In [None]:
cardinal_dir4.loc[
    (cardinal_dir4.route_id == "001")
    & (cardinal_dir4.schedule_gtfs_dataset_key == "cb3074eb8b423dfc5acfeeb0de95eb82")
]

In [None]:
cardinal_dir4 = cardinal_dir4.drop(columns=["total_stops"])

In [None]:
cardinal_dir4.route_id.nunique()

### Merge (some route IDs are missing because they don't have 0/1 populated in `direction_id`)

In [None]:
df2.head(2)

In [None]:
cardinal_dir4.head(2)

In [None]:
cardinal_dir4.route_id.nunique(), df2.route_id.nunique()

In [None]:
len(df2), len(cardinal_dir4)

In [None]:
pd.merge(
    df2,
    cardinal_dir4,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
pd.merge(
    df2,
    cardinal_dir4,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
m1 = pd.merge(
    df2,
    cardinal_dir4,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [None]:
pd.merge(
    df2,
    cardinal_dir4,
    on=["schedule_gtfs_dataset_key", "route_id"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

#### How does harmonizing the route names fit into here? 
* [This script](https://github.com/cal-itp/data-analyses/blob/b1e5d4f870400251240eeba4a6515a0848e5d6f8/gtfs_funnel/clean_route_naming.py#L4)

In [None]:
m1.head(3)

In [None]:
m1.info()

### Final

In [None]:
def find_most_common_dir(
    stop_times_gdf: gpd.GeoDataFrame,
    trips_to_route_df: pd.DataFrame,
) -> pd.DataFrame:
    """
    Load load_scheduled_trips() and load_scheduled_stop_times()
    """
    stop_times_col = [
        "feed_key",
        "stop_id",
        "stop_sequence",
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
        "shape_array_key",
        "stop_name",
        "prior_stop_sequence",
        "subseq_stop_sequence",
        "stop_pair",
        "stop_pair_name",
        "stop_primary_direction",
        "stop_meters",
    ]

    stop_times_gdf2 = stop_times_gdf[stop_times_col]

    # Merge dfs
    merge_cols = ["trip_instance_key", "schedule_gtfs_dataset_key", "shape_array_key"]

    df1 = pd.merge(stop_times_gdf2, trips_to_route_df, on=merge_cols, how="inner")
    df1.direction_id = df1.direction_id.fillna(0)
    
    agg1 = (
        df1.groupby(
            [
                "route_id",
                "schedule_gtfs_dataset_key",
                "direction_id",
                "stop_primary_direction",
            ]
        )
        .agg({"stop_sequence": "count"})
        .reset_index()
        .rename(columns={"stop_sequence": "total_stops"})
    )

    # Sort and drop duplicates so that the
    # largest # of stops by stop_primary_direction is at the top
    agg2 = agg1.sort_values(
        by=["route_id", "schedule_gtfs_dataset_key", "direction_id", "total_stops"],
        ascending=[True, True, True, False],
    )

    # Drop duplicates so only the top stop_primary_direction is kept.
    agg3 = agg2.drop_duplicates(
    subset=[
        "route_id",
        "schedule_gtfs_dataset_key",
        "direction_id",
    ]).reset_index(drop=True)
    
    agg3 = agg3.drop(columns=["total_stops"])
    
    
    
    return agg3

In [None]:
def assemble_scheduled_trip_metrics(
    analysis_date: str, 
    dict_inputs: dict
) -> pd.DataFrame:
    """
    Get GTFS schedule trip metrics including time-of-day buckets,
    scheduled service minutes, and median stop spacing.
    """
    STOP_TIMES_FILE = dict_inputs.rt_vs_schedule_tables.stop_times_direction
    
    # Load files
    df = gpd.read_parquet(
        f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{analysis_date}.parquet"
    )
    
    scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name"
    ]
    
    trips_to_route = helpers.import_scheduled_trips(
        analysis_date,
        columns = scheduled_col,
        get_pandas = True
    )
    
    
    time_of_day = (gtfs_schedule_wrangling.get_trip_time_buckets(analysis_date)   
                   [["trip_instance_key", "time_of_day", 
                     "scheduled_service_minutes"]]
              )
    
    trip_cols = ["schedule_gtfs_dataset_key", "trip_instance_key"]
    
    grouped_df = df.groupby(trip_cols, observed=True, group_keys=False)
    
    trips_to_route_cols_subset = ["trip_instance_key", "route_id", "direction_id"]
    # Get median / mean stop meters for the trip
    # Attach time-of-day and route_id and direction_id
    # Merge using a subset
    median_stop_meters_df= pd.merge(
        grouped_df.agg({"stop_meters": "median"}).reset_index().rename(
            columns = {"stop_meters": "median_stop_meters"}),
        time_of_day,
        on = "trip_instance_key",
        how = "left"
    ).merge(
        trips_to_route[trips_to_route_cols_subset],
        on = "trip_instance_key",
        how = "inner"
    )
    
    median_stop_meters_df.direction_id = median_stop_meters_df.direction_id.fillna(0)
    
    # Get cardinal direction
    cardinal_direction_df = find_most_common_dir(df,trips_to_route)
    
    # Merge everything together
    m1 = pd.merge(
    median_stop_meters_df,
    cardinal_direction_df,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner")
    
    return m1

In [None]:
test = assemble_scheduled_trip_metrics(analysis_date, GTFS_DATA_DICT)

In [None]:
test.head()

In [None]:
test.shape

In [None]:
route_cols = [
            "schedule_gtfs_dataset_key", 
            "route_id", 
            "direction_id"
        ]

In [None]:
import sys
sys.path.append("../gtfs_funnel")
import schedule_stats_by_route_direction

In [None]:
route_dir_metrics = schedule_stats_by_route_direction.schedule_metrics_by_route_direction(
            test, analysis_date, route_cols)

In [None]:
ROUTE_TYPOLOGIES = GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
route_typologies = pd.read_parquet(
            f"{SCHED_GCS}{ROUTE_TYPOLOGIES}_{analysis_date}.parquet",
            columns = route_cols + [
                "is_coverage", "is_downtown_local", 
                "is_local", "is_rapid", "is_express", "is_rail"]
        )

In [None]:
route_dir_metrics2 = pd.merge(
            route_dir_metrics,
            route_typologies,
            on = route_cols,
            how = "left"
        )

In [None]:
route_dir_metrics2.head().drop(columns = ['geometry'
])