## Adding Cardinal Direction into Pipeline
* Editing [this file](https://github.com/cal-itp/data-analyses/blob/ah_gtfs_portfolio/gtfs_funnel/schedule_stats_by_route_direction.py#L23)

In [1]:
import _section2_utils as section2
import geopandas as gpd
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [4]:
GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction

'stop_times_direction'

In [5]:
analysis_date = "2024-04-16"

In [6]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_trip_metrics

'schedule_trip/schedule_trip_metrics'

## Seeing my changes in `gtfs_funnel/schedule_stats_by_route_direction.py` 6/7

#### `schedule_route_dir`

In [7]:
sched_route = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [8]:
# gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-04-15_AH_TESTING.parquet

In [9]:
route_url = f"{RT_SCHED_GCS}{sched_route}_AH_TESTING_"

In [10]:
# https://storage.cloud.google.com/calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_AH_TESTING_2024-04-15.parquet

In [11]:
apr_15_routes = pd.read_parquet(f"{route_url}2024-04-15.parquet")

In [12]:
apr_16_routes = pd.read_parquet(f"{route_url}2024-04-16.parquet")

#### Why is the 17th missing?

In [13]:
apr_18_routes = pd.read_parquet(f"{route_url}2024-04-18.parquet")

In [14]:
apr_18_routes.shape

(10140, 18)

In [15]:
apr_16_routes.head(1).drop(columns=["geometry"])

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail
0,3c62ad6ee589d56eca915ce291a5df0a,2b5c285d-29b2-4d19-8f12-f85348cd832e,0.0,ea5549cf-85a1-4f9e-a4e4-39e53493d2d1,WEST SACRAMENTO SHUTTLE - 240,Westbound,50.0,0.21,12,all_day,0.5,1.0,0.0,0.0,1.0,0.0,0.0


#### `schedule_trip`

In [16]:
# gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_trip/schedule_trip_metrics_2024-04-15_AH_TESTING.parquet

In [17]:
trip_gcs = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_trip_metrics

In [18]:
trip_url = f"{RT_SCHED_GCS}{trip_gcs}_"

In [19]:
trip_url

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_trip/schedule_trip_metrics_'

In [20]:
apr_15_trips = pd.read_parquet(f"{trip_url}2024-04-15_AH_TESTING.parquet")

In [21]:
apr_15_trips.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id,route_primary_direction
0,0139b1253130b33adcd4b3a4490530d2,004c9c82f6ef126000f4067c79f48ef2,740.13,PM Peak,55.0,c6726149-9979-4ebb-85f6-0be90402266c,0.0,Southbound
1,0139b1253130b33adcd4b3a4490530d2,1caf03cf383f956c179788b51e850db0,893.83,PM Peak,57.0,c6726149-9979-4ebb-85f6-0be90402266c,0.0,Southbound


In [22]:
apr_16_trips = pd.read_parquet(f"{trip_url}2024-04-16_AH_TESTING.parquet")

In [23]:
apr_16_trips.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id,route_primary_direction
0,0139b1253130b33adcd4b3a4490530d2,000a5e91dd67d8dbbbe91b97159bfc6c,1447.13,Midday,60.0,3ff1b747-a791-4eb3-90b2-25cb355b6c67,0.0,Southbound
1,0139b1253130b33adcd4b3a4490530d2,0e578363758f6012c878553b563ba908,1447.13,Midday,60.0,3ff1b747-a791-4eb3-90b2-25cb355b6c67,0.0,Southbound


## `gtfs_digest/merge_data.py`

In [24]:
analysis_date_list = ["2024-04-15", "2024-04-16", "2024-04-18"]

### THIS NEEDS TO CHANGE IN THE PYTHON FILE.

In [25]:
# THIS NEEDS TO CHANGE IN THE PYTHON FILE.
route_time_cols = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
    "route_primary_direction",
]

In [26]:
sort_cols = route_time_cols + ["service_date"]

In [27]:
def concatenate_schedule_by_route_direction(date_list: list) -> pd.DataFrame:
    """
    Concatenate schedule data that's been
    aggregated to route-direction-time_period.
    """
    FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics
    FILE = f"{FILE}_AH_TESTING"
    df = (
        time_series_utils.concatenate_datasets_across_dates(
            RT_SCHED_GCS,
            FILE,
            date_list,
            data_type="df",
            columns=route_time_cols
            + [
                "avg_scheduled_service_minutes",
                "avg_stop_miles",
                "n_trips",
                "frequency",
                "is_express",
                "is_rapid",
                "is_rail",
                "is_coverage",
                "is_downtown_local",
                "is_local",
            ],
        )
        .sort_values(sort_cols)
        .rename(
            columns={
                # rename so we understand data source
                "n_trips": "n_scheduled_trips",
            }
        )
        .reset_index(drop=True)
    )

    return df

In [28]:
concat_test = concatenate_schedule_by_route_direction(analysis_date_list)

In [29]:
concat_test.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,51.77,0.27,22,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-04-15
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,51.77,0.27,22,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-04-16


### Following `if __name__ == "__main__"` part of `gtfs_digest/merge_data`

In [30]:
import sys

sys.path.append("../gtfs_digest")
import merge_data

#### `set_primary_typology` should also include `stop_primary_direction`
* There's no change to this function, it's just that I added `stop_primary_dir` to the `route_time_cols` above

In [31]:
def set_primary_typology(df: pd.DataFrame) -> pd.DataFrame:
    """
    Choose a primary typology, and we'll be more generous if
    multiple typologies are found.
    """
    subset_cols = [
        c
        for c in df.columns
        if "is_" in c and c not in ["is_ontime", "is_early", "is_late"]
    ]
    keep_cols = route_time_cols + subset_cols

    df2 = (
        df[keep_cols]
        .sort_values(route_time_cols + subset_cols)
        .drop_duplicates(subset=route_time_cols)
    )

    ranks = {
        "coverage": 1,
        "local": 2,
        "downtown_local": 3,
        "express": 4,
        "rapid": 5,
        "rail": 6,
    }

    # Find the max "score" / typology type, and use that
    for c in ranks.keys():
        df2[f"{c}_score"] = df2[f"is_{c}"] * ranks[c]

    df2["max_score"] = df2[[c for c in df2.columns if "_score" in c]].max(axis=1)
    df2["typology"] = df2.max_score.map({v: k for k, v in ranks.items()})
    df2 = df2.assign(typology=df2.typology.fillna("unknown"))

    df3 = df2[route_time_cols + ["typology"]]

    return df3

In [32]:
primary_typology = set_primary_typology(concat_test)

In [33]:
primary_typology.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,typology
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,downtown_local
3,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,offpeak,Northbound,downtown_local


In [34]:
df_sched2 = pd.merge(concat_test, primary_typology, on=route_time_cols, how="left")

In [35]:
df_sched2.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,51.77,0.27,22,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-04-15,downtown_local
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,51.77,0.27,22,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-04-16,downtown_local


In [36]:
df_sched2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31166 entries, 0 to 31165
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   schedule_gtfs_dataset_key      31166 non-null  object        
 1   route_id                       31166 non-null  object        
 2   direction_id                   31166 non-null  float64       
 3   time_period                    31166 non-null  object        
 4   route_primary_direction        31166 non-null  object        
 5   avg_scheduled_service_minutes  31166 non-null  float64       
 6   avg_stop_miles                 31166 non-null  float64       
 7   n_scheduled_trips              31166 non-null  int64         
 8   frequency                      31166 non-null  float64       
 9   is_express                     29574 non-null  float64       
 10  is_rapid                       29574 non-null  float64       
 11  is_rail        

#### `concatenate_speeds_by_route_direction`

In [37]:
df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(analysis_date_list)

In [38]:
df_avg_speeds.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,speed_mph,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,17.94,2024-04-15
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,16.21,2024-04-16


#### `concatenate_rt_vs_schedule_by_route_direction`

In [39]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(
    analysis_date_list
).astype({"direction_id": "float"})

In [40]:
df_rt_sched.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,name,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,1600,1575,1743.37,1139.0,4728,3813,0,0,22,22,2.71,0.81,0.92,0.9,1.0,1.0,1.53,79.24,Bay Area 511 Marin Schedule,2024-04-15
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,1760,1741,1809.46,1139.0,5219,3875,0,0,22,22,2.88,0.74,0.97,0.96,1.0,1.0,1.59,82.25,Bay Area 511 Marin Schedule,2024-04-16


#### `concatenate_crosswalk_organization`
* Adds the organization name.

In [41]:
df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)

In [42]:
df_crosswalk.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,name,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,caltrans_district,service_date
0,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,recrAG7e0oOiR6FiP,aHR0cHM6Ly9nb3ZjYnVzLmNvbS9ndGZz,rec7EN71rsZxDFxZd,Ventura County Transportation Commission,07 - Los Angeles,2024-04-15
1,bff13f8993ff18e43577db1f5596e014,Merced GMV Schedule,rec0qwiwmJzZWh8w2,aHR0cHM6Ly90aGVidXNsaXZlLmNvbS9ndGZz,recVSX7dwjxAb557T,Transit Joint Powers Authority for Merced County,10 - Stockton,2024-04-15


#### Merges
* How do I incoporate `cardinal direction` to the other two datasets `df_rt_sched` and `df_avg_speeds`?

In [43]:
df_rt_sched.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,name,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,1600,1575,1743.37,1139.0,4728,3813,0,0,22,22,2.71,0.81,0.92,0.9,1.0,1.0,1.53,79.24,Bay Area 511 Marin Schedule,2024-04-15


In [44]:
df_sched2.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,51.77,0.27,22,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-04-15,downtown_local


In [45]:
route_time_cols2 = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
]

In [46]:
df = pd.merge(
    df_sched2,
    df_rt_sched,
    on=route_time_cols2 + ["service_date"],
    how="outer",
    indicator="sched_rt_category",
)

In [47]:
df.sched_rt_category.value_counts(dropna=False)

both          24450
left_only      6716
right_only     1565
Name: sched_rt_category, dtype: int64

In [48]:
df = pd.merge(df, df_avg_speeds, on=route_time_cols2 + ["service_date"], how="outer")

In [49]:
# df2.test_indicator.value_counts()

In [50]:
df = df.assign(
    sched_rt_category=df.sched_rt_category.map(
        gtfs_schedule_wrangling.sched_rt_category_dict
    )
)

In [51]:
df.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,name,sched_rt_category,speed_mph
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,51.77,0.27,22.0,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-04-15,downtown_local,1600.0,1575.0,1743.37,1139.0,4728.0,3813.0,0.0,0.0,22.0,22.0,2.71,0.81,0.92,0.9,1.0,1.0,1.53,79.24,Bay Area 511 Marin Schedule,schedule_and_vp,17.94


In [52]:
df.sched_rt_category.value_counts(dropna=False)

schedule_and_vp    24450
schedule_only       6716
vp_only             1565
Name: sched_rt_category, dtype: int64

In [53]:
df.shape

(32731, 38)

#### Breaking out `merge_in_standardized_route_names`
* Doesn't work because service_dates don't match. 
* Guess it's due to testing on two days that dont match.

In [54]:
df = merge_data.merge_in_standardized_route_names(df)

In [55]:
df.service_date.unique()

array(['2024-04-15T00:00:00.000000000', '2024-04-16T00:00:00.000000000',
       '2024-04-18T00:00:00.000000000'], dtype='datetime64[ns]')

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32731 entries, 0 to 32730
Data columns (total 41 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   schedule_gtfs_dataset_key        32731 non-null  object        
 1   direction_id                     32731 non-null  float64       
 2   time_period                      32731 non-null  object        
 3   route_primary_direction          31166 non-null  object        
 4   avg_scheduled_service_minutes    31166 non-null  float64       
 5   avg_stop_miles                   31166 non-null  float64       
 6   n_scheduled_trips                31166 non-null  float64       
 7   frequency                        31166 non-null  float64       
 8   is_express                       29574 non-null  float64       
 9   is_rapid                         29574 non-null  float64       
 10  is_rail                          29574 non-null  float64  

#### Merge `df_crosswalk`

In [57]:
df_crosswalk.service_date.unique()

array(['2024-04-15T00:00:00.000000000', '2024-04-16T00:00:00.000000000',
       '2024-04-18T00:00:00.000000000'], dtype='datetime64[ns]')

In [58]:
pd.merge(
    df,
    df_crosswalk,
    on=["schedule_gtfs_dataset_key", "name", "service_date"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

_merge    
both          32731
right_only       43
left_only         0
dtype: int64

In [59]:
df = pd.merge(
    df,
    df_crosswalk,
    on=["schedule_gtfs_dataset_key", "name", "service_date"],
    how="left",
)

In [60]:
integrify = [
    "n_scheduled_trips",
    "n_vp_trips",
    "minutes_atleast1_vp",
    "minutes_atleast2_vp",
    "total_vp",
    "vp_in_shape",
    "is_early",
    "is_ontime",
    "is_late",
]

df[integrify] = df[integrify].fillna(0).astype("int")

In [61]:
df.shape

(32731, 46)

In [62]:
df.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,name,route_long_name,route_short_name,route_combined_name,route_id,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,caltrans_district
0,015d67d5b75b5cf2b710bbadadfb75f5,0.0,all_day,Northbound,51.77,0.27,22,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-04-15,downtown_local,1600,1575,1743.37,1139.0,4728,3813,0,0,22,22,2.71,0.81,0.92,0.9,1.0,1.0,1.53,79.24,schedule_and_vp,17.94,Bay Area 511 Marin Schedule,Downtown San Rafael - Sausalito,17,17 Downtown San Rafael - Sausalito,17,reckCEnFkdLVgfxck,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland


In [63]:
len(df)

32731

In [64]:
non_nan_primary_stop = 9003+7700+7528+6926

In [65]:
len(df) - non_nan_primary_stop

1574

In [66]:
df.sched_rt_category.value_counts(dropna=False)

schedule_and_vp    24450
schedule_only       6716
vp_only             1565
Name: sched_rt_category, dtype: int64

In [67]:
df.sched_rt_category.value_counts().sum()

32731

In [69]:
# include nans in value_counts
df.route_primary_direction.value_counts(dropna = False)

Eastbound     9003
Westbound     7703
Northbound    7534
Southbound    6926
NaN           1565
Name: route_primary_direction, dtype: int64

In [70]:
df.loc[df.route_primary_direction.isna()].sched_rt_category.unique()

['vp_only']
Categories (3, object): ['schedule_only', 'vp_only', 'schedule_and_vp']

In [71]:
df.loc[df.route_primary_direction.isna()][["name", "sched_rt_category"]].drop_duplicates()

Unnamed: 0,name,sched_rt_category
31166,Bay Area 511 Marin Schedule,vp_only
31238,Bay Area 511 Dumbarton Express Schedule,vp_only
31274,Bay Area 511 Emery Go-Round Schedule,vp_only
31283,VCTC GMV Schedule,vp_only
31403,Bay Area 511 Vine Transit Schedule,vp_only
31421,Bay Area 511 SolTrans Schedule,vp_only
31511,SBMTD Schedule,vp_only
31556,LA Metro Bus Schedule,vp_only
31574,Bay Area 511 SamTrans Schedule,vp_only
31628,Bay Area 511 WestCAT Schedule,vp_only


## Adding cardinal direction into the actual functions

### `assemble_scheduled_trip_metrics`
* `df` from this function is the same thing that is loaded from `section2_utils.load_scheduled_stop_times`.
* How come `df` is read as a `gpd`? Could I just read it in as a regular pandas dataframe?

In [None]:
STOP_TIMES_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction

In [None]:
stop_times_gdf = gpd.read_parquet(
    f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{analysis_date}.parquet"
)

In [None]:
type(stop_times_gdf)

In [None]:
scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name",
]

In [None]:
trips_to_route_cols_subset = ["trip_instance_key", "route_id", "direction_id"]

In [None]:
# Add more columns to this.
trips_to_route_df = helpers.import_scheduled_trips(
    analysis_date, columns=scheduled_col, get_pandas=True
)

In [None]:
time_of_day = gtfs_schedule_wrangling.get_trip_time_buckets(analysis_date)[
    ["trip_instance_key", "time_of_day", "scheduled_service_minutes"]
]

In [None]:
trip_cols = ["schedule_gtfs_dataset_key", "trip_instance_key"]

grouped_df = stop_times_gdf.groupby(trip_cols, observed=True, group_keys=False)

In [None]:
len(grouped_df)

In [None]:
df2 = pd.merge(
    grouped_df.agg({"stop_meters": "median"})
    .reset_index()
    .rename(columns={"stop_meters": "median_stop_meters"}),
    time_of_day,
    on="trip_instance_key",
    how="left",
).merge(
    trips_to_route_df[trips_to_route_cols_subset], on="trip_instance_key", how="inner"
)

In [None]:
df2.head()

In [None]:
df2.route_id.nunique()

In [None]:
df2.info()

In [None]:
df2.direction_id = df2.direction_id.fillna(0)

#### Add in Cardinal Direction Stuff somewhere here?

In [None]:
stop_times_col = [
    "feed_key",
    "stop_id",
    "stop_sequence",
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "stop_name",
    "prior_stop_sequence",
    "subseq_stop_sequence",
    "stop_pair",
    "stop_pair_name",
    "stop_primary_direction",
    "stop_meters",
]

In [None]:
stop_times_gdf2 = stop_times_gdf[stop_times_col]

In [None]:
trips_to_route_df.route_id.nunique()

In [None]:
trips_to_route_df.columns

In [None]:
merge_cols = [
    "trip_instance_key",
    "schedule_gtfs_dataset_key",
    "shape_array_key",
]

In [None]:
cardinal_dir1 = pd.merge(stop_times_gdf2, trips_to_route_df, on=merge_cols, how="inner")

In [None]:
cardinal_dir1.direction_id = cardinal_dir1.direction_id.fillna(0)

In [None]:
cardinal_dir1.shape

In [None]:
cardinal_dir1.route_id.nunique()

In [None]:
cardinal_dir1.trip_instance_key.nunique()

In [None]:
cardinal_dir1.info()

### Why do some of the route IDS drop off?
* Some have `direction_id` that are `nan`

In [None]:
cardinal_dir2 = (
    cardinal_dir1.groupby(
        [
            "route_id",
            "schedule_gtfs_dataset_key",
            "direction_id",
            "stop_primary_direction",
        ]
    )
    .agg({"stop_sequence": "count"})
    .reset_index()
    .rename(columns={"stop_sequence": "total_stops"})
)

In [None]:
cardinal_dir2.route_id.nunique()

In [None]:
cardinal_dir1_routes = set(cardinal_dir1.route_id.unique().tolist())
cardinal_dir2_routes = set(cardinal_dir2.route_id.unique().tolist())

In [None]:
len(cardinal_dir1_routes - cardinal_dir2_routes)

In [None]:
(cardinal_dir1_routes - cardinal_dir2_routes)

In [None]:
cardinal_dir1.loc[(cardinal_dir1.route_id == "0177a66b-9f33-407d-a72e-776429fb73d4")][
    ["stop_primary_direction", "direction_id"]
].drop_duplicates()

In [None]:
cardinal_dir1.loc[(cardinal_dir1.route_id == "9f38a05f-6eea-47f4-bf42-992a789e7b49")][
    ["stop_primary_direction", "direction_id"]
].drop_duplicates()

In [None]:
cardinal_dir1.loc[(cardinal_dir1.route_id == "3ff1b747-a791-4eb3-90b2-25cb355b6c67")][
    ["route_id", "stop_primary_direction", "direction_id"]
].drop_duplicates()

In [None]:
cardinal_dir2.loc[
    (cardinal_dir2.route_id == "001")
    & (cardinal_dir2.schedule_gtfs_dataset_key == "cb3074eb8b423dfc5acfeeb0de95eb82")
]

In [None]:
cardinal_dir3 = cardinal_dir2.sort_values(
    by=["route_id", "schedule_gtfs_dataset_key", "direction_id", "total_stops"],
    ascending=[True, True, True, False],
)

In [None]:
cardinal_dir3.loc[
    (cardinal_dir3.route_id == "001")
    & (cardinal_dir3.schedule_gtfs_dataset_key == "cb3074eb8b423dfc5acfeeb0de95eb82")
]

In [None]:
# Drop duplicates so only the top stop_primary_direction is kept.
cardinal_dir4 = cardinal_dir3.drop_duplicates(
    subset=[
        "route_id",
        "schedule_gtfs_dataset_key",
        "direction_id",
    ]
).reset_index(drop=True)

In [None]:
cardinal_dir4.loc[
    (cardinal_dir4.route_id == "001")
    & (cardinal_dir4.schedule_gtfs_dataset_key == "cb3074eb8b423dfc5acfeeb0de95eb82")
]

In [None]:
cardinal_dir4 = cardinal_dir4.drop(columns=["total_stops"])

In [None]:
cardinal_dir4.route_id.nunique()

### Merge (some route IDs are missing because they don't have 0/1 populated in `direction_id`)

In [None]:
df2.head(2)

In [None]:
cardinal_dir4.head(2)

In [None]:
cardinal_dir4.route_id.nunique(), df2.route_id.nunique()

In [None]:
len(df2), len(cardinal_dir4)

In [None]:
pd.merge(
    df2,
    cardinal_dir4,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
pd.merge(
    df2,
    cardinal_dir4,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
m1 = pd.merge(
    df2,
    cardinal_dir4,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [None]:
pd.merge(
    df2,
    cardinal_dir4,
    on=["schedule_gtfs_dataset_key", "route_id"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

#### How does harmonizing the route names fit into here? 
* [This script](https://github.com/cal-itp/data-analyses/blob/b1e5d4f870400251240eeba4a6515a0848e5d6f8/gtfs_funnel/clean_route_naming.py#L4)

In [None]:
m1.head(3)

In [None]:
m1.info()

### Final

In [None]:
def find_most_common_dir(
    stop_times_gdf: gpd.GeoDataFrame,
    trips_to_route_df: pd.DataFrame,
) -> pd.DataFrame:
    """
    Load load_scheduled_trips() and load_scheduled_stop_times()
    """
    stop_times_col = [
        "feed_key",
        "stop_id",
        "stop_sequence",
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
        "shape_array_key",
        "stop_name",
        "prior_stop_sequence",
        "subseq_stop_sequence",
        "stop_pair",
        "stop_pair_name",
        "stop_primary_direction",
        "stop_meters",
    ]

    stop_times_gdf2 = stop_times_gdf[stop_times_col]

    # Merge dfs
    merge_cols = ["trip_instance_key", "schedule_gtfs_dataset_key", "shape_array_key"]

    df1 = pd.merge(stop_times_gdf2, trips_to_route_df, on=merge_cols, how="inner")
    df1.direction_id = df1.direction_id.fillna(0)

    agg1 = (
        df1.groupby(
            [
                "route_id",
                "schedule_gtfs_dataset_key",
                "direction_id",
                "stop_primary_direction",
            ]
        )
        .agg({"stop_sequence": "count"})
        .reset_index()
        .rename(columns={"stop_sequence": "total_stops"})
    )

    # Sort and drop duplicates so that the
    # largest # of stops by stop_primary_direction is at the top
    agg2 = agg1.sort_values(
        by=["route_id", "schedule_gtfs_dataset_key", "direction_id", "total_stops"],
        ascending=[True, True, True, False],
    )

    # Drop duplicates so only the top stop_primary_direction is kept.
    agg3 = agg2.drop_duplicates(
        subset=[
            "route_id",
            "schedule_gtfs_dataset_key",
            "direction_id",
        ]
    ).reset_index(drop=True)

    agg3 = agg3.drop(columns=["total_stops"])

    return agg3

In [None]:
def assemble_scheduled_trip_metrics(
    analysis_date: str, dict_inputs: dict
) -> pd.DataFrame:
    """
    Get GTFS schedule trip metrics including time-of-day buckets,
    scheduled service minutes, and median stop spacing.
    """
    STOP_TIMES_FILE = dict_inputs.rt_vs_schedule_tables.stop_times_direction

    # Load files
    df = gpd.read_parquet(f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{analysis_date}.parquet")

    scheduled_col = [
        "route_id",
        "trip_instance_key",
        "gtfs_dataset_key",
        "shape_array_key",
        "direction_id",
        "route_long_name",
        "route_short_name",
        "route_desc",
        "name",
    ]

    trips_to_route = helpers.import_scheduled_trips(
        analysis_date, columns=scheduled_col, get_pandas=True
    )

    time_of_day = gtfs_schedule_wrangling.get_trip_time_buckets(analysis_date)[
        ["trip_instance_key", "time_of_day", "scheduled_service_minutes"]
    ]

    trip_cols = ["schedule_gtfs_dataset_key", "trip_instance_key"]

    grouped_df = df.groupby(trip_cols, observed=True, group_keys=False)

    trips_to_route_cols_subset = ["trip_instance_key", "route_id", "direction_id"]
    # Get median / mean stop meters for the trip
    # Attach time-of-day and route_id and direction_id
    # Merge using a subset
    median_stop_meters_df = pd.merge(
        grouped_df.agg({"stop_meters": "median"})
        .reset_index()
        .rename(columns={"stop_meters": "median_stop_meters"}),
        time_of_day,
        on="trip_instance_key",
        how="left",
    ).merge(
        trips_to_route[trips_to_route_cols_subset], on="trip_instance_key", how="inner"
    )

    median_stop_meters_df.direction_id = median_stop_meters_df.direction_id.fillna(0)

    # Get cardinal direction
    cardinal_direction_df = find_most_common_dir(df, trips_to_route)

    # Merge everything together
    m1 = pd.merge(
        median_stop_meters_df,
        cardinal_direction_df,
        on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
        how="inner",
    )

    return m1

In [None]:
test = assemble_scheduled_trip_metrics(analysis_date, GTFS_DATA_DICT)

In [None]:
test.head()

In [None]:
test.shape

In [None]:
route_cols = ["schedule_gtfs_dataset_key", "route_id", "direction_id"]

In [None]:
import sys

sys.path.append("../gtfs_funnel")
import schedule_stats_by_route_direction

In [None]:
route_dir_metrics = (
    schedule_stats_by_route_direction.schedule_metrics_by_route_direction(
        test, analysis_date, route_cols
    )
)

In [None]:
ROUTE_TYPOLOGIES = GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
route_typologies = pd.read_parquet(
    f"{SCHED_GCS}{ROUTE_TYPOLOGIES}_{analysis_date}.parquet",
    columns=route_cols
    + [
        "is_coverage",
        "is_downtown_local",
        "is_local",
        "is_rapid",
        "is_express",
        "is_rail",
    ],
)

In [None]:
route_dir_metrics2 = pd.merge(
    route_dir_metrics, route_typologies, on=route_cols, how="left"
)

In [None]:
route_dir_metrics2.head().drop(columns=["geometry"])