## Cardinal Direction: Switches
* In my notebook `23_cardinal_dir_checks` I discovered some routes genuinely change and their cardinal direction along with it. 
* However, I also discovered some routes simply just have different direction combinations each time.
* Due to the unpredictability, I have opted to fill out the all cardinal directions across time with the most common one?
* In the portfolio, we plot only the most recent geography available, so this wouldn't cause confusion.

In [1]:
import _section1_utils
import _section2_utils as section2
import geopandas as gpd
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
import sys

sys.path.append("../gtfs_funnel")
import schedule_stats_by_route_direction

In [6]:
all_dates = (
    rt_dates.y2024_dates
    + rt_dates.y2023_dates
    + rt_dates.oct2023_week
    + rt_dates.apr2023_week
    + rt_dates.apr2024_week
)

In [8]:
"""cardinal_df = pd.DataFrame()
for date in all_dates:
    df = schedule_stats_by_route_direction.cardinal_direction_for_route_direction(
        date, GTFS_DATA_DICT
    )
    cardinal_df = pd.concat([cardinal_df, df])
    """

In [9]:
cardinal_df.head(1)

Unnamed: 0,route_id,schedule_gtfs_dataset_key,direction_id,route_primary_direction
0,1,cb3074eb8b423dfc5acfeeb0de95eb82,0.0,Westbound


In [11]:
ROUTE_DIR_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [13]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [12]:
ROUTE_DIR_EXPORT

'schedule_route_dir/schedule_route_direction_metrics'

### This has to be fixed at the end because `schedule_stats_by_route_direction` produces a dataframe for each date rather than stacking all the dates on top of each other for a time series.

In [14]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [16]:
all_districts_df = pd.read_parquet(
    schd_vp_url,
    filters=[
        [
            ("sched_rt_category", "==", "schedule_and_vp"),
        ]
    ],
)

In [31]:
total_serivce_dates = int(all_districts_df.service_date.nunique())

In [28]:
agg1 = (
    all_districts_df.groupby(
        [
            "organization_name",
            "route_combined_name",
            "direction_id",
            "route_primary_direction",
        ]
    )
    .agg({"service_date": "nunique"})
    .reset_index()
    .sort_values(
        by=["organization_name", "route_combined_name", "direction_id", "service_date"],
        ascending=[True, True, True, False],
    )
)

### Routes should have a value of 16 in their service_dates column

In [29]:
agg1.head()

Unnamed: 0,organization_name,route_combined_name,direction_id,route_primary_direction,service_date
0,Alameda-Contra Costa Transit District,10 E. 14th St. - Mission,0.0,Eastbound,16
1,Alameda-Contra Costa Transit District,10 E. 14th St. - Mission,1.0,Westbound,16
2,Alameda-Contra Costa Transit District,12 MLK Jr. - Temescal - Grand,0.0,Southbound,16
3,Alameda-Contra Costa Transit District,12 MLK Jr. - Temescal - Grand,1.0,Northbound,16
4,Alameda-Contra Costa Transit District,14 14th St - San Antonio - High St,0.0,Westbound,16


In [38]:
# Find most common
agg2 = agg1.drop_duplicates(
    subset=["organization_name", "route_combined_name", "direction_id"]
).reset_index(drop=True)

### Check that this kept only the most popular `route-cardinal direction` pair for each `route-direction_id`
* If Route A switches cardinal direction for direction_id 0, then keep the most popular value and apply that across the entire time period.

In [39]:
agg2.loc[
    (agg2.organization_name == "Monterey-Salinas Transit")
    & (agg2.route_combined_name == "84 King City - Paso Robles")
]

Unnamed: 0,organization_name,route_combined_name,direction_id,route_primary_direction,service_date
2268,Monterey-Salinas Transit,84 King City - Paso Robles,0.0,Northbound,15
2269,Monterey-Salinas Transit,84 King City - Paso Robles,1.0,Southbound,16


In [40]:
agg2.loc[
    (agg2.organization_name == "City and County of San Francisco")
    & (agg2.route_combined_name == "14R MISSION RAPID")
]

Unnamed: 0,organization_name,route_combined_name,direction_id,route_primary_direction,service_date
366,City and County of San Francisco,14R MISSION RAPID,0.0,Westbound,8
367,City and County of San Francisco,14R MISSION RAPID,1.0,Northbound,14


In [42]:
agg2.loc[
    (agg2.organization_name == "City of Fairfield")
    & (agg2.route_combined_name == "7 FTC/Solano College/Cordelia Library")
]

Unnamed: 0,organization_name,route_combined_name,direction_id,route_primary_direction,service_date
676,City of Fairfield,7 FTC/Solano College/Cordelia Library,0.0,Westbound,14
677,City of Fairfield,7 FTC/Solano College/Cordelia Library,1.0,Northbound,14


In [43]:
agg2.loc[
    (agg2.organization_name == "Ventura County Transportation Commission")
    & (agg2.route_combined_name == "Route 1")
]

Unnamed: 0,organization_name,route_combined_name,direction_id,route_primary_direction,service_date
4056,Ventura County Transportation Commission,Route 1,0.0,Southbound,8
4057,Ventura County Transportation Commission,Route 1,1.0,Northbound,8


## Turn it into a function

In [44]:
all_districts_df.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,name,route_long_name,route_short_name,route_combined_name,route_id,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,caltrans_district
108,015d67d5b75b5cf2b710bbadadfb75f5,0.0,all_day,Northbound,57.14,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2023-04-12,downtown_local,1614,1579,2517.85,1201.0,4596,3438,2,8,11,21,1.83,0.75,0.64,0.63,1.0,1.0,2.1,119.9,schedule_and_vp,18.03,Bay Area 511 Marin Schedule,Downtown San Rafael - Sausalito,17,17 Downtown San Rafael - Sausalito,17,reckCEnFkdLVgfxck,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland


In [45]:
def top_cardinal_direction(df: pd.DataFrame) -> pd.DataFrame:
    """
    Some routes don't change their geographies over time,
    but their cardinal direction changes. This function finds
    the most common direction across all the dates for a route-
    direction_id and keeps only that value.
    """
    # Count the # of  times a route_primary_direction appears
    # across all dates.
    agg1 = (
        df.groupby(
            [
                "organization_name",
                "route_combined_name",
                "direction_id",
                "route_primary_direction",
            ]
        )
        .agg({"service_date": "nunique"})
        .reset_index()
        .sort_values(
            by=[
                "organization_name",
                "route_combined_name",
                "direction_id",
                "service_date",
            ],
            ascending=[True, True, True, False],
        )
    )

    # Keep only the most common route_primary_direction
    # for direction_id 0 and 1. Drop service-date since it's
    # no logner needed.
    agg2 = (
        agg1.drop_duplicates(
            subset=["organization_name", "route_combined_name", "direction_id"]
        ).reset_index(drop=True)
    ).drop(columns=["service_date"])

    # Left merge to the original dataframe
    m1 = pd.merge(
        df.drop(columns=["route_primary_direction"]),
        agg2,
        on=["organization_name", "route_combined_name", "direction_id"],
        how="left",
    )
    return m1

In [46]:
test_merge = top_cardinal_direction(all_districts_df)

In [48]:
len(test_merge)

119916

In [49]:
len(all_districts_df)

119916

In [51]:
all_districts_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119916 entries, 108 to 157547
Data columns (total 46 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   schedule_gtfs_dataset_key        119916 non-null  object        
 1   direction_id                     119916 non-null  float64       
 2   time_period                      119916 non-null  object        
 3   route_primary_direction          119916 non-null  object        
 4   avg_scheduled_service_minutes    119916 non-null  float64       
 5   avg_stop_miles                   119916 non-null  float64       
 6   n_scheduled_trips                119916 non-null  int64         
 7   frequency                        119916 non-null  float64       
 8   is_express                       115721 non-null  float64       
 9   is_rapid                         115721 non-null  float64       
 10  is_rail                          115721 no

In [50]:
test_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119916 entries, 0 to 119915
Data columns (total 46 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   schedule_gtfs_dataset_key        119916 non-null  object        
 1   direction_id                     119916 non-null  float64       
 2   time_period                      119916 non-null  object        
 3   avg_scheduled_service_minutes    119916 non-null  float64       
 4   avg_stop_miles                   119916 non-null  float64       
 5   n_scheduled_trips                119916 non-null  int64         
 6   frequency                        119916 non-null  float64       
 7   is_express                       115721 non-null  float64       
 8   is_rapid                         115721 non-null  float64       
 9   is_rail                          115721 non-null  float64       
 10  is_coverage                      115721 non-

In [47]:
test_merge.head()

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,name,route_long_name,route_short_name,route_combined_name,route_id,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,caltrans_district,route_primary_direction
0,015d67d5b75b5cf2b710bbadadfb75f5,0.0,all_day,57.14,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2023-04-12,downtown_local,1614,1579,2517.85,1201.0,4596,3438,2,8,11,21,1.83,0.75,0.64,0.63,1.0,1.0,2.1,119.9,schedule_and_vp,18.03,Bay Area 511 Marin Schedule,Downtown San Rafael - Sausalito,17,17 Downtown San Rafael - Sausalito,17,reckCEnFkdLVgfxck,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland,Northbound
1,015d67d5b75b5cf2b710bbadadfb75f5,0.0,all_day,57.14,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2023-05-17,downtown_local,1672,1652,2326.07,1257.0,4953,4287,0,4,18,22,2.13,0.87,0.72,0.71,1.0,1.0,1.85,105.73,schedule_and_vp,16.59,Bay Area 511 Marin Schedule,Downtown San Rafael - Sausalito,17,17 Downtown San Rafael - Sausalito,17,reckCEnFkdLVgfxck,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland,Northbound
2,015d67d5b75b5cf2b710bbadadfb75f5,0.0,all_day,57.14,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2023-06-14,downtown_local,1630,1618,1611.92,1257.0,4844,4387,2,4,16,22,3.01,0.91,1.0,1.0,1.0,1.0,1.28,73.27,schedule_and_vp,16.39,Bay Area 511 Marin Schedule,Downtown San Rafael - Sausalito,17,17 Downtown San Rafael - Sausalito,17,reckCEnFkdLVgfxck,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland,Northbound
3,015d67d5b75b5cf2b710bbadadfb75f5,0.0,all_day,57.14,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2023-07-12,downtown_local,1093,1080,1096.02,816.0,3242,2882,0,3,11,14,2.96,0.89,1.0,0.98,1.0,1.0,1.34,78.29,schedule_and_vp,16.98,Bay Area 511 Marin Schedule,Downtown San Rafael - Sausalito,17,17 Downtown San Rafael - Sausalito,17,reckCEnFkdLVgfxck,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland,Northbound
4,015d67d5b75b5cf2b710bbadadfb75f5,0.0,all_day,57.14,0.27,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2023-08-15,downtown_local,1513,1498,1627.77,1257.0,4486,3764,1,6,15,22,2.76,0.84,0.93,0.92,1.0,1.0,1.29,73.99,schedule_and_vp,17.04,Bay Area 511 Marin Schedule,Downtown San Rafael - Sausalito,17,17 Downtown San Rafael - Sausalito,17,reckCEnFkdLVgfxck,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland,Northbound


#### Look at the routes again

In [57]:
test_merge.loc[
    (test_merge.organization_name == "City of Fairfield")
    & (test_merge.route_combined_name == "7 FTC/Solano College/Cordelia Library")
][["time_period", "direction_id", "route_primary_direction"]].drop_duplicates()

Unnamed: 0,time_period,direction_id,route_primary_direction
5213,all_day,0.0,Westbound
5229,offpeak,0.0,Westbound
5245,peak,0.0,Westbound
5261,all_day,1.0,Northbound
5277,offpeak,1.0,Northbound
5293,peak,1.0,Northbound


In [58]:
test_merge.loc[
    (test_merge.organization_name == "Ventura County Transportation Commission")
    & (test_merge.route_combined_name == "Route 1")
][["time_period", "direction_id", "route_primary_direction"]].drop_duplicates()

Unnamed: 0,time_period,direction_id,route_primary_direction
6189,all_day,0.0,Southbound
6203,offpeak,0.0,Southbound
6217,peak,0.0,Southbound
6231,all_day,1.0,Northbound
6245,offpeak,1.0,Northbound
6259,peak,1.0,Northbound


In [59]:
test_merge.loc[
    (test_merge.organization_name ==  "Monterey-Salinas Transit")
    & (test_merge.route_combined_name =="84 King City - Paso Robles")
][["time_period", "direction_id", "route_primary_direction"]].drop_duplicates()

Unnamed: 0,time_period,direction_id,route_primary_direction
58111,all_day,0.0,Northbound
58112,offpeak,0.0,Northbound
58113,peak,0.0,Northbound
58114,all_day,1.0,Southbound
58115,peak,1.0,Southbound
