## Improving Cardinal Direction work
* Responding to comments on [PR 1145](https://github.com/cal-itp/data-analyses/pull/1145)

* 6/25
    * I experimented with not filling in the nans in `direction_id`.
    * When I did fill in the nans, I got "10,489" rows found in both datafarmes and "745" found in "right_only". The result from `schedule_metrics_by_route_direction` is 10,489 rows. 
    * When I didn't fill in the nans, I got a curious result. The dataframe produced by `schedule_metrics_by_route_direction` is 10,486 rows. These two don't match?
    _merge    
    both          10486
    right_only      615
    left_only         0

In [1]:
import _section2_utils as section2
import geopandas as gpd
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date = "2024-04-16"

### `gtfs_funnel/schedule_stats_by_route_direction`

In [4]:
def cardinal_direction_for_route_direction(analysis_date:str, dict_inputs:dict):
    """
    Get a cardinal direction (North, South, East, West) for each
    route.
    """
    STOP_TIMES_FILE = dict_inputs.rt_vs_schedule_tables.stop_times_direction
    
    stop_times_gdf = pd.read_parquet(
    f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{analysis_date}.parquet",
    filters=[[("stop_primary_direction", "!=", "Unknown")]
    ])
    
    trip_scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name"
    ]
        
    trips_df = helpers.import_scheduled_trips(analysis_date, 
                                             columns = trip_scheduled_col,
                                             get_pandas = True)

    
    # Merge dfs
    merge_cols = ["trip_instance_key", 
                  "schedule_gtfs_dataset_key", 
                  "shape_array_key"]
    
    stop_times_with_trip = pd.merge(stop_times_gdf, trips_df, on = merge_cols)
    
    # Fill in missing direction id with 0, per our usual practice.
    print(f"# of nulls for direction_id: {stop_times_with_trip['direction_id'].isna().sum()}")
    stop_times_with_trip.direction_id = stop_times_with_trip.direction_id.fillna(0)
    
    main_cols = [
        "route_id",
        "schedule_gtfs_dataset_key",
        "direction_id"
    ]
    
    agg1 = (
        stop_times_with_trip.groupby(
            main_cols + ["stop_primary_direction"]
        )
        .agg({"stop_sequence": "count"})
        .reset_index()
        .rename(columns={"stop_sequence": "total_stops"})
    )
    
    # Sort and drop duplicates so that the
    # largest # of stops by stop_primary_direction is at the top
    agg2 = agg1.sort_values(
        by= main_cols + ["total_stops"],
        ascending=[True, True, True, False],
    )
    
    # Drop duplicates so only the top stop_primary_direction is kept.
    agg3 = agg2.drop_duplicates(subset= main_cols).reset_index(drop=True)
    
    agg3 = agg3.drop(columns=["total_stops"])
    return agg3

In [5]:
test = cardinal_direction_for_route_direction(analysis_date,GTFS_DATA_DICT)

# of nulls for direction_id: 90598


In [6]:
test.head()

Unnamed: 0,route_id,schedule_gtfs_dataset_key,direction_id,stop_primary_direction
0,001,cb3074eb8b423dfc5acfeeb0de95eb82,0.0,Westbound
1,001,cb3074eb8b423dfc5acfeeb0de95eb82,1.0,Eastbound
2,001-132,9809d3f8121513057bc5cb8de7b54ce2,0.0,Eastbound
3,001-132,9809d3f8121513057bc5cb8de7b54ce2,1.0,Northbound
4,002-132,9809d3f8121513057bc5cb8de7b54ce2,0.0,Eastbound


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4475 entries, 0 to 4474
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   route_id                   4475 non-null   object 
 1   schedule_gtfs_dataset_key  4475 non-null   object 
 2   direction_id               4475 non-null   float64
 3   stop_primary_direction     4475 non-null   object 
dtypes: float64(1), object(3)
memory usage: 140.0+ KB


In [8]:
import sys

sys.path.append("../gtfs_funnel")
import schedule_stats_by_route_direction

In [9]:
trip_metrics = schedule_stats_by_route_direction.assemble_scheduled_trip_metrics(analysis_date, GTFS_DATA_DICT)
trip_metrics = trip_metrics.rename(columns = {"stop_primary_direction":"route_primary_direction"})

In [10]:
trip_metrics.sample()

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id
4687,1ebafaca8716652559b2017b6eedc4ef,3b05e38bba586be01828035921b806e1,367.87,PM Peak,29.0,2,1.0


In [11]:

route_merge_group_cols = [
            "schedule_gtfs_dataset_key", 
            "route_id", 
            "direction_id",
        ]

In [12]:
route = schedule_stats_by_route_direction.schedule_metrics_by_route_direction(trip_metrics, analysis_date,route_merge_group_cols)
        

In [13]:
route.drop(columns = ['geometry']).sample()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency
8857,4c6b107352b318297bb39173c796f357,3822,1.0,20470,HUGHES/MCKINLEY,51.73,0.23,33,all_day,1.38


In [14]:
len(route)

10489

In [15]:
pd.merge(
            route,
            test,
            on = route_merge_group_cols,
            how = "outer",
            indicator = True
        )[["_merge"]].value_counts(dropna=False)

_merge    
both          10489
right_only      745
left_only         0
dtype: int64

In [17]:
route2= pd.merge(
            route,
            test,
            on = route_merge_group_cols,
            how = "left"
        )

In [18]:
len(route2)

10489

In [19]:
route2.drop(columns = ['geometry']).sample(3)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency,stop_primary_direction
899,7891c0d5e91c8dccf88536129dbac084,B,0.0,96,,46.92,0.22,7,peak,0.88,Southbound
4694,ecb6e412d4745e9ebbfb9df814e336f2,353,1.0,3530028,Escondido to Nordahl Market Pl via Citracado,21.4,0.2,15,all_day,0.62,Westbound
9521,7dbe3e19a4966e0c0531fa826e0446d8,525,0.0,33383,DTC-Main-Myrtle-Oro,16.76,0.22,11,offpeak,0.69,Westbound


In [21]:
ROUTE_TYPOLOGIES = GTFS_DATA_DICT.schedule_tables.route_typologies

In [24]:
route_typologies = pd.read_parquet(
            f"{SCHED_GCS}{ROUTE_TYPOLOGIES}_{analysis_date}.parquet",
            columns = route_merge_group_cols + [
                "is_coverage", "is_downtown_local", 
                "is_local", "is_rapid", "is_express", "is_rail"]
        )
    

In [27]:
route_dir_metrics2 = pd.merge(
            route,
            route_typologies,
            on = route_merge_group_cols,
            how = "left"
        )

In [28]:
    
route_dir_metrics3 = pd.merge(
            route,
            route_typologies,
            on = route_merge_group_cols,
            how = "left"
        ).merge(test,
            on = route_merge_group_cols,
            how = "left")

In [30]:
len(route_dir_metrics3), len(route_dir_metrics2), len(route)

(10489, 10489, 10489)

In [29]:
route_dir_metrics3.drop(columns = ['geometry']).sample(3)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail,stop_primary_direction
1513,8de1f1a3b9ae172c6b8255b1c82c340f,32390,0.0,p_1303422,"The Lincoln Circulator Overflow only operates Monday – Friday when school is in session only, to supplement the regular bus route during peak service to serving stops nearby schools.",60.0,0.3,1,all_day,0.04,1.0,0.0,0.0,1.0,0.0,0.0,Southbound
7526,95cb514215c61ca578b01d885f35ec0a,11050,1.0,36699,YUCAIPA,24.07,0.25,8,peak,1.0,1.0,0.0,0.0,1.0,0.0,0.0,Southbound
363,e524db270831632bdcf71df1d7e74d25,554,0.0,p_1131,South County/ Commuter,83.0,2.78,1,offpeak,0.06,1.0,0.0,0.0,0.0,0.0,0.0,Westbound
