## Improving Cardinal Direction work
* Responding to comments on [PR 1145](https://github.com/cal-itp/data-analyses/pull/1145)

* 6/19
    * I experimented with not filling in the nans in `direction_id`.
    * When I did fill in the nans, I got "10,489" rows found in both datafarmes and "745" found in "right_only"
    * When I didn't, I got a curious result. These two don't match?
    _merge    
    both          10486
    right_only      615
    left_only         0

In [1]:
import _section2_utils as section2
import geopandas as gpd
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date = "2024-04-16"

### `gtfs_funnel/schedule_stats_by_route_direction`

In [4]:
def cardinal_direction_for_route_direction(analysis_date:str, dict_inputs:dict):
    """
    Get a cardinal direction (North, South, East, West) for each
    route.
    """
    STOP_TIMES_FILE = dict_inputs.rt_vs_schedule_tables.stop_times_direction
    
    stop_times_gdf = pd.read_parquet(
    f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{analysis_date}.parquet",
    filters=[[("stop_primary_direction", "!=", "Unknown")]
    ])
    
    trip_scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name"
    ]
        
    trips_df = helpers.import_scheduled_trips(analysis_date, 
                                              columns = trip_scheduled_col,
                                             get_pandas = True)

    
    
    # Merge dfs
    merge_cols = ["trip_instance_key", 
                  "schedule_gtfs_dataset_key", 
                  "shape_array_key"]
    
    stop_times_with_trip = pd.merge(stop_times_gdf, trips_df, on = merge_cols)
    
    # Fill in missing direction id with 0, per our usual practice.
    print(f"# of nulls for direction_id: {stop_times_with_trip['direction_id'].isna().sum()}")
    #stop_times_with_trip.direction_id = stop_times_with_trip.direction_id.fillna(0)
    
    agg1 = (
        stop_times_with_trip.groupby(
            [
                "route_id",
                "schedule_gtfs_dataset_key",
                "direction_id",
                "stop_primary_direction",
            ]
        )
        .agg({"stop_sequence": "count"})
        .reset_index()
        .rename(columns={"stop_sequence": "total_stops"})
    )
    
    # Sort and drop duplicates so that the
    # largest # of stops by stop_primary_direction is at the top
    agg2 = agg1.sort_values(
        by=["route_id", 
            "schedule_gtfs_dataset_key",
            "direction_id",
            "total_stops"],
        ascending=[True, True, True, False],
    )
    
    # Drop duplicates so only the top stop_primary_direction is kept.
    agg3 = agg2.drop_duplicates(
    subset=[
        "route_id",
        "schedule_gtfs_dataset_key",
        "direction_id",
    ]).reset_index(drop=True)
    
    agg3 = agg3.drop(columns=["total_stops"])
    return agg3

In [5]:
test = cardinal_direction_for_route_direction(analysis_date,GTFS_DATA_DICT)

# of nulls for direction_id: 90598


In [6]:
test.head()

Unnamed: 0,route_id,schedule_gtfs_dataset_key,direction_id,stop_primary_direction
0,001,cb3074eb8b423dfc5acfeeb0de95eb82,0.0,Westbound
1,001,cb3074eb8b423dfc5acfeeb0de95eb82,1.0,Eastbound
2,001-132,9809d3f8121513057bc5cb8de7b54ce2,0.0,Eastbound
3,001-132,9809d3f8121513057bc5cb8de7b54ce2,1.0,Northbound
4,002-132,9809d3f8121513057bc5cb8de7b54ce2,0.0,Eastbound


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4345 entries, 0 to 4344
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   route_id                   4345 non-null   object 
 1   schedule_gtfs_dataset_key  4345 non-null   object 
 2   direction_id               4345 non-null   float64
 3   stop_primary_direction     4345 non-null   object 
dtypes: float64(1), object(3)
memory usage: 135.9+ KB


In [8]:
import sys

sys.path.append("../gtfs_funnel")
import schedule_stats_by_route_direction

In [9]:
trip_metrics = schedule_stats_by_route_direction.assemble_scheduled_trip_metrics(analysis_date, GTFS_DATA_DICT)
trip_metrics = trip_metrics.rename(columns = {"stop_primary_direction":"route_primary_direction"})

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104941 entries, 0 to 104940
Data columns (total 7 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   schedule_gtfs_dataset_key  104941 non-null  object 
 1   trip_instance_key          104941 non-null  object 
 2   median_stop_meters         104937 non-null  float64
 3   time_of_day                104941 non-null  object 
 4   scheduled_service_minutes  104941 non-null  float64
 5   route_id                   104941 non-null  object 
 6   direction_id               95044 non-null   float64
dtypes: float64(3), object(4)
memory usage: 6.4+ MB


None

In [10]:
trip_metrics.sample()

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id
37780,7cc0cb1871dfd558f11a2885c145d144,2fd67215db5942ad0944e3457757e9b7,429.92,AM Peak,41.0,M,0.0


In [11]:
route_group_cols = [
            "schedule_gtfs_dataset_key", 
            "route_id", 
            "direction_id"
        ]
        
route_merge_cols = [
            "schedule_gtfs_dataset_key", 
            "route_id", 
            "direction_id",
        ]

In [12]:
route = schedule_stats_by_route_direction.schedule_metrics_by_route_direction(trip_metrics, analysis_date,route_group_cols, route_merge_cols)
        

In [13]:
route.drop(columns = ['geometry']).sample()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency
4150,ecd018ad66f497fb8f188ed5a71b284b,453,0.0,821,Orange Transportation Center - St Joseph Hospital via Chapman/Main/La Veta,20.0,0.25,7,all_day,0.29


In [17]:
pd.merge(
            route,
            test,
            on = route_merge_cols,
            how = "outer",
            indicator = True
        )[["_merge"]].value_counts(dropna=False)

_merge    
both          10486
right_only      615
left_only         0
dtype: int64

In [15]:
route2= pd.merge(
            route,
            test,
            on = route_merge_cols,
            how = "left"
        )

In [19]:
len(route)

10486

In [16]:
route2.drop(columns = ['geometry']).sample(3)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency,stop_primary_direction
3969,8eecb796518dafd3c1b971a99f8b8252,3205,0.0,7612,Apple Valley - Lucerne Valley,47.88,0.79,5,peak,0.62,Eastbound
4034,ecd018ad66f497fb8f188ed5a71b284b,400,1.0,4001,John Wayne Airport - Tustin Metrolink via Jamboree,26.0,0.25,10,peak,1.25,Southbound
6229,55a01ef72af21906934ae8ffb4786e86,200X,0.0,83,Martinez / Pittsburg,44.5,0.88,6,all_day,0.25,Eastbound
