## Improving Cardinal Direction work
* Responding to comments on [PR 1145](https://github.com/cal-itp/data-analyses/pull/1145)

* 6/25
    * I experimented with not filling in the nans in `direction_id`.
    * When I did fill in the nans, I got "10,489" rows found in both datafarmes and "745" found in "right_only". The result from `schedule_metrics_by_route_direction` is 10,489 rows. 
    * When I didn't fill in the nans, I got a curious result. The dataframe produced by `schedule_metrics_by_route_direction` is 10,486 rows. These two don't match?
    _merge    
    both          10486
    right_only      615
    left_only         0

In [1]:
import _section2_utils as section2
import geopandas as gpd
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date = "2024-04-16"

### Checking out results of `gtfs_funnel/schedule_stats_by_route_direction` 6/26

#### Trips

In [25]:
 TRIP_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_trip_metrics

In [26]:
TRIP_EXPORT

'schedule_trip/schedule_trip_metrics'

In [27]:
apr_2024_trips = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_trip/schedule_trip_metrics_2024-04-20.parquet"

In [29]:
apr_24_df = pd.read_parquet(apr_2024_trips)

In [30]:
apr_24_df.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id
0,0139b1253130b33adcd4b3a4490530d2,000c4e00868708e3a798caf22f4b7eeb,952.53,Midday,55.0,91673676-969b-4c0a-a233-158c98c522dc,0.0


#### Route

In [5]:
ROUTE_DIR_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [7]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [6]:
ROUTE_DIR_EXPORT

'schedule_route_dir/schedule_route_direction_metrics'

In [17]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

'schedule_route_dir/schedule_route_direction_metrics'

In [8]:
jan_2024 = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-01-17.parquet"

In [9]:
jan_24_df = gpd.read_parquet(jan_2024)

In [11]:
jan_24_df.head(2).drop(columns = ["geometry"])

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail,stop_primary_direction
0,749380f1a9f225d9123762d83ea2f50d,CCA/Adobe,0.0,p_1432466,,14.56,0.11,16,all_day,0.67,0.0,1.0,0.0,0.0,0.0,0.0,Eastbound
1,749380f1a9f225d9123762d83ea2f50d,CCA/Adobe,0.0,p_1432466,,14.56,0.11,16,peak,2.0,0.0,1.0,0.0,0.0,0.0,0.0,Eastbound


In [12]:
jan_24_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 10009 entries, 0 to 10008
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   geometry                       10009 non-null  geometry
 1   schedule_gtfs_dataset_key      10009 non-null  object  
 2   route_id                       10009 non-null  object  
 3   direction_id                   10009 non-null  float64 
 4   common_shape_id                10009 non-null  object  
 5   route_name                     10009 non-null  object  
 6   avg_scheduled_service_minutes  10009 non-null  float64 
 7   avg_stop_miles                 10009 non-null  float64 
 8   n_trips                        10009 non-null  int64   
 9   time_period                    10009 non-null  object  
 10  frequency                      10009 non-null  float64 
 11  is_coverage                    9511 non-null   float64 
 12  is_downtown_local            

In [13]:
mar_2023 = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2023-03-15.parquet"

In [14]:
mar_23_df = gpd.read_parquet(mar_2023)

In [15]:
mar_23_df.head(2).drop(columns = ["geometry"])

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail,stop_primary_direction
0,a065f1788f6694f048a3908e0adb1b57,ACE Violet,0.0,107498,West Milpitas,32.0,0.21,4,all_day,0.17,1.0,1.0,0.0,0.0,0.0,0.0,Southbound
1,a065f1788f6694f048a3908e0adb1b57,ACE Violet,0.0,107498,West Milpitas,32.0,0.21,1,offpeak,0.06,1.0,1.0,0.0,0.0,0.0,0.0,Southbound


In [16]:
mar_23_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 9881 entries, 0 to 9880
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   geometry                       9881 non-null   geometry
 1   schedule_gtfs_dataset_key      9881 non-null   object  
 2   route_id                       9881 non-null   object  
 3   direction_id                   9881 non-null   float64 
 4   common_shape_id                9881 non-null   object  
 5   route_name                     9881 non-null   object  
 6   avg_scheduled_service_minutes  9881 non-null   float64 
 7   avg_stop_miles                 9881 non-null   float64 
 8   n_trips                        9881 non-null   int64   
 9   time_period                    9881 non-null   object  
 10  frequency                      9881 non-null   float64 
 11  is_coverage                    9482 non-null   float64 
 12  is_downtown_local              

In [18]:
from shared_utils import rt_dates

In [19]:
rt_dates.y2024_dates[0:3]

['2024-01-17', '2024-02-14', '2024-03-13']

In [21]:
GTFS_DATA_DICT.digest_tables.route_schedule_vp 

'digest/schedule_vp_metrics'

In [22]:
GTFS_DATA_DICT.digest_tables.dir

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [31]:
apr_18_24 = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-04-18.parquet"

In [32]:
apr_18_24_df = gpd.read_parquet(apr_18_24)

In [33]:
apr_18_24_df.head(2).drop(columns = ["geometry"])

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail,route_primary_direction
0,07d3b79f14cec8099119e1eb649f065b,12133,0.0,p_497365,South Shore Service & Lake Express Daily,53.25,0.32,4,all_day,0.17,,,,,,,Northbound
1,07d3b79f14cec8099119e1eb649f065b,12133,0.0,p_497365,South Shore Service & Lake Express Daily,53.25,0.32,1,offpeak,0.06,,,,,,,Northbound


### `gtfs_digest/_section2_utils.py/load_schedule_vp_metrics()`

In [42]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [41]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

'schedule_route_dir/schedule_route_direction_metrics'

In [36]:
GTFS_DATA_DICT.digest_tables.dir

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [35]:
GTFS_DATA_DICT.digest_tables.route_schedule_vp

'digest/schedule_vp_metrics'

In [43]:
sched_vp_url = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics_AH_testing.parquet"

In [44]:
sched_vp_df = pd.read_parquet(sched_vp_url)

In [45]:
sched_vp_df.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,name,route_long_name,route_short_name,route_combined_name,route_id,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,caltrans_district
0,015d67d5b75b5cf2b710bbadadfb75f5,0.0,all_day,Northbound,51.77,0.27,22,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-04-15,downtown_local,1600,1575,1743.37,1139.0,4728,3813,0,0,22,22,2.71,0.81,0.92,0.9,1.0,1.0,1.53,79.24,schedule_and_vp,17.94,Bay Area 511 Marin Schedule,Downtown San Rafael - Sausalito,17,17 Downtown San Rafael - Sausalito,17,reckCEnFkdLVgfxck,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland


In [46]:
sched_vp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66107 entries, 0 to 66106
Data columns (total 46 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   schedule_gtfs_dataset_key        66107 non-null  object        
 1   direction_id                     66107 non-null  float64       
 2   time_period                      66107 non-null  object        
 3   route_primary_direction          63132 non-null  object        
 4   avg_scheduled_service_minutes    63135 non-null  float64       
 5   avg_stop_miles                   63132 non-null  float64       
 6   n_scheduled_trips                66107 non-null  int64         
 7   frequency                        63135 non-null  float64       
 8   is_express                       59840 non-null  float64       
 9   is_rapid                         59840 non-null  float64       
 10  is_rail                          59840 non-null  float64  

### `gtfs_funnel/schedule_stats_by_route_direction`

In [None]:
def cardinal_direction_for_route_direction(analysis_date:str, dict_inputs:dict):
    """
    Get a cardinal direction (North, South, East, West) for each
    route.
    """
    STOP_TIMES_FILE = dict_inputs.rt_vs_schedule_tables.stop_times_direction
    
    stop_times_gdf = pd.read_parquet(
    f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{analysis_date}.parquet",
    filters=[[("stop_primary_direction", "!=", "Unknown")]
    ])
    
    trip_scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name"
    ]
        
    trips_df = helpers.import_scheduled_trips(analysis_date, 
                                             columns = trip_scheduled_col,
                                             get_pandas = True)

    
    # Merge dfs
    merge_cols = ["trip_instance_key", 
                  "schedule_gtfs_dataset_key", 
                  "shape_array_key"]
    
    stop_times_with_trip = pd.merge(stop_times_gdf, trips_df, on = merge_cols)
    
    # Fill in missing direction id with 0, per our usual practice.
    print(f"# of nulls for direction_id: {stop_times_with_trip['direction_id'].isna().sum()}")
    stop_times_with_trip.direction_id = stop_times_with_trip.direction_id.fillna(0)
    
    main_cols = [
        "route_id",
        "schedule_gtfs_dataset_key",
        "direction_id"
    ]
    
    agg1 = (
        stop_times_with_trip.groupby(
            main_cols + ["stop_primary_direction"]
        )
        .agg({"stop_sequence": "count"})
        .reset_index()
        .rename(columns={"stop_sequence": "total_stops"})
    )
    
    # Sort and drop duplicates so that the
    # largest # of stops by stop_primary_direction is at the top
    agg2 = agg1.sort_values(
        by= main_cols + ["total_stops"],
        ascending=[True, True, True, False],
    )
    
    # Drop duplicates so only the top stop_primary_direction is kept.
    agg3 = agg2.drop_duplicates(subset= main_cols).reset_index(drop=True)
    
    agg3 = agg3.drop(columns=["total_stops"])
    return agg3

In [None]:
test = cardinal_direction_for_route_direction(analysis_date,GTFS_DATA_DICT)

In [None]:
test.head()

In [None]:
test.info()

In [None]:
import sys

sys.path.append("../gtfs_funnel")
import schedule_stats_by_route_direction

In [None]:
trip_metrics = schedule_stats_by_route_direction.assemble_scheduled_trip_metrics(analysis_date, GTFS_DATA_DICT)
trip_metrics = trip_metrics.rename(columns = {"stop_primary_direction":"route_primary_direction"})

In [None]:
trip_metrics.sample()

In [None]:

route_merge_group_cols = [
            "schedule_gtfs_dataset_key", 
            "route_id", 
            "direction_id",
        ]

In [None]:
route = schedule_stats_by_route_direction.schedule_metrics_by_route_direction(trip_metrics, analysis_date,route_merge_group_cols)
        

In [None]:
route.drop(columns = ['geometry']).sample()

In [None]:
len(route)

In [None]:
pd.merge(
            route,
            test,
            on = route_merge_group_cols,
            how = "outer",
            indicator = True
        )[["_merge"]].value_counts(dropna=False)

In [None]:
route2= pd.merge(
            route,
            test,
            on = route_merge_group_cols,
            how = "left"
        )

In [None]:
len(route2)

In [None]:
route2.drop(columns = ['geometry']).sample(3)

In [None]:
ROUTE_TYPOLOGIES = GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
route_typologies = pd.read_parquet(
            f"{SCHED_GCS}{ROUTE_TYPOLOGIES}_{analysis_date}.parquet",
            columns = route_merge_group_cols + [
                "is_coverage", "is_downtown_local", 
                "is_local", "is_rapid", "is_express", "is_rail"]
        )
    

In [None]:
route_dir_metrics2 = pd.merge(
            route,
            route_typologies,
            on = route_merge_group_cols,
            how = "left"
        )

In [None]:
    
route_dir_metrics3 = pd.merge(
            route,
            route_typologies,
            on = route_merge_group_cols,
            how = "left"
        ).merge(test,
            on = route_merge_group_cols,
            how = "left")

In [None]:
len(route_dir_metrics3), len(route_dir_metrics2), len(route)

In [None]:
route_dir_metrics3.drop(columns = ['geometry']).sample(3)