## Improving Cardinal Direction work
* Responding to comments on [PR 1145](https://github.com/cal-itp/data-analyses/pull/1145)

* 6/25
    * I experimented with not filling in the nans in `direction_id`.
    * When I did fill in the nans, I got "10,489" rows found in both datafarmes and "745" found in "right_only". The result from `schedule_metrics_by_route_direction` is 10,489 rows. 
    * When I didn't fill in the nans, I got a curious result. The dataframe produced by `schedule_metrics_by_route_direction` is 10,486 rows. These two don't match?
    _merge    
    both          10486
    right_only      615
    left_only         0

In [1]:
import _section2_utils as section2
import geopandas as gpd
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date = "2024-04-16"

### 7/1: Resovling where to fill in `direction_id`. 
#### `gtfs_digest/merge_data/concatenate_rt_vs_schedule_by_route_direction`

In [4]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [5]:
FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.vp_route_direction_metrics

In [6]:
FILE

'vp_route_dir/route_direction_metrics'

In [7]:
route_dir_url = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/vp_route_dir/route_direction_metrics_2024-05-22.parquet"

In [8]:
may_24_route_dir_df = pd.read_parquet(route_dir_url)

In [9]:
may_24_route_dir_df.sample()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,name,base64_url,organization_source_record_id,organization_name,caltrans_district
4365,baeeb157e85a901e47b828ef9fe75091,227,0.0,offpeak,1717,1503,3117.82,1320.0,3766,3558,0,8,37,45,1.21,0.94,0.55,0.48,1.0,1.0,2.36,69.28,San Diego Schedule,aHR0cHM6Ly93d3cuc2RtdHMuY29tL2dvb2dsZV90cmFuc2l0X2ZpbGVzL2dvb2dsZV90cmFuc2l0LnppcA==,rech5YtfjpQvVIBAF,Flagship Cruises and Events Inc.,11 - San Diego


In [10]:
may_24_route_dir_df.direction_id.value_counts(dropna = False)

0.00    4252
1.00    3871
Name: direction_id, dtype: int64

In [11]:
may_24_route_dir_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8123 entries, 0 to 8122
Data columns (total 27 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   schedule_gtfs_dataset_key        8123 non-null   object 
 1   route_id                         8123 non-null   object 
 2   direction_id                     8123 non-null   float64
 3   time_period                      8123 non-null   object 
 4   minutes_atleast1_vp              8123 non-null   int64  
 5   minutes_atleast2_vp              8123 non-null   int64  
 6   total_rt_service_minutes         8123 non-null   float64
 7   total_scheduled_service_minutes  8123 non-null   float64
 8   total_vp                         8123 non-null   int64  
 9   vp_in_shape                      8123 non-null   int64  
 10  is_early                         8123 non-null   int64  
 11  is_ontime                        8123 non-null   int64  
 12  is_late             

#### Checking out results of `gtfs_funnel/schedule_stats_by_route_direction`

In [12]:
ROUTE_DIR_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [13]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [14]:
ROUTE_DIR_EXPORT

'schedule_route_dir/schedule_route_direction_metrics'

In [15]:
may_22_24_url = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-05-22.parquet"

In [16]:
may_22_24 = gpd.read_parquet(may_22_24_url)

In [17]:
may_22_24.head(2).drop(columns = ["geometry"])

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail,route_primary_direction
0,e359e3617344263ad00858db2149a288,6,1.0,p_178727,,24.0,0.21,25,all_day,1.04,1.0,0.0,0.0,1.0,0.0,0.0,Northbound
1,e359e3617344263ad00858db2149a288,6,1.0,p_178727,,24.0,0.21,11,offpeak,0.69,1.0,0.0,0.0,1.0,0.0,0.0,Northbound


In [18]:
may_22_24.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 9650 entries, 0 to 9649
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   geometry                       9650 non-null   geometry
 1   schedule_gtfs_dataset_key      9650 non-null   object  
 2   route_id                       9650 non-null   object  
 3   direction_id                   9650 non-null   float64 
 4   common_shape_id                9650 non-null   object  
 5   route_name                     9650 non-null   object  
 6   avg_scheduled_service_minutes  9650 non-null   float64 
 7   avg_stop_miles                 9650 non-null   float64 
 8   n_trips                        9650 non-null   int64   
 9   time_period                    9650 non-null   object  
 10  frequency                      9650 non-null   float64 
 11  is_coverage                    9199 non-null   float64 
 12  is_downtown_local         

In [19]:
may_22_24.direction_id.value_counts(dropna = False)

0.00    5319
1.00    4331
Name: direction_id, dtype: int64

### Try to merge these two together

In [20]:
route_time_cols = ["schedule_gtfs_dataset_key", 
                   "route_id", 
                   "direction_id", 
                   "time_period"]

In [21]:
pd.merge(
        may_24_route_dir_df,
        may_22_24,
        on = route_time_cols,
        how = "outer",
        indicator = True
    )[["_merge"]].value_counts()

_merge    
both          7603
right_only    2047
left_only      520
dtype: int64

### Try out all the dates

In [22]:
import sys

sys.path.append("../gtfs_funnel")
import schedule_stats_by_route_direction

In [23]:
import merge_data

In [24]:
analysis_date_list = (rt_dates.y2024_dates + rt_dates.y2023_dates
            )

In [25]:
df_sched = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)

In [26]:
primary_typology = merge_data.set_primary_typology(df_sched)
    
df_sched2 = pd.merge(
        df_sched,
        primary_typology,
        on = route_time_cols,
        how = "left"
    )

In [27]:
df_sched2.direction_id.value_counts(dropna= False)

0.00    86125
1.00    71562
Name: direction_id, dtype: int64

In [28]:
df_rt_sched = (
        merge_data.concatenate_rt_vs_schedule_by_route_direction(
            analysis_date_list)
        .astype({"direction_id": "float"})
    )

In [29]:
df_rt_sched.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,name,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,1614,1579,2517.85,1201.0,4596,3438,2,8,11,21,1.83,0.75,0.64,0.63,1.0,1.0,2.1,119.9,Bay Area 511 Marin Schedule,2023-04-12
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,1672,1652,2326.07,1257.0,4953,4287,0,4,18,22,2.13,0.87,0.72,0.71,1.0,1.0,1.85,105.73,Bay Area 511 Marin Schedule,2023-05-17


In [30]:
df_rt_sched.direction_id.value_counts(dropna= False)

0.00    66542
1.00    61318
Name: direction_id, dtype: int64

In [31]:
df = pd.merge(
        df_sched2,
        df_rt_sched,
        on = route_time_cols + ["service_date"],
        how = "outer",
        indicator = "sched_rt_category"
    )

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165631 entries, 0 to 165630
Data columns (total 37 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   schedule_gtfs_dataset_key        165631 non-null  object        
 1   route_id                         165631 non-null  object        
 2   direction_id                     165631 non-null  float64       
 3   time_period                      165631 non-null  object        
 4   route_primary_direction          157687 non-null  object        
 5   avg_scheduled_service_minutes    157687 non-null  float64       
 6   avg_stop_miles                   157687 non-null  float64       
 7   n_scheduled_trips                157687 non-null  float64       
 8   frequency                        157687 non-null  float64       
 9   is_express                       150243 non-null  float64       
 10  is_rapid                         150243 non-

In [33]:
df.sched_rt_category.value_counts()

both          119916
left_only      37771
right_only      7944
Name: sched_rt_category, dtype: int64

### `gtfs_digest/_section2_utils.py/load_schedule_vp_metrics()`

In [None]:
RT_SCHED_GCS

In [None]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [None]:
GTFS_DATA_DICT.digest_tables.dir

In [None]:
GTFS_DATA_DICT.digest_tables.route_schedule_vp

In [None]:
sched_vp_url = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics_AH_testing.parquet"

In [None]:
sched_vp_df = pd.read_parquet(sched_vp_url)

In [None]:
sched_vp_df.head(1)

In [None]:
sched_vp_df.info()

### `gtfs_funnel/schedule_stats_by_route_direction`

In [None]:
def cardinal_direction_for_route_direction(analysis_date:str, dict_inputs:dict):
    """
    Get a cardinal direction (North, South, East, West) for each
    route.
    """
    STOP_TIMES_FILE = dict_inputs.rt_vs_schedule_tables.stop_times_direction
    
    stop_times_gdf = pd.read_parquet(
    f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{analysis_date}.parquet",
    filters=[[("stop_primary_direction", "!=", "Unknown")]
    ])
    
    trip_scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name"
    ]
        
    trips_df = helpers.import_scheduled_trips(analysis_date, 
                                             columns = trip_scheduled_col,
                                             get_pandas = True)

    
    # Merge dfs
    merge_cols = ["trip_instance_key", 
                  "schedule_gtfs_dataset_key", 
                  "shape_array_key"]
    
    stop_times_with_trip = pd.merge(stop_times_gdf, trips_df, on = merge_cols)
    
    # Fill in missing direction id with 0, per our usual practice.
    print(f"# of nulls for direction_id: {stop_times_with_trip['direction_id'].isna().sum()}")
    stop_times_with_trip.direction_id = stop_times_with_trip.direction_id.fillna(0)
    
    main_cols = [
        "route_id",
        "schedule_gtfs_dataset_key",
        "direction_id"
    ]
    
    agg1 = (
        stop_times_with_trip.groupby(
            main_cols + ["stop_primary_direction"]
        )
        .agg({"stop_sequence": "count"})
        .reset_index()
        .rename(columns={"stop_sequence": "total_stops"})
    )
    
    # Sort and drop duplicates so that the
    # largest # of stops by stop_primary_direction is at the top
    agg2 = agg1.sort_values(
        by= main_cols + ["total_stops"],
        ascending=[True, True, True, False],
    )
    
    # Drop duplicates so only the top stop_primary_direction is kept.
    agg3 = agg2.drop_duplicates(subset= main_cols).reset_index(drop=True)
    
    agg3 = agg3.drop(columns=["total_stops"])
    return agg3

In [None]:
test = cardinal_direction_for_route_direction(analysis_date,GTFS_DATA_DICT)

In [None]:
test.head()

In [None]:
test.info()

In [None]:
trip_metrics = schedule_stats_by_route_direction.assemble_scheduled_trip_metrics(analysis_date, GTFS_DATA_DICT)
trip_metrics = trip_metrics.rename(columns = {"stop_primary_direction":"route_primary_direction"})

In [None]:
trip_metrics.sample()

In [None]:

route_merge_group_cols = [
            "schedule_gtfs_dataset_key", 
            "route_id", 
            "direction_id",
        ]

In [None]:
route = schedule_stats_by_route_direction.schedule_metrics_by_route_direction(trip_metrics, analysis_date,route_merge_group_cols)
        

In [None]:
route.drop(columns = ['geometry']).sample()

In [None]:
len(route)

In [None]:
pd.merge(
            route,
            test,
            on = route_merge_group_cols,
            how = "outer",
            indicator = True
        )[["_merge"]].value_counts(dropna=False)

In [None]:
route2= pd.merge(
            route,
            test,
            on = route_merge_group_cols,
            how = "left"
        )

In [None]:
len(route2)

In [None]:
route2.drop(columns = ['geometry']).sample(3)

In [None]:
ROUTE_TYPOLOGIES = GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
route_typologies = pd.read_parquet(
            f"{SCHED_GCS}{ROUTE_TYPOLOGIES}_{analysis_date}.parquet",
            columns = route_merge_group_cols + [
                "is_coverage", "is_downtown_local", 
                "is_local", "is_rapid", "is_express", "is_rail"]
        )
    

In [None]:
route_dir_metrics2 = pd.merge(
            route,
            route_typologies,
            on = route_merge_group_cols,
            how = "left"
        )

In [None]:
    
route_dir_metrics3 = pd.merge(
            route,
            route_typologies,
            on = route_merge_group_cols,
            how = "left"
        ).merge(test,
            on = route_merge_group_cols,
            how = "left")

In [None]:
len(route_dir_metrics3), len(route_dir_metrics2), len(route)

In [None]:
route_dir_metrics3.drop(columns = ['geometry']).sample(3)