## Improving Cardinal Direction work
* Responding to comments on [PR 1145](https://github.com/cal-itp/data-analyses/pull/1145)

* 6/25
    * I experimented with not filling in the nans in `direction_id`.
    * When I did fill in the nans, I got "10,489" rows found in both datafarmes and "745" found in "right_only". The result from `schedule_metrics_by_route_direction` is 10,489 rows. 
    * When I didn't fill in the nans, I got a curious result. The dataframe produced by `schedule_metrics_by_route_direction` is 10,486 rows. These two don't match?
    _merge    
    both          10486
    right_only      615
    left_only         0

In [1]:
import _section2_utils as section2
import geopandas as gpd
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date = "2024-04-16"

### 7/2 checking out `_section2_utils.load_schedule_vp_metrics` after removing all `direction_id.fillna(0)` from `gtfs_funnel/schedule_metrics_by_route_direction`
* Looks ok, all values in `direction_id` are filled.

In [51]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [53]:
all_ops = pd.read_parquet(schd_vp_url)

In [55]:
all_ops.direction_id.value_counts(dropna=False)

0.00    90196
1.00    75435
Name: direction_id, dtype: int64

In [54]:
all_ops.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165631 entries, 0 to 165630
Data columns (total 46 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   schedule_gtfs_dataset_key        165631 non-null  object        
 1   direction_id                     165631 non-null  float64       
 2   time_period                      165631 non-null  object        
 3   route_primary_direction          157687 non-null  object        
 4   avg_scheduled_service_minutes    157687 non-null  float64       
 5   avg_stop_miles                   157687 non-null  float64       
 6   n_scheduled_trips                165631 non-null  int64         
 7   frequency                        157687 non-null  float64       
 8   is_express                       150243 non-null  float64       
 9   is_rapid                         150243 non-null  float64       
 10  is_rail                          150243 non-

### 7/1: Resolving where to fill in `direction_id`. 
#### `gtfs_digest/merge_data/concatenate_rt_vs_schedule_by_route_direction`

In [4]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [5]:
FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.vp_route_direction_metrics

In [6]:
FILE

'vp_route_dir/route_direction_metrics'

In [7]:
route_dir_url = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/vp_route_dir/route_direction_metrics_2024-05-22.parquet"

In [8]:
may_24_route_dir_df = pd.read_parquet(route_dir_url)

In [9]:
may_24_route_dir_df.sample()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,name,base64_url,organization_source_record_id,organization_name,caltrans_district
269,0666caf3ec1ecc96b74f4477ee4bc939,210-13172,0.0,offpeak,5318,5247,6696.94,4523.0,15753,14509,2,4,44,50,2.35,0.92,0.79,0.78,1.0,1.0,1.48,133.94,LA Metro Bus Schedule,aHR0cHM6Ly9naXRsYWIuY29tL0xBQ01UQS9ndGZzX2J1cy9yYXcvbWFzdGVyL2d0ZnNfYnVzLnppcA==,recPnGkwdpnr8jmHB,Los Angeles County Metropolitan Transportation Authority,07 - Los Angeles


In [10]:
may_24_route_dir_df.direction_id.value_counts(dropna = False)

0.00    4252
1.00    3871
Name: direction_id, dtype: int64

In [11]:
may_24_route_dir_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8123 entries, 0 to 8122
Data columns (total 27 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   schedule_gtfs_dataset_key        8123 non-null   object 
 1   route_id                         8123 non-null   object 
 2   direction_id                     8123 non-null   float64
 3   time_period                      8123 non-null   object 
 4   minutes_atleast1_vp              8123 non-null   int64  
 5   minutes_atleast2_vp              8123 non-null   int64  
 6   total_rt_service_minutes         8123 non-null   float64
 7   total_scheduled_service_minutes  8123 non-null   float64
 8   total_vp                         8123 non-null   int64  
 9   vp_in_shape                      8123 non-null   int64  
 10  is_early                         8123 non-null   int64  
 11  is_ontime                        8123 non-null   int64  
 12  is_late             

#### Checking out results of `gtfs_funnel/schedule_stats_by_route_direction` 

##### Trips

In [23]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [21]:
TRIP_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_trip_metrics

In [22]:
TRIP_EXPORT

'schedule_trip/schedule_trip_metrics'

In [24]:
trips_may_22 = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_trip/schedule_trip_metrics_2024-05-22.parquet"

In [25]:
trips_may_22_df = pd.read_parquet(trips_may_22)

In [26]:
trips_may_22_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96391 entries, 0 to 96390
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   schedule_gtfs_dataset_key  96391 non-null  object 
 1   trip_instance_key          96391 non-null  object 
 2   median_stop_meters         96387 non-null  float64
 3   time_of_day                96391 non-null  object 
 4   scheduled_service_minutes  96391 non-null  float64
 5   route_id                   96391 non-null  object 
 6   direction_id               90300 non-null  float64
dtypes: float64(3), object(4)
memory usage: 5.9+ MB


In [27]:
trips_may_22_df.direction_id.value_counts(dropna=False)

0.00    48466
1.00    41834
NaN      6091
Name: direction_id, dtype: int64

In [49]:
all_trips_df = pd.DataFrame()
for date in analysis_date_list:
    df = schedule_stats_by_route_direction.assemble_scheduled_trip_metrics(date,GTFS_DATA_DICT)
    all_trips_df = pd.concat([all_trips_df, df])

In [50]:
all_trips_df.direction_id.value_counts(dropna=False)

0.00    774400
1.00    682329
NaN     142287
Name: direction_id, dtype: int64

##### Routes

In [12]:
ROUTE_DIR_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [13]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [14]:
ROUTE_DIR_EXPORT

'schedule_route_dir/schedule_route_direction_metrics'

In [15]:
may_22_24_url = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-05-22.parquet"

In [16]:
may_22_24 = gpd.read_parquet(may_22_24_url)

In [None]:
may_22_24.head(2).drop(columns = ["geometry"])

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail,route_primary_direction
0,e359e3617344263ad00858db2149a288,6,1.0,p_178727,,24.0,0.21,25,all_day,1.04,1.0,0.0,0.0,1.0,0.0,0.0,Northbound
1,e359e3617344263ad00858db2149a288,6,1.0,p_178727,,24.0,0.21,11,offpeak,0.69,1.0,0.0,0.0,1.0,0.0,0.0,Northbound


may_22_24.info()

In [19]:
may_22_24.direction_id.value_counts(dropna = False)

0.00    5319
1.00    4331
Name: direction_id, dtype: int64

#### `gtfs_funnel/schedule_stats_by_route_direction/cardinal_direction_for_route_direction()`

In [31]:
import sys

sys.path.append("../gtfs_funnel")
import schedule_stats_by_route_direction

In [36]:
analysis_date_list = (rt_dates.y2024_dates + rt_dates.y2023_dates
            )

In [37]:
analysis_date_list

['2024-01-17',
 '2024-02-14',
 '2024-03-13',
 '2024-04-17',
 '2024-05-22',
 '2024-06-12',
 '2023-03-15',
 '2023-04-12',
 '2023-05-17',
 '2023-06-14',
 '2023-07-12',
 '2023-08-15',
 '2023-09-13',
 '2023-10-11',
 '2023-11-15',
 '2023-12-13']

In [39]:
cardinal_dir = schedule_stats_by_route_direction.cardinal_direction_for_route_direction('2024-05-22',GTFS_DATA_DICT)

In [40]:
cardinal_dir.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4029 entries, 0 to 4028
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   route_id                   4029 non-null   object 
 1   schedule_gtfs_dataset_key  4029 non-null   object 
 2   direction_id               4029 non-null   float64
 3   route_primary_direction    4029 non-null   object 
dtypes: float64(1), object(3)
memory usage: 126.0+ KB


In [47]:
cardinal_df = pd.DataFrame()
for date in analysis_date_list:
    df = schedule_stats_by_route_direction.cardinal_direction_for_route_direction(date,GTFS_DATA_DICT)
    cardinal_df = pd.concat([cardinal_df, df])

In [48]:
cardinal_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65710 entries, 0 to 4302
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   route_id                   65710 non-null  object 
 1   schedule_gtfs_dataset_key  65710 non-null  object 
 2   direction_id               65710 non-null  float64
 3   route_primary_direction    65710 non-null  object 
dtypes: float64(1), object(3)
memory usage: 2.5+ MB


### Try to merge these two together

In [None]:
route_time_cols = ["schedule_gtfs_dataset_key", 
                   "route_id", 
                   "direction_id", 
                   "time_period"]

In [None]:
pd.merge(
        may_24_route_dir_df,
        may_22_24,
        on = route_time_cols,
        how = "outer",
        indicator = True
    )[["_merge"]].value_counts()

### Try out all the dates

In [None]:
import merge_data

In [None]:
df_sched = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)

In [None]:
primary_typology = merge_data.set_primary_typology(df_sched)
    
df_sched2 = pd.merge(
        df_sched,
        primary_typology,
        on = route_time_cols,
        how = "left"
    )

In [None]:
df_sched2.direction_id.value_counts(dropna= False)

In [None]:
df_rt_sched = (
        merge_data.concatenate_rt_vs_schedule_by_route_direction(
            analysis_date_list)
        .astype({"direction_id": "float"})
    )

In [None]:
df_rt_sched.head(2)

In [None]:
df_rt_sched.direction_id.value_counts(dropna= False)

In [None]:
df = pd.merge(
        df_sched2,
        df_rt_sched,
        on = route_time_cols + ["service_date"],
        how = "outer",
        indicator = "sched_rt_category"
    )

In [None]:
df.info()

In [None]:
df.sched_rt_category.value_counts()

### `gtfs_digest/_section2_utils.py/load_schedule_vp_metrics()`

In [None]:
RT_SCHED_GCS

In [None]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [None]:
GTFS_DATA_DICT.digest_tables.dir

In [None]:
GTFS_DATA_DICT.digest_tables.route_schedule_vp

In [None]:
sched_vp_url = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics_AH_testing.parquet"

In [None]:
sched_vp_df = pd.read_parquet(sched_vp_url)

In [None]:
sched_vp_df.head(1)

In [None]:
sched_vp_df.info()

### `gtfs_funnel/schedule_stats_by_route_direction`

In [None]:
def cardinal_direction_for_route_direction(analysis_date:str, dict_inputs:dict):
    """
    Get a cardinal direction (North, South, East, West) for each
    route.
    """
    STOP_TIMES_FILE = dict_inputs.rt_vs_schedule_tables.stop_times_direction
    
    stop_times_gdf = pd.read_parquet(
    f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{analysis_date}.parquet",
    filters=[[("stop_primary_direction", "!=", "Unknown")]
    ])
    
    trip_scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name"
    ]
        
    trips_df = helpers.import_scheduled_trips(analysis_date, 
                                             columns = trip_scheduled_col,
                                             get_pandas = True)

    
    # Merge dfs
    merge_cols = ["trip_instance_key", 
                  "schedule_gtfs_dataset_key", 
                  "shape_array_key"]
    
    stop_times_with_trip = pd.merge(stop_times_gdf, trips_df, on = merge_cols)
    
    # Fill in missing direction id with 0, per our usual practice.
    print(f"# of nulls for direction_id: {stop_times_with_trip['direction_id'].isna().sum()}")
    stop_times_with_trip.direction_id = stop_times_with_trip.direction_id.fillna(0)
    
    main_cols = [
        "route_id",
        "schedule_gtfs_dataset_key",
        "direction_id"
    ]
    
    agg1 = (
        stop_times_with_trip.groupby(
            main_cols + ["stop_primary_direction"]
        )
        .agg({"stop_sequence": "count"})
        .reset_index()
        .rename(columns={"stop_sequence": "total_stops"})
    )
    
    # Sort and drop duplicates so that the
    # largest # of stops by stop_primary_direction is at the top
    agg2 = agg1.sort_values(
        by= main_cols + ["total_stops"],
        ascending=[True, True, True, False],
    )
    
    # Drop duplicates so only the top stop_primary_direction is kept.
    agg3 = agg2.drop_duplicates(subset= main_cols).reset_index(drop=True)
    
    agg3 = agg3.drop(columns=["total_stops"])
    return agg3

In [None]:
test = cardinal_direction_for_route_direction(analysis_date,GTFS_DATA_DICT)

In [None]:
test.head()

In [None]:
test.info()

In [None]:
trip_metrics = schedule_stats_by_route_direction.assemble_scheduled_trip_metrics(analysis_date, GTFS_DATA_DICT)
trip_metrics = trip_metrics.rename(columns = {"stop_primary_direction":"route_primary_direction"})

In [None]:
trip_metrics.sample()

In [None]:

route_merge_group_cols = [
            "schedule_gtfs_dataset_key", 
            "route_id", 
            "direction_id",
        ]

In [None]:
route = schedule_stats_by_route_direction.schedule_metrics_by_route_direction(trip_metrics, analysis_date,route_merge_group_cols)
        

In [None]:
route.drop(columns = ['geometry']).sample()

In [None]:
len(route)

In [None]:
pd.merge(
            route,
            test,
            on = route_merge_group_cols,
            how = "outer",
            indicator = True
        )[["_merge"]].value_counts(dropna=False)

In [None]:
route2= pd.merge(
            route,
            test,
            on = route_merge_group_cols,
            how = "left"
        )

In [None]:
len(route2)

In [None]:
route2.drop(columns = ['geometry']).sample(3)

In [None]:
ROUTE_TYPOLOGIES = GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
route_typologies = pd.read_parquet(
            f"{SCHED_GCS}{ROUTE_TYPOLOGIES}_{analysis_date}.parquet",
            columns = route_merge_group_cols + [
                "is_coverage", "is_downtown_local", 
                "is_local", "is_rapid", "is_express", "is_rail"]
        )
    

In [None]:
route_dir_metrics2 = pd.merge(
            route,
            route_typologies,
            on = route_merge_group_cols,
            how = "left"
        )

In [None]:
    
route_dir_metrics3 = pd.merge(
            route,
            route_typologies,
            on = route_merge_group_cols,
            how = "left"
        ).merge(test,
            on = route_merge_group_cols,
            how = "left")

In [None]:
len(route_dir_metrics3), len(route_dir_metrics2), len(route)

In [None]:
route_dir_metrics3.drop(columns = ['geometry']).sample(3)