## Checking routes with 2+ cardinal directions
* While running an operator (but I forgot which one) in district 4, I discovered there were some routes with more than 2 cardinal directions. 
* This shouldn't be happening. Find out what's going on. 
* Work backwards from final dataset to figure out which routes from D4 is experiencing this, then go back to the pipeline.

In [1]:
import _section2_utils as section2
import geopandas as gpd
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

d4_df = pd.read_parquet(
    schd_vp_url,
    filters=[
        [
            ("caltrans_district", "==", "04 - Oakland"),
            ("sched_rt_category", "==", "schedule_and_vp"),
        ]
    ],
)

In [4]:
d4_df.shape

(34870, 46)

In [5]:
d4_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,name,route_long_name,route_short_name,route_combined_name,route_id,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,caltrans_district
108,015d67d5b75b5cf2b710bbadadfb75f5,0.0,all_day,Northbound,57.14,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2023-04-12,downtown_local,1614,1579,2517.85,1201.0,4596,3438,2,8,11,21,1.83,0.75,0.64,0.63,1.0,1.0,2.1,119.9,schedule_and_vp,18.03,Bay Area 511 Marin Schedule,Downtown San Rafael - Sausalito,17,17 Downtown San Rafael - Sausalito,17,reckCEnFkdLVgfxck,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland
109,015d67d5b75b5cf2b710bbadadfb75f5,0.0,all_day,Northbound,57.14,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2023-05-17,downtown_local,1672,1652,2326.07,1257.0,4953,4287,0,4,18,22,2.13,0.87,0.72,0.71,1.0,1.0,1.85,105.73,schedule_and_vp,16.59,Bay Area 511 Marin Schedule,Downtown San Rafael - Sausalito,17,17 Downtown San Rafael - Sausalito,17,reckCEnFkdLVgfxck,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland


#### Interesting, the direction changes across time for certain routes. How to handle this?

In [6]:
agg1 = (
    d4_df.groupby(
        [
            "service_date",
            "schedule_gtfs_dataset_key",
            "organization_name",
            "route_combined_name",
        ]
    )
    .agg({"route_primary_direction": "nunique"})
    .reset_index()
    .sort_values(by=["route_primary_direction"], ascending=False)
)

In [7]:
agg1.head()

Unnamed: 0,service_date,schedule_gtfs_dataset_key,organization_name,route_combined_name,route_primary_direction
0,2023-03-15,0881af3822466784992a49f1cc57d38f,Sonoma-Marin Area Rail Transit District,SMART Main Line,2
4445,2023-12-13,fb467982dcc77a7f9199bebe709bb700,Santa Clara Valley Transportation Authority,44 Milpitas BART - McCarthy Rnch via Alder,2
4443,2023-12-13,fb467982dcc77a7f9199bebe709bb700,Santa Clara Valley Transportation Authority,40 Foothill Coll - Mtn View TC via N. Bayshore,2
4442,2023-12-13,fb467982dcc77a7f9199bebe709bb700,Santa Clara Valley Transportation Authority,39 Eastridge - The Villages,2
4441,2023-12-13,fb467982dcc77a7f9199bebe709bb700,Santa Clara Valley Transportation Authority,37 West Valley Coll - Capitol Stn,2


In [8]:
agg2 = (
    d4_df.groupby(
        ["schedule_gtfs_dataset_key", "organization_name", "route_combined_name"]
    )
    .agg({"route_primary_direction": "nunique"})
    .reset_index()
    .sort_values(by=["route_primary_direction"], ascending=False)
)

In [9]:
agg2.loc[agg2.route_primary_direction > 2].sort_values(by=["organization_name"])

Unnamed: 0,schedule_gtfs_dataset_key,organization_name,route_combined_name,route_primary_direction
661,c499f905e33929a641f083dad55c521e,Alameda-Contra Costa Transit District,676 De Anza - Crespi - Rollingwood,4
650,c499f905e33929a641f083dad55c521e,Alameda-Contra Costa Transit District,657 Oak High - Oak Tech - MacArthur,3
665,c499f905e33929a641f083dad55c521e,Alameda-Contra Costa Transit District,682 Bishop O'Dowd High - Montclair,3
333,587e730fac4db21d54037e0f12b0dd5d,Central Contra Costa Transit Authority,15 Concord BART/Pleasant Hill BART,3
414,7cc0cb1871dfd558f11a2885c145d144,City and County of San Francisco,14R MISSION RAPID,3
73,0f5e1b251db53223200c5bfc365d33f2,City of Fairfield,7 FTC/Solano College/Cordelia Library,4
72,0f5e1b251db53223200c5bfc365d33f2,City of Fairfield,6 Solano Town Center/East Tabor & Sunset,3
74,0f5e1b251db53223200c5bfc365d33f2,City of Fairfield,8 Cordelia Library/Oakbrook Dr,3
68,0f5e1b251db53223200c5bfc365d33f2,City of Fairfield,1 FTC/Fairfield Walmart,3
70,0f5e1b251db53223200c5bfc365d33f2,City of Fairfield,3 FTC/Solano Town Center/Fairfield Walmart,3


#### Check this out for all districts

In [90]:
all_districts_df = pd.read_parquet(
    schd_vp_url,
    filters=[
        [
            ("sched_rt_category", "==", "schedule_and_vp"),
        ]
    ],
)

all_districts_df_agg = (
    all_districts_df.groupby(
        ["schedule_gtfs_dataset_key", "organization_name", "route_combined_name"]
    )
    .agg({"route_primary_direction": "nunique"})
    .reset_index()
    .sort_values(by=["route_primary_direction"], ascending=False)
)

In [91]:
routes_multi_cardinal_dir = all_districts_df_agg.loc[all_districts_df_agg.route_primary_direction > 2].sort_values(by=["organization_name"])

In [92]:
routes_multi_cardinal_dir.route_combined_name.nunique()

51

In [93]:
len(routes_multi_cardinal_dir)

56

In [95]:
routes_multi_cardinal_dir.route_combined_name.value_counts().head()

Route 1                                 4
313 Oceanside TC - Town Center North    2
38 Jesse Bethel                         2
82 RANCHO CUCAMONGA-FONTANA             1
13 Hunter Park Metro-Galleria           1
Name: route_combined_name, dtype: int64

In [96]:
routes_multi_cardinal_dir.loc[routes_multi_cardinal_dir.route_combined_name == "Route 1"]

Unnamed: 0,schedule_gtfs_dataset_key,organization_name,route_combined_name,route_primary_direction
1919,7e015887964432c82ce7e735c2753f86,City of Ojai,Route 1,3
309,1770249a5a2e770ca90628434d4934b1,City of Ojai,Route 1,3
349,1770249a5a2e770ca90628434d4934b1,Gold Coast Transit District,Route 1,3
392,1770249a5a2e770ca90628434d4934b1,Ventura County Transportation Commission,Route 1,3


#### Check this out by one route only: this route run by City of Fairfield switches direction in May. 

In [10]:
d4_df.loc[
    (d4_df.schedule_gtfs_dataset_key == "0f5e1b251db53223200c5bfc365d33f2")
    & (d4_df.route_combined_name == "7 FTC/Solano College/Cordelia Library")
].sort_values(by=["service_date", "direction_id"])[
    ["service_date", "route_primary_direction", "direction_id"]
].drop_duplicates()

Unnamed: 0,service_date,route_primary_direction,direction_id
7130,2023-03-15,Southbound,0.0
7178,2023-03-15,Eastbound,1.0
7131,2023-04-12,Southbound,0.0
7179,2023-04-12,Eastbound,1.0
7132,2023-05-17,Westbound,0.0
7180,2023-05-17,Northbound,1.0
7133,2023-06-14,Westbound,0.0
7181,2023-06-14,Northbound,1.0
7134,2023-07-12,Westbound,0.0
7182,2023-07-12,Northbound,1.0


### `schedule_stats_by_routes/cardinal_direction_for_route_direction` 

In [12]:
dates_2023 = rt_dates.y2023_dates[0:4]

In [59]:
dates_2023

['2023-03-15', '2023-04-12', '2023-05-17', '2023-06-14']

In [14]:
STOP_TIMES_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction

In [15]:
stop_times_gdf = pd.DataFrame()
for date in dates_2023:
    df = pd.read_parquet(
        f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{date}.parquet",
        filters=[[("stop_primary_direction", "!=", "Unknown")]],
    )

    stop_times_gdf = pd.concat([stop_times_gdf, df])

In [18]:
len(
    stop_times_gdf.loc[
        stop_times_gdf.schedule_gtfs_dataset_key == "0f5e1b251db53223200c5bfc365d33f2"
    ]
)

9952

In [25]:
fairfield_stops = stop_times_gdf.loc[
    stop_times_gdf.schedule_gtfs_dataset_key == "0f5e1b251db53223200c5bfc365d33f2"
].reset_index(drop=True)

In [32]:
fairfield_stops.head(2)

Unnamed: 0,feed_key,stop_id,stop_sequence,schedule_gtfs_dataset_key,trip_instance_key,shape_array_key,stop_name,geometry,prior_stop_sequence,subseq_stop_sequence,stop_pair,stop_pair_name,stop_primary_direction,stop_meters
0,41193dc57893b54982c8e3945dea98fc,75256,17,0f5e1b251db53223200c5bfc365d33f2,01dce2d98184330a8d22215f0972efe8,fba0c2d4d16b19b8173271177d1dcb62,SMART and Final,b'\x01\x01\x00\x00\x00?\xdeFj\xe8\xb3\x05\xc1\x00\xddM\xe7p\x13\xde@',1,,75256__,SMART and Final__,Westbound,6265.73
1,41193dc57893b54982c8e3945dea98fc,75510,6,0f5e1b251db53223200c5bfc365d33f2,03ce8d047c8219806ec8a90ce485e966,3ed65de60779424b96669228f16e3f14,Doverwood Apartments,b'\x01\x01\x00\x00\x00\xd8\xff\x06\xdb<\x98\x05\xc1\x00\x1cp\x85\x84\xa4\xde@',1,12.0,75510__75256,Doverwood Apartments__SMART and Final,Eastbound,1124.52


In [21]:
trip_scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name",
]

In [22]:
trips_df = pd.DataFrame()
for date in dates_2023:
    df = helpers.import_scheduled_trips(
        date, columns=trip_scheduled_col, get_pandas=True
    )

    trips_df = pd.concat([trips_df, df])

In [23]:
len(
    trips_df.loc[
        trips_df.schedule_gtfs_dataset_key == "0f5e1b251db53223200c5bfc365d33f2"
    ]
)

1032

In [24]:
city_fairfield_trips = trips_df.loc[
    trips_df.schedule_gtfs_dataset_key == "0f5e1b251db53223200c5bfc365d33f2"
]

In [34]:
city_fairfield_trips = city_fairfield_trips.drop(columns=["route_desc"])

In [35]:
len(
    city_fairfield_trips.loc[
        city_fairfield_trips.route_long_name.str.contains("Cordelia")
    ]
)

248

In [36]:
cordelia = city_fairfield_trips.loc[
    city_fairfield_trips.route_long_name.str.contains("Cordelia")
]

In [38]:
merge_cols = ["trip_instance_key", "schedule_gtfs_dataset_key", "shape_array_key"]

stop_times_with_trip = pd.merge(fairfield_stops, cordelia, on=merge_cols)

In [42]:
stop_times_with_trip.head(2)

Unnamed: 0,feed_key,stop_id,stop_sequence,schedule_gtfs_dataset_key,trip_instance_key,shape_array_key,stop_name,geometry,prior_stop_sequence,subseq_stop_sequence,stop_pair,stop_pair_name,stop_primary_direction,stop_meters,route_id,direction_id,route_long_name,route_short_name,name
0,41193dc57893b54982c8e3945dea98fc,75313,4,0f5e1b251db53223200c5bfc365d33f2,05a3530c6346fb27f51c26f9ae0c776b,e9227e4f28eaf07b2576127333f0a31f,Pittman Rd and Central Way,"b""\x01\x01\x00\x00\x00*\xecp\xe6\xcb\xb3\x06\xc1\x00\xd6\xcf\x8b\x85'\xd8@""",1,15,75313__75290,Pittman Rd and Central Way__Lopes Rd and Canyon Hills Dr,Eastbound,1541.93,8,1.0,Cordelia Library/Oakbrook Dr,8,Bay Area 511 Fairfield and Suisun Transit Schedule
1,41193dc57893b54982c8e3945dea98fc,75290,15,0f5e1b251db53223200c5bfc365d33f2,05a3530c6346fb27f51c26f9ae0c776b,e9227e4f28eaf07b2576127333f0a31f,Lopes Rd and Canyon Hills Dr,b'\x01\x01\x00\x00\x00\xad\xdcxDJ\xbe\x06\xc1\x80P4r~\x14\xd3@',4,19,75290__75294,Lopes Rd and Canyon Hills Dr__Green Valley Middle School,Southbound,5206.95,8,1.0,Cordelia Library/Oakbrook Dr,8,Bay Area 511 Fairfield and Suisun Transit Schedule


In [39]:
main_cols = ["route_id", "schedule_gtfs_dataset_key", "direction_id"]

agg1 = (
    stop_times_with_trip.groupby(main_cols + ["stop_primary_direction"])
    .agg({"stop_sequence": "count"})
    .reset_index()
    .rename(columns={"stop_sequence": "total_stops"})
)

In [40]:
agg2 = agg1.sort_values(
    by=main_cols + ["total_stops"],
    ascending=[True, True, True, False],
)

#### Connect this back to route typologies

In [43]:
ROUTE_DIR_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [45]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [55]:
ROUTE_DIR_EXPORT

'schedule_route_dir/schedule_route_direction_metrics'

In [71]:
def open_add(file: str, date: str):
    df = pd.read_parquet(
        "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2023-03-15.parquet"
    )
    df["service_date"] = date
    return df

In [72]:
march_df = open_add(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2023-03-15.parquet",
    "3-15",
)

In [73]:
apr_df = open_add(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2023-04-12.parquet",
    "4-12",
)

In [74]:
may_df = open_add(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2023-05-17.parquet",
    "5-17",
)

In [75]:
june_df = open_add(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2023-06-14.parquet",
    "6-14",
)

In [76]:
all_df = pd.concat([march_df, apr_df, may_df, june_df])

In [77]:
route_group_merge_cols = ["schedule_gtfs_dataset_key", "route_id", "direction_id"]

In [78]:
m1 = pd.merge(agg2, all_df, on=route_group_merge_cols, how="left")

In [79]:
m1 = m1.drop(columns=["geometry"])

In [80]:
m1.head(1)

Unnamed: 0,route_id,schedule_gtfs_dataset_key,direction_id,stop_primary_direction,total_stops,common_shape_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail,route_primary_direction,service_date
0,7,0f5e1b251db53223200c5bfc365d33f2,0.0,Westbound,228,p_2689,FTC/Solano College/Cordelia Library,25.0,1.24,19,all_day,0.79,1.0,0.0,0.0,1.0,0.0,0.0,Southbound,3-15


In [84]:
m1[
    ["service_date", "stop_primary_direction", "direction_id", "total_stops"]
].sort_values(
    by=["service_date", "total_stops"], ascending=[True, False]
).drop_duplicates()

Unnamed: 0,service_date,stop_primary_direction,direction_id,total_stops
72,3-15,Northbound,0.0,278
36,3-15,Northbound,1.0,252
0,3-15,Westbound,0.0,228
48,3-15,Eastbound,1.0,216
108,3-15,Southbound,1.0,192
12,3-15,Southbound,0.0,190
120,3-15,Northbound,1.0,166
132,3-15,Eastbound,1.0,120
144,3-15,Westbound,1.0,120
24,3-15,Northbound,0.0,114
