## Second Pass at Cardinal Direction ID
* Right now, there is a rough draft but it doesn't incorporate dates and it takes forever to run.
* This notebook will incorporate dates and deployed to make sure each operator doesn't take 2+ minutes to generate when deploying this work to the portfolio

Resources
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/average_summary_speeds.py#L184-L191

In [1]:
import _section2_utils as section2_utils
import geopandas as gpd
import numpy as np
import pandas as pd
from IPython.display import HTML, Image, Markdown, display, display_html
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
import datetime

from dask import compute, delayed

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
lawndale = "City of Lawndale"

In [5]:
la_metro = "Los Angeles County Metropolitan Transportation Authority"

In [6]:
sf = "City and County of San Francisco"

In [7]:
df = section2_utils.load_schedule_vp_metrics(sf)

In [8]:
df.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,dir_0_1,Period,Average Scheduled Service (trip minutes),Average Stop Distance (miles),# scheduled trips,Trips per Hour,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,Date,Route typology,# Minutes with 1+ VP per Minute,# Minutes with 2+ VP per Minute,Aggregate Actual Service Minutes,Aggregate Scheduled Service Minutes (all trips),# VP,# VP within Scheduled Shape,# Early Arrival Trips,# On-Time Trips,# Late Trips,# Trips with VP,Average VP per Minute,% VP within Scheduled Shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,% Scheduled Trip w/ 1+ VP/Minute,% Scheduled Trip w/ 2+ VP/Minute,Realtime versus Scheduled Service Ratio,Average Actual Service (Trip Minutes),GTFS Availability,Speed (MPH),Transit Operator,route_long_name,route_short_name,Route,Route ID,schedule_source_record_id,Base64 Encoded Feed URL,Organization ID,Organization,District,ruler_100_pct,ruler_for_vp_per_min,frequency_in_minutes,direction_id,Direction,service_date
0,7cc0cb1871dfd558f11a2885c145d144,0.0,all_day,41.33,0.12,151,6.29,0.0,0.0,0.0,0.0,1.0,0.0,2023-04-12,downtown_local,7816,7708,12084.08,6194.0,23106,21485,4,28,118,150,1.91,93.0,65.0,64.0,100.0,100.0,1.95,80.56,schedule_and_vp,6.91,Bay Area 511 Muni Schedule,CALIFORNIA,1,1 CALIFORNIA,1,recHD22phgJs34JHP,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1TRg==,rechaapWbeffO33OX,City and County of San Francisco,04 - Oakland,100,2,9.54,0.0,Westbound,2023-04-12


In [9]:
df.Direction.value_counts()

Southbound    883
Westbound     724
Eastbound     636
Northbound     98
Name: Direction, dtype: int64

In [10]:
df.Direction.value_counts().sum()

2341

In [11]:
len(df)

4654

In [12]:
df.dir_0_1.count()

4654

In [13]:
all_dates_list = list(df.Date.unique())
gtfs_keys = list(df.schedule_gtfs_dataset_key.unique())
all_dates_list = [np.datetime_as_string(date, unit="D") for date in all_dates_list]

In [14]:
analysis_date = all_dates_list[0]

### Editing the `stop_times_direction`

In [15]:
TABLE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction
FILE = f"{RT_SCHED_GCS}{TABLE}_{analysis_date}.parquet"

In [16]:
stops = pd.read_parquet(FILE)

In [17]:
stops.sample()

Unnamed: 0,feed_key,stop_id,stop_sequence,schedule_gtfs_dataset_key,trip_instance_key,shape_array_key,stop_name,geometry,prior_stop_sequence,subseq_stop_sequence,stop_pair,stop_pair_name,stop_primary_direction,stop_meters
764564,c7fac53170126479badfad5499f6048b,2475,4620,f74424acf8c41e4c1e9fd42838c4875c,38291c78f3bf2d6292ff0cff96da9a2f,ecf5e3c700f2ae99b45179bdb653bcd3,Spring St and 1st St W,b'\x01\x01\x00\x00\x00\x12!\xeerG\xcb\x03A@\\\xd0D-\xc6\x1a\xc1',4546,4716,2475__501,Spring St and 1st St W__1st St and Broadway W,Southbound,145.63


### Editing `import_scheduled_trips`

In [18]:
TABLE2 = GTFS_DATA_DICT.schedule_downloads.trips
FILE2 = f"{COMPILED_CACHED_VIEWS}{TABLE}_{analysis_date}.parquet"

In [19]:
scheduled_trips = pd.read_parquet(FILE)

In [20]:
scheduled_trips.sample()

Unnamed: 0,feed_key,stop_id,stop_sequence,schedule_gtfs_dataset_key,trip_instance_key,shape_array_key,stop_name,geometry,prior_stop_sequence,subseq_stop_sequence,stop_pair,stop_pair_name,stop_primary_direction,stop_meters
2181943,a7ba6f075198e9bf9152fab6c7faf0f6,91012,60,1adf7a7bde86b42ed014f6de74c7132e,a107d5c3e981eb760e51b30c06a29ba9,dab0f4a260ab2b5b15d1d4e5d4d758d7,Iris Avenue Transit Center,"b'\x01\x01\x00\x00\x00\xf6\x1cU\xdc\x1b\xd7\x10A\x08\x87\x0c\x07\xaeR""\xc1'",59,,91012__,Iris Avenue Transit Center__,Southbound,472.16


### Testing for ONE date only

In [21]:
def find_most_common_dir(
    scheduled_trips_df: pd.DataFrame,
    scheduled_stop_times_df: pd.DataFrame,
) -> pd.DataFrame:
    """
    Load load_scheduled_trips() and load_scheduled_stop_times()
    """
    # Merge dfs
    merge_cols = [
        "trip_instance_key",
        "schedule_gtfs_dataset_key",
        "shape_array_key",
        "service_date",
    ]

    df = delayed(pd.merge)(
        scheduled_trips_df,
        scheduled_stop_times_df,
        on=merge_cols,
        how="inner",
    )

    agg1 = (
        df.groupby(
            [
                "route_id",
                "schedule_gtfs_dataset_key",
                "direction_id",
                "stop_primary_direction",
                "service_date",
            ]
        )
        .agg({"stop_sequence": "count"})
        .reset_index()
        .rename(columns={"stop_sequence": "total_stops"})
    )

    # Sort and drop duplicates so that the
    # largest # of stops by stop_primary_direction is at the top
    agg2 = agg1.sort_values(
        by=[
            "route_id",
            "schedule_gtfs_dataset_key",
            "direction_id",
            "service_date",
            "total_stops",
        ],
        ascending=[True, True, True, True, False],
    )

    # Drop duplicates so only the top stop_primary_direction is kept.
    agg3 = agg2.drop_duplicates(
        subset=["route_id", "schedule_gtfs_dataset_key", "direction_id", "service_date"]
    ).reset_index(drop=True)

    agg3 = agg3.drop(columns=["total_stops"])
    return agg3

In [22]:
scheduled_trips_dd = delayed(
    section2_utils.load_scheduled_trips(analysis_date, gtfs_keys)
)
scheduled_stops_dd = delayed(
    section2_utils.load_scheduled_stop_times(analysis_date, gtfs_keys)
)
apr_2023_cardinal_dir = find_most_common_dir(scheduled_trips_dd, scheduled_stops_dd)

In [118]:
apr_2023_cardinal_dir_df = apr_2023_cardinal_dir.compute()

#### Something is going wrong with linking the route name to the route ID.
* Using `1 CALIFORNIA` as an example, I can see that direction 1 isn't populating correctly.
* Even though `find_most_common_dir` yields the correct result of a cardinal direction for 0 and 1.

In [119]:
apr_2023_cardinal_dir_df.route_id.value_counts().describe()

count   67.00
mean     2.00
std      0.00
min      2.00
25%      2.00
50%      2.00
75%      2.00
max      2.00
Name: route_id, dtype: float64

In [120]:
apr_2023_cardinal_dir_df.head(2)

Unnamed: 0,route_id,schedule_gtfs_dataset_key,direction_id,stop_primary_direction,service_date
0,1,7cc0cb1871dfd558f11a2885c145d144,0.0,Westbound,2023-04-12
1,1,7cc0cb1871dfd558f11a2885c145d144,1.0,Eastbound,2023-04-12


In [121]:
apr_2023_cardinal_dir_df.route_id.nunique()

67

In [31]:
df.loc[df["Route ID"] == "1"].Direction.value_counts()

Westbound    39
Name: Direction, dtype: int64

In [36]:
df.loc[df["Route ID"] == "1"][
    ["Direction", "dir_0_1", "Route", "Route ID"]
].drop_duplicates()

Unnamed: 0,Direction,dir_0_1,Route,Route ID
0,Westbound,0.0,1 CALIFORNIA,1
36,,1.0,1 CALIFORNIA,1


In [37]:
def most_recent_route_info(
    df: pd.DataFrame, group_cols: list, route_col: str
) -> pd.DataFrame:
    """
    Find the most recent value across a grouping.
    Ex: if we group by route_id, we can find the most recent
    value for route_long_name.

    Needs a date column to work.
    """
    sort_order = [True for c in group_cols]

    most_recent = (
        df.sort_values(group_cols + ["service_date"], ascending=sort_order + [False])
        .drop_duplicates(subset=group_cols)
        .rename(columns={route_col: f"recent_{route_col}"})
    )

    df2 = delayed(pd.merge)(
        df, most_recent[group_cols + [f"recent_{route_col}"]], on=group_cols, how="left"
    )
    return most_recent

In [137]:
def find_most_recent_route_id(df):
    df = df.assign(
        route_id=df.route_id.fillna(""),
        route_short_name=df.route_short_name.fillna(""),
        route_long_name=df.route_long_name.fillna(""),
    )
    df = df.assign(combined_name=df.route_short_name + "__" + df.route_long_name)

    df = df.assign(
        route_id2=df.apply(
            lambda x: gtfs_schedule_wrangling.standardize_route_id(
                x, "name", "route_id"
            ),
            axis=1,
        )
    )

    route_cols = ["schedule_gtfs_dataset_key", "name", "route_id2"]

    df2 = most_recent_route_info(
        df, group_cols=route_cols, route_col="combined_name"
    ).pipe(
        most_recent_route_info,
        group_cols=["schedule_gtfs_dataset_key", "name", "recent_combined_name"],
        route_col="route_id2",
    )

    df2 = df2[
        ["schedule_gtfs_dataset_key", "route_id", "service_date", "recent_route_id2"]
    ]
    return df2

In [41]:
recent_route_id_test = find_most_recent_route_id(scheduled_trips_dd)

In [43]:
recent_route_id_test_df = recent_route_id_test.compute()

In [44]:
recent_route_id_test_df.shape

(67, 12)

In [46]:
recent_route_id_test_df.head(3)

Unnamed: 0,route_id,trip_instance_key,schedule_gtfs_dataset_key,shape_array_key,direction_id,route_long_name,route_short_name,route_desc,name,service_date,recent_combined_name,recent_route_id2
1438,12,7e9802e026a9642708b8657815c15a94,7cc0cb1871dfd558f11a2885c145d144,158a0ca4dcaa137f097c61d7867224f4,0.0,FOLSOM-PACIFIC,12,6am-10pm daily,Bay Area 511 Muni Schedule,2023-04-12,12__FOLSOM-PACIFIC,12
4519,14R,3681b49787d250eb6a97235fe61e025e,7cc0cb1871dfd558f11a2885c145d144,9f889f76b1450a8825d236170bb73805,0.0,MISSION RAPID,14R,5am-10pm daily,Bay Area 511 Muni Schedule,2023-04-12,14R__MISSION RAPID,14R
1527,14,5762f1a71e8673e8bfb6e36f4ccc7c0a,7cc0cb1871dfd558f11a2885c145d144,b3c934bafe09d14d8709786f8d82a6fb,0.0,MISSION,14,24 hour service daily,Bay Area 511 Muni Schedule,2023-04-12,14__MISSION,14


In [80]:
recent_route_id_test_df.recent_route_id2.nunique()

67

#### Testing merging

In [122]:
m1 = pd.merge(
    apr_2023_cardinal_dir_df,
    recent_route_id_test_df,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id", "service_date"],
    how="outer",
    indicator=True,
)

In [123]:
m1[["_merge"]].value_counts()

_merge    
left_only     67
both          67
right_only     0
dtype: int64

In [125]:
apr_2023_cardinal_dir_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 5 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   route_id                   134 non-null    object 
 1   schedule_gtfs_dataset_key  134 non-null    object 
 2   direction_id               134 non-null    float64
 3   stop_primary_direction     134 non-null    object 
 4   service_date               134 non-null    object 
dtypes: float64(1), object(4)
memory usage: 5.4+ KB


In [126]:
recent_route_id_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67 entries, 1438 to 877
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   route_id                   67 non-null     object 
 1   trip_instance_key          67 non-null     object 
 2   schedule_gtfs_dataset_key  67 non-null     object 
 3   shape_array_key            67 non-null     object 
 4   direction_id               67 non-null     float64
 5   route_long_name            67 non-null     object 
 6   route_short_name           67 non-null     object 
 7   route_desc                 67 non-null     object 
 8   name                       67 non-null     object 
 9   service_date               67 non-null     object 
 10  recent_combined_name       67 non-null     object 
 11  recent_route_id2           67 non-null     object 
dtypes: float64(1), object(11)
memory usage: 6.8+ KB


In [127]:
m2 = pd.merge(
    apr_2023_cardinal_dir_df,
    recent_route_id_test_df,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="outer",
    indicator=True,
)

In [128]:
m2._merge.value_counts()

left_only     67
both          67
right_only     0
Name: _merge, dtype: int64

In [129]:
m3 = pd.merge(
    apr_2023_cardinal_dir_df,
    recent_route_id_test_df,
    on=["schedule_gtfs_dataset_key", "route_id", "service_date"],
    how="outer",
    indicator=True,
)

In [130]:
m3._merge.value_counts()

both          134
left_only       0
right_only      0
Name: _merge, dtype: int64

#### Something going wrong with this function when merging.

In [138]:
def find_cardinal_direction(date: str, gtfs_schedule_keys: list) -> pd.DataFrame:
    # Grab all available dates for these dataframes
    # Load the 2 dataframes
    scheduled_trips_dd = delayed(
        section2_utils.load_scheduled_trips(date, gtfs_schedule_keys)
    )
    scheduled_stops_dd = delayed(
        section2_utils.load_scheduled_stop_times(date, gtfs_schedule_keys)
    )

    # Find the most common direction for this Route ID
    common_stops_dd = find_most_common_dir(scheduled_trips_dd, scheduled_stops_dd)

    # Find the most recent Route ID to connect back to sched_vp_df
    recent_ids_dd = find_most_recent_route_id(scheduled_trips_dd)

    # Merge this
    m1 = delayed(pd.merge)(
        common_stops_dd,
        recent_ids_dd,
        on=["schedule_gtfs_dataset_key", "route_id", "service_date"],
        how="inner",
    )

    m1 = m1.drop(columns=["route_id"])

    return m1

In [139]:
apr2023_final = find_cardinal_direction(analysis_date, gtfs_keys)

In [140]:
apr2023_final_df = apr2023_final.compute()

In [141]:
apr2023_final_df.sample()

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,stop_primary_direction,service_date,recent_route_id2
105,7cc0cb1871dfd558f11a2885c145d144,1.0,Eastbound,2023-04-12,F


In [142]:
apr2023_final_df.groupby(
    ["recent_route_id2", "direction_id", "stop_primary_direction"]
).agg({"service_date": "unique"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,service_date
recent_route_id2,direction_id,stop_primary_direction,Unnamed: 3_level_1
1,0.0,Westbound,[2023-04-12]
1,1.0,Eastbound,[2023-04-12]
12,0.0,Eastbound,[2023-04-12]
12,1.0,Westbound,[2023-04-12]
14,0.0,Southbound,[2023-04-12]
14,1.0,Northbound,[2023-04-12]
14R,0.0,Southbound,[2023-04-12]
14R,1.0,Northbound,[2023-04-12]
15,0.0,Southbound,[2023-04-12]
15,1.0,Eastbound,[2023-04-12]


In [87]:
df_apr_2023 = df.loc[df["Date"] == "2023-04-12"]

In [81]:
df_apr_2023["Route ID"].nunique()

62

In [88]:
df_apr_2023 = df_apr_2023.drop(columns=["Direction", "direction_id", "service_date"])

In [91]:
df_apr_2023["temp_service_date"] = df_apr_2023["Date"].astype(str)

In [75]:
df_apr_2023.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,dir_0_1,Period,Average Scheduled Service (trip minutes),Average Stop Distance (miles),# scheduled trips,Trips per Hour,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,Date,Route typology,# Minutes with 1+ VP per Minute,# Minutes with 2+ VP per Minute,Aggregate Actual Service Minutes,Aggregate Scheduled Service Minutes (all trips),# VP,# VP within Scheduled Shape,# Early Arrival Trips,# On-Time Trips,# Late Trips,# Trips with VP,Average VP per Minute,% VP within Scheduled Shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,% Scheduled Trip w/ 1+ VP/Minute,% Scheduled Trip w/ 2+ VP/Minute,Realtime versus Scheduled Service Ratio,Average Actual Service (Trip Minutes),GTFS Availability,Speed (MPH),Transit Operator,route_long_name,route_short_name,Route,Route ID,schedule_source_record_id,Base64 Encoded Feed URL,Organization ID,Organization,District,ruler_100_pct,ruler_for_vp_per_min,frequency_in_minutes,service_date
0,7cc0cb1871dfd558f11a2885c145d144,0.0,all_day,41.33,0.12,151,6.29,0.0,0.0,0.0,0.0,1.0,0.0,2023-04-12,downtown_local,7816,7708,12084.08,6194.0,23106,21485,4,28,118,150,1.91,93.0,65.0,64.0,100.0,100.0,1.95,80.56,schedule_and_vp,6.91,Bay Area 511 Muni Schedule,CALIFORNIA,1,1 CALIFORNIA,1,recHD22phgJs34JHP,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1TRg==,rechaapWbeffO33OX,City and County of San Francisco,04 - Oakland,100,2,9.54,2023-04-12


In [92]:
apr2023_final_df.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,stop_primary_direction,service_date,trip_instance_key,shape_array_key,route_long_name,route_short_name,route_desc,name,recent_combined_name,recent_route_id2
0,7cc0cb1871dfd558f11a2885c145d144,0.0,Westbound,2023-04-12,d12a03a35fe628a861d1707d079f4c8d,3228d8cd692236fe8d4be1b2cb8e2a8b,CALIFORNIA,1,5am-12 midnight daily,Bay Area 511 Muni Schedule,1__CALIFORNIA,1


#### 1st Merge test

In [143]:
m1_apr_2023 = pd.merge(
    df_apr_2023,
    apr2023_final_df,
    left_on=["schedule_gtfs_dataset_key", "dir_0_1", "Route ID", "temp_service_date"],
    right_on=[
        "schedule_gtfs_dataset_key",
        "direction_id",
        "recent_route_id2",
        "service_date",
    ],
    how="left",
)

In [144]:
print(len(m1_apr_2023))
print(m1_apr_2023.stop_primary_direction.count())
print(m1_apr_2023.dir_0_1.count())

356
356
356


In [147]:
len(df_apr_2023)

356

##### IDs are matching so it's the match that's the issue.

In [98]:
cardinal_dir_routeids = set(apr2023_final_df.recent_route_id2.unique().tolist())
main_routeids = set(m1_apr_2023["Route ID"].unique().tolist())

In [83]:
cardinal_dir_routeids - main_routeids

{'15', '25', '91', 'T', 'TBUS'}

In [84]:
main_routeids - cardinal_dir_routeids

set()

In [115]:
apr2023_final_df.sample()

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,stop_primary_direction,service_date,trip_instance_key,shape_array_key,route_long_name,route_short_name,route_desc,name,recent_combined_name,recent_route_id2
65,7cc0cb1871dfd558f11a2885c145d144,0.0,Northbound,2023-04-12,9c892aec6128e91eb7c166097f08650b,c183787eba0ba8c2f6734b17016d2e91,THIRD,T,Weekends 8 am-11:30 pm,Bay Area 511 Muni Schedule,T__THIRD,T


#### 2nd Merge Test

In [99]:
m2_apr_2023 = pd.merge(
    df_apr_2023,
    apr2023_final_df,
    left_on=["schedule_gtfs_dataset_key", "dir_0_1", "Route ID"],
    right_on=[
        "schedule_gtfs_dataset_key",
        "direction_id",
        "recent_route_id2",
    ],
    how="left",
)

In [108]:
print(len(m2_apr_2023))
print(m2_apr_2023.stop_primary_direction.count())
print(m2_apr_2023.dir_0_1.count())

356
179
356


#### 3rd test

In [109]:
m3_apr_2023 = pd.merge(
    df_apr_2023,
    apr2023_final_df,
    left_on=["dir_0_1", "Route ID"],
    right_on=[
        "direction_id",
        "recent_route_id2",
    ],
    how="left",
)

In [110]:
print(len(m3_apr_2023))
print(m3_apr_2023.stop_primary_direction.count())
print(m3_apr_2023.dir_0_1.count())

356
179
356


#### Testing for ALL dates

In [145]:
def all_dates_cardinal_dir(dates: list, gtfs_schedule_keys: list) -> pd.DataFrame:
    full_df = pd.DataFrame()
    for date in dates:
        df = find_cardinal_direction(date, gtfs_schedule_keys)
        df = df.compute()
        full_df = pd.concat([full_df, df], axis=0)
    return full_df

In [146]:
start1 = datetime.datetime.now()
all_dates_df = all_dates_cardinal_dir(all_dates_list, gtfs_keys)
end1 = datetime.datetime.now()
print(f"{end1-start1}")

0:01:31.344504


In [148]:
all_dates_df.groupby(["service_date"]).agg({"recent_route_id2": "nunique"})

Unnamed: 0_level_0,recent_route_id2
service_date,Unnamed: 1_level_1
2023-03-15,67
2023-04-12,67
2023-05-17,67
2023-06-14,67
2023-07-12,67
2023-08-15,67
2023-10-11,68
2023-11-15,70
2023-12-13,68
2024-01-17,68


#### What to do about routes that change direction? 
* Check and make sure my code is right.
* Use 14R
    * 14R is also missing data for September but it looks like all routes for San Francisco are.

In [168]:
all_dates_df.groupby(
    ["recent_route_id2", "direction_id", "stop_primary_direction"]
).agg({"service_date": "unique"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,service_date
recent_route_id2,direction_id,stop_primary_direction,Unnamed: 3_level_1
1,0.0,Westbound,"[2023-04-12, 2023-05-17, 2023-06-14, 2023-07-12, 2023-08-15, 2023-10-11, 2023-11-15, 2023-12-13, 2024-01-17, 2024-02-14, 2024-03-13, 2024-04-17, 2023-03-15]"
1,1.0,Eastbound,"[2023-04-12, 2023-05-17, 2023-06-14, 2023-07-12, 2023-08-15, 2023-10-11, 2023-11-15, 2023-12-13, 2024-01-17, 2024-02-14, 2024-03-13, 2024-04-17, 2023-03-15]"
12,0.0,Eastbound,"[2023-04-12, 2023-05-17, 2023-06-14, 2023-07-12, 2023-08-15, 2023-10-11, 2023-11-15, 2023-12-13, 2024-01-17, 2024-02-14, 2024-03-13, 2024-04-17, 2023-03-15]"
12,1.0,Westbound,"[2023-04-12, 2023-05-17, 2023-06-14, 2023-07-12, 2023-08-15, 2023-10-11, 2023-11-15, 2023-12-13, 2024-01-17, 2024-02-14, 2024-03-13, 2024-04-17, 2023-03-15]"
14,0.0,Southbound,"[2023-04-12, 2023-05-17, 2023-06-14, 2023-07-12, 2023-08-15, 2023-10-11, 2023-11-15, 2023-12-13, 2024-01-17, 2024-02-14, 2024-03-13, 2024-04-17, 2023-03-15]"
14,1.0,Northbound,"[2023-04-12, 2023-05-17, 2023-06-14, 2023-07-12, 2023-08-15, 2023-10-11, 2023-11-15, 2023-12-13, 2024-01-17, 2024-02-14, 2024-03-13, 2024-04-17, 2023-03-15]"
14R,0.0,Southbound,"[2023-04-12, 2023-05-17, 2023-06-14, 2023-07-12, 2023-08-15, 2023-03-15]"
14R,0.0,Westbound,"[2023-10-11, 2023-11-15, 2023-12-13, 2024-01-17, 2024-02-14, 2024-03-13, 2024-04-17]"
14R,1.0,Northbound,"[2023-04-12, 2023-05-17, 2023-06-14, 2023-07-12, 2023-08-15, 2023-10-11, 2023-11-15, 2023-12-13, 2024-01-17, 2024-02-14, 2024-03-13, 2024-04-17, 2023-03-15]"
15,0.0,Southbound,"[2023-04-12, 2023-05-17, 2023-06-14, 2023-07-12, 2023-08-15, 2023-10-11, 2023-11-15, 2023-12-13, 2024-01-17, 2024-02-14, 2024-03-13, 2024-04-17, 2023-03-15]"


In [167]:
all_dates_df.groupby(["recent_route_id2",]).agg(
    {"stop_primary_direction": "nunique"}
).sort_values(by=["stop_primary_direction"], ascending=False).head()

Unnamed: 0_level_0,stop_primary_direction
recent_route_id2,Unnamed: 1_level_1
14R,3
15,3
1,2
8AX,2
CA,2


In [158]:
scheduled_trips_dd_aug = delayed(
    section2_utils.load_scheduled_trips("2023-08-15", gtfs_keys)
)
scheduled_stops_dd_aug = delayed(
    section2_utils.load_scheduled_stop_times("2023-08-15", gtfs_keys)
)
aug_2023_cardinal_dir = find_most_common_dir(
    scheduled_trips_dd_aug, scheduled_stops_dd_aug
)

In [159]:
aug_2023_cardinal_dir_df = aug_2023_cardinal_dir.compute()

In [161]:
aug_2023_cardinal_dir_df.loc[aug_2023_cardinal_dir_df.route_id == "14R"]

Unnamed: 0,route_id,schedule_gtfs_dataset_key,direction_id,stop_primary_direction,service_date
6,14R,7cc0cb1871dfd558f11a2885c145d144,0.0,Southbound,2023-08-15
7,14R,7cc0cb1871dfd558f11a2885c145d144,1.0,Northbound,2023-08-15


In [152]:
df = df.drop(columns=["Direction", "direction_id", "service_date"])

In [153]:
df["temp_service_date"] = df["Date"].astype(str)

In [154]:
m1_all_dates = pd.merge(
    df,
    all_dates_df,
    left_on=["schedule_gtfs_dataset_key", "dir_0_1", "Route ID", "temp_service_date"],
    right_on=[
        "schedule_gtfs_dataset_key",
        "direction_id",
        "recent_route_id2",
        "service_date",
    ],
    how="left",
)

In [172]:
m1_all_dates.stop_primary_direction.value_counts()

Eastbound     1411
Westbound     1312
Southbound     968
Northbound     963
Name: stop_primary_direction, dtype: int64

4654

In [169]:
print(len(m1_all_dates))
print(m1_all_dates.stop_primary_direction.count())
print(m1_all_dates.dir_0_1.count())

4654
4654
4654


### Plot and see