## Checking routes with 2+ cardinal directions
* While running an operator (but I forgot which one) in district 4, I discovered there were some routes with more than 2 cardinal directions. 
* This shouldn't be happening. Find out what's going on. 
* Work backwards from final dataset to figure out which routes from D4 is experiencing this, then go back to the pipeline.

In [1]:
import _section2_utils as section2
import geopandas as gpd
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

d4_df = pd.read_parquet(
    schd_vp_url,
    filters=[
        [
            ("caltrans_district", "==", "04 - Oakland"),
            ("sched_rt_category", "==", "schedule_and_vp"),
        ]
    ],
)

In [4]:
d4_df.shape

(34870, 46)

In [5]:
d4_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,name,route_long_name,route_short_name,route_combined_name,route_id,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,caltrans_district
108,015d67d5b75b5cf2b710bbadadfb75f5,0.0,all_day,Northbound,57.14,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2023-04-12,downtown_local,1614,1579,2517.85,1201.0,4596,3438,2,8,11,21,1.83,0.75,0.64,0.63,1.0,1.0,2.1,119.9,schedule_and_vp,18.03,Bay Area 511 Marin Schedule,Downtown San Rafael - Sausalito,17,17 Downtown San Rafael - Sausalito,17,reckCEnFkdLVgfxck,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland
109,015d67d5b75b5cf2b710bbadadfb75f5,0.0,all_day,Northbound,57.14,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2023-05-17,downtown_local,1672,1652,2326.07,1257.0,4953,4287,0,4,18,22,2.13,0.87,0.72,0.71,1.0,1.0,1.85,105.73,schedule_and_vp,16.59,Bay Area 511 Marin Schedule,Downtown San Rafael - Sausalito,17,17 Downtown San Rafael - Sausalito,17,reckCEnFkdLVgfxck,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland


#### Interesting, the direction changes across time for certain routes. How to handle this?

In [6]:
agg1 = (
    d4_df.groupby(
        [
            "service_date",
            "schedule_gtfs_dataset_key",
            "organization_name",
            "route_combined_name",
        ]
    )
    .agg({"route_primary_direction": "nunique"})
    .reset_index()
    .sort_values(by=["route_primary_direction"], ascending=False)
)

In [7]:
agg1.head()

Unnamed: 0,service_date,schedule_gtfs_dataset_key,organization_name,route_combined_name,route_primary_direction
0,2023-03-15,0881af3822466784992a49f1cc57d38f,Sonoma-Marin Area Rail Transit District,SMART Main Line,2
4445,2023-12-13,fb467982dcc77a7f9199bebe709bb700,Santa Clara Valley Transportation Authority,44 Milpitas BART - McCarthy Rnch via Alder,2
4443,2023-12-13,fb467982dcc77a7f9199bebe709bb700,Santa Clara Valley Transportation Authority,40 Foothill Coll - Mtn View TC via N. Bayshore,2
4442,2023-12-13,fb467982dcc77a7f9199bebe709bb700,Santa Clara Valley Transportation Authority,39 Eastridge - The Villages,2
4441,2023-12-13,fb467982dcc77a7f9199bebe709bb700,Santa Clara Valley Transportation Authority,37 West Valley Coll - Capitol Stn,2


In [8]:
agg2 = (
    d4_df.groupby(
        ["schedule_gtfs_dataset_key", "organization_name", "route_combined_name"]
    )
    .agg({"route_primary_direction": "nunique"})
    .reset_index()
    .sort_values(by=["route_primary_direction"], ascending=False)
)

In [9]:
agg2.loc[agg2.route_primary_direction > 2].sort_values(by=["organization_name"])

Unnamed: 0,schedule_gtfs_dataset_key,organization_name,route_combined_name,route_primary_direction
661,c499f905e33929a641f083dad55c521e,Alameda-Contra Costa Transit District,676 De Anza - Crespi - Rollingwood,4
650,c499f905e33929a641f083dad55c521e,Alameda-Contra Costa Transit District,657 Oak High - Oak Tech - MacArthur,3
665,c499f905e33929a641f083dad55c521e,Alameda-Contra Costa Transit District,682 Bishop O'Dowd High - Montclair,3
333,587e730fac4db21d54037e0f12b0dd5d,Central Contra Costa Transit Authority,15 Concord BART/Pleasant Hill BART,3
414,7cc0cb1871dfd558f11a2885c145d144,City and County of San Francisco,14R MISSION RAPID,3
73,0f5e1b251db53223200c5bfc365d33f2,City of Fairfield,7 FTC/Solano College/Cordelia Library,4
72,0f5e1b251db53223200c5bfc365d33f2,City of Fairfield,6 Solano Town Center/East Tabor & Sunset,3
74,0f5e1b251db53223200c5bfc365d33f2,City of Fairfield,8 Cordelia Library/Oakbrook Dr,3
68,0f5e1b251db53223200c5bfc365d33f2,City of Fairfield,1 FTC/Fairfield Walmart,3
70,0f5e1b251db53223200c5bfc365d33f2,City of Fairfield,3 FTC/Solano Town Center/Fairfield Walmart,3


#### Check this out for all districts

In [10]:
all_districts_df = pd.read_parquet(
    schd_vp_url,
    filters=[
        [
            ("sched_rt_category", "==", "schedule_and_vp"),
        ]
    ],
)

all_districts_df_agg = (
    all_districts_df.groupby(
        ["schedule_gtfs_dataset_key", "organization_name", "route_combined_name"]
    )
    .agg({"route_primary_direction": "nunique"})
    .reset_index()
    .sort_values(by=["route_primary_direction"], ascending=False)
)

In [11]:
routes_multi_cardinal_dir = all_districts_df_agg.loc[
    all_districts_df_agg.route_primary_direction > 2
].sort_values(by=["organization_name"])

In [12]:
routes_multi_cardinal_dir.route_combined_name.nunique()

51

In [13]:
len(routes_multi_cardinal_dir)

56

In [14]:
routes_multi_cardinal_dir.route_combined_name.value_counts().head()

Route 1                                 4
313 Oceanside TC - Town Center North    2
38 Jesse Bethel                         2
82 RANCHO CUCAMONGA-FONTANA             1
13 Hunter Park Metro-Galleria           1
Name: route_combined_name, dtype: int64

In [15]:
routes_multi_cardinal_dir.loc[
    routes_multi_cardinal_dir.route_combined_name == "Route 1"
]

Unnamed: 0,schedule_gtfs_dataset_key,organization_name,route_combined_name,route_primary_direction
1919,7e015887964432c82ce7e735c2753f86,City of Ojai,Route 1,3
309,1770249a5a2e770ca90628434d4934b1,City of Ojai,Route 1,3
349,1770249a5a2e770ca90628434d4934b1,Gold Coast Transit District,Route 1,3
392,1770249a5a2e770ca90628434d4934b1,Ventura County Transportation Commission,Route 1,3


In [16]:
routes_multi_cardinal_dir[
    [
        "schedule_gtfs_dataset_key",
        "organization_name",
        "route_combined_name",
        "route_primary_direction",
    ]
].drop_duplicates()

Unnamed: 0,schedule_gtfs_dataset_key,organization_name,route_combined_name,route_primary_direction
2748,c499f905e33929a641f083dad55c521e,Alameda-Contra Costa Transit District,682 Bishop O'Dowd High - Montclair,3
2744,c499f905e33929a641f083dad55c521e,Alameda-Contra Costa Transit District,676 De Anza - Crespi - Rollingwood,4
2733,c499f905e33929a641f083dad55c521e,Alameda-Contra Costa Transit District,657 Oak High - Oak Tech - MacArthur,3
1691,6693efa56a541b6276da9b424f78a170,Blue Lake Rancheria,Willow Creek-Arcata,3
1601,587e730fac4db21d54037e0f12b0dd5d,Central Contra Costa Transit Authority,15 Concord BART/Pleasant Hill BART,3
1803,7cc0cb1871dfd558f11a2885c145d144,City and County of San Francisco,14R MISSION RAPID,3
3608,f74424acf8c41e4c1e9fd42838c4875c,City of Duarte,274 West Covina- Industry - Whittier,4
3607,f74424acf8c41e4c1e9fd42838c4875c,City of Duarte,272 Duarte-Baldwin Park-West Covina,3
1661,63029a23cb0e73f2a5d98a345c5e2e40,City of Elk Grove,Route 11,3
270,0f5e1b251db53223200c5bfc365d33f2,City of Fairfield,3 FTC/Solano Town Center/Fairfield Walmart,3


#### Checking out if routes are actually switching direction in real life. 
##### SFTMA
* SFMTA: 14R changed routes but back in 2022.https://www.sfmta.com/travel-updates/route-modifications-permanent-stop-relocation-starting-january-22-2022
* Can't find any record of this route from before APril 2024. 
* Turn SFMTA stuff into a function

In [17]:
import _section1_utils

In [18]:
sf_name = _section1_utils.organization_name_crosswalk("City and County of San Francisco")

In [19]:
sf_name

'Bay Area 511 Muni Schedule'

In [20]:
op_routes_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

In [21]:
sf_routes_gdf = gpd.read_parquet(
    op_routes_url,
    filters=[[("name", "==", sf_name)]])

In [22]:
sf_routes_gdf = sf_routes_gdf.sort_values(by = ["service_date"], ascending = False)


In [23]:
sf_routes_gdf.service_date.unique()

array(['2024-04-17T00:00:00.000000000', '2024-03-13T00:00:00.000000000',
       '2024-02-14T00:00:00.000000000', '2024-01-17T00:00:00.000000000',
       '2023-12-13T00:00:00.000000000', '2023-11-15T00:00:00.000000000',
       '2023-10-11T00:00:00.000000000', '2023-08-15T00:00:00.000000000',
       '2023-07-12T00:00:00.000000000', '2023-06-14T00:00:00.000000000',
       '2023-05-17T00:00:00.000000000', '2023-04-12T00:00:00.000000000',
       '2023-03-15T00:00:00.000000000'], dtype='datetime64[ns]')

In [24]:
sf_routes_gdf.route_combined_name.unique()

array(['1 CALIFORNIA', '54 FELTON', '5 FULTON', '49 VAN NESS-MISSION',
       '48 QUINTARA-24TH STREET', '45 UNION-STOCKTON', "44 O'SHAUGHNESSY",
       '43 MASONIC', '39 COIT', '38R GEARY RAPID', '38 GEARY',
       '37 CORBETT', '36 TERESITA', '35 EUREKA', '33 ASHBURY-18TH ST',
       '31 BALBOA', '29 SUNSET', '28R 19TH AVENUE RAPID',
       '28 19TH AVENUE', '27 BRYANT', '24 DIVISADERO', '23 MONTEREY',
       '22 FILLMORE', '21 HAYES', '2 SUTTER', '1X CALIFORNIA EXPRESS',
       '19 POLK', '18 46TH AVENUE', '14R MISSION RAPID', '14 MISSION',
       '12 FOLSOM-PACIFIC', '52 EXCELSIOR', '30 STOCKTON', '55 DOGPATCH',
       '9R SAN BRUNO RAPID', '56 RUTLAND', 'S SHUTTLE',
       'PM POWELL-MASON CABLE CAR', 'PH POWELL-HYDE CABLE CAR',
       'NOWL OWL JUDAH', 'NBUS JUDAH BUS', 'N JUDAH', 'M OCEAN VIEW',
       'LBUS TARAVAL BUS', 'KBUS INGLESIDE BUS', 'K INGLESIDE',
       'J CHURCH', 'F MARKET & WHARVES', 'CA CALIFORNIA STREET CABLE CAR',
       'LOWL OWL TARAVAL', '66 QUINTARA', '9 SA

In [25]:
sf_routes_gdf.route_short_name.unique()

array(['1', '54', '5', '49', '48', '45', '44', '43', '39', '38R', '38',
       '37', '36', '35', '33', '31', '29', '28R', '28', '27', '24', '23',
       '22', '21', '2', '1X', '19', '18', '14R', '14', '12', '52', '30',
       '55', '9R', '56', 'S', 'PM', 'PH', 'NOWL', 'NBUS', 'N', 'M',
       'LBUS', 'KBUS', 'K', 'J', 'F', 'CA', 'LOWL', '66', '9', '8BX',
       '8AX', '8', '714', '7', '67', '6', '5R', '58', '90', '57', '15',
       '25', '91', 'KLM', 'MBUS', 'T', 'TBUS', 'P'], dtype=object)

In [33]:
sf_route = sf_routes_gdf.loc[sf_routes_gdf.route_combined_name.str.contains("14R MISSION RAPID")]

In [39]:
sf_route['service_date'] = sf_route['service_date'].dt.strftime('%Y-%m-%d %H:%M:%S')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [40]:
sf_route.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 13 entries, 704 to 767
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   shape_array_key                13 non-null     object  
 1   geometry                       13 non-null     geometry
 2   feed_key                       13 non-null     object  
 3   schedule_gtfs_dataset_key      13 non-null     object  
 4   direction_id                   13 non-null     float64 
 5   route_key                      13 non-null     object  
 6   route_length                   13 non-null     float64 
 7   route_length_miles             13 non-null     float64 
 8   is_downtown_local              13 non-null     int64   
 9   is_local                       13 non-null     int64   
 10  is_coverage                    13 non-null     int64   
 11  is_rapid                       13 non-null     int64   
 12  is_express                 

In [43]:
# sf_route.explore('service_date')

In [31]:
sf_routes_gdf.drop(columns = ['geometry']).head()

Unnamed: 0,shape_array_key,feed_key,schedule_gtfs_dataset_key,direction_id,route_key,route_length,route_length_miles,is_downtown_local,is_local,is_coverage,is_rapid,is_express,is_rail,organization_source_record_id,organization_name,service_date,name,route_long_name,route_short_name,route_combined_name,route_id
701,bd19d6550464149e5c2cf6afe489a40f,cad7d9dfbba521f1911ebdf004ad2007,7cc0cb1871dfd558f11a2885c145d144,0.0,3f69008b131f28ad617a12f7b8d2638e,9255.12,5.75,2,0,0,0,0,0,rechaapWbeffO33OX,City and County of San Francisco,2024-04-17,Bay Area 511 Muni Schedule,CALIFORNIA,1,1 CALIFORNIA,1
733,b56d412439d528534d11ec28b24f17b8,cad7d9dfbba521f1911ebdf004ad2007,7cc0cb1871dfd558f11a2885c145d144,1.0,1bdb59d1a8487d7887f7decd39c48cad,19493.69,12.11,2,0,0,0,0,0,rechaapWbeffO33OX,City and County of San Francisco,2024-04-17,Bay Area 511 Muni Schedule,FELTON,54,54 FELTON,54
731,e01c0eafeea0ea52186881515eeb6563,cad7d9dfbba521f1911ebdf004ad2007,7cc0cb1871dfd558f11a2885c145d144,1.0,b1038aa6ec1ddd7ec1c340267c4ca915,11151.04,6.93,2,0,0,0,0,0,rechaapWbeffO33OX,City and County of San Francisco,2024-04-17,Bay Area 511 Muni Schedule,FULTON,5,5 FULTON,5
730,97f98d02361ffd767b6b3be8f55a2c72,cad7d9dfbba521f1911ebdf004ad2007,7cc0cb1871dfd558f11a2885c145d144,0.0,1ad227822520d69a676a2efaddd03faa,11270.61,7.0,2,0,0,0,0,0,rechaapWbeffO33OX,City and County of San Francisco,2024-04-17,Bay Area 511 Muni Schedule,VAN NESS-MISSION,49,49 VAN NESS-MISSION,49
729,64fee9183d5690a25434d78927d9d825,cad7d9dfbba521f1911ebdf004ad2007,7cc0cb1871dfd558f11a2885c145d144,1.0,433c2ebded59a3e69207f6650d759056,15130.58,9.4,2,0,0,0,0,0,rechaapWbeffO33OX,City and County of San Francisco,2024-04-17,Bay Area 511 Muni Schedule,QUINTARA-24TH STREET,48,48 QUINTARA-24TH STREET,48


In [None]:
d4_df.loc[
    (d4_df.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (d4_df.route_combined_name == "14R MISSION RAPID")
].sort_values(by=["service_date", "direction_id"])[
    ["service_date", "route_primary_direction", "direction_id"]
].drop_duplicates()

##### Check this out by one route only: this route run by City of Fairfield switches direction in May. 

In [None]:
fairfield_name = _section1_utils.organization_name_crosswalk("City of Fairfield")

In [None]:
d4_df.loc[
    (d4_df.schedule_gtfs_dataset_key == "0f5e1b251db53223200c5bfc365d33f2")
    & (d4_df.route_combined_name == "7 FTC/Solano College/Cordelia Library")
].sort_values(by=["service_date", "direction_id"])[
    ["service_date", "route_primary_direction", "direction_id"]
].drop_duplicates()

### `schedule_stats_by_routes/cardinal_direction_for_route_direction` 

In [None]:
dates_2023 = rt_dates.y2023_dates[0:4]

In [None]:
dates_2023

In [None]:
STOP_TIMES_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction

In [None]:
stop_times_gdf = pd.DataFrame()
for date in dates_2023:
    df = pd.read_parquet(
        f"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{date}.parquet",
        filters=[[("stop_primary_direction", "!=", "Unknown")]],
    )

    stop_times_gdf = pd.concat([stop_times_gdf, df])

In [None]:
len(
    stop_times_gdf.loc[
        stop_times_gdf.schedule_gtfs_dataset_key == "0f5e1b251db53223200c5bfc365d33f2"
    ]
)

In [None]:
fairfield_stops = stop_times_gdf.loc[
    stop_times_gdf.schedule_gtfs_dataset_key == "0f5e1b251db53223200c5bfc365d33f2"
].reset_index(drop=True)

In [None]:
fairfield_stops.head(2)

In [None]:
trip_scheduled_col = [
    "route_id",
    "trip_instance_key",
    "gtfs_dataset_key",
    "shape_array_key",
    "direction_id",
    "route_long_name",
    "route_short_name",
    "route_desc",
    "name",
]

In [None]:
trips_df = pd.DataFrame()
for date in dates_2023:
    df = helpers.import_scheduled_trips(
        date, columns=trip_scheduled_col, get_pandas=True
    )

    trips_df = pd.concat([trips_df, df])

In [None]:
len(
    trips_df.loc[
        trips_df.schedule_gtfs_dataset_key == "0f5e1b251db53223200c5bfc365d33f2"
    ]
)

In [None]:
city_fairfield_trips = trips_df.loc[
    trips_df.schedule_gtfs_dataset_key == "0f5e1b251db53223200c5bfc365d33f2"
]

In [None]:
city_fairfield_trips = city_fairfield_trips.drop(columns=["route_desc"])

In [None]:
len(
    city_fairfield_trips.loc[
        city_fairfield_trips.route_long_name.str.contains("Cordelia")
    ]
)

In [None]:
cordelia = city_fairfield_trips.loc[
    city_fairfield_trips.route_long_name.str.contains("Cordelia")
]

In [None]:
merge_cols = ["trip_instance_key", "schedule_gtfs_dataset_key", "shape_array_key"]

stop_times_with_trip = pd.merge(fairfield_stops, cordelia, on=merge_cols)

In [None]:
stop_times_with_trip.head(2)

In [None]:
main_cols = ["route_id", "schedule_gtfs_dataset_key", "direction_id"]

agg1 = (
    stop_times_with_trip.groupby(main_cols + ["stop_primary_direction"])
    .agg({"stop_sequence": "count"})
    .reset_index()
    .rename(columns={"stop_sequence": "total_stops"})
)

In [None]:
agg2 = agg1.sort_values(
    by=main_cols + ["total_stops"],
    ascending=[True, True, True, False],
)

#### Connect this back to route typologies

In [None]:
ROUTE_DIR_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [None]:
RT_SCHED_GCS

In [None]:
ROUTE_DIR_EXPORT

In [None]:
def open_add(file: str, date: str):
    df = pd.read_parquet(
        "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2023-03-15.parquet"
    )
    df["service_date"] = date
    return df

In [None]:
march_df = open_add(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2023-03-15.parquet",
    "3-15",
)

In [None]:
apr_df = open_add(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2023-04-12.parquet",
    "4-12",
)

In [None]:
may_df = open_add(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2023-05-17.parquet",
    "5-17",
)

In [None]:
june_df = open_add(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2023-06-14.parquet",
    "6-14",
)

In [None]:
all_df = pd.concat([march_df, apr_df, may_df, june_df])

In [None]:
route_group_merge_cols = ["schedule_gtfs_dataset_key", "route_id", "direction_id"]

In [None]:
m1 = pd.merge(agg2, all_df, on=route_group_merge_cols, how="left")

In [None]:
m1 = m1.drop(columns=["geometry"])

In [None]:
m1.head(1)

In [None]:
m1[
    ["service_date", "stop_primary_direction", "direction_id", "total_stops"]
].sort_values(
    by=["service_date", "total_stops"], ascending=[True, False]
).drop_duplicates()