# Quick explore to test merges between GTFS schedule tables we have with LA Metro's route dataset

In [1]:
import geopandas as gpd
import pandas as pd

from shared_utils import rt_dates
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS

analysis_date = rt_dates.DATES["oct2022"]
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/la_metro_demo/"



## From our GTFS

In [2]:
trips = pd.read_parquet(
    f"{COMPILED_CACHED_VIEWS}trips_{analysis_date}_v2.parquet",
    filters = [[("name", "==", "LA Metro Bus Schedule")], 
               [("name", "==", "LA Metro Rail Schedule")]
              ],
    columns = ["feed_key", "name", "trip_id", "shape_id", 
               "shape_array_key", "route_id", 
               "route_short_name", "route_long_name"]
)

## From LA Metro

In [3]:
by_route = pd.read_parquet(f"{GCS_FILE_PATH}by_route.parquet")
by_route = by_route.assign(
    route_str = by_route.route.astype(str)
)

## Merge these 2 datasets at route-level and see how merge performs

In [4]:
route = trips[[
    "name", "route_id", 
    "route_short_name", "route_long_name"]
].drop_duplicates().reset_index(drop=True)

In [5]:
m1 = pd.merge(
    route,
    by_route,
    left_on = "route_short_name",
    right_on = "route_str",
    how = "outer",
    validate = "m:1",
    indicator= True
)

In [6]:
m1._merge.value_counts()

both          103
right_only     23
left_only      17
Name: _merge, dtype: int64

In [7]:
m1[m1._merge=="left_only"].route_short_name.unique()

array([None, '10/48', '35/38', '14/37', '224/690', '242/243', '211/215'],
      dtype=object)

* Will need to fill in where `route_short_name` is missing.
* Will also need to split routes where a "/" appears

In [8]:
m1[(m1._merge=="left_only") & (m1.route_short_name.isna())]

Unnamed: 0,name,route_id,route_short_name,route_long_name,route,keep_label,num_bus_cash_oct_2022,num_taps_oct_2022,percent_bus_cash_oct_2022,route_str,_merge
0,LA Metro Rail Schedule,802,,Metro B Line (Red),,,,,,,left_only
1,LA Metro Rail Schedule,803,,Metro C Line (Green),,,,,,,left_only
2,LA Metro Rail Schedule,804,,Metro L Line (Gold),,,,,,,left_only
3,LA Metro Rail Schedule,806,,Metro E Line (Expo),,,,,,,left_only
4,LA Metro Rail Schedule,807,,Metro K Line (Crenshaw),,,,,,,left_only
5,LA Metro Rail Schedule,801,,Metro A Line (Blue),,,,,,,left_only
6,LA Metro Rail Schedule,805,,Metro D Line (Purple),,,,,,,left_only
7,LA Metro Bus Schedule,854-13157,,Metro L Line (Gold) Shuttle 854,,,,,,,left_only
8,LA Metro Bus Schedule,857-13164,,Metro K Line (Crenshaw) Shuttle 857,,,,,,,left_only
9,LA Metro Bus Schedule,910-13157,,Metro J Line (Silver) 910/950,,,,,,,left_only


In [9]:
m1[m1._merge=="right_only"].route_str.unique()

array(['259', '215', '211', '163', '52', '770', '48', '242', '235', '38',
       '243', '224', '35', '762', '10', '330', '14', '37', '134', '728',
       '489', '910', '201'], dtype=object)