# `route_id` changes

In [1]:
import pandas as pd

from segment_speed_utils import helpers
from segment_speed_utils.project_vars import SEGMENT_GCS
from shared_utils import rt_dates

date_list = [
    rt_dates.DATES[f"{m}2023"] 
    for m in ["apr", "may", "jun", "jul", "aug", "sep", "oct", "nov"]
]

In [2]:
df = pd.concat([
    helpers.import_scheduled_trips(
        analysis_date,
        columns = ["gtfs_dataset_key", "route_id", 
                   "route_long_name", "route_short_name", "route_desc"],
        get_pandas = True
    ).assign(
        service_date = pd.to_datetime(analysis_date)
    ) for analysis_date in date_list
], axis=0, ignore_index=True)

In [3]:
# Fill in missing values
df = df.assign(
    route_short_name = df.route_short_name.fillna("None"),
    route_long_name = df.route_long_name.fillna("None"),
    route_desc = df.route_desc.fillna("None")
)

df = df.assign(
    combined_all = df.route_id + "_" + df.route_short_name + "_" + df.route_long_name + "_" + df.route_desc,
    combined_name = df.route_short_name + "_" + df.route_long_name + "_" + df.route_desc
)

In [4]:
n_days = len(date_list)
n_days

8

In [5]:
operator_day_cols = ["schedule_gtfs_dataset_key", "service_date"]
route_day_cols = operator_day_cols + ["route_id"]
route_cols = ["schedule_gtfs_dataset_key", "route_id"]
route_name_cols = ["schedule_gtfs_dataset_key", "combined_name"]

daily_routes = (df.groupby(operator_day_cols)
                  .agg({"route_id": "nunique"})
                  .reset_index()
)

## Problem 1: Count nunique route_ids if the combined name is the same
* combination of `route_short_name`, `route_long_name`, and `route_desc` is the same, but `route_id` is different
* find a couple of examples where over 8 different service dates, there are at least 3 variations

In [6]:
combinations_id = (df.groupby(route_name_cols)
                .agg({"route_id": "nunique"})
                .reset_index()
               )

In [7]:
combinations_id.route_id.value_counts()

1     3769
2      195
3       67
4       19
43       1
Name: route_id, dtype: int64

In [8]:
combinations_id.query('route_id > 2').head(3)

Unnamed: 0,schedule_gtfs_dataset_key,combined_name,route_id
1099,4c6b107352b318297bb39173c796f357,01_FAX Q_None,4
1100,4c6b107352b318297bb39173c796f357,03_HERNDON_None,4
1101,4c6b107352b318297bb39173c796f357,09_SHAW_None,4


In [9]:
combinations_id[combinations_id.schedule_gtfs_dataset_key != "4c6b107352b318297bb39173c796f357"].query('route_id > 2').head(3)

Unnamed: 0,schedule_gtfs_dataset_key,combined_name,route_id
1927,95cb514215c61ca578b01d885f35ec0a,10_FONTANA-BASELINE-SAN BDNO_None,3
1928,95cb514215c61ca578b01d885f35ec0a,14_FONTANA-FOOTHILL-SAN BDNO_None,3
1929,95cb514215c61ca578b01d885f35ec0a,15_FONTANA-SAN BDNO-REDLANDS_None,3


In [10]:
def subset_by_operator_name(
    df: pd.DataFrame, 
    gtfs_key: str, 
    one_name: str,
):
    return df[(df.schedule_gtfs_dataset_key == gtfs_key) & 
              (df.combined_name==one_name)].drop(columns = ["route_long_name", "route_short_name", "route_desc"])

In [11]:
subset_by_operator_name(df, "4c6b107352b318297bb39173c796f357", "01_FAX Q_None") 

Unnamed: 0,schedule_gtfs_dataset_key,route_id,service_date,combined_all,combined_name
674,4c6b107352b318297bb39173c796f357,3643,2023-04-12,3643_01_FAX Q_None,01_FAX Q_None
5590,4c6b107352b318297bb39173c796f357,3687,2023-06-14,3687_01_FAX Q_None,01_FAX Q_None
8056,4c6b107352b318297bb39173c796f357,3687,2023-07-12,3687_01_FAX Q_None,01_FAX Q_None
10491,4c6b107352b318297bb39173c796f357,3727,2023-08-15,3727_01_FAX Q_None,01_FAX Q_None
13131,4c6b107352b318297bb39173c796f357,3727,2023-09-13,3727_01_FAX Q_None,01_FAX Q_None
15741,4c6b107352b318297bb39173c796f357,3727,2023-10-11,3727_01_FAX Q_None,01_FAX Q_None
18395,4c6b107352b318297bb39173c796f357,3745,2023-11-15,3745_01_FAX Q_None,01_FAX Q_None


In [12]:
# The route_id earlier will disappear later
df[(df.schedule_gtfs_dataset_key=="4c6b107352b318297bb39173c796f357") & 
   (df.route_id=="3687")
  ]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,route_long_name,route_short_name,route_desc,service_date,combined_all,combined_name
5590,4c6b107352b318297bb39173c796f357,3687,FAX Q,1,,2023-06-14,3687_01_FAX Q_None,01_FAX Q_None
8056,4c6b107352b318297bb39173c796f357,3687,FAX Q,1,,2023-07-12,3687_01_FAX Q_None,01_FAX Q_None


In [13]:
subset_by_operator_name(df, "95cb514215c61ca578b01d885f35ec0a", "14_FONTANA-FOOTHILL-SAN BDNO_None") 

Unnamed: 0,schedule_gtfs_dataset_key,route_id,service_date,combined_all,combined_name
613,95cb514215c61ca578b01d885f35ec0a,10662,2023-04-12,10662_14_FONTANA-FOOTHILL-SAN BDNO_None,14_FONTANA-FOOTHILL-SAN BDNO_None
3022,95cb514215c61ca578b01d885f35ec0a,10764,2023-05-17,10764_14_FONTANA-FOOTHILL-SAN BDNO_None,14_FONTANA-FOOTHILL-SAN BDNO_None
5530,95cb514215c61ca578b01d885f35ec0a,10764,2023-06-14,10764_14_FONTANA-FOOTHILL-SAN BDNO_None,14_FONTANA-FOOTHILL-SAN BDNO_None
7997,95cb514215c61ca578b01d885f35ec0a,10764,2023-07-12,10764_14_FONTANA-FOOTHILL-SAN BDNO_None,14_FONTANA-FOOTHILL-SAN BDNO_None
10431,95cb514215c61ca578b01d885f35ec0a,10935,2023-08-15,10935_14_FONTANA-FOOTHILL-SAN BDNO_None,14_FONTANA-FOOTHILL-SAN BDNO_None
13071,95cb514215c61ca578b01d885f35ec0a,10935,2023-09-13,10935_14_FONTANA-FOOTHILL-SAN BDNO_None,14_FONTANA-FOOTHILL-SAN BDNO_None
15680,95cb514215c61ca578b01d885f35ec0a,10935,2023-10-11,10935_14_FONTANA-FOOTHILL-SAN BDNO_None,14_FONTANA-FOOTHILL-SAN BDNO_None
18334,95cb514215c61ca578b01d885f35ec0a,10935,2023-11-15,10935_14_FONTANA-FOOTHILL-SAN BDNO_None,14_FONTANA-FOOTHILL-SAN BDNO_None


## Problem 2: Assume route_id is stable, find nunique combined_name 
* `route_id` for operator is same, but `combined_name` is different
* this is probably easier to fix, we can designate the latest combined name to be what is displayed
* fixing this will not address problem 1

In [14]:
# How many unique combinations are there
combinations = (df.groupby(route_cols)
                .agg({"combined_name": "nunique"})
                .reset_index()
               )

combinations.combined_name.value_counts()

1    3748
2     350
3       9
4       1
Name: combined_name, dtype: int64

In [15]:
(combinations
 .query('combined_name > 2')
).head(10)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,combined_name
1493,6b09003d3a547f37aba623fe8211b355,115,3
1494,6b09003d3a547f37aba623fe8211b355,116,3
2336,a37760dde6b9fdcb76b82e57afab7274,2512,3
2340,a37760dde6b9fdcb76b82e57afab7274,2590,3
2368,a37760dde6b9fdcb76b82e57afab7274,N2601,3
2696,b9473e19aebf7ee2ec18623eb35762a1,42947,3
4004,fb467982dcc77a7f9199bebe709bb700,37,3
4010,fb467982dcc77a7f9199bebe709bb700,51,4
4012,fb467982dcc77a7f9199bebe709bb700,52,3
4014,fb467982dcc77a7f9199bebe709bb700,55,3


In [16]:
def subset_by_operator_route(
    df: pd.DataFrame, 
    gtfs_key: str, 
    one_route: str,
):
    return df[(df.schedule_gtfs_dataset_key == gtfs_key) & 
              (df.route_id==one_route)].drop(columns = ["route_short_name", "route_long_name", "route_desc"])

In [17]:
subset_by_operator_route(
    df.drop_duplicates(subset = route_cols + ["combined_name"]), 
    "6b09003d3a547f37aba623fe8211b355", 
    "115"
)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,service_date,combined_all,combined_name
1950,6b09003d3a547f37aba623fe8211b355,115,2023-04-12,115_1_Route 1-Blue Wal-Mart / El Dorado_Route ...,1_Route 1-Blue Wal-Mart / El Dorado_Route 1 - ...
6705,6b09003d3a547f37aba623fe8211b355,115,2023-06-14,115_1_Route 1-Blue Wal-Mart / El Dorado_None,1_Route 1-Blue Wal-Mart / El Dorado_None
11785,6b09003d3a547f37aba623fe8211b355,115,2023-08-15,115_1_Route 1-Blue Wal-Mart / DNHS College_Rou...,1_Route 1-Blue Wal-Mart / DNHS College_Route 1...


In [18]:
subset_by_operator_route(
    df.drop_duplicates(subset = route_cols + ["combined_name"]), 
    "fb467982dcc77a7f9199bebe709bb700", 
    "55"
)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,service_date,combined_all,combined_name
1059,fb467982dcc77a7f9199bebe709bb700,55,2023-04-12,55_55_Old Ironsides Station - De Anza College_...,55_Old Ironsides Station - De Anza College_None
5899,fb467982dcc77a7f9199bebe709bb700,55,2023-06-14,55_55_Old Ironsides Stn - De Anza Coll_None,55_Old Ironsides Stn - De Anza Coll_None
10877,fb467982dcc77a7f9199bebe709bb700,55,2023-08-15,55_55_Old Ironsides Stn - De Anza Coll_Note: T...,55_Old Ironsides Stn - De Anza Coll_Note: The ...
