# `route_id` changes

The heuristic can be, if the `combined_name` is the same, select the latest `route_id`.
* For Metro, let's parse out the suffix
* For others, leave it

When `route_id` is stable and there are different `combined_names`, use the name that is most prevalent. Do this first before addressing the first problem.

In [1]:
import pandas as pd

from segment_speed_utils import helpers
from segment_speed_utils.project_vars import SEGMENT_GCS
from shared_utils import rt_dates

months = [
    "mar", "apr", "may",
    "jun", "jul", "aug",
    "sep", "oct", "nov",
    "dec"
]

date_list = [
    rt_dates.DATES[f"{m}2023"] 
    for m in months
]

In [2]:
df = pd.concat([
    helpers.import_scheduled_trips(
        analysis_date,
        columns = ["gtfs_dataset_key", "name", "route_id", 
                   "route_long_name", "route_short_name", "route_desc"],
        get_pandas = True
    ).assign(
        service_date = pd.to_datetime(analysis_date)
    ) for analysis_date in date_list
], axis=0, ignore_index=True)

In [3]:
# Fill in missing values
df = df.assign(
    route_id = df.route_id.fillna(""),
    route_short_name = df.route_short_name.fillna(""),
    route_long_name = df.route_long_name.fillna(""),
    #route_desc = df.route_desc.fillna("None")
)

df = df.assign(
    combined_all = df.route_id + "_" + df.route_short_name + "_" + df.route_long_name,
    combined_name = df.route_short_name + "_" + df.route_long_name
)

In [4]:
n_days = len(date_list)
n_days

10

In [5]:
operator_day_cols = ["schedule_gtfs_dataset_key", "service_date"]
route_day_cols = operator_day_cols + ["route_id"]
route_cols = ["schedule_gtfs_dataset_key", "route_id"]
route_name_cols = ["schedule_gtfs_dataset_key", "combined_name"]

daily_routes = (df.groupby(operator_day_cols)
                  .agg({"route_id": "nunique"})
                  .reset_index()
)

## Problem 1: Count nunique route_ids if the combined name is the same
* combination of `route_short_name`, `route_long_name`, and `route_desc` is the same, but `route_id` is different
* find a couple of examples where over 8 different service dates, there are at least 3 variations

In [6]:
combinations_id = (df.groupby(route_name_cols)
                .agg({"route_id": "nunique"})
                .reset_index()
               )

In [7]:
combinations_id.route_id.value_counts()

1     4836
3      159
2       93
4       37
43       1
39       1
Name: route_id, dtype: int64

In [8]:
combinations_id.route_id.value_counts(normalize=True)

1     0.943242
3     0.031012
2     0.018139
4     0.007217
43    0.000195
39    0.000195
Name: route_id, dtype: float64

In [9]:
combinations_id[combinations_id.route_id==39]

Unnamed: 0,schedule_gtfs_dataset_key,combined_name,route_id
4800,f449c9c8d400dd385d7bc216748d29da,_Amtrak Thruway Connecting Service,39


In [10]:
df[df.schedule_gtfs_dataset_key=="f449c9c8d400dd385d7bc216748d29da"]

Unnamed: 0,schedule_gtfs_dataset_key,name,route_id,route_long_name,route_short_name,route_desc,service_date,combined_all,combined_name
498,f449c9c8d400dd385d7bc216748d29da,Amtrak Schedule,29498,Amtrak Thruway Connecting Service,,,2023-03-15,29498__Amtrak Thruway Connecting Service,_Amtrak Thruway Connecting Service
499,f449c9c8d400dd385d7bc216748d29da,Amtrak Schedule,37329,Amtrak Thruway Connecting Service,,,2023-03-15,37329__Amtrak Thruway Connecting Service,_Amtrak Thruway Connecting Service
500,f449c9c8d400dd385d7bc216748d29da,Amtrak Schedule,15159,Amtrak Thruway Connecting Service,,,2023-03-15,15159__Amtrak Thruway Connecting Service,_Amtrak Thruway Connecting Service
501,f449c9c8d400dd385d7bc216748d29da,Amtrak Schedule,42949,Amtrak Thruway Connecting Service,,,2023-03-15,42949__Amtrak Thruway Connecting Service,_Amtrak Thruway Connecting Service
502,f449c9c8d400dd385d7bc216748d29da,Amtrak Schedule,23565,Amtrak Thruway Connecting Service,,,2023-03-15,23565__Amtrak Thruway Connecting Service,_Amtrak Thruway Connecting Service
...,...,...,...,...,...,...,...,...,...
2170,f449c9c8d400dd385d7bc216748d29da,Amtrak Schedule,91,Silver Meteor,,,2023-03-15,91__Silver Meteor,_Silver Meteor
2171,f449c9c8d400dd385d7bc216748d29da,Amtrak Schedule,70,Silver Star,,,2023-03-15,70__Silver Star,_Silver Star
2172,f449c9c8d400dd385d7bc216748d29da,Amtrak Schedule,81,Lake Shore Limited,,,2023-03-15,81__Lake Shore Limited,_Lake Shore Limited
2173,f449c9c8d400dd385d7bc216748d29da,Amtrak Schedule,58,Cardinal,,,2023-03-15,58__Cardinal,_Cardinal


In [11]:
combinations_id.query('route_id > 2').head(3)

Unnamed: 0,schedule_gtfs_dataset_key,combined_name,route_id
953,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,10/48_Metro Local Line,3
954,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,102_Metro Local Line,3
955,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,105_Metro Local Line,3


In [12]:
combinations_id[combinations_id.schedule_gtfs_dataset_key != 
                "4c6b107352b318297bb39173c796f357"].query('route_id > 2').head(3)

Unnamed: 0,schedule_gtfs_dataset_key,combined_name,route_id
953,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,10/48_Metro Local Line,3
954,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,102_Metro Local Line,3
955,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,105_Metro Local Line,3


In [13]:
def subset_by_operator_name(
    df: pd.DataFrame, 
    gtfs_key: str, 
    one_name: str,
):
    return df[(df.schedule_gtfs_dataset_key == gtfs_key) & 
              (df.combined_name==one_name)].drop(columns = ["route_long_name", "route_short_name", "route_desc"])

In [14]:
subset_by_operator_name(df, "4c6b107352b318297bb39173c796f357", "01_FAX Q") 

Unnamed: 0,schedule_gtfs_dataset_key,name,route_id,service_date,combined_all,combined_name
684,4c6b107352b318297bb39173c796f357,Fresno Schedule,3643,2023-03-15,3643_01_FAX Q,01_FAX Q
3031,4c6b107352b318297bb39173c796f357,Fresno Schedule,3643,2023-04-12,3643_01_FAX Q,01_FAX Q
7947,4c6b107352b318297bb39173c796f357,Fresno Schedule,3687,2023-06-14,3687_01_FAX Q,01_FAX Q
10413,4c6b107352b318297bb39173c796f357,Fresno Schedule,3687,2023-07-12,3687_01_FAX Q,01_FAX Q
12848,4c6b107352b318297bb39173c796f357,Fresno Schedule,3727,2023-08-15,3727_01_FAX Q,01_FAX Q
15488,4c6b107352b318297bb39173c796f357,Fresno Schedule,3727,2023-09-13,3727_01_FAX Q,01_FAX Q
18098,4c6b107352b318297bb39173c796f357,Fresno Schedule,3727,2023-10-11,3727_01_FAX Q,01_FAX Q
20752,4c6b107352b318297bb39173c796f357,Fresno Schedule,3745,2023-11-15,3745_01_FAX Q,01_FAX Q
23372,4c6b107352b318297bb39173c796f357,Fresno Schedule,3745,2023-12-13,3745_01_FAX Q,01_FAX Q


In [15]:
# The route_id earlier will disappear later
df[(df.schedule_gtfs_dataset_key=="4c6b107352b318297bb39173c796f357") & 
   (df.route_id=="3687")
  ]

Unnamed: 0,schedule_gtfs_dataset_key,name,route_id,route_long_name,route_short_name,route_desc,service_date,combined_all,combined_name
7947,4c6b107352b318297bb39173c796f357,Fresno Schedule,3687,FAX Q,1,,2023-06-14,3687_01_FAX Q,01_FAX Q
10413,4c6b107352b318297bb39173c796f357,Fresno Schedule,3687,FAX Q,1,,2023-07-12,3687_01_FAX Q,01_FAX Q


In [16]:
subset_by_operator_name(df, "95cb514215c61ca578b01d885f35ec0a", "14_FONTANA-FOOTHILL-SAN BDNO") 

Unnamed: 0,schedule_gtfs_dataset_key,name,route_id,service_date,combined_all,combined_name
2970,95cb514215c61ca578b01d885f35ec0a,OmniTrans Schedule,10662,2023-04-12,10662_14_FONTANA-FOOTHILL-SAN BDNO,14_FONTANA-FOOTHILL-SAN BDNO
5379,95cb514215c61ca578b01d885f35ec0a,OmniTrans Schedule,10764,2023-05-17,10764_14_FONTANA-FOOTHILL-SAN BDNO,14_FONTANA-FOOTHILL-SAN BDNO
7887,95cb514215c61ca578b01d885f35ec0a,OmniTrans Schedule,10764,2023-06-14,10764_14_FONTANA-FOOTHILL-SAN BDNO,14_FONTANA-FOOTHILL-SAN BDNO
10354,95cb514215c61ca578b01d885f35ec0a,OmniTrans Schedule,10764,2023-07-12,10764_14_FONTANA-FOOTHILL-SAN BDNO,14_FONTANA-FOOTHILL-SAN BDNO
12788,95cb514215c61ca578b01d885f35ec0a,OmniTrans Schedule,10935,2023-08-15,10935_14_FONTANA-FOOTHILL-SAN BDNO,14_FONTANA-FOOTHILL-SAN BDNO
15428,95cb514215c61ca578b01d885f35ec0a,OmniTrans Schedule,10935,2023-09-13,10935_14_FONTANA-FOOTHILL-SAN BDNO,14_FONTANA-FOOTHILL-SAN BDNO
18037,95cb514215c61ca578b01d885f35ec0a,OmniTrans Schedule,10935,2023-10-11,10935_14_FONTANA-FOOTHILL-SAN BDNO,14_FONTANA-FOOTHILL-SAN BDNO
20691,95cb514215c61ca578b01d885f35ec0a,OmniTrans Schedule,10935,2023-11-15,10935_14_FONTANA-FOOTHILL-SAN BDNO,14_FONTANA-FOOTHILL-SAN BDNO
23311,95cb514215c61ca578b01d885f35ec0a,OmniTrans Schedule,10935,2023-12-13,10935_14_FONTANA-FOOTHILL-SAN BDNO,14_FONTANA-FOOTHILL-SAN BDNO


In [17]:
subset_by_operator_name(df, "3f3f36b4c41cc6b5df3eb7f5d8ea6e3c", "102_Metro Local Line")

Unnamed: 0,schedule_gtfs_dataset_key,name,route_id,service_date,combined_all,combined_name
1506,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,LA Metro Bus Schedule,102-13167,2023-03-15,102-13167_102_Metro Local Line,102_Metro Local Line
3901,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,LA Metro Bus Schedule,102-13167,2023-04-12,102-13167_102_Metro Local Line,102_Metro Local Line
6317,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,LA Metro Bus Schedule,102-13167,2023-05-17,102-13167_102_Metro Local Line,102_Metro Local Line
8670,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,LA Metro Bus Schedule,102-13167,2023-06-14,102-13167_102_Metro Local Line,102_Metro Local Line
11196,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,LA Metro Bus Schedule,102-13168,2023-07-12,102-13168_102_Metro Local Line,102_Metro Local Line
13727,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,LA Metro Bus Schedule,102-13168,2023-08-15,102-13168_102_Metro Local Line,102_Metro Local Line
16310,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,LA Metro Bus Schedule,102-13168,2023-09-13,102-13168_102_Metro Local Line,102_Metro Local Line
18993,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,LA Metro Bus Schedule,102-13168,2023-10-11,102-13168_102_Metro Local Line,102_Metro Local Line
21681,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,LA Metro Bus Schedule,102-13168,2023-11-15,102-13168_102_Metro Local Line,102_Metro Local Line
24286,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,LA Metro Bus Schedule,102-13172,2023-12-13,102-13172_102_Metro Local Line,102_Metro Local Line


## Problem 2: Assume route_id is stable, find nunique combined_name 
* `route_id` for operator is same, but `combined_name` is different
* this is probably easier to fix, we can designate the latest combined name to be what is displayed
* fixing this will not address problem 1

In [18]:
# How many unique combinations are there
combinations = (df.groupby(route_cols)
                .agg({"combined_name": "nunique"})
                .reset_index()
               )

combinations.combined_name.value_counts()

1    5012
2     351
3       5
Name: combined_name, dtype: int64

In [19]:
(combinations
 .query('combined_name > 2')
).head(10)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,combined_name
3167,a37760dde6b9fdcb76b82e57afab7274,2512,3
3171,a37760dde6b9fdcb76b82e57afab7274,2590,3
3200,a37760dde6b9fdcb76b82e57afab7274,N2601,3
3567,b9473e19aebf7ee2ec18623eb35762a1,42947,3
5232,fb467982dcc77a7f9199bebe709bb700,52,3


In [20]:
def subset_by_operator_route(
    df: pd.DataFrame, 
    gtfs_key: str, 
    one_route: str,
):
    return df[(df.schedule_gtfs_dataset_key == gtfs_key) & 
              (df.route_id==one_route)].drop(columns = ["route_short_name", "route_long_name", "route_desc"])

In [21]:
subset_by_operator_route(
    df.drop_duplicates(subset = route_cols + ["combined_name"]), 
    "6b09003d3a547f37aba623fe8211b355", 
    "115"
)

Unnamed: 0,schedule_gtfs_dataset_key,name,route_id,service_date,combined_all,combined_name
1921,6b09003d3a547f37aba623fe8211b355,Redwood Coast Schedule,115,2023-03-15,115_1_Route 1-Blue Wal-Mart / El Dorado,1_Route 1-Blue Wal-Mart / El Dorado
14142,6b09003d3a547f37aba623fe8211b355,Redwood Coast Schedule,115,2023-08-15,115_1_Route 1-Blue Wal-Mart / DNHS College,1_Route 1-Blue Wal-Mart / DNHS College


In [22]:
subset_by_operator_route(
    df,
    #df.drop_duplicates(subset = route_cols + ["combined_name"]), 
    "fb467982dcc77a7f9199bebe709bb700", 
    "55"
).combined_name.tolist()

['55_Old Ironsides Station - De Anza College',
 '55_Old Ironsides Station - De Anza College',
 '55_Old Ironsides Stn - De Anza Coll',
 '55_Old Ironsides Stn - De Anza Coll',
 '55_Old Ironsides Stn - De Anza Coll',
 '55_Old Ironsides Stn - De Anza Coll',
 '55_Old Ironsides Stn - De Anza Coll',
 '55_Old Ironsides Stn - De Anza Coll',
 '55_Old Ironsides Stn - De Anza Coll']

## Solutions

Add functions to find most common and most recent **route_id or combined_name**.

In [23]:
def most_prevalent_route_info(
    df: pd.DataFrame, 
    group_cols: list, 
    route_col: str
) -> pd.DataFrame:
    most_prevalent = (df.groupby(group_cols)
                      [route_col].max()
                      .reset_index()
                      .rename(columns = {route_col: f"common_{route_col}"})
                     )
    
    df2 = pd.merge(
        df,
        most_prevalent,
        on = group_cols,
        how = "left"
    )
    
    return df2

def most_recent_route_info(
    df: pd.DataFrame,
    group_cols: list,
    route_col: str
) -> pd.DataFrame:
    
    sort_order = [True for c in group_cols]
    
    most_recent = (df.sort_values(group_cols + ["service_date"], 
                                  ascending = sort_order + [False])
                   .drop_duplicates(subset = group_cols)  
                   .rename(columns = {route_col: f"recent_{route_col}"})
                  )
    
    df2 = pd.merge(
        df,
        most_recent[group_cols + [f"recent_{route_col}"]],
        on = group_cols,
        how = "left"
    )
    
    return df2

In [24]:
most_recent_route_info(
    df,
    group_cols = ["name"] + route_cols,
    route_col = "combined_name"
).head(3)

Unnamed: 0,schedule_gtfs_dataset_key,name,route_id,route_long_name,route_short_name,route_desc,service_date,combined_all,combined_name,recent_combined_name
0,7e015887964432c82ce7e735c2753f86,VCTC GMV Schedule,3402,Route 11,Route 11,PACIFIC VIEW MALL via TELEPHONE RD,2023-03-15,3402_Route 11_Route 11,Route 11_Route 11,Route 11_Route 11
1,7e015887964432c82ce7e735c2753f86,VCTC GMV Schedule,3407,Route 18,Route 18,OHS TRIPPER - C ST - DORIS ST,2023-03-15,3407_Route 18_Route 18,Route 18_Route 18,Route 18_Route 18
2,7e015887964432c82ce7e735c2753f86,VCTC GMV Schedule,3408,Route 21,Route 21,PACIFIC VIEW MALL via VICTORIA AVE,2023-03-15,3408_Route 21_Route 21,Route 21_Route 21,Route 21_Route 21


In [25]:
most_prevalent_route_info(
    df,
    group_cols = ["name"] + route_name_cols,
    route_col = "route_id"
).query('route_id != common_route_id').head()

Unnamed: 0,schedule_gtfs_dataset_key,name,route_id,route_long_name,route_short_name,route_desc,service_date,combined_all,combined_name,common_route_id
92,08862f6780e2bb5afce698a73f6974cc,LA DOT Schedule,1458,CE438B,438B,Commuter Express 438B - Imperial-Aviation,2023-03-15,1458_438B_CE438B,438B_CE438B,4278
221,dbbe8ee4864a2715a40749605395d584,Big Blue Bus Schedule,3554,Main St & Santa Monica Blvd/UCLA,1,,2023-03-15,3554_1_Main St & Santa Monica Blvd/UCLA,1_Main St & Santa Monica Blvd/UCLA,3629
222,dbbe8ee4864a2715a40749605395d584,Big Blue Bus Schedule,3555,Wilshire Blvd/UCLA,2,,2023-03-15,3555_2_Wilshire Blvd/UCLA,2_Wilshire Blvd/UCLA,3630
223,dbbe8ee4864a2715a40749605395d584,Big Blue Bus Schedule,3556,Lincoln Blvd/LAX,3,,2023-03-15,3556_3_Lincoln Blvd/LAX,3_Lincoln Blvd/LAX,3631
224,dbbe8ee4864a2715a40749605395d584,Big Blue Bus Schedule,3558,Olympic Blvd,5,,2023-03-15,3558_5_Olympic Blvd,5_Olympic Blvd,3633


### Parse for hyphens and underscores

In [26]:
routes_to_parse = df[df.route_id.str.contains("-")][["name", "route_id"]].drop_duplicates()
routes_to_parse2 = df[df.route_id.str.contains("_")][["name", "route_id"]].drop_duplicates()

In [27]:
OPERATORS_USE_HYPHENS = [
    "Monterey Salinas", "LA Metro",
    "BART", # Beige - N, Beige - S
    "MVGO", # B - AM, B - PM
]

OPERATORS_USE_UNDERSCORES = [
    "Roseville", # 5_AM, 5_PM
]

#for i in df.name.unique():
#    print(i)
#    print(df[(df.name == i)].route_id.unique())

In [28]:
inspect_more = ["Bay Area 511 ACE Schedule",
                
                # These have dashes, which may make it harder to link across time?
                "Yolobus", "Santa Maria",
                "BruinBus",
                "TCRTA", # some dashes
                "B-Line", "Anaheim Resort"
                
               ] 

In [29]:
def parse_route_id(row, gtfs_name_col: str, route_col: str):
    
    word = row[route_col]
    
    OPERATORS_USE_HYPHENS = [
        "Monterey Salinas", "LA Metro",
        "BART", # Beige - N, Beige - S
        "MVGO", # B - AM, B - PM
    ]
    
    OPERATORS_USE_UNDERSCORES = [
        "Roseville", # 5_AM, 5_PM
    ]
    if any(word in row[gtfs_name_col] for word in OPERATORS_USE_HYPHENS): 
        word = word.split("-")[0]
    
    if any(word in row[gtfs_name_col] for word in OPERATORS_USE_UNDERSCORES):
        word = word.split("_")[0]
    
    
    word = word.strip()
    return word
    

In [30]:
routes_to_parse["route_id2"] = routes_to_parse.apply(
    lambda x: parse_route_id(x, "name", "route_id"), axis=1
)


In [31]:
routes_to_parse.name.unique()

array(['Monterey Salinas Schedule', 'Yolobus Schedule',
       'Bay Area 511 BART Schedule', 'Bay Area 511 MVGO Schedule',
       'LA Metro Bus Schedule', 'B-Line Schedule',
       'Anaheim Resort Schedule', 'Santa Maria Schedule',
       'BruinBus Schedule', 'Metrolink Schedule',
       'TCRTA TripShot Schedule',
       'Bay Area 511 Angel Island-Tiburon Ferry Schedule',
       'Bay Area 511 Treasure Island Ferry Schedule',
       'Bay Area 511 Santa Clara Transit Schedule', 'Lynwood Schedule'],
      dtype=object)