## Crosswalk
* Check out why there are multiple `organization names` to `names`.
* Figure out how to configure yaml so only one organization will match to one name.

In [1]:
import pandas as pd
from shared_utils import catalog_utils

GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Schd_VP

In [3]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [4]:
og = pd.read_parquet(schd_vp_url)

In [5]:
og = og.loc[og.sched_rt_category == "schedule_and_vp"]

In [6]:
crosswalk = (
    og[
        [
            "schedule_gtfs_dataset_key",
            "caltrans_district",
            "organization_name",
            "name",
            "sched_rt_category",
            "service_date",
        ]
    ]
    .sort_values(
        by=["caltrans_district", "organization_name", "service_date"],
        ascending=[False, False, False],
    )
    .drop(columns=["service_date"])
    .drop_duplicates()
)

In [7]:
crosswalk.organization_name.value_counts().head(40)

Long Beach Transit                                          4
Orange County Transportation Authority                      3
North County Transit District                               3
Butte County Association of Governments                     3
Stanislaus Regional Transit Authority                       3
Monterey-Salinas Transit                                    3
University of California, Davis                             3
Eastern Sierra Transit Authority                            3
Mountain View Transportation Management Association         2
City of Lawndale                                            2
Golden Empire Transit District                              2
Santa Cruz Metropolitan Transit District                    2
City of Santa Maria                                         2
Western Contra Costa Transit Authority                      2
Santa Clara Valley Transportation Authority                 2
Napa Valley Transportation Authority                        2
City and

In [8]:
crosswalk.name.value_counts().head(10)

VCTC GMV Schedule                              4
San Diego Schedule                             4
Long Beach Schedule                            4
Sacramento Schedule                            4
OCTA Schedule                                  3
Eastern Sierra Schedule                        3
Bay Area 511 Sonoma County Transit Schedule    3
Monterey Salinas Schedule                      3
Unitrans Schedule                              3
B-Line Schedule                                3
Name: name, dtype: int64

In [9]:
crosswalk.shape

(134, 5)

In [10]:
crosswalk.organization_name.nunique(), crosswalk.name.nunique()

(89, 81)

#### Observations
* The same `organization_names` appear multiple times under different `schedule_gtfs_dataset_key`. How do we know which one to use?
* The same `name` appears multiple times 
    * San Diego Schedule maps to two different organization_names
* The same `organization_name` and `name` combination appear multiple times but under different `schedule_gtfs_dataset_key`.
    * Santa Cruz Metropolitan Transit District & Santa Cruz Schedule

### Operator Profiles

In [11]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [12]:
op_profiles_df = pd.read_parquet(op_profiles_url)

In [13]:
# Sort by name and service_date, so most recent date is kept.
op_profiles2 = op_profiles_df.sort_values(
    by=["name", "service_date"],
    ascending=[True, False],
)[["organization_name", "name", "service_date", "schedule_gtfs_dataset_key"]]

In [14]:
# Keep only the most recent version.
op_profiles3 = op_profiles2.drop_duplicates(subset=["name"])

In [15]:
op_profiles3.name.value_counts().head(10)

Alhambra Schedule            1
Redwood Coast Schedule       1
Merced GMV Schedule          1
Merced Schedule              1
Mission Bay Schedule         1
Monterey Salinas Schedule    1
Morongo Basin Schedule       1
Mountain Transit Schedule    1
Needles Schedule             1
Nevada County Schedule       1
Name: name, dtype: int64

In [16]:
op_profiles3.organization_name.value_counts().head(10)

City of Downey                                              2
Mission Bay Transportation Management Agency                2
Palo Verde Valley Transit Agency                            2
Transit Joint Powers Authority for Merced County            2
Los Angeles County Metropolitan Transportation Authority    2
Tahoe Transportation District                               2
Victor Valley Transit Authority                             2
City of Lawndale                                            2
City of Needles                                             1
Monterey-Salinas Transit                                    1
Name: organization_name, dtype: int64

In [17]:
op_profiles4 = op_profiles3.drop_duplicates(subset=["organization_name"]).reset_index(
    drop=True
)

In [18]:
op_profiles4.organization_name.nunique()

155

In [19]:
len(op_profiles4)

155

In [20]:
op_profiles4.organization_name.value_counts().head(10)

City of Alhambra                                    1
North County Transit District                       1
Mendocino Transit Authority                         1
Transit Joint Powers Authority for Merced County    1
Monterey-Salinas Transit                            1
Basin Transit                                       1
Mountain Area Regional Transit Authority            1
City of Needles                                     1
Nevada County                                       1
City of Norwalk                                     1
Name: organization_name, dtype: int64

In [21]:
op_profiles4.name.value_counts().head(10)

Alhambra Schedule            1
North County Schedule        1
Mendocino Schedule           1
Merced GMV Schedule          1
Monterey Salinas Schedule    1
Morongo Basin Schedule       1
Mountain Transit Schedule    1
Needles Schedule             1
Nevada County Schedule       1
Norwalk Avail Schedule       1
Name: name, dtype: int64

In [22]:
op_profiles4.service_date.describe()

  op_profiles4.service_date.describe()


count                     155
unique                      7
top       2024-03-13 00:00:00
freq                      142
first     2023-03-15 00:00:00
last      2024-03-13 00:00:00
Name: service_date, dtype: object

In [23]:
op_profiles4

Unnamed: 0,organization_name,name,service_date,schedule_gtfs_dataset_key
0,City of Alhambra,Alhambra Schedule,2024-03-13,6894087758e4c76d3e591daee4c46dc9
1,Amador Regional Transit System,Amador Schedule,2024-03-13,36b8fbf12e4adc76b21651462b200860
2,Antelope Valley Transit Authority,Antelope Valley Transit Authority Schedule,2024-03-13,e681c3a8dafa2c80e5b8e2cdd01f917a
3,City of Arcadia,Arcadia Schedule,2023-12-13,a3a2a40ae51e523796f0be989b8b3493
4,City of Arvin,Arvin Schedule,2024-03-13,8a90fabefcbcbea614ed939a47cbe063
5,City of Auburn,Auburn Schedule,2024-03-13,83a293ce449a611b01f08929a7fcaab0
6,City of Avalon,Avalon Schedule,2024-03-13,1c10c6bdc6d306a5a05bba52c00e3755
7,Butte County Association of Governments,B-Line Schedule,2024-03-13,68aa06a25a32c83eb38c20c43977feff
8,City of Baldwin Park,Baldwin Park Schedule,2024-03-13,3a2aa5e411c107ea6867b5316f98000b
9,Banning Pass Transit,Banning Pass Schedule,2024-03-13,bc039937fdadd173bd3c3edc03b7a9c9


### Merge # 1

In [38]:
# Merge without schedule_gtfs_key
m1 = pd.merge(
    op_profiles4,
    crosswalk,
    on=["name", "organization_name"],
    how="outer",
    indicator=True,
)

In [39]:
m1._merge.value_counts()

both          115
left_only      80
right_only     19
Name: _merge, dtype: int64

In [42]:
final_organization_list = m1.loc[m1._merge == "both"].reset_index(drop=True)

In [43]:
final_organization_list.shape, final_organization_list.organization_name.nunique(), final_organization_list.name.nunique()

((115, 8), 75, 75)

In [46]:
final_organization_list2 = final_organization_list.drop_duplicates(
    subset=["organization_name", "name"]
).sort_values(by=["caltrans_district"])

In [47]:
final_organization_list2.shape, final_organization_list2.organization_name.nunique(), final_organization_list2.name.nunique()

((75, 8), 75, 75)

### Merge #2 

In [49]:
op_profiles4.columns

Index(['organization_name', 'name', 'service_date',
       'schedule_gtfs_dataset_key'],
      dtype='object')

In [50]:
# Merge without schedule_gtfs_key
m2 = pd.merge(
    op_profiles4,
    crosswalk,
    on=["name", "organization_name", "schedule_gtfs_dataset_key"],
    how="outer",
    indicator=True,
)

In [51]:
final_m2 = m2.loc[m2._merge == "both"].reset_index(drop=True)

In [52]:
final_m2.shape, final_m2.organization_name.nunique(), final_m2.name.nunique()

((75, 7), 75, 75)

In [66]:
final_m2.sort_values(by = ['organization_name'])

Unnamed: 0,organization_name,name,service_date,schedule_gtfs_dataset_key,caltrans_district,sched_rt_category,_merge
2,Alameda-Contra Costa Transit District,Bay Area 511 AC Transit Schedule,2024-03-13,c499f905e33929a641f083dad55c521e,04 - Oakland,schedule_and_vp,both
0,Antelope Valley Transit Authority,Antelope Valley Transit Authority Schedule,2024-03-13,e681c3a8dafa2c80e5b8e2cdd01f917a,07 - Los Angeles,schedule_and_vp,both
1,Butte County Association of Governments,B-Line Schedule,2024-03-13,68aa06a25a32c83eb38c20c43977feff,03 - Marysville,schedule_and_vp,both
3,Central Contra Costa Transit Authority,Bay Area 511 County Connection Schedule,2024-03-13,587e730fac4db21d54037e0f12b0dd5d,04 - Oakland,schedule_and_vp,both
8,City and County of San Francisco,Bay Area 511 Muni Schedule,2024-03-13,7cc0cb1871dfd558f11a2885c145d144,04 - Oakland,schedule_and_vp,both
22,City of Beaumont,Beaumont Pass Schedule,2024-03-13,680328d33847441ab0037bc3861e5763,08 - San Bernardino,schedule_and_vp,both
25,City of Burbank,Burbank Schedule,2024-03-13,22b1fd6db336c11d8df960e58cf79d73,07 - Los Angeles,schedule_and_vp,both
26,City of Commerce,Commerce Schedule,2024-03-13,eaabdf2b0bb899b7953ea81047fdd00d,07 - Los Angeles,schedule_and_vp,both
27,City of Culver City,Culver City Schedule,2024-03-13,cf0f7df88da36cd9ca4248eb1d6a0f39,07 - Los Angeles,schedule_and_vp,both
31,City of Duarte,Foothill Schedule,2024-03-13,f74424acf8c41e4c1e9fd42838c4875c,07 - Los Angeles,schedule_and_vp,both


In [64]:
crosswalk.sort_values(by=["organization_name"]).drop_duplicates(subset = ['organization_name'])

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category
270307,444700afe086ed24e3cb888cecd3037c,04 - Oakland,Alameda-Contra Costa Transit District,Bay Area 511 AC Transit Schedule,schedule_and_vp
936044,e681c3a8dafa2c80e5b8e2cdd01f917a,07 - Los Angeles,Antelope Valley Transit Authority,Antelope Valley Transit Authority Schedule,schedule_and_vp
421891,6693efa56a541b6276da9b424f78a170,01 - Eureka,Blue Lake Rancheria,Humboldt Schedule,schedule_and_vp
1027463,f1cc580313b37ae0f853b2e469b27228,03 - Marysville,Butte County Association of Governments,B-Line Schedule,schedule_and_vp
368774,587e730fac4db21d54037e0f12b0dd5d,04 - Oakland,Central Contra Costa Transit Authority,Bay Area 511 County Connection Schedule,schedule_and_vp
488856,7cc0cb1871dfd558f11a2885c145d144,04 - Oakland,City and County of San Francisco,Bay Area 511 Muni Schedule,schedule_and_vp
422776,680328d33847441ab0037bc3861e5763,08 - San Bernardino,City of Beaumont,Beaumont Pass Schedule,schedule_and_vp
131246,22b1fd6db336c11d8df960e58cf79d73,07 - Los Angeles,City of Burbank,Burbank Schedule,schedule_and_vp
960953,eaabdf2b0bb899b7953ea81047fdd00d,07 - Los Angeles,City of Commerce,Commerce Schedule,schedule_and_vp
856345,cf0f7df88da36cd9ca4248eb1d6a0f39,07 - Los Angeles,City of Culver City,Culver City Schedule,schedule_and_vp


In [61]:
m2.sort_values(by=["organization_name", "_merge"], ascending = [True, False]).drop(
    columns=["schedule_gtfs_dataset_key"]
).drop_duplicates(subset = ["organization_name"])

Unnamed: 0,organization_name,name,service_date,caltrans_district,sched_rt_category,_merge
10,Alameda-Contra Costa Transit District,Bay Area 511 AC Transit Schedule,2024-03-13,04 - Oakland,schedule_and_vp,both
1,Amador Regional Transit System,Amador Schedule,2024-03-13,,,left_only
2,Antelope Valley Transit Authority,Antelope Valley Transit Authority Schedule,2024-03-13,07 - Los Angeles,schedule_and_vp,both
9,Banning Pass Transit,Banning Pass Schedule,2024-03-13,,,left_only
102,Basin Transit,Morongo Basin Schedule,2024-03-13,,,left_only
212,Blue Lake Rancheria,Humboldt Schedule,NaT,01 - Eureka,schedule_and_vp,right_only
7,Butte County Association of Governments,B-Line Schedule,2024-03-13,03 - Marysville,schedule_and_vp,both
47,Calaveras Transit Agency,Calaveras Schedule,2024-03-13,,,left_only
12,Capitol Corridor Joint Powers Authority,Bay Area 511 Capitol Corridor Schedule,2024-03-13,,,left_only
14,Central Contra Costa Transit Authority,Bay Area 511 County Connection Schedule,2024-03-13,04 - Oakland,schedule_and_vp,both


### See differences

In [54]:
m2_orgs = set(final_m2.organization_name.unique().tolist())
m1_orgs = set(final_organization_list2.organization_name.unique().tolist())
m2_orgs - m1_orgs

set()

In [55]:
m1_orgs - m2_orgs

set()