## Banning Transit is missing
[PR](https://github.com/cal-itp/data-analyses/issues/1254)

In [1]:
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

In [2]:
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
import _operators_prep as op_prep

In [5]:
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates

In [6]:
analysis_date_list

['2024-01-17',
 '2024-02-14',
 '2024-03-13',
 '2024-04-17',
 '2024-05-22',
 '2024-06-12',
 '2024-07-17',
 '2024-08-14',
 '2024-09-18',
 '2024-10-16',
 '2023-03-15',
 '2023-04-12',
 '2023-05-17',
 '2023-06-14',
 '2023-07-12',
 '2023-08-15',
 '2023-09-13',
 '2023-10-11',
 '2023-11-15',
 '2023-12-13']

### What's in `gtfs_funnel/crosswalk_gtfs.../`

In [7]:
from shared_utils import schedule_rt_utils

In [8]:
analysis_date = "2024-10-16"

In [9]:
imported_trips_df = helpers.import_scheduled_trips(
    analysis_date, columns=["gtfs_dataset_key", "name"], get_pandas=True
).rename(columns={"schedule_gtfs_dataset_key": "gtfs_dataset_key"})

In [10]:
imported_trips_df.head(2)

Unnamed: 0,gtfs_dataset_key,name
0,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule
1,bff13f8993ff18e43577db1f5596e014,Merced GMV Schedule


In [11]:
crosswalk = schedule_rt_utils.sample_gtfs_dataset_key_to_organization_crosswalk(
    imported_trips_df,
    analysis_date,
    quartet_data="schedule",
    dim_gtfs_dataset_cols=["key", "source_record_id", "base64_url"],
    dim_organization_cols=[
        "source_record_id",
        "name",
        "itp_id",
        "caltrans_district",
        "ntd_id_2022",
    ],
)

In [12]:
crosswalk.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,itp_id,caltrans_district,ntd_id_2022
0,1770249a5a2e770ca90628434d4934b1,recrAG7e0oOiR6FiP,aHR0cHM6Ly9nb3ZjYnVzLmNvbS9ndGZz,reckQmUdXUzHFmlVf,City of Ojai,231.0,07 - Los Angeles,91058
1,1770249a5a2e770ca90628434d4934b1,recrAG7e0oOiR6FiP,aHR0cHM6Ly9nb3ZjYnVzLmNvbS9ndGZz,rec7EN71rsZxDFxZd,Ventura County Transportation Commission,380.0,07 - Los Angeles,90164


### Fix at crosswalk `schedule_rt_utils.sample_gtfs_dataset_key_to_organization_crosswalk`  where caltrans_district is brought in

In [13]:
from shared_utils import gtfs_utils_v2

In [14]:
import siuba
from calitp_data_analysis.tables import tbls
from shared_utils import gtfs_utils_v2
from siuba import *  # for type hints

In [15]:
dim_orgs_cols = ["source_record_id", "name", "key"]

In [16]:
dim_orgs = (
    tbls.mart_transit_database.dim_organizations()
    >> filter(_._is_current == True)
    >> gtfs_utils_v2.subset_cols(dim_orgs_cols)
    >> collect()
)

In [17]:
dim_orgs.loc[dim_orgs.name.str.contains("Banning")]

Unnamed: 0,source_record_id,name,key
726,recuGkFhN2WXGK67H,City of Banning,41f7aaa3446116fd1124b8ef1966ff14


In [18]:
bridge_tbl_cols = ["organization_key", "county_geography_name"]

In [19]:
bridge_tbl = (
    tbls.mart_transit_database.bridge_organizations_x_headquarters_county_geography()
    >> filter(_._is_current == True)
    >> gtfs_utils_v2.subset_cols(bridge_tbl_cols)
    >> collect()
)

In [20]:
bridge_tbl.head(1)

Unnamed: 0,organization_key,county_geography_name
0,a056b0e0242367463bd9d02f8ea4fd0b,Inyo


In [21]:
bridge_tbl = bridge_tbl.rename(columns={"organization_key": "key"})

In [22]:
m1 = pd.merge(
    dim_orgs,
    bridge_tbl,
    on="key",
    how="inner",
)

In [23]:
district_keep = [
    "name",
    "caltrans_district",
    "caltrans_district_name",
]

In [24]:
caltrans_dist = (
    tbls.mart_transit_database.dim_county_geography()
    >> filter(_._is_current == True)
    >> gtfs_utils_v2.subset_cols(district_keep)
    >> collect()
)

In [25]:
caltrans_dist = caltrans_dist.rename(columns={"name": "county_geography_name"})

In [26]:
caltrans_dist["caltrans_district2"] = (
    caltrans_dist["caltrans_district"].astype(str).str.zfill(2)
)

In [27]:
caltrans_dist = caltrans_dist.drop(columns=["caltrans_district"])

In [28]:
m1.columns

Index(['source_record_id', 'name', 'key', 'county_geography_name'], dtype='object')

In [29]:
caltrans_dist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   county_geography_name   58 non-null     object
 1   caltrans_district_name  58 non-null     object
 2   caltrans_district2      58 non-null     object
dtypes: object(3)
memory usage: 1.5+ KB


In [30]:
caltrans_dist.head(1)

Unnamed: 0,county_geography_name,caltrans_district_name,caltrans_district2
0,Lake,Eureka,1


In [31]:
caltrans_dist["caltrans_district"] = (
    caltrans_dist.caltrans_district2 + " - " + caltrans_dist.caltrans_district_name
)

In [32]:
caltrans_dist["caltrans_district"].unique()

array(['01 - Eureka', '02 - Redding', '03 - Marysville / Sacramento',
       '04 - Bay Area / Oakland', '05 - San Luis Obispo / Santa Barbara',
       '06 - Fresno / Bakersfield', '07 - Los Angeles',
       '08 - San Bernardino / Riverside', '09 - Bishop', '10 - Stockton',
       '11 - San Diego', '12 - Orange County'], dtype=object)

In [33]:
caltrans_dist = caltrans_dist.drop(
    columns=["caltrans_district_name", "caltrans_district2"]
)

In [34]:
caltrans_dist.sample(3)

Unnamed: 0,county_geography_name,caltrans_district
35,Santa Barbara,05 - San Luis Obispo / Santa Barbara
45,Mono,09 - Bishop
23,Sonoma,04 - Bay Area / Oakland


In [35]:
m2 = pd.merge(
    m1,
    caltrans_dist,
    on=["county_geography_name"],
    how="left",
)

In [36]:
m2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1290 entries, 0 to 1289
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   source_record_id       1290 non-null   object
 1   name                   1290 non-null   object
 2   key                    1290 non-null   object
 3   county_geography_name  1290 non-null   object
 4   caltrans_district      1290 non-null   object
dtypes: object(5)
memory usage: 60.5+ KB


In [37]:
m2.head(1)

Unnamed: 0,source_record_id,name,key,county_geography_name,caltrans_district
0,recsupkiKC6Y6fFfV,DAV,eea7326fc87a575ce26e24cf56b8ff37,Alameda,04 - Bay Area / Oakland


### Combine it into a function

In [39]:
def filter_dim_organizations(
    date: str,
    keep_cols: list[str] = ["source_record_id", "caltrans_district"],
    custom_filtering: dict = None,
) -> pd.DataFrame:
    """
    Filter dim_organizations down to current record for organization.
    Caltrans district is associated with organization_source_record_id.
    """
    dim_orgs_cols = ["source_record_id", "name", "key"]

    # This table contains the source_record_id
    dim_orgs = (
        tbls.mart_transit_database.dim_organizations()
        >> gtfs_utils_v2.filter_custom_col(custom_filtering)
        >> filter(_._is_current == True)
        >> gtfs_utils_v2.subset_cols(dim_orgs_cols)
        >> collect()
    )

    # This table contains the county_name that is a key to the district table
    bridge_tbl_cols = ["organization_key", "county_geography_name"]
    bridge_tbl = (
        tbls.mart_transit_database.bridge_organizations_x_headquarters_county_geography()
        >> filter(_._is_current == True)
        >> gtfs_utils_v2.subset_cols(bridge_tbl_cols)
        >> collect()
    )
    bridge_tbl = bridge_tbl.rename(columns={"organization_key": "key"})

    # This table contains Caltrans District
    district_keep = ["name", "caltrans_district", "caltrans_district_name"]

    caltrans_dist = (
        tbls.mart_transit_database.dim_county_geography()
        >> filter(_._is_current == True)
        >> gtfs_utils_v2.subset_cols(district_keep)
        >> collect()
    )

    # Clean up CT district
    caltrans_dist = caltrans_dist.rename(columns={"name": "county_geography_name"})

    # Pad single digits with 0
    caltrans_dist["caltrans_district2"] = (
        caltrans_dist["caltrans_district"].astype(str).str.zfill(2)
    )

    # Drop old column to create a new one
    caltrans_dist = caltrans_dist.drop(columns=["caltrans_district"])
    caltrans_dist["caltrans_district"] = (
        caltrans_dist.caltrans_district2 + " - " + caltrans_dist.caltrans_district_name
    )

    # Drop other columns
    caltrans_dist = caltrans_dist.drop(
        columns=["caltrans_district_name", "caltrans_district2"]
    )

    # Merge
    m1 = pd.merge(dim_orgs, bridge_tbl, on="key", how="inner").merge(
        caltrans_dist,
        on=["county_geography_name"],
        how="left",
    )
    
    # Subset
    m1 = m1[keep_cols]
    return m1

In [46]:
test  = filter_dim_organizations(analysis_date, ["source_record_id", "caltrans_district"])

In [47]:
test.shape

(1290, 5)

In [48]:
test.head(1)

Unnamed: 0,source_record_id,name,key,county_geography_name,caltrans_district
0,recsupkiKC6Y6fFfV,DAV,eea7326fc87a575ce26e24cf56b8ff37,Alameda,04 - Bay Area / Oakland


### Edited crosswalk in `gtfs_funnel`: manually added Districts (temporary fix)

In [None]:
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [None]:
url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-10-21.parquet"

In [None]:
crosswalk_oct_31 = pd.read_parquet(url)

In [None]:
crosswalk_oct_31.loc[crosswalk_oct_31.name.str.contains("Ban")]

In [None]:
url2 = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-04-15.parquet"

In [None]:
crosswalk_apr_15 = pd.read_parquet(url2)

In [None]:
crosswalk_apr_15.loc[crosswalk_apr_15.name.str.contains("Ban")].organization_name

### `Merge_date`
#### Stack all the new crosswalks

In [None]:
stack_crossawlk = merge_data.concatenate_crosswalk_organization(analysis_date_list)

In [None]:
stack_crossawlk.service_date.nunique()

In [None]:
banning_only = stack_crossawlk.loc[stack_crossawlk.name.str.contains("Banning")]

In [None]:
banning_only[
    ["service_date", "organization_name", "schedule_gtfs_dataset_key", "name"]
].drop_duplicates().sort_values(by=["service_date"])

#### Try to find Banning in `df_schedule`

In [None]:
df_sched = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)

In [None]:
df_sched.service_date.nunique()

In [None]:
df_sched.head(1)

In [None]:
banning_only_df_sched = df_sched.loc[
    df_sched.schedule_gtfs_dataset_key.str.contains("ebc783bace70899492d6206c352547d6")
]

In [None]:
banning_only_df_sched[
    ["service_date", "schedule_gtfs_dataset_key"]
].drop_duplicates().sort_values(by=["service_date"])

#### Try to find Banning in `df_rt_sched`

In [None]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(
    analysis_date_list
)

In [None]:
df_rt_sched.head(1)

In [None]:
banning_only_df_rt_sched = df_rt_sched.loc[df_rt_sched.name.str.contains("Banning")]

In [None]:
banning_only_df_rt_sched.head(1)

In [None]:
banning_only_df_rt_sched[
    ["service_date", "schedule_gtfs_dataset_key", "name"]
].drop_duplicates().sort_values(by=["service_date"])

## Read in file after rerunning `merge_data`

In [None]:
DIGEST_RT_SCHED = GTFS_DATA_DICT.digest_tables.route_schedule_vp
DIGEST_SEGMENT_SPEEDS = GTFS_DATA_DICT.digest_tables.route_segment_speeds

In [None]:
f"{RT_SCHED_GCS}{DIGEST_RT_SCHED}.parquet"

In [None]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [None]:
final = pd.read_parquet(f"{RT_SCHED_GCS}{DIGEST_RT_SCHED}.parquet")

In [None]:
final.head(1)

### Banning is here

In [None]:
final[
    [
        "organization_name",
        "caltrans_district",
        "name",
        "sched_rt_category",
        "service_date",
    ]
].drop_duplicates().sort_values(by=["organization_name"])

## I also want to edit this so if an agency that consistently produces RT or schedule data misses a month or two, they will still show up on the portfolio

In [None]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

schd_vp_df = pd.read_parquet(
    schd_vp_url,
    filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
    columns=[
        "schedule_gtfs_dataset_key",
        "caltrans_district",
        "organization_name",
        "name",
        "sched_rt_category",
        "service_date",
    ],
)

In [None]:
schd_vp_df2 = (
    schd_vp_df.dropna(subset="caltrans_district")
    .sort_values(
        by=[
            "caltrans_district",
            "organization_name",
            "service_date",
        ],
        ascending=[True, True, False],
    )
    .drop_duplicates(
        subset=[
            "organization_name",
            "caltrans_district",
        ]
    )
    .reset_index(drop=True)
)

In [None]:
schd_vp_df2.loc[schd_vp_df2.organization_name.str.contains("Banning")]

In [None]:
schd_vp_df2.organization_name.value_counts().describe()

In [None]:
schd_vp_df2.service_date.unique()

In [None]:
schd_vp_df2.service_date.value_counts()

In [None]:
schd_vp_df2.loc[schd_vp_df2.service_date == "2024-02-14"]

## Forgot why I need `operator_profiles`

In [None]:
operator_profiles = op_prep.operator_profiles()

In [None]:
operator_profiles