# Portfolio Organization Name YML -> Warehouse #1686
* `portfolio_utils.standardize_portfolio_organization_names` is now linked to Airtable instead of the yml. Update all instances in the scripts.

In [1]:
import geopandas as gpd
import intake
import merge_data
import merge_operator_data
import merge_operator_service
import pandas as pd
import yaml
from calitp_data_analysis import geography_utils, utils
from segment_speed_utils import (
    gtfs_schedule_wrangling,
    metrics,
    segment_calcs,
    time_series_utils,
)
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, weeks_available
from shared_utils import (
    dask_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
    time_helpers,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
catalog = intake.open_catalog("../_shared_utils/shared_utils/shared_data_catalog.yml")

In [3]:
from calitp_data_analysis.sql import get_engine

db_engine = get_engine()

In [4]:
sort_cols = ["schedule_gtfs_dataset_key", "service_date"]

In [5]:
with open("../_shared_utils/shared_utils/portfolio_organization_name.yml", "r") as f:
    PORTFOLIO_ORGANIZATIONS_DICT = yaml.safe_load(f)

In [6]:
import google.auth

credentials, project = google.auth.default()

import gcsfs

fs = gcsfs.GCSFileSystem()

In [7]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [8]:
def standardize_portfolio_organization_names(
    df: pd.DataFrame, preferred_organization_name_dict: dict
) -> pd.DataFrame:
    # Map the preferred organization name using schedule_gtfs_dataset_name.
    df = df.assign(
        portfolio_organization_name=df.name.map(preferred_organization_name_dict)
    )
    # drop the ones that were removed with duplicated feed info (create_portfolio_display_yaml.py)
    df = df.dropna(subset="portfolio_organization_name")
    return df

## `_route_dir_data_prep.py`

## `python quarterly_rollup.py`

In [None]:
import quarterly_rollup

In [None]:
DIGEST_RT_SCHED_MONTH = GTFS_DATA_DICT.digest_tables.monthly_route_schedule_vp

In [None]:
monthly_df = pd.read_parquet(f"{RT_SCHED_GCS}{DIGEST_RT_SCHED_MONTH}.parquet")

In [None]:
monthly_df[["schedule_source_record_id", "source_record_id"]].sample(3)

In [None]:
#monthly_df["source_id_same"] = (
#    monthly_df.schedule_source_record_id == monthly_df.source_record_id
#)

In [None]:
# monthly_df["source_id_same"].value_counts()

In [None]:
# quarter_df = quarterly_rollup.quarterly_metrics(monthly_df)

In [None]:
df = time_helpers.add_quarter(monthly_df, "service_date")

In [None]:
df.year_quarter = df.year_quarter.str.replace("_", " ")

In [None]:
schd_metric_cols = [
    "avg_scheduled_service_minutes",
    "avg_stop_miles",
    "frequency",
    "total_scheduled_service_minutes",
]

groupby_cols = [
    "analysis_name",
    "source_record_id",
    "year_quarter",
    "direction_id",
    "time_period",
    "recent_combined_name",
]
rt_metric_cols = [
    "minutes_atleast1_vp",
    "minutes_atleast2_vp",
    "total_rt_service_minutes",
    "total_vp",
    "vp_in_shape",
    "avg_rt_service_minutes",
    "speed_mph",
]
rt_metric_no_weighted_avg = [
    "is_early",
    "is_ontime",
    "is_late",
]

In [None]:
rt_metrics = segment_calcs.calculate_weighted_averages(
    df=df[groupby_cols + rt_metric_cols + ["n_vp_trips"]],
    group_cols=groupby_cols,
    metric_cols=rt_metric_cols,
    weight_col="n_vp_trips",
)

In [None]:
schd_metrics = segment_calcs.calculate_weighted_averages(
    df=df[groupby_cols + schd_metric_cols + ["n_scheduled_trips"]],
    group_cols=groupby_cols,
    metric_cols=schd_metric_cols,
    weight_col="n_scheduled_trips",
)


# Calculate trips by timeliness which doesn't need weighted average
timeliness_df = df[groupby_cols + rt_metric_no_weighted_avg]
timeliness_df2 = (
    timeliness_df.groupby(groupby_cols)
    .agg({"is_early": "sum", "is_ontime": "sum", "is_late": "sum"})
    .reset_index()
)

In [None]:
crosswalk_cols = [
    "base64_url",
    "caltrans_district",
    "combined_name",
    "is_coverage",
    "is_downtown_local",
    "is_express",
    "is_ferry",
    "is_local",
    "is_rail",
    "is_rapid",
    "name",
    "recent_route_id",
    "route_id",
    "route_primary_direction",
    "sched_rt_category",
    "schedule_gtfs_dataset_key",
    "schedule_source_record_id",
    "source_record_id",
    "typology",
    "analysis_name",
    "year_quarter",
    "direction_id",
    "time_period",
    "recent_combined_name" ,
]

In [None]:
crosswalk = df[crosswalk_cols].drop_duplicates()

In [None]:
m1 = (
        pd.merge(rt_metrics, schd_metrics, on=groupby_cols)
        .merge(timeliness_df2, on=groupby_cols)
        .merge(crosswalk, on=groupby_cols)
    )

In [None]:
m2 = m1.rename(
    columns={
        "total_rt_service_minutes": "rt_service_minutes",
        "total_scheduled_service_minutes": "scheduled_service_minutes",
    }).pipe(
     metrics.calculate_rt_vs_schedule_metrics
    ).rename(
       columns={
        "rt_service_minutes": "total_rt_service_minutes",
        "scheduled_service_minutes": "total_scheduled_service_minutes"

    })
 
    # Have to recalculate rt sched journey ratio
m2["rt_sched_journey_ratio"] = (
        m2.total_rt_service_minutes / m2.total_scheduled_service_minutes
    )

In [None]:
col_proper_order = list(df.columns) 
col_proper_order.remove("service_date")
col_proper_order.remove("year")
col_proper_order.remove("quarter")

In [None]:
m2 = m2[col_proper_order]

## Comparing what's in airtable vs. the yml

In [None]:
with db_engine.connect() as connection:
    query = """
            SELECT*
            FROM
            cal-itp-data-infra.mart_transit_database.dim_gtfs_datasets
            WHERE _is_current = TRUE
            """
    df = pd.read_sql(query, connection)

In [None]:
df.sample()

In [None]:
def load_portfolio_names() -> pd.DataFrame:
    with db_engine.connect() as connection:
        query = """
            SELECT
            name,
            analysis_name,
            source_record_id
            FROM
            cal-itp-data-infra.mart_transit_database.dim_gtfs_datasets
            WHERE _is_current = TRUE
            """
        df = pd.read_sql(query, connection)
    df = df.rename(
        columns={
            "key": "schedule_gtfs_dataset_key",
        }
    )
    return df

In [None]:
airtable_names = load_portfolio_names()

In [None]:
airtable_names.sort_values(by=["name"])

In [None]:
len(airtable_names.dropna(subset=["analysis_name"]))

In [None]:
len(PORTFOLIO_ORGANIZATIONS_DICT)

In [None]:
original_crosswalk = (
    pd.DataFrame.from_dict(PORTFOLIO_ORGANIZATIONS_DICT, orient="index")
    .reset_index()
    .rename(columns={"index": "name", 0: "analysis_name"})
)

In [None]:
original_crosswalk.columns

In [None]:
len(original_crosswalk)

In [None]:
airtable_names_list = list(airtable_names.name.unique())

In [None]:
airtable_analysis_names_list = list(airtable_names.analysis_name.unique())

In [None]:
og_crosswalk_list = list(original_crosswalk.name.unique())

In [None]:
og_crosswalk_analysis_names_list = list(original_crosswalk.analysis_name.unique())

In [None]:
set(og_crosswalk_list) - set(airtable_names_list)

In [None]:
set(og_crosswalk_analysis_names_list) - set(airtable_analysis_names_list)

## `merge_data`

In [None]:
def concatenate_crosswalk_organization(date_list: list) -> pd.DataFrame:
    """
    Concatenate the crosswalk (from gtfs_funnel)
    that connects gtfs_dataset_key to organization
    and other organization-related columns (NTD, etc)
    for all the dates we have.

    This is operator grain.
    """
    FILE = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

    crosswalk_cols = [
        "schedule_gtfs_dataset_key",
        "name",
        "schedule_source_record_id",
        "base64_url",
        # "organization_source_record_id",
        # "organization_name",
        "caltrans_district",
    ]

    df = time_series_utils.concatenate_datasets_across_dates(
        SCHED_GCS, FILE, date_list, data_type="df", columns=crosswalk_cols
    )

    df = df.assign(
        caltrans_district=df.caltrans_district.map(
            portfolio_utils.CALTRANS_DISTRICT_DICT
        )
    ).pipe(portfolio_utils.standardize_portfolio_organization_names)

    # to aggregate up to organization,
    # group by name-service_date-portfolio_organization_name
    # because name indicates different feeds, so we want to sum those.

    return df

In [None]:
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates + rt_dates.y2025_dates

In [None]:
crosswalk = concatenate_crosswalk_organization(analysis_date_list)

In [None]:
crosswalk.shape

In [None]:
# crosswalk_og = concatenate_crosswalk_organization_og(analysis_date_list)

In [None]:
crosswalk.shape

In [None]:
# crosswalk_og.shape

## `merge_operator_data`

In [None]:
def concatenate_operator_routes(date_list: list) -> gpd.GeoDataFrame:
    """
    Concatenate operator route gdf (1 representative shape chosen)
    across all dates we have.
    """
    FILE = GTFS_DATA_DICT.schedule_tables.operator_routes

    df = (
        time_series_utils.concatenate_datasets_across_dates(
            SCHED_GCS,
            FILE,
            date_list,
            data_type="gdf",
        )
        .sort_values(sort_cols)
        .reset_index(drop=True)
        .pipe(portfolio_utils.standardize_portfolio_organization_names)
    )

    return df

In [None]:
operator_routes = concatenate_operator_routes(analysis_date_list)

In [None]:
def concatenate_operator_routes_og(date_list: list) -> gpd.GeoDataFrame:
    """
    Concatenate operator route gdf (1 representative shape chosen)
    across all dates we have.
    """
    FILE = GTFS_DATA_DICT.schedule_tables.operator_routes

    df = (
        time_series_utils.concatenate_datasets_across_dates(
            SCHED_GCS,
            FILE,
            date_list,
            data_type="gdf",
        )
        .sort_values(sort_cols)
        .reset_index(drop=True)
        .pipe(standardize_portfolio_organization_names, PORTFOLIO_ORGANIZATIONS_DICT)
    )

    return df

In [None]:
operator_routes_og = concatenate_operator_routes_og(analysis_date_list)

In [None]:
operator_routes_og.shape

In [None]:
operator_routes.shape

In [None]:
def concatenate_crosswalks(
    date_list: list,
) -> pd.DataFrame:
    """
    Get crosswalk and selected NTD columns for certain dates.
    """
    FILE = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

    ntd_cols = [
        "schedule_gtfs_dataset_key",
        "name",
        "caltrans_district",
        "service_area_sq_miles",
        "hq_city",
        "service_area_pop",
        "organization_type",
        "primary_uza_name",
        "reporter_type",
    ]

    df = (
        time_series_utils.concatenate_datasets_across_dates(
            SCHED_GCS, FILE, date_list, data_type="df", columns=ntd_cols
        )
        .sort_values(sort_cols)
        .reset_index(drop=True)
    )

    df = df.assign(
        caltrans_district=df.caltrans_district.map(
            portfolio_utils.CALTRANS_DISTRICT_DICT
        )
    ).pipe(
        portfolio_utils.standardize_portfolio_organization_names,
    )

    return df

In [None]:
crosswalk = concatenate_crosswalks(analysis_date_list)

In [None]:
def concatenate_crosswalks_og(
    date_list: list,
) -> pd.DataFrame:
    """
    Get crosswalk and selected NTD columns for certain dates.
    """
    FILE = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

    ntd_cols = [
        "schedule_gtfs_dataset_key",
        "name",
        "caltrans_district",
        "service_area_sq_miles",
        "hq_city",
        "service_area_pop",
        "organization_type",
        "primary_uza_name",
        "reporter_type",
    ]

    df = (
        time_series_utils.concatenate_datasets_across_dates(
            SCHED_GCS, FILE, date_list, data_type="df", columns=ntd_cols
        )
        .sort_values(sort_cols)
        .reset_index(drop=True)
    )

    df = df.assign(
        caltrans_district=df.caltrans_district.map(
            portfolio_utils.CALTRANS_DISTRICT_DICT
        )
    ).pipe(standardize_portfolio_organization_names, PORTFOLIO_ORGANIZATIONS_DICT)

    return df

In [None]:
crosswalk_og = concatenate_crosswalks_og(analysis_date_list)

In [None]:
crosswalk_og.shape

In [None]:
crosswalk.shape

In [None]:
crosswalk.sample()

In [None]:
crosswalk_og.sample()

In [None]:
crosswalk.analysis_name.nunique()

In [None]:
crosswalk_og.portfolio_organization_name.nunique()

In [None]:
og_orgs = list(crosswalk_og.portfolio_organization_name.unique())

In [None]:
new_orgs = list(crosswalk.analysis_name.unique())

In [None]:
set(og_orgs) - set(new_orgs)

In [None]:
og_names = list(crosswalk_og.name.unique())

In [None]:
new_names = list(crosswalk.name.unique())

In [None]:
set(og_names) - set(new_names)

In [None]:
crosswalk_og[["portfolio_organization_name", "name"]].sort_values(
    by=["portfolio_organization_name"]
).drop_duplicates()

In [None]:
# crosswalk[["analysis_name", "name"]].sort_values(by=["analysis_name"]).drop_duplicates()

## `merge operator service`

In [None]:
def concatenate_trips(
    date_list: list,
) -> pd.DataFrame:
    """
    Concatenate schedule data that's been
    aggregated to route-direction-time_period for
    multiple days.
    """
    FILE = GTFS_DATA_DICT.schedule_downloads.trips

    df = (
        time_series_utils.concatenate_datasets_across_dates(
            COMPILED_CACHED_VIEWS,
            FILE,
            date_list,
            data_type="df",
            columns=[
                "name",
                "service_date",
                "route_long_name",
                "trip_first_departure_datetime_pacific",
                "service_hours",
            ],
        )
        .sort_values(["service_date"])
        .reset_index(drop=True)
    )

    # Map portfolio_organization_name to name
    # First remove any private datasets before mapping
    public_datasets = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys(
        get_df=True
    )
    public_feeds = public_datasets.gtfs_dataset_name.unique().tolist()

    df = (
        df.pipe(
            publish_utils.exclude_private_datasets,
            col="name",
            public_gtfs_dataset_keys=public_feeds,
        )
        .pipe(portfolio_utils.standardize_portfolio_organization_names)
        .drop(columns=["name"])
    )

    return df

In [None]:
def concatenate_trips_og(
    date_list: list,
) -> pd.DataFrame:
    """
    Concatenate schedule data that's been
    aggregated to route-direction-time_period for
    multiple days.
    """
    FILE = GTFS_DATA_DICT.schedule_downloads.trips

    df = (
        time_series_utils.concatenate_datasets_across_dates(
            COMPILED_CACHED_VIEWS,
            FILE,
            date_list,
            data_type="df",
            columns=[
                "name",
                "service_date",
                "route_long_name",
                "trip_first_departure_datetime_pacific",
                "service_hours",
            ],
        )
        .sort_values(["service_date"])
        .reset_index(drop=True)
    )

    # Map portfolio_organization_name to name
    # First remove any private datasets before mapping
    public_datasets = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys(
        get_df=True
    )
    public_feeds = public_datasets.gtfs_dataset_name.unique().tolist()

    df = (
        df.pipe(
            publish_utils.exclude_private_datasets,
            col="name",
            public_gtfs_dataset_keys=public_feeds,
        )
        .pipe(standardize_portfolio_organization_names, PORTFOLIO_ORGANIZATIONS_DICT)
        .drop(columns=["name"])
    )

    return df

In [None]:
trips_og = concatenate_trips_og(analysis_date_list)

In [None]:
trips = concatenate_trips(analysis_date_list)

In [None]:
trips_og.shape, trips.shape

## `open_data/create_stops_data`

In [9]:
ah_stops_test = (
    "gs://calitp-analytics-data/data-analyses/ah_testing/ca_transit_stops.parquet"
)

In [36]:
ah_stops_df = gpd.read_parquet(
    ah_stops_test,
    storage_options={"token": credentials.token},
)

In [11]:
og_stops_url = (
    "gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_stops.parquet"
)

In [12]:
og_stops_df = gpd.read_parquet(
    og_stops_url,
    storage_options={"token": credentials.token},
)

### How did 30,000 more rows pop up?

In [13]:
og_stops_df.shape

(129391, 13)

In [14]:
ah_stops_df.shape

(159325, 13)

In [15]:
analysis_date = rt_dates.DATES["jun2025"]

In [16]:
def load_portfolio_names() -> pd.DataFrame:
    with db_engine.connect() as connection:
        query = """
            SELECT
            name,
            analysis_name,
            source_record_id,
            FROM
            cal-itp-data-infra.mart_transit_database.dim_gtfs_datasets
            WHERE _is_current = TRUE
            """
        df = pd.read_sql(query, connection)
    df = df.rename(
        columns={
            "key": "schedule_gtfs_dataset_key",
        }
    )
    return df


def standardize_portfolio_organization_names_new(df: pd.DataFrame) -> pd.DataFrame:
    portfolio_name_df = load_portfolio_names()
    # Map the preferred organization name using schedule_gtfs_dataset_name.
    m1 = pd.merge(
        df,
        portfolio_name_df,
        on="name",
        how="left",
    )

    # drop the ones that were removed with duplicated feed info
    m1 = m1.dropna(subset="analysis_name")
    return m1

In [17]:
def standardize_operator_info_for_exports_og(
    df: pd.DataFrame, date: str
) -> pd.DataFrame:
    """
    Use our crosswalk file created in gtfs_funnel
    and add in the organization columns we want to
    publish on.
    """

    CROSSWALK_FILE = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

    public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys()

    # Get the crosswalk file
    crosswalk = pd.read_parquet(
        f"{SCHED_GCS}{CROSSWALK_FILE}_{date}.parquet",
        columns=[
            "schedule_gtfs_dataset_key",
            "name",
            "base64_url",
            "caltrans_district",
        ],
        filters=[[("schedule_gtfs_dataset_key", "in", public_feeds)]],
    )

    # Add portfolio_organization_name
    crosswalk = (
        crosswalk.assign(
            caltrans_district=crosswalk.caltrans_district.map(
                portfolio_utils.CALTRANS_DISTRICT_DICT
            )
        )
        .pipe(standardize_portfolio_organization_names, PORTFOLIO_ORGANIZATIONS_DICT)
        .drop_duplicates(
            subset=["schedule_gtfs_dataset_key", "name", "portfolio_organization_name"]
        )
    )

    # Checked whether we need a left merge to keep stops outside of CA
    # that may not have caltrans_district
    # and inner merge is fine. All operators are assigned a caltrans_district
    # so Amtrak / FlixBus stops have values populated

    # Merge the crosswalk and the input DF
    crosswalk_input_merged = pd.merge(
        df,
        crosswalk,
        on=["schedule_gtfs_dataset_key"],
        suffixes=[
            "_original",
            None,
        ],  # Keep the source record id from the crosswalk as the "definitive" version
        how="inner",
    )

    # Drop dups
    crosswalk_input_merged = crosswalk_input_merged.drop_duplicates()
    return crosswalk_input_merged

In [18]:
def standardize_operator_info_for_exports_new(
    df: pd.DataFrame, date: str
) -> pd.DataFrame:
    """
    Use our crosswalk file created in gtfs_funnel
    and add in the organization columns we want to
    publish on.
    """

    CROSSWALK_FILE = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

    public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys()

    # Get the crosswalk file
    crosswalk = pd.read_parquet(
        f"{SCHED_GCS}{CROSSWALK_FILE}_{date}.parquet",
        columns=[
            "schedule_gtfs_dataset_key",
            "name",
            "base64_url",
            "caltrans_district",
        ],
        filters=[[("schedule_gtfs_dataset_key", "in", public_feeds)]],
    )

    # Add portfolio_organization_name
    crosswalk = (
        crosswalk.assign(
            caltrans_district=crosswalk.caltrans_district.map(
                portfolio_utils.CALTRANS_DISTRICT_DICT
            )
        )
        .pipe(standardize_portfolio_organization_names_new)
        .drop_duplicates(subset=["schedule_gtfs_dataset_key", "name", "analysis_name"])
    )

    # Checked whether we need a left merge to keep stops outside of CA
    # that may not have caltrans_district
    # and inner merge is fine. All operators are assigned a caltrans_district
    # so Amtrak / FlixBus stops have values populated

    # Merge the crosswalk and the input DF
    crosswalk_input_merged = pd.merge(
        df,
        crosswalk,
        on=["schedule_gtfs_dataset_key"],
        suffixes=[
            "_original",
            None,
        ],  # Keep the source record id from the crosswalk as the "definitive" version
        how="inner",
    )

    # Drop dups
    crosswalk_input_merged = crosswalk_input_merged.drop_duplicates()
    return crosswalk_input_merged

In [19]:
def create_stops_file_for_export_new(
    date: str,
) -> gpd.GeoDataFrame:
    """
    Read in scheduled stop metrics table and attach crosswalk
    info related to organization for Geoportal.
    """

    # Read in parquets
    STOP_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_stop_metrics

    stops = gpd.read_parquet(
        f"{RT_SCHED_GCS}{STOP_FILE}_{date}.parquet",
        storage_options={"token": credentials.token},
    )

    stops2 = standardize_operator_info_for_exports_new(stops, date)

    return stops2

In [20]:
stops_new = create_stops_file_for_export_new(analysis_date)

### Move `.pipe(open_data_utils.standardize_operator_info_for_exports, one_date)` away to the end?

In [21]:
def add_distance_to_state_highway(stops: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Bring in State Highway Network gdf and add a column that tells us
    distance (in meters) between stop and SHN.
    For stops outside of CA, this will not be that meaningful.
    Using a dissolve takes a long time. Instead, opt for gpd.sjoin_nearest,
    which allows us to return a distance column, and if there are multiple
    rows, we'll keep the closest distance.

    See discussion in:
    https://github.com/cal-itp/data-analyses/issues/1182
    https://github.com/cal-itp/data-analyses/issues/1321
    https://github.com/cal-itp/data-analyses/issues/1397
    """
    orig_crs = stops.crs

    shn = catalog.state_highway_network.read()[["District", "geometry"]].to_crs(
        geography_utils.CA_NAD83Albers_m
    )

    stop_cols = ["schedule_gtfs_dataset_key", "stop_id"]

    nearest_shn_result = (
        gpd.sjoin_nearest(
            stops[stop_cols + ["geometry"]].to_crs(geography_utils.CA_NAD83Albers_m),
            shn,
            distance_col="meters_to_shn",
        )
        .sort_values(stop_cols + ["meters_to_shn"])
        .drop_duplicates(subset=stop_cols)
        .reset_index(drop=True)
    )

    stops2 = pd.merge(
        stops,
        nearest_shn_result[stop_cols + ["meters_to_shn"]],
        on=stop_cols,
        how="inner",
    )

    stops2 = stops2.assign(meters_to_shn=stops2.meters_to_shn.round(1))

    return stops2.to_crs(orig_crs)

In [22]:
def patch_previous_dates_new(
    current_stops: gpd.GeoDataFrame,
    current_date: str,
    published_operators_yaml: str = "../gtfs_funnel/published_operators.yml",
) -> gpd.GeoDataFrame:
    """
    Compare to the yaml for what operators we want, and
    patch in previous dates for the 10 or so operators
    that do not have data for this current date.
    """
    with open(published_operators_yaml) as f:
        published_operators_dict = yaml.safe_load(f)

    patch_operators_dict = {
        str(date): operator_list
        for date, operator_list in published_operators_dict.items()
        if str(date) != current_date
    }

    partial_dfs = []

    STOP_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_stop_metrics

    for one_date, operator_list in patch_operators_dict.items():
        df_to_add = publish_utils.subset_table_from_previous_date(
            gcs_bucket=RT_SCHED_GCS,
            filename=STOP_FILE,
            operator_and_dates_dict=patch_operators_dict,
            date=one_date,
            crosswalk_col="schedule_gtfs_dataset_key",
            data_type="gdf",
        )

        partial_dfs.append(df_to_add)

    patch_stops = pd.concat(partial_dfs, axis=0, ignore_index=True)

    published_stops = pd.concat(
        [current_stops, patch_stops], axis=0, ignore_index=True
    ).pipe(add_distance_to_state_highway)

    return published_stops

In [23]:
def patch_previous_dates_og(
    current_stops: gpd.GeoDataFrame,
    current_date: str,
    published_operators_yaml: str = "../gtfs_funnel/published_operators.yml",
) -> gpd.GeoDataFrame:
    """
    Compare to the yaml for what operators we want, and
    patch in previous dates for the 10 or so operators
    that do not have data for this current date.
    """
    with open(published_operators_yaml) as f:
        published_operators_dict = yaml.safe_load(f)

    patch_operators_dict = {
        str(date): operator_list
        for date, operator_list in published_operators_dict.items()
        if str(date) != current_date
    }

    partial_dfs = []

    STOP_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_stop_metrics

    for one_date, operator_list in patch_operators_dict.items():
        df_to_add = publish_utils.subset_table_from_previous_date(
            gcs_bucket=RT_SCHED_GCS,
            filename=STOP_FILE,
            operator_and_dates_dict=patch_operators_dict,
            date=one_date,
            crosswalk_col="schedule_gtfs_dataset_key",
            data_type="gdf",
        ).pipe(standardize_operator_info_for_exports_og, one_date)

        partial_dfs.append(df_to_add)

    patch_stops = pd.concat(partial_dfs, axis=0, ignore_index=True)

    published_stops = pd.concat(
        [current_stops, patch_stops], axis=0, ignore_index=True
    ).pipe(add_distance_to_state_highway)

    return published_stops

In [24]:
stops_new_patch_new = patch_previous_dates_new(stops_new, analysis_date).pipe(
    standardize_operator_info_for_exports_og, analysis_date
)

In [25]:
len(stops_new_patch_new)

154089

In [26]:
stops_new_patch_og = patch_previous_dates_og(stops_new, analysis_date)

In [27]:
len(stops_new_patch_og)

160774

In [28]:
def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Suppress certain columns used in our internal modeling for export.
    """
    # Change column order
    route_cols = [
        "source_record_id",
        "analysis_name",
    ]
    stop_cols = [
        "stop_id",
        "stop_name",
        # add GTFS stop-related metrics
        "n_routes",
        "route_ids_served",
        "route_types_served",
        "n_arrivals",
        "n_hours_in_service",
        # this is derived column
        "meters_to_shn",
    ]
    agency_ids = ["base64_url", "caltrans_district"]

    col_order = route_cols + stop_cols + agency_ids + ["geometry"]

    df2 = (
        df[col_order]
        .reindex(columns=col_order)
        .rename(columns=STANDARDIZED_COLUMNS_DICT)
        .reset_index(drop=True)
    )

    return df2

In [29]:
STANDARDIZED_COLUMNS_DICT = {
    "caltrans_district": "district_name",
    "organization_source_record_id": "org_id",
    "organization_name": "agency",
    "agency_name_primary": "agency_primary",
    "agency_name_secondary": "agency_secondary",
    "route_name_used": "route_name",
    "route_types_served": "routetypes",
    "meters_to_shn": "meters_to_ca_state_highway",
    "portfolio_organization_name": "agency",
    "analysis_name": "agency",
}

In [30]:
# Standardize info once again?
stops_new_patch_new2 = (stops_new_patch_new).pipe(standardize_operator_info_for_exports_new, analysis_date)

  crosswalk_input_merged = pd.merge(


In [31]:
stops_new_patch_new3 = finalize_export_df(stops_new_patch_new2)

In [32]:
stops_new_patch_og2 = (stops_new_patch_og).pipe(standardize_operator_info_for_exports_new, analysis_date)

In [33]:
stops_new_patch_og3 = finalize_export_df(stops_new_patch_og2)

In [34]:
len(stops_new_patch_og3)

153738

In [35]:
len(stops_new_patch_new3)

153738

In [37]:
len(ah_stops_df)

153738