# Area to redo portions of the report that need to be refactored

In [16]:
from datetime import datetime
from functools import cache
from pathlib import Path
from typing import Literal

import _sql_query
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas

In [2]:
import _portfolio_charts

In [3]:
from shared_utils import (
    bq_utils,
    geo_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
)

In [4]:
import publish_public_data
from calitp_data_analysis import geography_utils
from shared_utils import geo_utils, gtfs_utils_v2, portfolio_utils, publish_utils
from update_vars import GTFS_DATA_DICT, analysis_month, file_name, previous_month

In [5]:
from calitp_data_analysis import geography_utils, utils

In [6]:
import google.auth

credentials, project = google.auth.default()

In [7]:
from calitp_data_analysis.sql import get_engine
from calitp_data_analysis.tables import tbls

db_engine = get_engine()

In [25]:
from omegaconf import OmegaConf

readable_dict = OmegaConf.load("new_readable.yml")

In [8]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [9]:
PROD_PROJECT = "cal-itp-data-infra"
STG_PROJECT = "cal-itp-data-infra-staging"
STG_MART = "tiffany_mart_gtfs_rollup"
MONTH_DATE_COL = "month_first_day"

## Route Length Chart

In [10]:
GCS_PATH = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/"

In [38]:
fct_monthly_routes_url = f"{GCS_PATH}{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}processed/_{file_name}.parquet"

In [32]:
analysis_name = 'City and County of San Francisco'

In [39]:
fct_monthly_route_df = gpd.read_parquet(fct_monthly_routes_url,
                                        filters=[[("Analysis Name", "==", analysis_name)]],
                                        storage_options={"token": credentials.token}).reset_index()

FileNotFoundError: calitp-analytics-data/data-analyses/gtfs_digest/processed/fct_monthly_routesprocessed/_2025_11.parquet

In [35]:
fct_monthly_route_df

Unnamed: 0,index,Name,Year,Month,Month First Day,Route Name,Route Type,Geometry,Analysis Name,Route Length Miles
0,1103,Bay Area 511 Golden Gate Park Shuttle Schedule,2025,11,2025-11-01,GGPS__GGPS Golden Gate Park Shuttle,3,"LINESTRING (-122.47684 37.77084, -122.47659 37.77091, -122.47643 37.77096, -122.47624 37.77100, -122.47607 37.77104, -122.47591 37.77108, -122.47578 37.77113, -122.47561 37.77116, -122.47541 37.77116, -122.47529 37.77112, -122.47516 37.77110, -122.47504 37.77108, -122.47486 37.77107, -122.47452 37.77105, -122.47419 37.77103, -122.47408 37.77106, -122.47399 37.77109, -122.47390 37.77123, -122.47352 37.77110, -122.47302 37.77093, -122.47278 37.77085, -122.47258 37.77079, -122.47244 37.77075, -122.47224 37.77072, -122.47207 37.77071, -122.47195 37.77072, -122.47182 37.77074, -122.47167 37.77079, -122.47143 37.77088, -122.47107 37.77106, -122.47055 37.77137, -122.47055 37.77137, -122.47027 37.77156, -122.46951 37.77208, -122.46927 37.77222, -122.46903 37.77232, -122.46881 37.77239, -122.46844 37.77247, -122.46822 37.77250, -122.46822 37.77250, -122.46808 37.77251, -122.46744 37.77254, -122.46654 37.77257, -122.46661 37.77247, -122.46664 37.77223, -122.46672 37.77207, -122.46683 37.77194, -122.46891 37.77046, -122.46891 37.77046, -122.46915 37.77029, -122.46932 37.77013, -122.46945 37.76998, -122.46958 37.76973, -122.46901 37.76952, -122.46873 37.76944, -122.46873 37.76944, -122.46811 37.76925, -122.46806 37.76935, -122.46792 37.76947, -122.46617 37.77072, -122.46617 37.77072, -122.46610 37.77079, -122.46606 37.77087, -122.46604 37.77097, -122.46604 37.77108, -122.46607 37.77118, -122.46646 37.77208, -122.46648 37.77217, -122.46650 37.77229, -122.46649 37.77238, -122.46648 37.77254, -122.46617 37.77254, -122.46602 37.77255, -122.46589 37.77254, -122.46575 37.77252, -122.46571 37.77253, -122.46561 37.77253, -122.46539 37.77256, -122.46527 37.77257, -122.46505 37.77253, -122.46440 37.77240, -122.46326 37.77208, -122.46225 37.77182, -122.46225 37.77182, -122.45897 37.77128, -122.45665 37.77096, -122.45514 37.77085, -122.45514 37.77085, -122.45503 37.77092, -122.45476 37.77106, -122.45406 37.77135, -122.45404 37.77127, -122.45404 37.77127, -122.45401 37.77110, -122.45393 37.77101, -122.45337 37.76825, -122.45325 37.76825, -122.45172 37.76845, -122.45170 37.76845, -122.45189 37.76938, -122.45295 37.76925)",City and County of San Francisco,2.39


In [14]:

def find_percentiles(df: pd.DataFrame) -> pd.DataFrame:
    """
    Bin 'Route Length Miles' into percentile categories and merge
    human-readable group labels. Zeros are labeled 'Zero'.
    """
    col = 'Route Length Miles'

    # Compute quartiles once
    p25, p50, p75 = df[col].quantile([0.25, 0.50, 0.75])

    # Build bins: (-inf, 0], (0, p25], (p25, p50], (p50, p75], (p75, inf)
    bins = [-np.inf, 0, p25, p50, p75, np.inf]
    labels = ['Zero', '25th percentile', '50th percentile', '< 75th percentile', '> 75th percentile']

    out = df.copy()
    out['percentile_cat'] = pd.cut(
        out[col],
        bins=bins,
        labels=labels,
        right=True,                # include upper bound in each interval
        include_lowest=True        # include lowest value
    )

    # Build concise label text using the computed thresholds
    percentile_df = pd.DataFrame({
        'percentile_cat': labels[1:],  # exclude 'Zero' from the mapping table
        'Route Length Miles Percentile Group': [
            f"25 percentile (<= {p25:.1f} miles)",
            f"26-50th percentile ({p25:.1f}-{p50:.1f} miles)",
            f"51-75th percentile ({p50:.1f}-{p75:.1f} miles)",
            f"76th percentile (>= {p75:.1f} miles)",
        ],
    })

    # Merge and drop 'Geometry' if present
    m1 = out.merge(percentile_df, on='percentile_cat', how='left')
    if 'Geometry' in m1.columns:
        m1 = m1.drop(columns=['Geometry'])

    return m1

In [17]:
percentiles_df = find_percentiles(df = fct_monthly_route_df)

In [18]:
def reshape_percentile_groups(df: pd.DataFrame) -> pd.DataFrame:
    """
    Total number of routes by each
    the route_length_miles_percentile groups.
    """
    agg1 = (
        df.groupby(["Route Length Miles Percentile Group",])
        .agg({"Route Name": "nunique"})
        .reset_index()
    ).rename(
        columns={"Route Name": "Total Routes"}
    )
    return agg1

In [19]:
percentiles_df2 = reshape_percentile_groups(percentiles_df)

In [20]:
percentiles_df2

Unnamed: 0,Route Length Miles Percentile Group,Total Routes
0,25 percentile (<= 12.5 miles),4
1,26-50th percentile (12.5-18.8 miles),4
2,51-75th percentile (18.8-24.6 miles),3
3,76th percentile (>= 24.6 miles),4


In [22]:
_portfolio_charts.configure_chart(chart,
                                  width = 400,
                                  height = 250,
                                  title = "",
                                  subtitle = "")

In [23]:
def create_route_lengths(df: pd.DataFrame):
    df2 = find_percentiles(df)
    df3 = reshape_percentile_groups(df2)
    
    chart_dict = readable_dict.route_percentiles

    chart = _portfolio_charts.bar_chart(
    df = df3,
    x_col = "Route Length Miles Percentile Group",
    y_col = "Total Routes",
    color_col = "Route Length Miles Percentile Group",
    color_scheme = [*chart_dict.colors],
    tooltip_cols = list(chart_dict.tooltip),
    date_format = "",
    y_ticks = chart_dict.ticks,
)
    
    chart = (
        _portfolio_charts.configure_chart(
            chart,
            width=400,
            height=250,
            title=chart_dict.title,
            subtitle=chart_dict.subtitle,
        )
    )
    return chart

In [26]:
create_route_lengths(fct_monthly_route_df)

## NTD Info

In [None]:
ntd_query_sql = f"""
        SELECT 
        number_of_state_counties,
        primary_uza_name,
        density,
        number_of_counties_with_service,
        state_admin_funds_expended,
        service_area_sq_miles,
        population,
        service_area_pop,
        subrecipient_type,
        primary_uza_code,
        reporter_type,
        organization_type,
        agency_name,
        voms_pt,
        voms_do,
        ntd_id,
        year
        FROM `cal-itp-data-infra-staging`.`mart_ntd`.`dim_annual_agency_information`
        WHERE state = 'CA' AND _is_current = TRUE
    """

In [None]:
ntd_query_sql_all = f"""
        SELECT *
        FROM `cal-itp-data-infra-staging`.`mart_ntd`.`dim_annual_agency_information`
        WHERE state = 'CA' AND _is_current = TRUE
    """

In [None]:
type(ntd_query_sql)

In [None]:
def load_ntd(query:str)->pd.DataFrame:
    with db_engine.connect() as connection:
        df = pd.read_sql(query, connection)
    df2 = df.sort_values(by=df.columns.tolist(), na_position="last")
    df3 = df2.groupby("agency_name").first().reset_index()
    return df3

In [None]:
# ntd_df = load_ntd(ntd_query_sql_all)

In [None]:
# ntd_df.sample()

In [None]:
mobility_query_sql = f"""
            SELECT
            agency_name,
            counties_served,
            hq_city,
            hq_county,
            is_public_entity,
            is_publicly_operating,
            funding_sources,
            on_demand_vehicles_at_max_service,
            vehicles_at_max_service
            FROM
            cal-itp-data-infra.mart_transit_database.dim_mobility_mart_providers  
            """

In [None]:
mobility_query_sql_all = f"""
            SELECT *
            FROM
            cal-itp-data-infra.mart_transit_database.dim_mobility_mart_providers  
            """

In [None]:
def load_mobility(query:str)->pd.DataFrame:
    with db_engine.connect() as connection:
        df = pd.read_sql(query, connection)
    df2 = df.sort_values(
        by=["on_demand_vehicles_at_max_service","vehicles_at_max_service"], 
        ascending = [False, False]
    )
    df3 = df2.groupby('agency_name').first().reset_index()
    return df3

In [None]:
# mobility_df = load_ntd(mobility_query_sql_all)

In [None]:
# mobility_df.sample()

In [None]:
def load_crosswalk()->pd.DataFrame:
    crosswalk_query_sql = f"""
        SELECT 
        analysis_name,
        county_name,
        caltrans_district,
        caltrans_district_name,
        ntd_id,
        ntd_id_2022,
        FROM `cal-itp-data-infra-staging`.`tiffany_mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`
    """
    with db_engine.connect() as connection:
        df = pd.read_sql(crosswalk_query_sql, connection)
    df = (df
        .dropna(subset = ["ntd_id","ntd_id_2022"])
        .drop_duplicates()
        .reset_index()
         )
    return df

In [None]:
def merge_ntd_mobility(ntd_query:str,
                      mobility_query:str)->pd.DataFrame:
    """
    Merge NTD (dim_annual_ntd_agency_information) with 
    mobility providers (dim_mobility_mart_providers)
    and dedupe and keep 1 row per agency.
    """
    ntd = load_ntd(ntd_query)
    mobility = load_mobility(mobility_query)
    crosswalk = load_crosswalk()[["analysis_name","ntd_id_2022"]]
    m1 = pd.merge(
        mobility,
        ntd,
        how="inner",
        on="agency_name"
    )

    m1 = m1.drop_duplicates(
        subset="agency_name"
    ).reset_index(
        drop=True
    )
    
    # Wherever possible, allow nullable integers. These columns are integers, but can be
    # missing if we don't find corresponding NTD info
    integrify_cols = [
        "number_of_state_counties", "number_of_counties_with_service", 
        "service_area_sq_miles", "service_area_pop",
        "on_demand_vehicles_at_max_service", "vehicles_at_max_service",
        "voms_pt", "voms_do", "year",
    ]
    m1[integrify_cols] = m1[integrify_cols].astype("Int64")

    # Merge with crosswalk to get analysis_name
    m1 = pd.merge(m1, crosswalk,  
                  left_on = ["ntd_id"],
                  right_on = ["ntd_id_2022"], 
                  how = "left")
    return m1

In [None]:
df = merge_ntd_mobility(ntd_query_sql,
                       mobility_query_sql)

In [None]:
df.shape

In [None]:
df.sample().T

In [None]:
try:
    service_area = formatted(int(df.service_area_sq_miles.values[0]))
    service_pop = formatted(int(df.service_area_pop.values[0]))
except:
    pass

### Second merge attempt using `ntd_id` instead.

In [None]:
# m2 = pd.merge(ntd_crosswalk, df, left_on = ["ntd_id"], right_on = ["ntd_id"], how = "outer", indicator = True)

In [None]:
# m2._merge.value_counts()

### Third merge attempt using `analysis_name`

In [None]:
ntd_crosswalk.columns

In [None]:
df.columns

In [None]:
m3 = pd.merge(ntd_crosswalk, df, left_on = ["analysis_name"], right_on = ["agency_name"], how = "outer", indicator = True)

In [None]:
m3._merge.value_counts()