# Area to redo portions of the report that need to be refactored

In [1]:
from datetime import datetime
from functools import cache
from pathlib import Path
from typing import Literal

import _sql_query
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas

In [2]:
import _portfolio_charts

In [3]:
from shared_utils import (
    bq_utils,
    geo_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
)

In [4]:
import publish_public_data
from calitp_data_analysis import geography_utils
from shared_utils import geo_utils, gtfs_utils_v2, portfolio_utils, publish_utils
from update_vars import GTFS_DATA_DICT, analysis_month, file_name, previous_month

In [5]:
from calitp_data_analysis import geography_utils, utils

In [6]:
import google.auth

credentials, project = google.auth.default()

In [7]:
from calitp_data_analysis.sql import get_engine
from calitp_data_analysis.tables import tbls

db_engine = get_engine()

OAuthError: ('Error code invalid_grant: Refresh token has expired', '{"error":"invalid_grant","error_description":"Refresh token has expired"}')

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
PROD_PROJECT = "cal-itp-data-infra"
STG_PROJECT = "cal-itp-data-infra-staging"
STG_MART = "tiffany_mart_gtfs_rollup"
MONTH_DATE_COL = "month_first_day"

## Route Length Chart

In [None]:
GCS_PATH = f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/"

In [None]:
fct_monthly_routes_url = f"{GCS_PATH}{GTFS_DATA_DICT.gtfs_digest_rollup.route_map}_{file_name}.parquet"

In [None]:
analysis_name = 'Yolo County Transportation District'

In [None]:
fct_monthly_route_df = gpd.read_parquet(fct_monthly_routes_url,
                                        filters=[[("Analysis Name", "==", analysis_name)]],
                                        storage_options={"token": credentials.token}).reset_index()

In [None]:

def find_percentiles(df: pd.DataFrame) -> pd.DataFrame:
    """
    Bin 'Route Length Miles' into percentile categories and merge
    human-readable group labels. Zeros are labeled 'Zero'.
    """
    col = 'Route Length Miles'

    # Compute quartiles once
    p25, p50, p75 = df[col].quantile([0.25, 0.50, 0.75])

    # Build bins: (-inf, 0], (0, p25], (p25, p50], (p50, p75], (p75, inf)
    bins = [-np.inf, 0, p25, p50, p75, np.inf]
    labels = ['Zero', '25th percentile', '50th percentile', '< 75th percentile', '> 75th percentile']

    out = df.copy()
    out['percentile_cat'] = pd.cut(
        out[col],
        bins=bins,
        labels=labels,
        right=True,                # include upper bound in each interval
        include_lowest=True        # include lowest value
    )

    # Build concise label text using the computed thresholds
    percentile_df = pd.DataFrame({
        'percentile_cat': labels[1:],  # exclude 'Zero' from the mapping table
        'Route Length Miles Percentile Group': [
            f"25 percentile (<= {p25:.1f} miles)",
            f"26-50th percentile ({p25:.1f}-{p50:.1f} miles)",
            f"51-75th percentile ({p50:.1f}-{p75:.1f} miles)",
            f"76th percentile (>= {p75:.1f} miles)",
        ],
    })

    # Merge and drop 'Geometry' if present
    m1 = out.merge(percentile_df, on='percentile_cat', how='left')
    if 'Geometry' in m1.columns:
        m1 = m1.drop(columns=['Geometry'])

    return m1


In [None]:
percentiles_df = find_percentiles(df = fct_monthly_route_df)

In [None]:
def reshape_percentile_groups(df: pd.DataFrame) -> pd.DataFrame:
    """
    Total number of routes by each
    the route_length_miles_percentile groups.
    """
    agg1 = (
        df.groupby(["Route Length Miles Percentile Group",])
        .agg({"Route Name": "nunique"})
        .reset_index()
    ).rename(
        columns={"Route Name": "Total Routes"}
    )
    return agg1

In [None]:
percentiles_df2 = reshape_percentile_groups(percentiles_df)

In [None]:
percentiles_df2.columns

In [None]:
chart = _portfolio_charts.bar_chart(
    df = percentiles_df2,
    x_col = "Route Length Miles Percentile Group",
    y_col = "Total Routes",
    color_col = "Route Length Miles Percentile Group",
    color_scheme = ["#dd217d","#fcb40e","#ccbb44","#5b8efd"],
    tooltip_cols = [],
    date_format = "",
    y_ticks = [0, 5, 10, 15, 20]
)

In [None]:
_portfolio_charts.configure_chart(chart,
                                  width = 400,
                                  height = 250,
                                  title = "",
                                  subtitle = "")

## NTD Info

In [None]:
ntd_query_sql = f"""
        SELECT 
        number_of_state_counties,
        primary_uza_name,
        density,
        number_of_counties_with_service,
        state_admin_funds_expended,
        service_area_sq_miles,
        population,
        service_area_pop,
        subrecipient_type,
        primary_uza_code,
        reporter_type,
        organization_type,
        agency_name,
        voms_pt,
        voms_do,
        ntd_id,
        year
        FROM `cal-itp-data-infra-staging`.`mart_ntd`.`dim_annual_agency_information`
        WHERE state = 'CA' AND _is_current = TRUE
    """

In [None]:
ntd_query_sql_all = f"""
        SELECT *
        FROM `cal-itp-data-infra-staging`.`mart_ntd`.`dim_annual_agency_information`
        WHERE state = 'CA' AND _is_current = TRUE
    """

In [None]:
type(ntd_query_sql)

In [None]:
def load_ntd(query:str)->pd.DataFrame:
    with db_engine.connect() as connection:
        df = pd.read_sql(query, connection)
    df2 = df.sort_values(by=df.columns.tolist(), na_position="last")
    df3 = df2.groupby("agency_name").first().reset_index()
    return df3

In [None]:
# ntd_df = load_ntd(ntd_query_sql_all)

In [None]:
# ntd_df.sample()

In [None]:
mobility_query_sql = f"""
            SELECT
            agency_name,
            counties_served,
            hq_city,
            hq_county,
            is_public_entity,
            is_publicly_operating,
            funding_sources,
            on_demand_vehicles_at_max_service,
            vehicles_at_max_service
            FROM
            cal-itp-data-infra.mart_transit_database.dim_mobility_mart_providers  
            """

In [None]:
mobility_query_sql_all = f"""
            SELECT *
            FROM
            cal-itp-data-infra.mart_transit_database.dim_mobility_mart_providers  
            """

In [None]:
def load_mobility(query:str)->pd.DataFrame:
    with db_engine.connect() as connection:
        df = pd.read_sql(query, connection)
    df2 = df.sort_values(
        by=["on_demand_vehicles_at_max_service","vehicles_at_max_service"], 
        ascending = [False, False]
    )
    df3 = df2.groupby('agency_name').first().reset_index()
    return df3

In [None]:
# mobility_df = load_ntd(mobility_query_sql_all)

In [None]:
# mobility_df.sample()

In [None]:
def load_crosswalk()->pd.DataFrame:
    crosswalk_query_sql = f"""
        SELECT 
        analysis_name,
        county_name,
        caltrans_district,
        caltrans_district_name,
        ntd_id,
        ntd_id_2022,
        FROM `cal-itp-data-infra-staging`.`tiffany_mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`
    """
    with db_engine.connect() as connection:
        df = pd.read_sql(crosswalk_query_sql, connection)
    df = (df
        .dropna(subset = ["ntd_id","ntd_id_2022"])
        .drop_duplicates()
        .reset_index()
         )
    return df

In [None]:
def merge_ntd_mobility(ntd_query:str,
                      mobility_query:str)->pd.DataFrame:
    """
    Merge NTD (dim_annual_ntd_agency_information) with 
    mobility providers (dim_mobility_mart_providers)
    and dedupe and keep 1 row per agency.
    """
    ntd = load_ntd(ntd_query)
    mobility = load_mobility(mobility_query)
    crosswalk = load_crosswalk()[["analysis_name","ntd_id_2022"]]
    m1 = pd.merge(
        mobility,
        ntd,
        how="inner",
        on="agency_name"
    )

    m1 = m1.drop_duplicates(
        subset="agency_name"
    ).reset_index(
        drop=True
    )
    
    # Wherever possible, allow nullable integers. These columns are integers, but can be
    # missing if we don't find corresponding NTD info
    integrify_cols = [
        "number_of_state_counties", "number_of_counties_with_service", 
        "service_area_sq_miles", "service_area_pop",
        "on_demand_vehicles_at_max_service", "vehicles_at_max_service",
        "voms_pt", "voms_do", "year",
    ]
    m1[integrify_cols] = m1[integrify_cols].astype("Int64")

    # Merge with crosswalk to get analysis_name
    m1 = pd.merge(m1, crosswalk,  
                  left_on = ["ntd_id"],
                  right_on = ["ntd_id_2022"], 
                  how = "left")
    return m1

In [None]:
df = merge_ntd_mobility(ntd_query_sql,
                       mobility_query_sql)

In [None]:
df.shape

In [None]:
df.sample().T

In [None]:
try:
    service_area = formatted(int(df.service_area_sq_miles.values[0]))
    service_pop = formatted(int(df.service_area_pop.values[0]))
except:
    pass

### Second merge attempt using `ntd_id` instead.

In [None]:
# m2 = pd.merge(ntd_crosswalk, df, left_on = ["ntd_id"], right_on = ["ntd_id"], how = "outer", indicator = True)

In [None]:
# m2._merge.value_counts()

### Third merge attempt using `analysis_name`

In [None]:
ntd_crosswalk.columns

In [None]:
df.columns

In [None]:
m3 = pd.merge(ntd_crosswalk, df, left_on = ["analysis_name"], right_on = ["agency_name"], how = "outer", indicator = True)

In [None]:
m3._merge.value_counts()