# Area to redo portions of the report that need to be refactored

In [1]:
from datetime import datetime
from functools import cache
from pathlib import Path
from typing import Literal

import _sql_query
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas

In [2]:
from shared_utils import (
    bq_utils,
    geo_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
)

In [3]:
import publish_public_data
from calitp_data_analysis import geography_utils
from shared_utils import geo_utils, gtfs_utils_v2, portfolio_utils, publish_utils
from update_vars import GTFS_DATA_DICT, analysis_month, file_name, previous_month

In [4]:
from calitp_data_analysis import geography_utils, utils

In [5]:
import google.auth

credentials, project = google.auth.default()

In [6]:
from calitp_data_analysis.sql import get_engine
from calitp_data_analysis.tables import tbls

db_engine = get_engine()

In [7]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [8]:
PROD_PROJECT = "cal-itp-data-infra"
STG_PROJECT = "cal-itp-data-infra-staging"
STG_MART = "tiffany_mart_gtfs_rollup"
MONTH_DATE_COL = "month_first_day"

## NTD Info

In [9]:
ntd_query_sql = f"""
        SELECT 
        number_of_state_counties,
        primary_uza_name,
        density,
        number_of_counties_with_service,
        state_admin_funds_expended,
        service_area_sq_miles,
        population,
        service_area_pop,
        subrecipient_type,
        primary_uza_code,
        reporter_type,
        organization_type,
        agency_name,
        voms_pt,
        voms_do,
        ntd_id,
        year
        FROM `cal-itp-data-infra-staging`.`mart_ntd`.`dim_annual_agency_information`
        WHERE state = 'CA' AND _is_current = TRUE
    """

In [10]:
ntd_query_sql_all = f"""
        SELECT *
        FROM `cal-itp-data-infra-staging`.`mart_ntd`.`dim_annual_agency_information`
        WHERE state = 'CA' AND _is_current = TRUE
    """

In [11]:
type(ntd_query_sql)

str

In [44]:
def load_ntd(query:str)->pd.DataFrame:
    with db_engine.connect() as connection:
        df = pd.read_sql(query, connection)
    df2 = df.sort_values(by=df.columns.tolist(), na_position="last")
    df3 = df2.groupby("agency_name").first().reset_index()
    return df3

In [13]:
# ntd_df = load_ntd(ntd_query_sql_all)

In [14]:
# ntd_df.sample()

In [15]:
mobility_query_sql = f"""
            SELECT
            agency_name,
            counties_served,
            hq_city,
            hq_county,
            is_public_entity,
            is_publicly_operating,
            funding_sources,
            on_demand_vehicles_at_max_service,
            vehicles_at_max_service
            FROM
            cal-itp-data-infra.mart_transit_database.dim_mobility_mart_providers  
            """

In [16]:
mobility_query_sql_all = f"""
            SELECT *
            FROM
            cal-itp-data-infra.mart_transit_database.dim_mobility_mart_providers  
            """

In [17]:
def load_mobility(query:str)->pd.DataFrame:
    with db_engine.connect() as connection:
        df = pd.read_sql(query, connection)
    df2 = df.sort_values(
        by=["on_demand_vehicles_at_max_service","vehicles_at_max_service"], 
        ascending = [False, False]
    )
    df3 = df2.groupby('agency_name').first().reset_index()
    return df3

In [18]:
# mobility_df = load_ntd(mobility_query_sql_all)

In [19]:
# mobility_df.sample()

In [47]:
def load_crosswalk()->pd.DataFrame:
    crosswalk_query_sql = f"""
        SELECT 
        analysis_name,
        county_name,
        caltrans_district,
        caltrans_district_name,
        ntd_id,
        ntd_id_2022,
        FROM `cal-itp-data-infra-staging`.`tiffany_mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`
    """
    with db_engine.connect() as connection:
        df = pd.read_sql(crosswalk_query_sql, connection)
    df = (df
        .dropna(subset = ["ntd_id","ntd_id_2022"])
        .drop_duplicates()
        .reset_index()
         )
    return df

In [45]:
def merge_ntd_mobility(ntd_query:str,
                      mobility_query:str)->pd.DataFrame:
    """
    Merge NTD (dim_annual_ntd_agency_information) with 
    mobility providers (dim_mobility_mart_providers)
    and dedupe and keep 1 row per agency.
    """
    ntd = load_ntd(ntd_query)
    mobility = load_mobility(mobility_query)
    crosswalk = load_crosswalk()[["analysis_name","ntd_id_2022"]]
    m1 = pd.merge(
        mobility,
        ntd,
        how="inner",
        on="agency_name"
    )

    m1 = m1.drop_duplicates(
        subset="agency_name"
    ).reset_index(
        drop=True
    )
    
    # Wherever possible, allow nullable integers. These columns are integers, but can be
    # missing if we don't find corresponding NTD info
    integrify_cols = [
        "number_of_state_counties", "number_of_counties_with_service", 
        "service_area_sq_miles", "service_area_pop",
        "on_demand_vehicles_at_max_service", "vehicles_at_max_service",
        "voms_pt", "voms_do", "year",
    ]
    m1[integrify_cols] = m1[integrify_cols].astype("Int64")

    # Merge with crosswalk to get analysis_name
    m1 = pd.merge(m1, crosswalk,  
                  left_on = ["ntd_id"],
                  right_on = ["ntd_id_2022"], 
                  how = "left")
    return m1

In [48]:
df = merge_ntd_mobility(ntd_query_sql,
                       mobility_query_sql)

In [49]:
df.shape

(226, 27)

In [51]:
df.sample().T

Unnamed: 0,30
agency_name,City of Bellflower
counties_served,Los Angeles
hq_city,Bellflower
hq_county,Los Angeles
is_public_entity,True
is_publicly_operating,True
funding_sources,5309
on_demand_vehicles_at_max_service,
vehicles_at_max_service,4
number_of_state_counties,


In [52]:
try:
    service_area = formatted(int(df.service_area_sq_miles.values[0]))
    service_pop = formatted(int(df.service_area_pop.values[0]))
except:
    pass

### Second merge attempt using `ntd_id` instead.

In [36]:
# m2 = pd.merge(ntd_crosswalk, df, left_on = ["ntd_id"], right_on = ["ntd_id"], how = "outer", indicator = True)

In [37]:
# m2._merge.value_counts()

### Third merge attempt using `analysis_name`

In [38]:
ntd_crosswalk.columns

Index(['index', 'analysis_name', 'county_name', 'caltrans_district',
       'caltrans_district_name', 'ntd_id', 'ntd_id_2022'],
      dtype='object')

In [39]:
df.columns

Index(['agency_name', 'counties_served', 'hq_city', 'hq_county',
       'is_public_entity', 'is_publicly_operating', 'funding_sources',
       'on_demand_vehicles_at_max_service', 'vehicles_at_max_service',
       'number_of_state_counties', 'primary_uza_name', 'density',
       'number_of_counties_with_service', 'state_admin_funds_expended',
       'service_area_sq_miles', 'population', 'service_area_pop',
       'subrecipient_type', 'primary_uza_code', 'reporter_type',
       'organization_type', 'voms_pt', 'voms_do', 'ntd_id', 'year'],
      dtype='object')

In [40]:
m3 = pd.merge(ntd_crosswalk, df, left_on = ["analysis_name"], right_on = ["agency_name"], how = "outer", indicator = True)

In [41]:
m3._merge.value_counts()

both          121
right_only    107
left_only      21
Name: _merge, dtype: int64