In [1]:
pip install shared_utils

Note: you may need to restart the kernel to use updated packages.


In [2]:
import sys
sys.path.append('../ahsc_grant')

In [3]:
import pandas as pd 
import geopandas as gpd
import google.auth
import os
import gcsfs
import requests
from calitp_data_analysis.sql import get_engine
from shared_utils import schedule_rt_utils 
from gtfs_key_ntd_crosswalk import filter_to_valid_dates
db_engine = get_engine()
credentials, project = google.auth.default()
fs = gcsfs.GCSFileSystem()

pd.set_option('display.max_columns', None)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [4]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses'
analysis_date = "2025-10-16"

In [5]:
# Querying dim organization
with db_engine.connect() as connection:
    query = """
        SELECT
            key, name, source_record_id, organization_type, ntd_id, ntd_id_2022, ntd_agency_info_key, 
            public_currently_operating, _is_current, _valid_from, _valid_to
        FROM 
            cal-itp-data-infra.mart_transit_database.dim_organizations
    """
    
    #localize timestamps
    dim_orgs = (
        pd.read_sql(query, connection)
        .pipe(schedule_rt_utils.localize_timestamp_col, ["_valid_from", "_valid_to"])
    )
    
    
    dim_orgs = dim_orgs[
        (dim_orgs['public_currently_operating'] == True) & 
        (dim_orgs['_is_current'] == True)
    ].reset_index(drop=True)

In [6]:
# Filtering the provider gtfs data to valid dates 
valid_organization_full = filter_to_valid_dates(dim_orgs, [analysis_date])

In [7]:
valid_organization_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   key                         218 non-null    object             
 1   name                        218 non-null    object             
 2   source_record_id            218 non-null    object             
 3   organization_type           218 non-null    object             
 4   ntd_id                      179 non-null    object             
 5   ntd_id_2022                 181 non-null    object             
 6   ntd_agency_info_key         159 non-null    object             
 7   public_currently_operating  218 non-null    object             
 8   _is_current                 218 non-null    bool               
 9   _valid_from                 218 non-null    datetime64[ns, UTC]
 10  _valid_to                   218 non-null    datetime64[ns, UTC

In [9]:
# Querying NTD Ridership data 
with db_engine.connect() as connection:
    query = """
        SELECT
            agency, ntd_id, reporter_type, report_year, primary_uza_name, unlinked_passenger_trips_upt, agency_voms
        FROM 
            cal-itp-data-infra.mart_ntd.dim_annual_service_agencies
        WHERE 
            state = 'CA' AND report_year = 2023
    """
    ridership_data= pd.read_sql(query, connection)

In [10]:
# Grouping ridership data for each agency/ntd_id
ridership_data_grouped = ridership_data.groupby(
        [
            "agency",
            "ntd_id",           
        ]
    ).agg({
        "unlinked_passenger_trips_upt":"sum",
        "agency_voms":"sum"
    }).sort_values(by="ntd_id").reset_index()

In [11]:
ridership_data_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   agency                        209 non-null    object 
 1   ntd_id                        209 non-null    object 
 2   unlinked_passenger_trips_upt  209 non-null    float64
 3   agency_voms                   209 non-null    float64
dtypes: float64(2), object(2)
memory usage: 6.7+ KB


In [12]:
#Store data in warehouse
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/ridership_data.parquet", "wb") as f:
    ridership_data_grouped.to_parquet(f, index=False)

In [8]:
#Store data in warehouse
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/organization_data_2025_10_16.parquet", "wb") as f:
    valid_organization_full.to_parquet(f, index=False)