In [1]:
pip install shared_utils

Note: you may need to restart the kernel to use updated packages.


In [2]:
import shared_utils
import pandas as pd
import geopandas as gpd
import re
import google.auth
import os
import gcsfs

from calitp_data_analysis import get_fs
from calitp_data_analysis import geography_utils, utils
from calitp_data_analysis.sql import get_engine

fs = get_fs()
db_engine = get_engine()

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses'

In [5]:
# Querying NTD Ridership data 
with db_engine.connect() as connection:
    query = """
        SELECT
            agency_name, ntd_id, legacy_ntd_id, last_report_year, reporter_type, year, mode, primary_uza_name, upt
        FROM 
            cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt
        WHERE 
            state = 'CA' AND year = 2023 AND agency_status = 'Active'
    """
    ridership_data= pd.read_sql(query, connection)

In [6]:
filtered_ridership_data = ridership_data[ridership_data['mode'].str.endswith('B')]

In [7]:
filtered_ridership_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 269 entries, 0 to 583
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   agency_name       269 non-null    object 
 1   ntd_id            269 non-null    object 
 2   legacy_ntd_id     221 non-null    object 
 3   last_report_year  269 non-null    int64  
 4   reporter_type     269 non-null    object 
 5   year              269 non-null    int64  
 6   mode              269 non-null    object 
 7   primary_uza_name  269 non-null    object 
 8   upt               208 non-null    float64
dtypes: float64(1), int64(2), object(6)
memory usage: 21.0+ KB


In [8]:
filtered_ridership_data.head(5)

Unnamed: 0,agency_name,ntd_id,legacy_ntd_id,last_report_year,reporter_type,year,mode,primary_uza_name,upt
0,City of Porterville,90198,9198,2023,Building Reporter,2023,MB,"Porterville, CA",
6,Altamont Corridor Express,90182,9182,2023,Full Reporter,2023,MB,"Stockton, CA",
9,City of Arcata,91018,9R02-108,2023,Rural Reporter,2023,MB,California Non-UZA,
11,City of Atascadero,90194,9194,2023,Reduced Reporter,2023,MB,"El Paso de Robles (Paso Robles)--Atascadero, CA",
14,City of Carson,90258,,2023,Reduced Reporter,2023,MB,"Los Angeles--Long Beach--Anaheim, CA",


In [9]:
ridership_data_grouped = filtered_ridership_data.groupby(
        [
            "agency_name",
            "ntd_id",
            'primary_uza_name',
            "reporter_type",
        ]
    ).agg({
        "upt":"sum"
    }).sort_values(by="ntd_id").reset_index()

In [10]:
ridership_data_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   agency_name       173 non-null    object 
 1   ntd_id            173 non-null    object 
 2   primary_uza_name  173 non-null    object 
 3   reporter_type     173 non-null    object 
 4   upt               173 non-null    float64
dtypes: float64(1), object(4)
memory usage: 6.9+ KB


In [11]:
ridership_data_grouped.head(5)

Unnamed: 0,agency_name,ntd_id,primary_uza_name,reporter_type,upt
0,San Francisco Bay Area Rapid Transit District,90003,"San Francisco--Oakland, CA",Full Reporter,0.0
1,Golden Empire Transit District,90004,"Bakersfield, CA",Full Reporter,3130678.0
2,Santa Cruz Metropolitan Transit District,90006,"Santa Cruz, CA",Full Reporter,3275802.0
3,City of Santa Monica,90008,"Los Angeles--Long Beach--Anaheim, CA",Full Reporter,7741258.0
4,San Mateo County Transit District,90009,"San Francisco--Oakland, CA",Full Reporter,8568085.0


In [12]:
# Load the stored organization dataset from the specified GCS file path.
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/agency_level_census_data.parquet.parquet", "rb") as f:
    agency_acs = pd.read_parquet(f)

In [13]:
agency_acs = agency_acs.dropna(subset=['ntd_id_2022'])

In [14]:
agency_acs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77 entries, 1 to 82
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   key                          77 non-null     object 
 1   name                         77 non-null     object 
 2   total_pop_adj                77 non-null     float64
 3   poverty_pop_adj              77 non-null     float64
 4   non_us_citizen_adj           77 non-null     float64
 5   workers_with_no_car_adj      77 non-null     float64
 6   households_with_no_cars_adj  77 non-null     float64
 7   disabled_pop_adj             77 non-null     float64
 8   public_asst_pop_adj          77 non-null     float64
 9   inc_extremelylow_adj         77 non-null     float64
 10  inc_verylow_adj              77 non-null     float64
 11  inc_low_adj                  77 non-null     float64
 12  male_seniors_adj             77 non-null     float64
 13  female_seniors_adj    

In [15]:
# Merge with an indicator column
merged_agency_ntd = (
    pd.merge(
        agency_acs,
        ridership_data_grouped,
        how='inner',
        left_on='ntd_id_2022',
        right_on='ntd_id',
        indicator=True
    )
    .sort_values(by='agency_name')
)


In [16]:
merged_agency_ntd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74 entries, 33 to 45
Data columns (total 27 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   key                          74 non-null     object  
 1   name                         74 non-null     object  
 2   total_pop_adj                74 non-null     float64 
 3   poverty_pop_adj              74 non-null     float64 
 4   non_us_citizen_adj           74 non-null     float64 
 5   workers_with_no_car_adj      74 non-null     float64 
 6   households_with_no_cars_adj  74 non-null     float64 
 7   disabled_pop_adj             74 non-null     float64 
 8   public_asst_pop_adj          74 non-null     float64 
 9   inc_extremelylow_adj         74 non-null     float64 
 10  inc_verylow_adj              74 non-null     float64 
 11  inc_low_adj                  74 non-null     float64 
 12  male_seniors_adj             74 non-null     float64 
 13  female

In [17]:
# Rows from agency_acs that didn't find a match
unmatched_agency_acs = agency_acs[
    ~agency_acs['ntd_id_2022'].isin(ridership_data_grouped['ntd_id'])
]


# Optionally view them
unmatched_agency_acs[['ntd_id_2022', 'name']]


Unnamed: 0,ntd_id_2022,name
21,90233,Yuma County Intergovernmental Public Transportation Authority
48,90151,Southern California Regional Rail Authority
50,90280,City of Lawndale


In [18]:
names_to_check = [
    'Yuma County Intergovernmental Public Transportation Authority',
    'Southern California Regional Rail Authority',
    'City of Lawndale'
]

# Exact matches
ridership_data_grouped[ridership_data_grouped['agency_name'].isin(names_to_check)]


Unnamed: 0,agency_name,ntd_id,primary_uza_name,reporter_type,upt


In [19]:
mask = ridership_data_grouped['agency_name'].str.contains(
    'Yuma|Southern California|Lawndale', case=False, na=False
)
ridership_data_grouped[mask][['agency_name', 'ntd_id']]

Unnamed: 0,agency_name,ntd_id


In [None]:
def export_gdf(gdf, filename: str):
    
    gdf.to_parquet(f"{filename}.parquet")
    
    fs.put(
        f"{filename}.parquet",
        f"{GCS_FILE_PATH}/transit_provider_dashboard/{filename}.parquet",
        token = credentials.token
    )
    
    os.remove(f"{filename}.parquet")
    print(f"saved {GCS_FILE_PATH}/transit_provider_dashboard/{filename}.parquet")
    
    return

In [None]:
# Store data in warehouse
export_gdf(agency_summary, "merged_agency_ntd")