In [1]:
pip install shared_utils

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000)

import shared_utils
import pandas as pd
import geopandas as gpd
from calitp_data_analysis.sql import get_engine
from shared_utils import gtfs_utils_v2
db_engine = get_engine()

import gcsfs
from calitp_data_analysis import get_fs
from calitp_data_analysis import geography_utils, utils
fs = get_fs()
import re
import google.auth
import os
credentials, project = google.auth.default()
import pytz
import datetime as dt

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
def read_parquet_from_gcs(filename):
    gcs_path = GCS_FILE_PATH.replace("gs://", "") + filename
    with fs.open(gcs_path, 'rb') as f:
        return gpd.read_parquet(f)

In [5]:
with db_engine.connect() as connection:
    query = """
        SELECT
            key, source_record_id, name, ntd_id_2022, ntd_agency_info_key, _is_current
        FROM 
            cal-itp-data-infra.mart_transit_database.dim_organizations
    """
    organizations= pd.read_sql(query, connection)

In [6]:
organizations = organizations[organizations['_is_current'] == True]
organizations = organizations.dropna(subset=["ntd_id_2022"])
organizations.shape

(245, 6)

In [7]:
has_duplicates = organizations['source_record_id'].duplicated().any()
print("Are there duplicates in 'source_record_id'? ->", has_duplicates)

Are there duplicates in 'source_record_id'? -> False


In [36]:
with db_engine.connect() as connection:
    query = """
        SELECT
            organization_source_record_id, schedule_gtfs_dataset_key, organization_key, schedule_gtfs_dataset_name, _valid_from, _valid_to
        FROM 
            cal-itp-data-infra.mart_transit_database.dim_provider_gtfs_data
    """
    dim_provider_gtfs_data= pd.read_sql(query, connection)

In [37]:
dim_provider_gtfs_data.shape

(29305, 6)

In [38]:
# dim_provider_gtfs_data['_valid_from'] = pd.to_datetime(dim_provider_gtfs_data['_valid_from']).dt.date
# dim_provider_gtfs_data['_valid_to'] = pd.to_datetime(dim_provider_gtfs_data['_valid_to']).dt.date

# # Step 2: Define your analysis window
# analysis_start = dt.date(2022, 11, 30)
# analysis_end   = dt.date(2022, 12, 4)

# # Step 3: Filter to only rows where the entire analysis period falls within valid range
# valid_gtfs_data = dim_provider_gtfs_data[
#     (dim_provider_gtfs_data['_valid_from'] <= analysis_start) &
#     (dim_provider_gtfs_data['_valid_to'] >= analysis_end)
# ]


In [42]:
dim_provider_gtfs_data.drop_duplicates(subset=['organization_source_record_id', 'schedule_gtfs_dataset_key'], inplace=True)

In [43]:
dim_provider_gtfs_data = dim_provider_gtfs_data.dropna(subset=['organization_source_record_id', 'schedule_gtfs_dataset_key'])

In [44]:
# Merge on organization_source_record_id and source_record_id
filtered_dimprovider_gtfs = dim_provider_gtfs_data.merge(
    organizations[['source_record_id', 'name', 'ntd_id_2022', 'ntd_agency_info_key']].drop_duplicates(), 
    left_on='organization_source_record_id', 
    right_on='source_record_id',
    how='inner'
)


In [45]:
filtered_dimprovider_gtfs.shape

(2867, 10)

In [26]:
filtered_dimprovider_gtfs.head(5)

Unnamed: 0,organization_source_record_id,schedule_gtfs_dataset_key,organization_key,schedule_gtfs_dataset_name,_valid_from,_valid_to,source_record_id,name,ntd_id_2022,ntd_agency_info_key
17,recOZgevYf7Jimm9L,032b7cc8e34084fecc58dfada5319251,ba7f7c075f7b127cb0e8ba7f487016c4,Bay Area 511 Dumbarton Express Schedule,2022-11-29,2022-12-01,recOZgevYf7Jimm9L,Alameda-Contra Costa Transit District,90014,rec0zt7fBmP2s3F3g
23,recOZgevYf7Jimm9L,00f473b71f24c8a1ca2a436345633ec6,4e5090566785ba92e39b427f3c7f53ec,Bay Area 511 Regional Schedule,2022-12-02,2022-12-05,recOZgevYf7Jimm9L,Alameda-Contra Costa Transit District,90014,rec0zt7fBmP2s3F3g
87,recpgYVeU3VePMeWx,a4aa135aad718bc59bf626828a37fbfb,b51eefed971b95670787b6d81b40d5ad,ACE Schedule,2022-11-29,2022-12-01,recpgYVeU3VePMeWx,San Joaquin Regional Rail Commission,90182,recSzmxZ9frQeLm2l
113,recpgYVeU3VePMeWx,24eda7b99f3e323e3498e1d8e44832d6,b51eefed971b95670787b6d81b40d5ad,Bay Area 511 ACE Schedule,2022-11-29,2022-12-01,recpgYVeU3VePMeWx,San Joaquin Regional Rail Commission,90182,recSzmxZ9frQeLm2l
133,recSBFiK95hJnJuYx,e1099a4938e0279f09d5bbc10a1702a4,f45204da25a4f2a031582a0d0bae0b18,Amador Schedule,2022-11-29,2022-12-01,recSBFiK95hJnJuYx,Amador Regional Transit System,91000,recyQ9Dp6JKnr3Lmr


In [46]:
with db_engine.connect() as connection:
    query = """
        SELECT
            gtfs_dataset_key, feed_key, service_date, trip_instance_key, trip_id, route_id, shape_id, direction_id, route_type, route_short_name
        FROM 
            cal-itp-data-infra.mart_gtfs.fct_scheduled_trips
        WHERE 
            service_date IN (DATE '2022-11-30', DATE '2022-12-03', DATE '2022-12-04')
        ORDER BY service_date DESC
    """
    gtfs_trips= pd.read_sql(query, connection)

In [47]:
# Merge on organization_source_record_id and source_record_id
trips_ntd_data = gtfs_trips.merge(
    filtered_dimprovider_gtfs[['schedule_gtfs_dataset_key', 'name', 'ntd_id_2022', 'ntd_agency_info_key']], 
    left_on='gtfs_dataset_key', 
    right_on='schedule_gtfs_dataset_key',
    how='inner'
)

In [48]:
trips_ntd_data.shape

(1829843, 14)

In [49]:
trips_ntd_data[['gtfs_dataset_key', 'ntd_id_2022']].nunique()

gtfs_dataset_key    178
ntd_id_2022         146
dtype: int64

In [50]:
gtfs_counts = trips_ntd_data.groupby('gtfs_dataset_key')['ntd_id_2022'].nunique()
ntd_counts = trips_ntd_data.groupby('ntd_id_2022')['gtfs_dataset_key'].nunique()

# Filter to get only 1:1 mappings
valid_gtfs_keys = gtfs_counts[gtfs_counts == 1].index
valid_ntd_ids = ntd_counts[ntd_counts == 1].index

# Now filter the original DataFrame
unique_pairs = trips_ntd_data[
    trips_ntd_data['gtfs_dataset_key'].isin(valid_gtfs_keys) &
    trips_ntd_data['ntd_id_2022'].isin(valid_ntd_ids)
]

# Drop duplicates just in case
unique_pairs = unique_pairs[['gtfs_dataset_key', 'ntd_id_2022']].drop_duplicates()


In [51]:
unique_pairs.shape

(98, 2)

In [19]:
def trips_data_summary(day_type, analysis_dt, trips_df):
    if trips_df.empty:
        return pd.DataFrame()

    # Get feed_keys from the provided trips_df
    feed_keys = trips_df["feed_key"].unique().tolist()

    trips_df = trips_df.drop_duplicates()

    # Add derived column
    trips_df["route_name_id"] = trips_df["route_short_name"].fillna("") + "_" + trips_df["route_id"]

    # Get stop_times
    stoptimes_df = gtfs_utils_v2.get_stop_times(
        selected_date=analysis_dt,
        operator_feeds=feed_keys,
        stop_time_cols=["trip_id", "feed_key", "stop_id", "arrival_sec", "departure_sec"],
        trip_df=trips_df,
        get_df=True
    )

    if stoptimes_df.empty:
        return pd.DataFrame()

    # Merge with trips
    stoptimes_df = stoptimes_df.merge(trips_df, on=["trip_id", "feed_key"])

    # Summarize stop-level trips
    grouped = stoptimes_df.groupby(["feed_key", "route_type", "stop_id"])
    summary = grouped.agg(
        n_trips=("trip_instance_key", pd.Series.nunique),
        n_routes=("route_id", pd.Series.nunique),
        route_list=("route_name_id", lambda x: [x.tolist()])
    ).reset_index()

    summary["daytype"] = day_type
    summary["analysis_date"] = analysis_dt
    summary["route_list_string"] = summary["route_list"].apply(lambda x: ",".join(x[0]))

    # Add gtfs_dataset_key and service_date from trips_df to summary
    feed_info = trips_df[["feed_key", "ntd_id_2022", "ntd_agency_info_key", "gtfs_dataset_key", "service_date"]].drop_duplicates()
    summary = summary.merge(feed_info, on="feed_key", how="left")

    # Get stop metadata
    stops_geo = gtfs_utils_v2.get_stops(
        selected_date=analysis_dt,
        operator_feeds=feed_keys,
        stop_cols=["feed_key", "stop_id", "stop_desc", "stop_name"],
        get_df=True
    )

    # Join with summarized data
    stoptimes_geo = stops_geo.merge(summary, on=["feed_key", "stop_id"])

    return stoptimes_geo


In [20]:
def stops_geo_acs_summary(stoptimes_geo, acs_ca, jobdata):
    if stoptimes_geo.empty:
        return pd.DataFrame()

    # Merge ACS with job data
    acs_ca = acs_ca.merge(jobdata, on='GEOID', how='left')
    acs_ca = acs_ca.to_crs(epsg=3347)

    # Spatial buffer (0.25 miles)
    stoptimes_geo = stoptimes_geo.to_crs(epsg=3347)
    stoptimes_geo["geometry"] = stoptimes_geo.buffer(402.336)

    # Spatial join with ACS
    stops_acs_joined = stoptimes_geo.sjoin(acs_ca, how="left", predicate="intersects")

    # Aggregate
    group_cols = [
        "gtfs_dataset_key", "ntd_id_2022", "ntd_agency_info_key", "stop_id", "stop_name", "n_trips", "n_routes",
        "daytype", "analysis_date", "route_list_string", "service_date"
    ]

    acs_summary = stops_acs_joined.groupby(group_cols).agg(
        sum_tracts=("GEOID", pd.Series.nunique),
        sum_total_pop=("total_pop", "sum"),
        sum_households=("households", "sum"),
        sum_not_us_citizen_pop=("not_us_citizen_pop", "sum"),
        sum_youth_pop=("youth_pop", "sum"),
        sum_seniors_pop=("seniors_pop", "sum"),
        sum_pop_determined_poverty_status=("pop_determined_poverty_status", "sum"),
        sum_poverty=("poverty", "sum"),
        sum_no_car=("workers_with_no_car", "sum"),
        sum_no_cars=("households_with_no_cars", "sum"),
        sum_land_area=("ALAND", "sum"),
        sum_jobs=("jobs_tot", "sum")
    ).reset_index()

    # Derived metrics
    acs_summary["land_area_sqkm"] = acs_summary["sum_land_area"] / 1_000_000
    acs_summary["pop_density"] = acs_summary["sum_total_pop"] / acs_summary["land_area_sqkm"]
    acs_summary["job_density"] = acs_summary["sum_jobs"] / acs_summary["land_area_sqkm"]
    acs_summary["pct_not_us_citizen_pop"] = (acs_summary["sum_not_us_citizen_pop"] / acs_summary["sum_total_pop"]) * 100
    acs_summary["pct_youth_pop"] = (acs_summary["sum_youth_pop"] / acs_summary["sum_total_pop"]) * 100
    acs_summary["pct_seniors_pop"] = (acs_summary["sum_seniors_pop"] / acs_summary["sum_total_pop"]) * 100
    acs_summary["pct_poverty"] = (acs_summary["sum_poverty"] / acs_summary["sum_pop_determined_poverty_status"]) * 100
    acs_summary["pct_pop_workers_no_car"] = (acs_summary["sum_no_car"] / acs_summary["sum_total_pop"]) * 100
    acs_summary["pct_hh_no_cars"] = (acs_summary["sum_no_cars"] / acs_summary["sum_households"]) * 100

    return acs_summary


In [21]:
# set date parameters
analysis_wkd = dt.date(2022,11,30)
analysis_sat = dt.date(2022,12,3)
analysis_sun = dt.date(2022,12,4)

In [22]:
GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'
acs_ca = read_parquet_from_gcs("tracts_ca_acs.parquet")
jobdata = pd.read_parquet(f"{GCS_FILE_PATH}/job_density_2022.parquet")[['GEOID', 'jobs_tot']]

In [23]:
# List of (day_type, analysis_date) pairs
day_type_dates = [
    ("Weekday", analysis_wkd),
    ("Saturday", analysis_sat),
    ("Sunday", analysis_sun)
]

# List to collect each day's ACS summary
all_summaries = []

for day_type, analysis_dt in day_type_dates:
    print(f"Processing: {day_type} on {analysis_dt}")

    # Filter the trips for the current analysis date
    trips_df_filtered = trips_ntd_data[trips_ntd_data["service_date"] == analysis_dt]

    # Run trip summary
    stoptimes_geo = trips_data_summary(day_type, analysis_dt, trips_df_filtered)

    if stoptimes_geo.empty:
        print(f"No trip data for {day_type} on {analysis_dt}")
        continue

    # Run ACS summary
    acs_summary = stops_geo_acs_summary(stoptimes_geo, acs_ca, jobdata)

    if acs_summary.empty:
        print(f"No ACS data matched for {day_type} on {analysis_dt}")
        continue

    # Collect result
    all_summaries.append(acs_summary)

# Concatenate all into a single DataFrame
final_summary_df = pd.concat(all_summaries, ignore_index=True)



Processing: Weekday on 2022-11-30


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trips_df["route_name_id"] = trips_df["route_short_name"].fillna("") + "_" + trips_df["route_id"]
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(


KeyboardInterrupt: 

In [None]:
final_summary_df.head(5)

In [None]:
final_summary_df.shape

In [None]:
final_summary_df['gtfs_dataset_key'].nunique()