# Agency-Grain Census Data Summary Table
- **Purpose:** To define and quantify the service population of each Cal-ITP partner transit agency using census and related demographic data.
- **Goal:**    Provide agency-level summaries that describe the characteristics of populations served, such as size, demographics, income, and travel behavior, to illustrate the reach and impact of Cal-ITP services.
- **Use:**     Support data-driven storytelling and performance reporting by supplying key statistics for communications about the benefits, adoption, and equity potential of Cal-ITP initiatives (e.g., open-loop payment systems).

- **Steps:**
  - Querying ACS data via the Census API and upload results to a GCS bucket for later usage.
  - Census Tract Geometry Processing
  - Querying Organization Data from the Data Warehouse and Storing in GCS
  - Querying Bridge Organization GTFS Datasets and Merging with Dim Organizations Table
  - Loading Transit Stop Data and Merging Stop Data with Organization Information
  - Spatial Analysis: Stop Buffers and Census Tract Intersections
  - Adjusting Population and Demographic Metrics for Stop Service Areas


In [1]:
pip install shared_utils

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pygris

Note: you may need to restart the kernel to use updated packages.


In [3]:
import sys
sys.path.append('../ahsc_grant')

In [4]:
# Importing necessary package 
import pandas as pd 
import geopandas as gpd
import google.auth
import os
import gcsfs
import requests
from pygris import tracts 
from calitp_data_analysis.sql import get_engine
from shared_utils import schedule_rt_utils 
from gtfs_key_ntd_crosswalk import filter_to_valid_dates
db_engine = get_engine()
credentials, project = google.auth.default()
fs = gcsfs.GCSFileSystem()

pd.set_option('display.max_columns', None)

In [5]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses'
analysis_date = "2025-08-20" # Selecting weekday to account for most agencies 

## Querying ACS data via the Census API and upload results to a GCS bucket for later usage.

Uncomment and run the cells below as needed to include additional ACS variables.

In [6]:
# with open ("ACS_apikey", "r") as file:
#     api_key = file.read().strip()

In [7]:
# # County Level Metrics required: "Total Population", "Total Veteran Population", "Total Senior Population", "Total Low Income Population"
# variables = [
#     "B01003_001E",                                                                            # Total Population
#     "B17001_002E",                                                                            # Population with Income in the past 12 months below poverty level
#     "B16008_037E",                                                                            # Non US Citizen Population
#     "B01001_020E", "B01001_021E", "B01001_022E", "B01001_023E", "B01001_024E", "B01001_025E", # Male senior population : 65 and above
#     "B01001_044E", "B01001_045E", "B01001_046E", "B01001_047E", "B01001_048E", "B01001_049E", # Female senior population : 65 and above
#     "B06010_004E", "B06010_005E", "B06010_006E",                                              # Population with extremely low income
#     "B06010_007E", "B06010_008E",                                                             # Population with very low income
#     "B06010_009E", "B06010_010E",                                                             # Population with low income 
#     "B08014_002E", "B08201_002E",                                                             # Workers and Households with no cars
#     "B18101_001E",                                                                            # Total Population with Disability
#     "B19058_001E",                                                                            # Public Assistance Income or Food Stamps/SNAP in past 12 months for Households
#     "B21001_002E"                                                                             # Population with veteran status: 18 and above
# ]

             

In [53]:
# variable_str = "NAME," + ",".join(variables)
# url = f"https://api.census.gov/data/2023/acs/acs5?get={variable_str}&for=tract:*&in=state:06&key={api_key}"
# response = requests.get(url)

# if response.status_code == 200:
#     data = response.json()
#     census_data = pd.DataFrame(data[1:], columns=data[0])
    
#     # Create GEOID column
#     census_data["GEOID"] = census_data["state"] + census_data["county"] + census_data["tract"]

# census_data['county_name'] = census_data['NAME'].str.extract(r';\s*([A-Za-z\s]+) County;')
# census_data = census_data.drop(columns=['NAME'])


In [14]:
# census_data = census_data.rename(columns = {
#     'B01003_001E': 'total_pop',
#     'B17001_002E': 'poverty_pop',
#     'B16008_037E': 'non_us_citizen',
#     'B01001_020E': 'male_65_to_66', 'B01001_021E': 'male_67_to_69', 'B01001_022E': 'male_70_to_74', 
#     'B01001_023E': 'male_75_to_79', 'B01001_024E': 'male_80_to_84', 'B01001_025E': 'male_85_and_over',
#     'B01001_044E': 'female_65_to_66', 'B01001_045E': 'female_67_to_69', 'B01001_046E': 'female_70_to_74', 
#     'B01001_047E': 'female_75_to_79', 'B01001_048E': 'female_80_to_84', 'B01001_049E': 'female_85_and_over',
#     'B06010_004E': 'income_less_10000', 'B06010_005E': 'income_10000_14999', 'B06010_006E': 'income_15000_24999', 
#     'B06010_007E': 'income_25000_34999', 'B06010_008E': 'income_35000_49999',
#     'B06010_009E': 'income_50000_64999', 'B06010_010E': 'income_65000_74999',
#     'B08014_002E': 'workers_with_no_car', 'B08201_002E': 'households_with_no_cars',
#     'B18101_001E': 'disabled_pop',
#     'B19058_001E': 'public_asst_pop',
#     'B21001_002E': 'veteran_pop'
# })

In [15]:
# exclude = ['state', 'county', 'tract', 'county_name', 'GEOID']
# cols_to_numeric = [col for col in census_data.columns if col not in exclude]
# census_data[cols_to_numeric] = census_data[cols_to_numeric].apply(pd.to_numeric, errors='coerce')

In [16]:
# # Store data in warehouse
# with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/census_data_2023.parquet", "wb") as f:
#     census_data.to_parquet(f, index=False)

In [17]:
# Load the stored ACS dataset from the specified GCS file path.
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/census_data_2023.parquet", "rb") as f:
    census_data = pd.read_parquet(f)

In [18]:
census_data.head(5)

Unnamed: 0,total_pop,poverty_pop,non_us_citizen,male_65_to_66,male_67_to_69,male_70_to_74,male_75_to_79,male_80_to_84,male_85_and_over,female_65_to_66,female_67_to_69,female_70_to_74,female_75_to_79,female_80_to_84,female_85_and_over,income_less_10000,income_10000_14999,income_15000_24999,income_25000_34999,income_35000_49999,income_50000_64999,income_65000_74999,workers_with_no_car,households_with_no_cars,disabled_pop,public_asst_pop,veteran_pop,state,county,tract,GEOID,county_name
0,3094,134,264,47,84,119,49,46,78,52,70,72,85,105,107,188,75,134,157,87,129,70,28,85,3094,1316,129,6,1,400100,6001400100,Alameda
1,2093,164,96,18,60,59,58,28,26,40,35,67,96,34,13,75,70,89,12,207,77,32,92,95,2093,861,38,6,1,400200,6001400200,Alameda
2,5727,310,306,23,47,113,100,24,25,108,62,194,158,13,142,383,201,300,251,400,148,291,157,416,5727,2713,80,6,1,400300,6001400300,Alameda
3,4395,343,185,31,70,89,19,26,36,55,105,104,43,23,30,187,105,287,215,207,178,87,134,204,4376,1803,88,6,1,400400,6001400400,Alameda
4,3822,397,231,41,32,56,41,4,0,19,47,51,50,60,203,256,91,244,213,385,387,244,74,169,3822,1655,115,6,1,400500,6001400500,Alameda


In [19]:
## Aggregate ACS income brackets into broader income group categories: extremely low, very low, and low income.
census_data['inc_extremelylow'] = census_data['income_less_10000'] + census_data['income_10000_14999'] + census_data['income_15000_24999']
census_data['inc_verylow'] = census_data['income_25000_34999'] + census_data['income_35000_49999']
census_data['inc_low'] = census_data['income_50000_64999'] + census_data['income_65000_74999']

In [20]:
# Sum all senior age brackets (65+) to calculate total male and female senior populations.
census_data['male_seniors'] = census_data.loc[:, "male_65_to_66":"male_85_and_over"].sum(axis=1)
census_data['female_seniors'] = census_data.loc[:, "female_65_to_66":"female_85_and_over"].sum(axis=1)

## Census Tract Geometry Processing

In [21]:
#Retrieving Tract Geometries for California
ca_tracts = tracts(state = "CA", cb = True,
                    year = 2023, cache = True)

Using FIPS code '06' for input 'CA'


In [22]:
# Merging the census tract geometries with the census data based on the GEOID
tracts_ca_acs = ca_tracts.merge(census_data, how="inner", on="GEOID")

In [23]:
# Reproject California census tract geometries to EPSG:3310 (California Albers projection).
tracts_ca_acs.to_crs(crs=3310, inplace=True)

In [24]:
# Calculate the area of each census tract in square meters.
tracts_ca_acs["area_m2"] = tracts_ca_acs.geometry.area

## Querying Organization Data from the Data Warehouse and Storing in GCS

Uncomment and run the cells below as needed to include additional columns from dim_organization table.

In [25]:
# # Querying dim organization
# with db_engine.connect() as connection:
#     query = """
#         SELECT
#             key, name, organization_type, ntd_id, ntd_agency_info_key, 
#             public_currently_operating, _is_current, _valid_from, _valid_to
#         FROM 
#             cal-itp-data-infra.mart_transit_database.dim_organizations
#     """
    
#     #localize timestamps
#     dim_orgs = (
#         pd.read_sql(query, connection)
#         .pipe(schedule_rt_utils.localize_timestamp_col, ["_valid_from", "_valid_to"])
#     )
    
    
#     dim_orgs = dim_orgs[
#         (dim_orgs['public_currently_operating'] == True) & 
#         (dim_orgs['_is_current'] == True)
#     ].reset_index(drop=True)


In [26]:
# # Filtering the provider gtfs data to valid dates 
# valid_organization_full = filter_to_valid_dates(dim_orgs, [analysis_date])

In [27]:
# #Store data in warehouse
# with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/organization_data_2025_08_20.parquet", "wb") as f:
#     valid_organization_full.to_parquet(f, index=False)

In [28]:
# Load the stored organization dataset from the specified GCS file path.
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/organization_data_2025_08_20.parquet", "rb") as f:
    valid_organization_full = pd.read_parquet(f)

In [29]:
valid_organization_full.head(5)

Unnamed: 0,key,name,organization_type,ntd_id,ntd_agency_info_key,public_currently_operating,_is_current,_valid_from,_valid_to,_valid_from_local,_valid_to_local
0,9b5971d16d58e4fcafa694ee7fa33b12,Alpine County,County,9R02-91116,rec02Is8jSIBDkwM0,True,True,2025-03-06 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,2025-03-05 16:00:00,2098-12-31 15:59:59.999999
1,73ed19bf64f9ba305091973b3f45d553,Camarillo Health Care District,Independent Agency,,,True,True,2025-03-06 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,2025-03-05 16:00:00,2098-12-31 15:59:59.999999
2,402b2852ff46b95557801fbf3038ae7c,Chemehuevi Indian Tribe,Tribe,99316,reclUB9NcCQrSImfd,True,True,2025-03-06 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,2025-03-05 16:00:00,2098-12-31 15:59:59.999999
3,3a93c944381ee6c34646fa2dbf8b3d8f,City of Atascadero,City/Town,90194,recMmQSjQCzABlmh1,True,True,2025-03-06 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,2025-03-05 16:00:00,2098-12-31 15:59:59.999999
4,e56f748b8cf235ca2acee940b9f60d64,City of Azusa,City/Town,90250,recbLanAuzm5QituE,True,True,2025-03-06 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,2025-03-05 16:00:00,2098-12-31 15:59:59.999999


## Querying Bridge Organization GTFS Datasets and Merging with Dim Organizations Table

In [30]:
# Querying bridge organizations and gtfs_datasets
with db_engine.connect() as connection:
    query = """
        SELECT
            organization_key, gtfs_dataset_key, organization_name
        FROM
            cal-itp-data-infra.mart_transit_database.bridge_organizations_x_gtfs_datasets_produced
    """
    dim_orgs_GTFS= pd.read_sql(query, connection)

In [None]:
# Merge validated organization data with GTFS organization dimension data
# based on matching keys and names, keeping all rows from the validated dataset.
dim_orgs_merged = pd.merge(
    valid_organization_full.dropna(subset=['key', 'name']),
    dim_orgs_GTFS.dropna(subset=['organization_key', 'organization_name']),
    left_on=['key', 'name'],
    right_on=['organization_key', 'organization_name'],
    how='left'
)   

In [33]:
# Drop rows where either 'organization_key' or 'gtfs_dataset_key' is missing.
dim_orgs_merged = dim_orgs_merged.dropna(subset=['organization_key', 'gtfs_dataset_key'])

In [34]:
# Select relevant columns for the final organization dataset.
dim_orgs_final = dim_orgs_merged[['key', 'name', 'organization_type', 'gtfs_dataset_key', 'ntd_id', 'ntd_agency_info_key']]

In [35]:
dim_orgs_final.head(5)

Unnamed: 0,key,name,organization_type,gtfs_dataset_key,ntd_id,ntd_agency_info_key
16,306bafde22fe614e0a6af2269625d8f6,City of Menlo Park,City/Town,b76861f44c68f440d922c54ac1231d31,,
32,1906a01d5cb664c5e898a95276912bfe,Town of Truckee,City/Town,6fda78099793184fe08dd78945d188c0,9R02-91101,receHP6eQInAo7sSP
33,1906a01d5cb664c5e898a95276912bfe,Town of Truckee,City/Town,683da99e57acc29ac600a24cbd96feda,9R02-91101,receHP6eQInAo7sSP
34,aad5befa7fcfce979f2113e373e48aa6,Yosemite National Park,Federal Government,31f91d59f493cbee9ae0eeb824f44d0e,,
35,aad5befa7fcfce979f2113e373e48aa6,Yosemite National Park,Federal Government,31152914d10e2d0977b8b2fabb167922,,


In [36]:
dim_orgs_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 441 entries, 16 to 522
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   key                  441 non-null    object
 1   name                 441 non-null    object
 2   organization_type    441 non-null    object
 3   gtfs_dataset_key     441 non-null    object
 4   ntd_id               417 non-null    object
 5   ntd_agency_info_key  370 non-null    object
dtypes: object(6)
memory usage: 24.1+ KB


## Loading Transit Stop Data and Merging Stop Data with Organization Information

In [38]:
#Load stop data for a given analysis date from GCS and return as a GeoDataFrame.
def prep_stops(analysis_date: str):
    stops = gpd.read_parquet(
        f"{GCS_FILE_PATH}/rt_vs_schedule/stop_times_direction_{analysis_date}.parquet",
        columns=["schedule_gtfs_dataset_key", "feed_key", "stop_id", "stop_name", "geometry"],
        storage_options={'token': credentials.token}
    )

    return stops

In [39]:
stops = prep_stops(analysis_date)
stops.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 4595211 entries, 0 to 4595210
Data columns (total 5 columns):
 #   Column                     Dtype   
---  ------                     -----   
 0   schedule_gtfs_dataset_key  object  
 1   feed_key                   object  
 2   stop_id                    object  
 3   stop_name                  object  
 4   geometry                   geometry
dtypes: geometry(1), object(4)
memory usage: 175.3+ MB


In [40]:
stops.head(5)

Unnamed: 0,schedule_gtfs_dataset_key,feed_key,stop_id,stop_name,geometry
0,723210f3a6d61ee3936df401e18a5636,15b542ef6dbfd2903710095179e84b25,TL-3,Terminal 1,POINT (147834.197 -450957.957)
1,723210f3a6d61ee3936df401e18a5636,15b542ef6dbfd2903710095179e84b25,TL-4,Terminal 2,POINT (147598.785 -450990.106)
2,723210f3a6d61ee3936df401e18a5636,15b542ef6dbfd2903710095179e84b25,TL-5,Terminal 3,POINT (147265.199 -451037.318)
3,723210f3a6d61ee3936df401e18a5636,15b542ef6dbfd2903710095179e84b25,TL-6,International Terminal,POINT (147144.316 -451145.363)
4,723210f3a6d61ee3936df401e18a5636,15b542ef6dbfd2903710095179e84b25,TL-7,Terminal 4,POINT (147272.606 -451317.665)


In [41]:
# Merge stop data with the final organization dataset, keeping only stops with valid IDs and names.
orgs_stops = stops.dropna(subset = ['stop_id', 'stop_name']).merge(
    dim_orgs_final,
    right_on = 'gtfs_dataset_key',
    left_on = 'schedule_gtfs_dataset_key',
    how = 'inner'
)

In [42]:
orgs_stops = orgs_stops.drop_duplicates()

In [43]:
orgs_stops.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 53511 entries, 0 to 2788870
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   schedule_gtfs_dataset_key  53511 non-null  object  
 1   feed_key                   53511 non-null  object  
 2   stop_id                    53511 non-null  object  
 3   stop_name                  53511 non-null  object  
 4   geometry                   53511 non-null  geometry
 5   key                        53511 non-null  object  
 6   name                       53511 non-null  object  
 7   organization_type          53511 non-null  object  
 8   gtfs_dataset_key           53511 non-null  object  
 9   ntd_id                     52343 non-null  object  
 10  ntd_agency_info_key        48936 non-null  object  
dtypes: geometry(1), object(10)
memory usage: 4.9+ MB


## Spatial Analysis: Stop Buffers and Census Tract Intersections

In [44]:
# Reproject stops to match the CRS of California census tracts.
orgs_stops = orgs_stops.to_crs(tracts_ca_acs.crs)

In [45]:
# Create a 500-meter buffer around each stop.
orgs_stop_buffered = gpd.GeoDataFrame(
    orgs_stops.copy(),                
    geometry=orgs_stops.geometry.buffer(500),
    crs=orgs_stops.crs
)

In [46]:
# Compute the intersection between buffered stops and census tracts.
geometry_intersect = gpd.overlay(orgs_stop_buffered, tracts_ca_acs, how = 'intersection', keep_geom_type=True)

In [47]:
# Calculate the area of each intersected geometry in square meters.
geometry_intersect['area_2'] = geometry_intersect.geometry.area

In [48]:
geometry_intersect.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,feed_key,stop_id,stop_name,key,name,organization_type,gtfs_dataset_key,ntd_id,ntd_agency_info_key,STATEFP,COUNTYFP,TRACTCE,GEOIDFQ,GEOID,NAME,NAMELSAD,STUSPS,NAMELSADCO,STATE_NAME,LSAD,ALAND,AWATER,total_pop,poverty_pop,non_us_citizen,male_65_to_66,male_67_to_69,male_70_to_74,male_75_to_79,male_80_to_84,male_85_and_over,female_65_to_66,female_67_to_69,female_70_to_74,female_75_to_79,female_80_to_84,female_85_and_over,income_less_10000,income_10000_14999,income_15000_24999,income_25000_34999,income_35000_49999,income_50000_64999,income_65000_74999,workers_with_no_car,households_with_no_cars,disabled_pop,public_asst_pop,veteran_pop,state,county,tract,county_name,inc_extremelylow,inc_verylow,inc_low,male_seniors,female_seniors,area_m2,geometry,area_2
0,cc53a0dbf5df90e3009b9cb5d89d80ba,49f469dcf8712b562e3c970aa1b89731,5961046,Echo Park Ave & Donaldson St (Southbound),123beaa13b8cfbd650a48cdfd4647088,City of Los Angeles,City/Town,cc53a0dbf5df90e3009b9cb5d89d80ba,90147,reccTizvO7pe1k1CS,6,37,197410,1400000US06037197410,6037197410,1974.1,Census Tract 1974.10,CA,Los Angeles County,California,CT,1393312,0,3805,239,429,25,25,53,75,8,32,0,49,71,77,35,27,288,138,288,226,426,402,151,29,51,3805,1701,95,6,37,197410,Los Angeles,714,652,553,218,259,1406431.0,"POLYGON ((162046.722 -434622.218, 162034.799 -...",725281.427873
1,cc53a0dbf5df90e3009b9cb5d89d80ba,49f469dcf8712b562e3c970aa1b89731,5797231,Echo Park Ave & Baxter St (Southbound) (4052),123beaa13b8cfbd650a48cdfd4647088,City of Los Angeles,City/Town,cc53a0dbf5df90e3009b9cb5d89d80ba,90147,reccTizvO7pe1k1CS,6,37,197410,1400000US06037197410,6037197410,1974.1,Census Tract 1974.10,CA,Los Angeles County,California,CT,1393312,0,3805,239,429,25,25,53,75,8,32,0,49,71,77,35,27,288,138,288,226,426,402,151,29,51,3805,1701,95,6,37,197410,Los Angeles,714,652,553,218,259,1406431.0,"POLYGON ((161975.618 -434794.073, 161968.418 -...",590112.226428


## Adjusting Population and Demographic Metrics for Stop Service Areas

In [49]:
# Adjust total population by the proportion of the tract area that intersects the stop buffer.
geometry_intersect['adjusted_total_pop'] = geometry_intersect['total_pop'] * (geometry_intersect['area_2'] / geometry_intersect['area_m2'])

In [50]:
geometry_intersect.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,feed_key,stop_id,stop_name,key,name,organization_type,gtfs_dataset_key,ntd_id,ntd_agency_info_key,STATEFP,COUNTYFP,TRACTCE,GEOIDFQ,GEOID,NAME,NAMELSAD,STUSPS,NAMELSADCO,STATE_NAME,LSAD,ALAND,AWATER,total_pop,poverty_pop,non_us_citizen,male_65_to_66,male_67_to_69,male_70_to_74,male_75_to_79,male_80_to_84,male_85_and_over,female_65_to_66,female_67_to_69,female_70_to_74,female_75_to_79,female_80_to_84,female_85_and_over,income_less_10000,income_10000_14999,income_15000_24999,income_25000_34999,income_35000_49999,income_50000_64999,income_65000_74999,workers_with_no_car,households_with_no_cars,disabled_pop,public_asst_pop,veteran_pop,state,county,tract,county_name,inc_extremelylow,inc_verylow,inc_low,male_seniors,female_seniors,area_m2,geometry,area_2,adjusted_total_pop
0,cc53a0dbf5df90e3009b9cb5d89d80ba,49f469dcf8712b562e3c970aa1b89731,5961046,Echo Park Ave & Donaldson St (Southbound),123beaa13b8cfbd650a48cdfd4647088,City of Los Angeles,City/Town,cc53a0dbf5df90e3009b9cb5d89d80ba,90147,reccTizvO7pe1k1CS,6,37,197410,1400000US06037197410,6037197410,1974.1,Census Tract 1974.10,CA,Los Angeles County,California,CT,1393312,0,3805,239,429,25,25,53,75,8,32,0,49,71,77,35,27,288,138,288,226,426,402,151,29,51,3805,1701,95,6,37,197410,Los Angeles,714,652,553,218,259,1406431.0,"POLYGON ((162046.722 -434622.218, 162034.799 -...",725281.427873,1962.197505
1,cc53a0dbf5df90e3009b9cb5d89d80ba,49f469dcf8712b562e3c970aa1b89731,5797231,Echo Park Ave & Baxter St (Southbound) (4052),123beaa13b8cfbd650a48cdfd4647088,City of Los Angeles,City/Town,cc53a0dbf5df90e3009b9cb5d89d80ba,90147,reccTizvO7pe1k1CS,6,37,197410,1400000US06037197410,6037197410,1974.1,Census Tract 1974.10,CA,Los Angeles County,California,CT,1393312,0,3805,239,429,25,25,53,75,8,32,0,49,71,77,35,27,288,138,288,226,426,402,151,29,51,3805,1701,95,6,37,197410,Los Angeles,714,652,553,218,259,1406431.0,"POLYGON ((161975.618 -434794.073, 161968.418 -...",590112.226428,1596.506809


In [51]:
# Calculate a population weight for each intersected geometry, which represents
# the fraction of the tract's total population within the stop buffer.
geometry_intersect['pop_weight'] = geometry_intersect['adjusted_total_pop'] / geometry_intersect['total_pop']


# Define the demographic and socioeconomic columns to be adjusted based on the population weight.
cols_to_weight = ['poverty_pop', 'non_us_citizen', 'workers_with_no_car', 
                  'households_with_no_cars', 'disabled_pop', 'public_asst_pop', 
                  'inc_extremelylow', 'inc_verylow', 'inc_low', 'male_seniors', 'female_seniors',
                  'veteran_pop']

# Apply the population weight to each selected metric to create adjusted versions
# representing the portion of each population subgroup within the stop buffer.
geometry_intersect[[f'{col}_adj' for col in cols_to_weight]] = (
    geometry_intersect[cols_to_weight].multiply(geometry_intersect['pop_weight'], axis=0)
)

In [52]:
filtered_final_data = geometry_intersect[['name', 'organization_type', 'ntd_id', 'ntd_agency_info_key', 'stop_id', 'stop_name', 'schedule_gtfs_dataset_key', 
                                          'feed_key', 'GEOIDFQ', 'geometry', 'area_2',	'adjusted_total_pop', 'pop_weight',	'poverty_pop_adj',	
                                          'non_us_citizen_adj',	'workers_with_no_car_adj',	'households_with_no_cars_adj',	'disabled_pop_adj',	
                                          'public_asst_pop_adj', 'inc_extremelylow_adj', 'inc_verylow_adj',	'inc_low_adj',	'male_seniors_adj',	
                                          'female_seniors_adj', 'veteran_pop_adj']]

filtered_final_data.head(2)

Unnamed: 0,name,organization_type,ntd_id,ntd_agency_info_key,stop_id,stop_name,schedule_gtfs_dataset_key,feed_key,GEOIDFQ,geometry,area_2,adjusted_total_pop,pop_weight,poverty_pop_adj,non_us_citizen_adj,workers_with_no_car_adj,households_with_no_cars_adj,disabled_pop_adj,public_asst_pop_adj,inc_extremelylow_adj,inc_verylow_adj,inc_low_adj,male_seniors_adj,female_seniors_adj,veteran_pop_adj
0,City of Los Angeles,City/Town,90147,reccTizvO7pe1k1CS,5961046,Echo Park Ave & Donaldson St (Southbound),cc53a0dbf5df90e3009b9cb5d89d80ba,49f469dcf8712b562e3c970aa1b89731,1400000US06037197410,"POLYGON ((162046.722 -434622.218, 162034.799 -...",725281.427873,1962.197505,0.515689,123.249725,221.230678,14.954988,26.300151,1962.197505,877.187373,368.202107,336.229375,285.176142,112.420251,133.56351,48.990476
1,City of Los Angeles,City/Town,90147,reccTizvO7pe1k1CS,5797231,Echo Park Ave & Baxter St (Southbound) (4052),cc53a0dbf5df90e3009b9cb5d89d80ba,49f469dcf8712b562e3c970aa1b89731,1400000US06037197410,"POLYGON ((161975.618 -434794.073, 161968.418 -...",590112.226428,1596.506809,0.419581,100.279928,180.000373,12.167857,21.398646,1596.506809,713.707775,299.581041,273.567001,232.028453,91.468721,108.671554,39.860223
