In [1]:
pip install pygris

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Importing necessary package 
import pandas as pd 
import geopandas as gpd
import google.auth
import os
import gcsfs
import requests
from pygris import tracts 
from pygris.utils import erase_water
fs = gcsfs.GCSFileSystem()
pd.set_option('display.max_columns', None)

In [3]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses'

In [4]:
with open ("ACS_apikey", "r") as file:
    api_key = file.read().strip()

In [5]:
# County Level Metrics required: "Total Population", "Total Veteran Population", "Total Senior Population", "Total Low Income Population"
variables = [
    "B01003_001E",                                                                            # Total Population
    "B17001_002E",                                                                            # Population with Income in the past 12 months below poverty level
    "B16008_037E",                                                                            # Non US Citizen Population
    "B01001_020E", "B01001_021E", "B01001_022E", "B01001_023E", "B01001_024E", "B01001_025E", # Male senior population : 65 and above
    "B01001_044E", "B01001_045E", "B01001_046E", "B01001_047E", "B01001_048E", "B01001_049E", # Female senior population : 65 and above
    "B01001_006E", "B01001_007E", "B01001_008E", "B01001_009E", "B01001_010E",                # Male population : 15-24
    "B01001_030E", "B01001_031E", "B01001_032E", "B01001_033E", "B01001_034E",                # Female population: 15-24
    "B19013_001E",                                                                            # Median household income in the past 12 months (2023 Inflation adjusted dollars)
    "B06010_004E", "B06010_005E", "B06010_006E",                                              # Population with extremely low income
    "B06010_007E", "B06010_008E",                                                             # Population with very low income
    "B06010_009E", "B06010_010E",                                                             # Population with low income 
    "B08014_002E", "B08201_002E",                                                             # Workers and Households with no cars
    "B18101_001E",                                                                            # Total Population with Disability
    "B19058_001E",                                                                            # Public Assistance Income or Food Stamps/SNAP in past 12 months for Households
    "B21001_002E",                                                                            # Population with veteran status: 18 and above
    "B18101_004E", "B18101_007E", "B18101_010E", "B18101_013E", "B18101_016E",                # Population with disability (Male and Female)
    "B18101_019E", "B18101_023E", "B18101_026E", "B18101_029E", "B18101_032E", 
    "B18101_035E", "B18101_038E"
]

In [6]:
def fetch_acs(vars_subset, api_key):
    var_str = "NAME," + ",".join(vars_subset)
    url = (
        "https://api.census.gov/data/2023/acs/acs5"
        f"?get={var_str}&for=tract:*&in=state:06&key={api_key}"
    )

    r = requests.get(url)
    r.raise_for_status()

    data = r.json()
    df = pd.DataFrame(data[1:], columns=data[0])

    df["GEOID"] = df["state"] + df["county"] + df["tract"]
    return df.drop(columns=["state", "county", "tract"])


In [7]:
def chunk(lst, size=35):
    return [lst[i:i+size] for i in range(0, len(lst), size)]

chunks = chunk(variables, size=35)


In [8]:
dfs = [fetch_acs(chunk, api_key) for chunk in chunks]

census_data = dfs[0]
for df in dfs[1:]:
    census_data = census_data.merge(df, on=["GEOID", "NAME"])


In [9]:
census_data["county_name"] = census_data["NAME"].str.extract(
    r';\s*(.*?)(?: County)?;'
)

census_data = census_data.drop(columns=["NAME"])


In [10]:
num_cols = census_data.columns.difference(["GEOID", "county_name"])
census_data[num_cols] = census_data[num_cols].astype(int)


In [11]:
census_data = census_data.rename(columns = {
    'B01003_001E': 'total_pop',
    'B17001_002E': 'poverty_pop',
    'B16008_037E': 'non_us_citizen',
    'B01001_020E': 'male_65_to_66', 'B01001_021E': 'male_67_to_69', 'B01001_022E': 'male_70_to_74', 
    'B01001_023E': 'male_75_to_79', 'B01001_024E': 'male_80_to_84', 'B01001_025E': 'male_85_and_over',
    'B01001_044E': 'female_65_to_66', 'B01001_045E': 'female_67_to_69', 'B01001_046E': 'female_70_to_74', 
    'B01001_047E': 'female_75_to_79', 'B01001_048E': 'female_80_to_84', 'B01001_049E': 'female_85_and_over',
    'B01001_006E': 'male_15_17',  'B01001_007E': 'male_18_19', 'B01001_008E': 'male_20', 'B01001_009E': 'male_21',
    'B01001_010E': 'male_22_24',   'B01001_030E': 'female_15_17', 'B01001_031E': 'female_18_19', 'B01001_032E': 'female_20',
    'B01001_033E': 'female_21', 'B01001_034E': 'female_22_24',
    'B19013_001E': 'median_household_income',
    'B06010_004E': 'income_less_10000', 'B06010_005E': 'income_10000_14999', 'B06010_006E': 'income_15000_24999', 
    'B06010_007E': 'income_25000_34999', 'B06010_008E': 'income_35000_49999',
    'B06010_009E': 'income_50000_64999', 'B06010_010E': 'income_65000_74999',
    'B08014_002E': 'workers_with_no_car', 'B08201_002E': 'households_with_no_cars',
    # 'B18101_001E': 'disabled_pop',
    'B19058_001E': 'public_asst_pop',
    'B21001_002E': 'veteran_pop',
    'B18101_004E': 'male_under5_with_disability',
    'B18101_007E': 'male_5_17_with_disability',
    'B18101_010E': 'male_18_34_with_disability',
    'B18101_013E': 'male_35_64_with_disability',
    'B18101_016E': 'male_65_74_with_disability',
    'B18101_019E': 'male_75_plus_with_disability',
    'B18101_023E': 'female_under5_with_disability',
    'B18101_026E': 'female_5_17_with_disability',
    'B18101_029E': 'female_18_34_with_disability',
    'B18101_032E': 'female_35_64_with_disability',
    'B18101_035E': 'female_65_74_with_disability',
    'B18101_038E': 'female_75_plus_with_disability'   
})

In [12]:
exclude = ['state', 'county', 'tract', 'county_name', 'GEOID']
cols_to_numeric = [col for col in census_data.columns if col not in exclude]
census_data[cols_to_numeric] = census_data[cols_to_numeric].apply(pd.to_numeric, errors='coerce')

In [13]:
## Aggregate ACS income brackets into broader income group categories: extremely low, very low, and low income.
census_data['inc_extremelylow'] = census_data['income_less_10000'] + census_data['income_10000_14999'] + census_data['income_15000_24999']
census_data['inc_verylow'] = census_data['income_25000_34999'] + census_data['income_35000_49999']
census_data['inc_low'] = census_data['income_50000_64999'] + census_data['income_65000_74999']

In [14]:
# Sum all senior age brackets (65+) to calculate total male and female senior populations.
census_data['male_seniors'] = census_data.loc[:, "male_65_to_66":"male_85_and_over"].sum(axis=1)
census_data['female_seniors'] = census_data.loc[:, "female_65_to_66":"female_85_and_over"].sum(axis=1)

In [15]:
census_data['male_youth'] = census_data.loc[:, 'male_15_17':'male_22_24'].sum(axis=1)
census_data['female_youth'] = census_data.loc[:, 'female_15_17':'female_22_24'].sum(axis=1)

In [16]:
disability_cols = ['male_under5_with_disability','male_5_17_with_disability','male_18_34_with_disability','male_35_64_with_disability','male_65_74_with_disability','male_75_plus_with_disability','female_under5_with_disability','female_5_17_with_disability','female_18_34_with_disability','female_35_64_with_disability','female_65_74_with_disability','female_75_plus_with_disability']
census_data['disabled_pop'] = census_data[disability_cols].sum(axis=1)
census_data = census_data.drop(columns=disability_cols)

In [17]:
census_data.head(5)

Unnamed: 0,total_pop,poverty_pop,non_us_citizen,male_65_to_66,male_67_to_69,male_70_to_74,male_75_to_79,male_80_to_84,male_85_and_over,female_65_to_66,female_67_to_69,female_70_to_74,female_75_to_79,female_80_to_84,female_85_and_over,male_15_17,male_18_19,male_20,male_21,male_22_24,female_15_17,female_18_19,female_20,female_21,female_22_24,median_household_income,income_less_10000,income_10000_14999,income_15000_24999,income_25000_34999,income_35000_49999,income_50000_64999,income_65000_74999,workers_with_no_car,households_with_no_cars,GEOID,B18101_001E,public_asst_pop,veteran_pop,county_name,inc_extremelylow,inc_verylow,inc_low,male_seniors,female_seniors,male_youth,female_youth,disabled_pop
0,3094,134,264,47,84,119,49,46,78,52,70,72,85,105,107,19,0,0,0,26,26,13,0,14,19,250001,188,75,134,157,87,129,70,28,85,6001400100,3094,1316,129,Alameda,397,244,199,423,491,45,72,313
1,2093,164,96,18,60,59,58,28,26,40,35,67,96,34,13,33,0,3,0,10,28,9,0,0,0,225880,75,70,89,12,207,77,32,92,95,6001400200,2093,861,38,Alameda,234,219,109,249,285,46,37,168
2,5727,310,306,23,47,113,100,24,25,108,62,194,158,13,142,94,88,0,6,93,55,33,0,26,43,157731,383,201,300,251,400,148,291,157,416,6001400300,5727,2713,80,Alameda,884,651,439,332,677,281,157,459
3,4395,343,185,31,70,89,19,26,36,55,105,104,43,23,30,30,41,0,0,0,27,9,0,0,38,159612,187,105,287,215,207,178,87,134,204,6001400400,4376,1803,88,Alameda,579,422,265,271,360,71,74,339
4,3822,397,231,41,32,56,41,4,0,19,47,51,50,60,203,26,81,9,0,115,7,0,0,98,46,96250,256,91,244,213,385,387,244,74,169,6001400500,3822,1655,115,Alameda,591,598,631,174,430,231,151,270


In [18]:
census_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9129 entries, 0 to 9128
Data columns (total 48 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   total_pop                9129 non-null   int64 
 1   poverty_pop              9129 non-null   int64 
 2   non_us_citizen           9129 non-null   int64 
 3   male_65_to_66            9129 non-null   int64 
 4   male_67_to_69            9129 non-null   int64 
 5   male_70_to_74            9129 non-null   int64 
 6   male_75_to_79            9129 non-null   int64 
 7   male_80_to_84            9129 non-null   int64 
 8   male_85_and_over         9129 non-null   int64 
 9   female_65_to_66          9129 non-null   int64 
 10  female_67_to_69          9129 non-null   int64 
 11  female_70_to_74          9129 non-null   int64 
 12  female_75_to_79          9129 non-null   int64 
 13  female_80_to_84          9129 non-null   int64 
 14  female_85_and_over       9129 non-null  

In [19]:
#Retrieving Tract Geometries for California
ca_tracts_full = tracts(state="CA", cb=True, year=2023, cache=True)
ca_tracts_full.to_crs(3310, inplace=True)
ca_tracts_full = ca_tracts_full.explode(index_parts=False).reset_index(drop=True)
ca_tracts_full = ca_tracts_full[~ca_tracts_full.is_empty]

Using FIPS code '06' for input 'CA'


In [20]:
ca_tracts_land = erase_water(ca_tracts_full.copy())

  return geopandas.overlay(


In [21]:
# Merging the census tract geometries with the census data based on the GEOID
tracts_ca_acs = ca_tracts_full.merge(census_data, how="inner", on="GEOID")

In [22]:
# Calculate the area of each census tract in square meters.
tracts_ca_acs["area_m2"] = tracts_ca_acs.geometry.area

In [23]:
# Store data in warehouse
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/census_tracts_data.parquet", "wb") as f:
    tracts_ca_acs.to_parquet(f, index=False)