In [1]:
import google.auth
import numpy as np
import pandas as pd
import pytidycensus as tc

pd.set_option("display.max_columns", None)

  import pytidycensus as tc


In [2]:
import google.auth
import pandas_gbq

credentials, project = google.auth.default()
from functools import cache

from calitp_data_analysis.gcs_pandas import GCSPandas

In [3]:
@cache
def gcs_pandas():
    return GCSPandas()

In [4]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [5]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/equity_index"

In [6]:
analysis_year = 2023

In [7]:
ca_fips = "06"

In [8]:
survey_variable = "acs5"

In [9]:
with open("config.txt", "r") as file:
    api_key = file.read().strip()

In [10]:
tc.set_census_api_key(api_key)

Census API key has been set for this session.


In [180]:
def load_county_fips(analysis_year:str)->pd.DataFrame:
    # Get CA county FIPS list via the API (or use your own list)
    df = tc.get_acs(
    geography="county",
    variables=["B01001_001E"],
    year=analysis_year,
    survey='acs5',
    state='06',
    output="wide",
)
    df["NAME"] = df["NAME"].str.replace(", California", "")

    df = df.rename(columns={"county": "COUNTYFP"})
    
    return df

In [181]:
county_codes = load_county_fips(analysis_year)

Getting data from the 2019-2023 5-year ACS


In [182]:
county_codes.sample()

Unnamed: 0,GEOID,B01001_001E,state,COUNTYFP,NAME,B01001_001_moe
20,6041,258765,6,41,Marin County,


In [15]:
def load_median_state_hhi(analysis_year:int)->int:
    # State median household income
    df = tc.get_acs(
    geography="state",
    variables=["B19013_001E"],  # Median household income
    year=analysis_year,
    state=ca_fips,  # California FIPS
    output="wide",
)
    state_cutoff = int(0.8 * df["B19013_001E"].iloc[0])
    return state_cutoff

In [16]:
state_cutoff = load_median_state_hhi(analysis_year = analysis_year)

Getting data from the 2019-2023 5-year ACS


In [17]:
state_cutoff

77067

In [18]:
def load_acs_data(
    geography = str,
    variable = str,
    analysis_year = int,
    county_codes = list,
)->pd.DataFrame:

    df_list = []
    for county in county_codes:
        df = tc.get_acs(
        geography=geography,
        variables=[variable],
        year=analysis_year,
        survey='acs5',
        state='06',
        county=county,
        tract="*",
        output="wide",
    )
        df_list.append(df)

    df2 = pd.concat(df_list, ignore_index=True)[["GEOID", variable + "E"]]
    # Ensure GEOID is treated as a string (important for slicing)
    df2["GEOID"] = df2["GEOID"].astype(str)

    # Create COUNTYFP
    df2["COUNTYFP"] = df2.GEOID.str.slice(2,5)
    return df2

In [99]:
def load_hh_size_data(geography: str,
                      variable:str,
                      analysis_year:int,
                      county_codes:list)->pd.DataFrame:
    df = load_acs_data(
    geography = geography,
    variable = "B25010_001",
    analysis_year = analysis_year,
    county_codes = county_codes
).rename(columns={"B25010_001E": "rounded_hh_size"})

    
    df["rounded_hh_size"] = (pd.to_numeric(df["rounded_hh_size"], errors="coerce")
                               .round()
                               .fillna(0) 
                               .astype(int)
                               )
    return df

In [34]:
def replace_words_with_numbers(df, column="hh_size"):
    # Mapping dictionary
    mapping = {
        "one": 1,
        "two": 2,
        "three": 3,
        "four": 4,
        "five": 5,
        "six": 6,
        "seven": 7,
        "eight": 8
    }
    # Replace using map
    df[column] = df[column].map(mapping)
    return df

In [27]:
def load_income_limits(analysis_year:str)->pd.DataFrame:
    ca_counties = load_county_fips(analysis_year=analysis_year,
                              )
    df = pd.read_csv(
    "gs://calitp-analytics-data/data-analyses/equity_index/hcd_low_income_limits_2024_cleaned.csv"
)
    df2 = pd.merge(
    df, ca_counties, left_on=["county"], right_on=["NAME"], how="inner"
)
    df2 = df2[['county', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
       'eight',  'COUNTYFP']]

    # Melt the dataframe from wide to long 
    df3 = df2.melt(id_vars=["COUNTYFP", "county"], value_vars=[ 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
       'eight']).rename(columns = {"value":"local_low_income_threshold", "variable":"hh_size"})

    # Make sure the strings are replaced by integers 
    df3 = replace_words_with_numbers(df3)
    df3["COUNTYFP"] = df3["COUNTYFP"].astype(str)
    return df3

In [24]:
income_lims = load_income_limits(analysis_year = analysis_year)


Getting data from the 2019-2023 5-year ACS


In [25]:
income_lims.head()

Unnamed: 0,COUNTYFP,county,variable,local_low_income_threshold
0,1,Alameda County,1,84600
1,3,Alpine County,1,59200
2,5,Amador County,1,56450
3,7,Butte County,1,50750
4,9,Calaveras County,1,53400


In [26]:
income_lims.shape

(464, 4)

In [108]:
def create_flags(
    geography:str,
    analysis_year:int,
    county_codes: list,
    state_cutoff: int,
    suffix: str

) -> pd.DataFrame:
  # Load Census Data
    hh_income_data = load_acs_data(
    geography = geography,
    variable = "B19013_001",
    analysis_year = analysis_year,
    county_codes = county_codes).rename(columns = {"B19013_001E":"median_hh_income"})
    
    hh_size_data = load_hh_size_data(
    geography = geography,
    variable = "B25010_001",
    analysis_year = analysis_year,
    county_codes = county_codes
)

    # Load income limits
    income_lims = load_income_limits(analysis_year = analysis_year)
    
    # Merge
    m1 = (
    hh_income_data
    # full outer join on GEOID (R's all = TRUE)
    .merge(hh_size_data, on=["GEOID", "COUNTYFP"], how="outer")
    .merge(income_lims, left_on = ["COUNTYFP","rounded_hh_size"], right_on = ["COUNTYFP", "hh_size"], how = "left")
)

    
    # Create Flags
    # Flag 1: flag whether a geography's median household income is equal to or lower than the local low income threshold
    m1["localized_income_screen"] = np.select(
    [
        m1["local_low_income_threshold"].notna() &
        (m1["median_hh_income"] <= m1["local_low_income_threshold"]),
        m1["local_low_income_threshold"].notna() &
        (m1["median_hh_income"] >  m1["local_low_income_threshold"]),
    ],
    [1, 0],
    default=np.nan)

    # Flag 2: flag whether a geography's median household income is equal or lower to the statewide income limit
    m1["state_income_screen"] = np.select(
    [
        m1["median_hh_income"] <= state_cutoff,
        m1["median_hh_income"] >  state_cutoff,
    ],
    [1, 0],
    default=np.nan
    ).astype(int)

    # Flag 3: if a geography's median household income is lower than the local_low_income_threshold and state_income_screen, flag as 1. Else flag as 0
    m1["income_screen"] = np.select(
    [
        (m1["localized_income_screen"] == 1) | (m1["state_income_screen"] == 1),
        (m1["localized_income_screen"] == 0) & (m1["state_income_screen"] == 0),
    ],
    [1, 0],
    default=np.nan).astype(int)

    # Clean 
    m1['state_income_screen'] = m1['state_income_screen'].clip(lower=0)
    m1['localized_income_screen'] = m1['localized_income_screen'].clip(lower=0)
    m1['income_screen'] = m1['income_screen'].clip(lower=0)
    m1 = m1[["GEOID", "median_hh_income", "rounded_hh_size", "localized_income_screen", "state_income_screen", "income_screen"]]

    m1 = m1.rename(columns=lambda col: col if col == "GEOID" else f"{col}{suffix}")

    # Save
    m1.to_csv(f"gs://calitp-analytics-data/data-analyses/equity_index/low_income{suffix}.csv")

    # Make we aren't missing rows
    print("merge shape")
    display(m1.shape)

    print("hh size shape")
    display(hh_size_data.shape)

    print("hh income shape")
    display(hh_income_data.shape)
    return m1 


In [171]:
income_blockgroup = create_flags(
    geography = "block group",
    analysis_year = analysis_year,
    county_codes = county_codes,
    state_cutoff = state_cutoff,
    suffix = "_bg"
)

Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS



KeyboardInterrupt



In [37]:
type(income_blockgroup)

pandas.core.frame.DataFrame

In [39]:
income_blockgroup.sample(3)

Unnamed: 0,GEOID,median_hh_income_bg,rounded_hh_size_bg,localized_income_screen_bg,state_income_screen_bg,income_screen_bg
1038,6001450608,250001,4,0.0,0,0
5232,6037195802,85357,3,1.0,0,1
23446,6095250900,46801,2,1.0,1,1


In [71]:
income_blockgroup.groupby(["localized_income_screen_bg","state_income_screen_bg", "income_screen_bg"]).agg({"GEOID":"nunique"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,GEOID
localized_income_screen_bg,state_income_screen_bg,income_screen_bg,Unnamed: 3_level_1
0.0,0,0,12676
0.0,1,1,678
1.0,0,1,3845
1.0,1,1,6824


In [102]:
income_blockgroup.shape

(25607, 6)

In [127]:
income_tract = create_flags(
    geography = "tract",
    analysis_year = analysis_year,
    county_codes = county_codes,
    state_cutoff = state_cutoff,
    suffix = "_tract"
)

Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting dat

  ).astype(int)
  default=np.nan).astype(int)


merge shape


(9129, 6)

hh size shape


(9129, 3)

hh income shape


(9129, 3)

In [112]:
income_tract.groupby(["localized_income_screen_tract","state_income_screen_tract", "income_screen_tract"]).agg({"GEOID":"nunique"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,GEOID
localized_income_screen_tract,state_income_screen_tract,income_screen_tract,Unnamed: 3_level_1
0.0,0,0,4498
0.0,1,1,311
1.0,0,1,1609
1.0,1,1,2585


In [111]:
income_tract.head(3)

Unnamed: 0,GEOID,median_hh_income_tract,rounded_hh_size_tract,localized_income_screen_tract,state_income_screen_tract,income_screen_tract
0,6001400100,250001,2,0.0,0,0
1,6001400200,225880,2,0.0,0,0
2,6001400300,157731,2,0.0,0,0


In [143]:

def compute_income_screen_final(income_screen_bg, income_screen_tract):
    # Check conditions in the same order as your R case_when
    if income_screen_bg == 1:
        return 1
    elif income_screen_bg == 0:
        return 0
    elif income_screen_bg is None and income_screen_tract == 1:
        return 1
    elif income_screen_bg is None and income_screen_tract == 0:
        return 0
    else:
        None


In [151]:
def compute_income_screen_geo(income_screen_bg, income_screen_tract):
    if pd.notna(income_screen_bg):
        return "block group"
    elif pd.isna(income_screen_bg) and pd.notna(income_screen_tract):
        return "tract"
    else:
        return "missing data"

In [160]:
county_codes = load_county_fips(analysis_year)

Getting data from the 2019-2023 5-year ACS


In [183]:
def create_eqi_demographic_overlay(
    analysis_year:int,) -> pd.DataFrame:

    # Load counties
    county_codes_df = load_county_fips(analysis_year)
    county_codes_list = county_codes_df["COUNTYFP"].unique().tolist()
    state_cutoff = load_median_state_hhi(analysis_year = analysis_year)
    
    # Create  blockgroup dataframe
    income_blockgroup = create_flags(
    geography = "block group",
    analysis_year = analysis_year,
    county_codes = county_codes_list,
    state_cutoff = state_cutoff,
    suffix = "_bg")
    
    income_blockgroup['TRACT_GEOID'] = income_blockgroup['GEOID'].astype(str).str.slice(0, 11)
    
    # Create tract dataframe
    income_tract = create_flags(
    geography = "tract",
    analysis_year = analysis_year,
    county_codes = county_codes_list,
    state_cutoff = state_cutoff,
    suffix = "_tract"
)

    # Merge them
    combined_income = pd.merge(income_blockgroup.drop(columns = ["GEOID"]), income_tract, left_on = ["TRACT_GEOID"], right_on = ["GEOID"], how = "left", indicator = True)

    # Create final columns 
    combined_income['income_screen_final'] = combined_income.apply(
    lambda row: compute_income_screen_final(row['income_screen_bg'], row['income_screen_tract']),
    axis=1)
    
    combined_income["income_screen_geo"] = combined_income.apply(
    lambda row: compute_income_screen_geo(row["income_screen_bg"], row["income_screen_tract"]),
    axis=1
)

    # Save
    m1.to_csv(f"gs://calitp-analytics-data/data-analyses/equity_index/low_income_blockgroups_updated.csv")
    return combined_income
    

In [175]:
analysis_year

2023

In [185]:
# final = create_eqi_demographic_overlay(analysis_year = analysis_year)