In [1]:
import google.auth
import numpy as np
import pandas as pd
import pytidycensus as tc

pd.set_option("display.max_columns", None)

  import pytidycensus as tc


In [2]:
import google.auth
import pandas_gbq

credentials, project = google.auth.default()
from functools import cache

from calitp_data_analysis.gcs_pandas import GCSPandas

In [3]:
@cache
def gcs_pandas():
    return GCSPandas()

In [4]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [5]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/equity_index"

In [6]:
analysis_year = 2023

In [7]:
ca_fips = "06"

In [8]:
survey_variable = "acs5"

In [9]:
with open("config.txt", "r") as file:
    api_key = file.read().strip()

In [10]:
tc.set_census_api_key(api_key)

Census API key has been set for this session.


In [11]:
def load_county_fips(analysis_year:str)->pd.DataFrame:
    # Get CA county FIPS list via the API (or use your own list)
    df = tc.get_acs(
    geography="county",
    variables=["B01001_001E"],
    year=analysis_year,
    survey='acs5',
    state='06',
    output="wide",
)
    df["NAME"] = df["NAME"].str.replace(", California", "")

    df = df.rename(columns={"county": "COUNTYFP"})
    return df

In [12]:
# Get CA county FIPS list via the API (or use your own list)
ca_counties = load_county_fips(analysis_year=analysis_year,
                              )

Getting data from the 2019-2023 5-year ACS


In [13]:
ca_counties.head(2)

Unnamed: 0,GEOID,B01001_001E,state,COUNTYFP,NAME,B01001_001_moe
0,6001,1651949,6,1,Alameda County,
1,6003,1695,6,3,Alpine County,234.0


In [14]:
county_codes = ca_counties["COUNTYFP"].unique().tolist()

In [15]:
def load_median_state_hhi(analysis_year:int)->int:
    # State median household income
    df = tc.get_acs(
    geography="state",
    variables=["B19013_001E"],  # Median household income
    year=analysis_year,
    state=ca_fips,  # California FIPS
    output="wide",
)
    state_cutoff = int(0.8 * df["B19013_001E"].iloc[0])
    return state_cutoff

In [16]:
state_cutoff = load_median_state_hhi(analysis_year = analysis_year)

Getting data from the 2019-2023 5-year ACS


In [17]:
state_cutoff

77067

In [18]:
survey_variable + "E"

'acs5E'

In [19]:
def load_acs_data(
    geography = str,
    variable = str,
    analysis_year = int,
    county_codes = list,
)->pd.DataFrame:

    df_list = []
    for county in county_codes:
        df = tc.get_acs(
        geography=geography,
        variables=[variable],
        year=analysis_year,
        survey='acs5',
        state='06',
        county=county,
        tract="*",
        output="wide",
    )
        df_list.append(df)

    df2 = pd.concat(df_list, ignore_index=True)[["GEOID", variable + "E"]]
    # Ensure GEOID is treated as a string (important for slicing)
    df2["GEOID"] = df2["GEOID"].astype(str)

    # Create COUNTYFP
    df2["COUNTYFP"] = df2.GEOID.str.slice(2,5)
    return df2

In [20]:
hh_income_data_blockgroup = load_acs_data(
    geography = "block group",
    variable = "B19013_001",
    analysis_year = analysis_year,
    county_codes = county_codes
).rename(columns = {"B19013_001E":"median_hh_income"})

Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting dat

In [21]:
def load_hh_size_data(geography: str,
                      variable:str,
                      analysis_year:int,
                      county_codes:list)->pd.DataFrame:
    df = load_acs_data(
    geography = "block group",
    variable = "B25010_001",
    analysis_year = analysis_year,
    county_codes = county_codes
).rename(columns={"B25010_001E": "rounded_hh_size"})

    
    df["rounded_hh_size"] = (pd.to_numeric(df["rounded_hh_size"], errors="coerce")
                               .round()
                               .fillna(0) 
                               .astype(int)
                               )
    return df

In [22]:
hh_size_data_blockgroup = load_hh_size_data(
    geography = "block group",
    variable = "B25010_001",
    analysis_year = analysis_year,
    county_codes = county_codes
)

Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting dat

In [23]:
hh_size_data_blockgroup.sample()

Unnamed: 0,GEOID,rounded_hh_size,COUNTYFP
15136,60650464034,3,65


In [24]:
def replace_words_with_numbers(df, column="variable"):
    # Mapping dictionary
    mapping = {
        "one": 1,
        "two": 2,
        "three": 3,
        "four": 4,
        "five": 5,
        "six": 6,
        "seven": 7,
        "eight": 8
    }
    # Replace using map
    df[column] = df[column].map(mapping)
    return df

In [25]:
def load_income_limits(analysis_year:str)->pd.DataFrame:
    ca_counties = load_county_fips(analysis_year=analysis_year,
                              )
    df = pd.read_csv(
    "gs://calitp-analytics-data/data-analyses/equity_index/hcd_low_income_limits_2024_cleaned.csv"
)
    df2 = pd.merge(
    df, ca_counties, left_on=["county"], right_on=["NAME"], how="inner"
)
    df2 = df2[['county', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
       'eight',  'COUNTYFP']]

    # Melt the dataframe from wide to long 
    df3 = df2.melt(id_vars=["COUNTYFP", "county"], value_vars=[ 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
       'eight']).rename(columns = {"value":"local_low_income_threshold"})

    # Make sure the strings are replaced by integers 
    df3 = replace_words_with_numbers(df3)
    df3["COUNTYFP"] = df3["COUNTYFP"].astype(str)
    return df3

In [26]:
income_lims = load_income_limits(analysis_year = analysis_year)

Getting data from the 2019-2023 5-year ACS


In [27]:
income_lims.sample()

Unnamed: 0,COUNTYFP,county,variable,local_low_income_threshold
24,49,Modoc County,1,49250


In [28]:
m1 = hh_income_data_blockgroup.merge(hh_size_data_blockgroup, on=["GEOID", "COUNTYFP"], how="outer")

In [29]:
def create_flags(
    geography:str,
    analysis_year:int,
    county_codes: list,
    state_cutoff: int,
    suffix: str

) -> pd.DataFrame:
  # Load Census Data
    hh_income_data = load_acs_data(
    geography = geography,
    variable = "B19013_001",
    analysis_year = analysis_year,
    county_codes = county_codes).rename(columns = {"B19013_001E":"median_hh_income"})

    display(hh_income_data.shape)

    print("hh income shape")
    hh_size_data = load_hh_size_data(
    geography = geography,
    variable = "B25010_001",
    analysis_year = analysis_year,
    county_codes = county_codes
)

    print("hh size shape")
    display(hh_size_data.shape)
    
    # Load income limits
    income_lims = load_income_limits(analysis_year = analysis_year)

    # Merge
    m1 = (
    hh_income_data
    # full outer join on GEOID (R's all = TRUE)
    .merge(hh_size_data, on=["GEOID", "COUNTYFP"], how="outer")
    .merge(income_lims, left_on = ["COUNTYFP","rounded_hh_size"], right_on = ["COUNTYFP", "variable"], how = "left")
)

    print("merge shape")
    display(m1.shape)
    # Create Flags
    # Flag 1: flag whether a geography's median household income is equal to or lower than the local low income threshold
    m1["localized_income_screen"] = np.select(
    [
        m1["local_low_income_threshold"].notna() &
        (m1["median_hh_income"] <= m1["local_low_income_threshold"]),
        m1["local_low_income_threshold"].notna() &
        (m1["median_hh_income"] >  m1["local_low_income_threshold"]),
    ],
    [1, 0],
    default=np.nan)

    # Flag 2: flag whether a geography's median household income is equal or lower to the statewide income limit
    m1["state_income_screen"] = np.select(
    [
        m1["median_hh_income"] <= state_cutoff,
        m1["median_hh_income"] >  state_cutoff,
    ],
    [1, 0],
    default=np.nan
    ).astype(int)

    # Flag 3: if a geography's median household income is lower than the local_low_income_threshold and state_income_screen, flag as 1. Else flag as 0
    m1["income_screen"] = np.select(
    [
        (m1["localized_income_screen"] == 1) | (m1["state_income_screen"] == 1),
        (m1["localized_income_screen"] == 0) & (m1["state_income_screen"] == 0),
    ],
    [1, 0],
    default=np.nan).astype(int)

    m1 = m1[["GEOID", "median_hh_income", "rounded_hh_size", "localized_income_screen", "state_income_screen", "income_screen"]]

    m1 = m1.rename(columns=lambda col: col if col == "GEOID" else f"{col}{suffix}")

    # Save
    m1.to_csv(f"gs://calitp-analytics-data/data-analyses/equity_index/low_income{suffix}.csv")
    return m1 


In [30]:
income_blockgroup = create_flags(
    geography = "block group",
    analysis_year = analysis_year,
    county_codes = county_codes,
    state_cutoff = state_cutoff,
    suffix = "_bg"
)

Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting dat

(25607, 3)

hh income shape
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year

(25607, 3)

Getting data from the 2019-2023 5-year ACS
merge shape


(25607, 7)

  ).astype(int)
  default=np.nan).astype(int)


In [31]:
income_blockgroup["GEOID"] = income_blockgroup.GEOID.str[:11]

In [32]:
income_tract = create_flags(
    geography = "tract",
    analysis_year = analysis_year,
    county_codes = county_codes,
    state_cutoff = state_cutoff,
    suffix = "_tract"
)

Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting dat

(9129, 3)

hh income shape
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year ACS
Getting data from the 2019-2023 5-year

(25607, 3)

Getting data from the 2019-2023 5-year ACS
merge shape


(34736, 7)

  ).astype(int)
  default=np.nan).astype(int)


In [33]:
combined_income = pd.merge(income_blockgroup, income_tract, on = ["GEOID"],
             how = "left")

In [34]:

def compute_income_screen_final(income_screen_bg, income_screen_tract):
    # Check conditions in the same order as your R case_when
    if income_screen_bg == 1:
        return 1
    elif income_screen_bg == 0:
        return 0
    elif income_screen_bg is None and income_screen_tract == 1:
        return 1
    elif income_screen_bg is None and income_screen_tract == 0:
        return 0
    else:
        None


In [35]:
def compute_income_screen_geo(income_screen_bg, income_screen_tract):
    if pd.notna(income_screen_bg):
        return "block group"
    elif pd.isna(income_screen_bg) and pd.notna(income_screen_tract):
        return "tract"
    else:
        return "missing data"


In [36]:

combined_income['income_screen_final'] = combined_income.apply(
    lambda row: compute_income_screen_final(row['income_screen_bg'], row['income_screen_tract']),
    axis=1)


In [37]:

combined_income["income_screen_geo"] = combined_income.apply(
    lambda row: compute_income_screen_geo(row["income_screen_bg"], row["income_screen_tract"]),
    axis=1
)


In [38]:
combined_income.sample(3)

Unnamed: 0,GEOID,median_hh_income_bg,rounded_hh_size_bg,localized_income_screen_bg,state_income_screen_bg,income_screen_bg,median_hh_income_tract,rounded_hh_size_tract,localized_income_screen_tract,state_income_screen_tract,income_screen_tract,income_screen_final,income_screen_geo
7746,6037462301,85655,2,1.0,0,1,89018,,,0,-9223372036854775808,1.0,block group
11763,6059011710,151607,3,0.0,0,0,145476,,,0,-9223372036854775808,0.0,block group
15136,6065046403,95774,3,0.0,0,0,94803,,,0,-9223372036854775808,0.0,block group


In [39]:
combined_income.income_screen_final.value_counts()

income_screen_final
0.00    12676
1.00    11349
Name: count, dtype: int64

In [41]:
combined_income.income_screen_geo.value_counts()

income_screen_geo
block group    25607
Name: count, dtype: int64

In [40]:
combined_income.income_screen_tract.describe()

count                  25607.00
mean    -6387960443379523584.00
std      4255961049333877248.00
min     -9223372036854775808.00
25%     -9223372036854775808.00
50%     -9223372036854775808.00
75%                        1.00
max                        1.00
Name: income_screen_tract, dtype: float64