In [1]:
import re
import pandas as pd
import pdfplumber

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [15]:

def pdf_to_text(path: str) -> str:
    """
    Read all pages of a PDF and return a single string of text.
    Skips pages where text extraction returns None.
    """
    all_text = []

    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            # Try extracting text; tune tolerances if needed
            text = (page.extract_text())
            all_text.append(text)

    return "".join(all_text)


In [16]:
text = pdf_to_text("./hcd_income_limits_2024.pdf")

In [40]:
text

"STATE OF CALIFORNIA - BUSINESS, CONSUMER SERVICES AND HOUSING AGENCY GAVIN NEWSOM, Governor\nDEPARTMENT OF HOUSING AND COMMUNITY DEVELOPMENT\nDIVISION OF HOUSING POLICY DEVELOPMENT\n2020 W. El Camino Avenue, Suite 500\nSacramento, CA 95833\n(916) 263-2911 / FAX (916) 263-7453\nwww.hcd.ca.gov\nMay 9, 2024\nMEMORANDUM FOR: Interested Parties\nFROM: Megan Kirkeby, Deputy Director\nDivision of Housing Policy Development\nSUBJECT: 2024 State Income Limits\nAttached are briefing materials and 2024 State Income Limits that are now in effect, replacing the\nprevious 2023 State Income Limits. Income limits reflect updated median income and household\nincome levels for acutely low -, extremely low-, very low-, low-, and moderate-income\nhouseholds for California’s 58 counties. The 2024 State Income Limits are on the Department of\nHousing and Community Development (HCD) website at https://www.hcd.ca.gov/grants-and-\nfunding/income-limits/state-and-federal-income-rent-and-loan-value-limits.\nSta

In [35]:
def extract_low_income_by_county(text: str) -> pd.DataFrame:
    """
    Parse a mixed text block to extract the eight 'Low Income' numbers per county.

    Returns a DataFrame with columns:
    county, one, two, three, four, five, six, seven, eight

    Assumptions:
    - County header lines end with "County" (e.g., "Alameda County")
    - The target line starts with "Low Income" (case-insensitive),
      followed by eight numeric values (commas allowed).
    - Ignores other lines (e.g., Area Median Income:, $..., etc.)
    """

    # Prepare storage for extracted records
    records = []

    # Split text into lines and iterate, tracking the current county
    current_county = None
    for raw_line in text.splitlines():
        line = raw_line.strip()

        # Identify county header lines (e.g., "Amador County")
        if line and line.endswith("County"):
            current_county = line
            continue

        # Look for the "Low Income ..." line
        if current_county and re.match(r'(?i)^low\s+income\b', line):
            # Extract all numbers (allowing commas)
            nums = re.findall(r'\d[\d,]*', line)

            # Normalize and convert to integers
            values = [int(n.replace(',', '')) for n in nums]

            # We expect exactly 8 numbers for household sizes 1..8
            if len(values) >= 8:
                values = values[:8]  # take the first eight if extras appear
                record = {
                    'county': current_county,
                    'one':   values[0],
                    'two':   values[1],
                    'three': values[2],
                    'four':  values[3],
                    'five':  values[4],
                    'six':   values[5],
                    'seven': values[6],
                    'eight': values[7],
                }
                records.append(record)
            else:
                # If a county lacks 8 numbers, skip or log; here we skip but you can raise/print a warning
                print(f"Warning: '{current_county}' Low Income line has {len(values)} values: {values}")

    # Build DataFrame
    df = pd.DataFrame.from_records(
        records,
        columns=['county', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight']
    )

    # Optional: enforce data types
    for col in ['one','two','three','four','five','six','seven','eight']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    return df


In [36]:
county_low_income = extract_low_income_by_county(text)

In [37]:
county_low_income.shape

(56, 9)

In [41]:
def _extract_eight_low_income_values(block: str) -> list[int] | None:
    """
    Given a text BLOCK that begins near the county header, find the 'Low Income' line
    and accumulate numbers across subsequent lines until we have eight values.
    Returns a list of 8 integers, or None if not found.
    """
    # Find 'Low Income' (case-insensitive), capture everything after it
    m = re.search(r'(?is)\blow\s+income\b(.*)', block)
    if not m:
        return None

    tail = m.group(1)
    # Numbers can wrap across lines; start with numbers found right after 'Low Income'
    nums = re.findall(r'\d[\d,]*', tail)

    # If fewer than 8, append numbers from the next few lines as a fallback
    if len(nums) < 8:
        lines = block.splitlines()
        # Find line index containing 'Low Income'
        try:
            li_idx = next(i for i, ln in enumerate(lines) if re.search(r'(?i)\blow\s+income\b', ln))
        except StopIteration:
            li_idx = None

        if li_idx is not None:
            lookahead_text = "\n".join(lines[li_idx + 1 : li_idx + 6])  # examine next ~5 lines
            nums_more = re.findall(r'\d[\d,]*', lookahead_text)
            nums = nums + nums_more

    if len(nums) >= 8:
        # Normalize commas and convert
        values = [int(n.replace(',', '')) for n in nums[:8]]
        return values

    return None


def extract_low_income_SLO_SB(text: str) -> pd.DataFrame:
    """
    Rescan the full text to specifically extract 'Low Income' values for:
      - San Luis Obispo (with or without 'County')
      - San Bernardino   (with or without 'County')

    Returns a DataFrame with columns:
      county, one, two, three, four, five, six, seven, eight

    If a county is not found or values cannot be parsed, the row is included
    with NaNs for the numeric columns.
    """
    targets = [
        ("San Luis Obispo",  re.compile(r'(?mi)^(San\s+Luis\s+Obispo)(?:\s+County)?\s*$')),
        ("San Bernardino",   re.compile(r'(?mi)^(San\s+Bernardino)(?:\s+County)?\s*$')),
    ]

    # To bound each county block, we’ll consider the next "header-like" line.
    # Header-like means a line that ends with 'County' OR matches either target without 'County'.
    header_like = re.compile(
        r'(?mi)^[A-Za-z .&-]+(?:\s+County)?\s*$'
    )

    rows = []
    for canonical_name, county_pat in targets:
        # Find the county header location
        match = county_pat.search(text)
        if not match:
            # County header not found; include row with NaNs
            rows.append({
                "county": f"{canonical_name} County",
                "one": np.nan, "two": np.nan, "three": np.nan, "four": np.nan,
                "five": np.nan, "six": np.nan, "seven": np.nan, "eight": np.nan,
            })
            continue

        start_idx = match.start()

        # Find the end of this county block as the start of the next header-like line after start_idx
        next_header = header_like.search(text, pos=start_idx + 1)
        end_idx = next_header.start() if next_header else len(text)

        block = text[start_idx:end_idx]

        values = _extract_eight_low_income_values(block)
        if values is None:
            # Couldn’t parse; include row with NaNs
            rows.append({
                "county": f"{canonical_name} County",
                "one": np.nan, "two": np.nan, "three": np.nan, "four": np.nan,
                "five": np.nan, "six": np.nan, "seven": np.nan, "eight": np.nan,
            })
        else:
            rows.append({
                "county": f"{canonical_name} County",
                "one": values[0], "two": values[1], "three": values[2], "four": values[3],
                "five": values[4], "six": values[5], "seven": values[6], "eight": values[7],
            })

    df = pd.DataFrame(rows, columns=["county","one","two","three","four","five","six","seven","eight"])
    return df


In [42]:
extract_low_income_SLO_SB(text)

Unnamed: 0,county,one,two,three,four,five,six,seven,eight
0,San Luis Obispo County,,,,,,,,
1,San Bernardino County,,,,,,,,
