In [24]:
import re
import pandas as pd
import pdfplumber

In [9]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [39]:
def extract_low_income_limits(pdf_path: str,
                              start_page: int = 6,
                              end_page: int = 14) -> pd.DataFrame:
    """
    Extract the 'Low Income' row for each county from pages 6–14 of the HCD PDF.

    Returns a DataFrame with 58 rows (one per county) and these columns:
      ['county','one','two','three','four','five','six','seven','eight'].

    Behavior:
      1) Detects county headers of the form:
         '<County Name> County Area Median Income: $<amount>'
      2) For each county block, finds the line that starts with 'Low Income'
         and parses 8 numbers (1–8 persons).
      3) Stores 'county' as '<County Name> County' exactly as printed.

    Notes:
      - Uses layout-preserving text extraction (`layout=True`) so the
        'Low Income' line remains intact.
      - Ignores instructional example tables on page 14 by requiring a
        county header before capturing any 'Low Income' line.
    """

    # Header example: "Yolo County Area Median Income: $117,000"
    COUNTY_HEADER_RE = re.compile(
        r"(?P<county>[A-Za-z][A-Za-z\s'-]+?\sCounty)\s+Area\s+Median\s+Income:\s*\$\s*[\d,]+",
        re.IGNORECASE
    )

    # Exactly the "Low Income" label at the start of a line, followed by 8 numbers
    LOW_LINE_RE = re.compile(
        r"^Low\s+Income\s+"
        r"([\d,]+)\s+([\d,]+)\s+([\d,]+)\s+([\d,]+)\s+"
        r"([\d,]+)\s+([\d,]+)\s+([\d,]+)\s+([\d,]+)\s*$",
        re.IGNORECASE
    )

    records = []

    with pdfplumber.open(pdf_path) as pdf:
        display(pdf)
        # Convert 1-based page numbers to 0-based indices
        for pidx in range(start_page - 1, end_page):
            page = pdf.pages[pidx]
            display(page)
            text = page.extract_text(layout=True) or ""
            # display(text)
            # Keep the line structure, but normalize internal whitespace
            lines = [re.sub(r"\s+", " ", ln).strip()
                     for ln in text.splitlines() if ln.strip()]

            i = 0
            while i < len(lines):
                m_header = COUNTY_HEADER_RE.search(lines[i])
                if not m_header:
                    i += 1
                    continue

                county_name = m_header.group("county").strip()

                display("county names")
                display(county_name)
                # Capture lines in this county's block until the next county header or end of page
                block_lines = []
                j = i + 1
                while j < len(lines) and not COUNTY_HEADER_RE.search(lines[j]):
                    block_lines.append(lines[j])
                    j += 1

                display(block_lines)
                # Find the single line that starts with "Low Income"
                low_vals = None
                for ln in block_lines:
                    m_low = LOW_LINE_RE.match(ln)
                    if m_low:
                        low_vals = [int(x.replace(",", "")) for x in m_low.groups()]
                        break

                if low_vals:
                    records.append({
                        "county": county_name,   # e.g., "Yolo County"
                        "one":   low_vals[0],
                        "two":   low_vals[1],
                        "three": low_vals[2],
                        "four":  low_vals[3],
                        "five":  low_vals[4],
                        "six":   low_vals[5],
                        "seven": low_vals[6],
                        "eight": low_vals[7],
                    })

                # Advance to the next county header (if any)
                i = j

    # Build DataFrame and ensure one row per county
    df = (pd.DataFrame.from_records(records,
                                    columns=["county","one","two","three","four","five","six","seven","eight"])
            .drop_duplicates(subset=["county"])
            .sort_values("county")
            .reset_index(drop=True))

    # Optional guard: expect 58 counties
    if len(df) != 58:
        print(f"Warning: expected 58 counties, found {len(df)}. "
              f"If you see 57, double-check that page bounds include both 13 and 14.")

    return df


In [40]:
extract_low_income_limits(pdf_path = "./hcd_income_limits_2024.pdf",
                         start_page = 6,
                         end_page = 6)

<pdfplumber.pdf.PDF at 0x7c97c8b39490>

<Page:6>



Unnamed: 0,county,one,two,three,four,five,six,seven,eight
