In [6]:
import re

import numpy as np
import pandas as pd
import pdfplumber

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [47]:

def extract_tables(pdf_path: str, start_page: int = 6, end_page: int = 14) -> pd.DataFrame:
    """
    Extract income tables for each county from pages start_page to end_page of the PDF.
    Returns a combined DataFrame with all rows from all tables.
    """
    all_tables = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_num in range(start_page - 1, end_page):  # pdfplumber pages are 0-indexed
            page = pdf.pages[page_num]
            tables = page.extract_tables()

            for table in tables:
                # Convert table to DataFrame
                df = pd.DataFrame(table)
                all_tables.append(df)

    # Combine all tables into one DataFrame
    combined_df = pd.concat(all_tables, ignore_index=True)
    return combined_df

In [21]:
# Usage
pdf_path = "./hcd_income_limits_2024.pdf"

'hcd_income_limits_2024.csv'

In [54]:
def create_low_income_hcd(pdf_path:str, GCS_PATH:str)->pd.DataFrame:
    df = extract_income_tables(pdf_path)
    df = df.ffill()

    
    df = df.rename(columns = 
                  {0: "county",
                  1: "category",
                  2: "one",
                  3: "two",
                  4: "three",
                  5: "four",
                  6: "five",
                  7: "six",
                  8: "seven",
                  9: "eight"})

    df['county'] = df['county'].str.split('Area', n=1).str.get(0).str.strip()
    
    df2 = df.loc[df.category == "Low Income"].reset_index(drop = True)
    csv_name = pdf_path.replace("pdf","csv").replace("./","")
    df2.to_csv(f"{GCS_PATH}{csv_name}")
    return df2

In [55]:
low_income_df = create_low_income_hcd(pdf_path = pdf_path,
                                     GCS_PATH = "gs://calitp-analytics-data/data-analyses/equity_index/")

In [41]:
df_income= df_income.rename(columns = {0:"county", 1:"category", 2: "one",
                                      3: "two", 4: "three", 5: "four", 6: "five",
                                      7:"six", 8: "seven", 9:"eight"})