In [1]:
import os
import pandas as pd
import pdfplumber

# -----------------------------
# Config
# -----------------------------
PDF_FILE = "Budget Speech 2025-rp.pdf"   # Change if file name differs
OUTPUT_FOLDER = "data_raw/tables"
INDEX_FILE = "data_raw/budget_tables_index.csv"

# -----------------------------
# Create output folders
# -----------------------------
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# -----------------------------
# Extract tables
# -----------------------------
tables = []
with pdfplumber.open(PDF_FILE) as pdf:
    for page_num, page in enumerate(pdf.pages, start=1):
        page_tables = page.extract_tables()
        for table in page_tables:
            df = pd.DataFrame(table)
            df["page"] = page_num
            tables.append(df)

print(f"âœ… Extracted {len(tables)} tables from {PDF_FILE}")

# -----------------------------
# Save each table as separate CSV
# -----------------------------
index_records = []
for i, df in enumerate(tables, start=1):
    file_name = f"table_{i}.csv"
    csv_path = os.path.join(OUTPUT_FOLDER, file_name)
    df.to_csv(csv_path, index=False)
    index_records.append({"table": file_name, "page": df['page'].iloc[0]})
    print(f"ğŸ’¾ Saved {csv_path}")

# -----------------------------
# Save index file
# -----------------------------
index_df = pd.DataFrame(index_records)
index_df.to_csv(INDEX_FILE, index=False)
print(f"\nğŸ“‘ Index of tables saved at: {INDEX_FILE}")


âœ… Extracted 116 tables from Budget Speech 2025-rp.pdf
ğŸ’¾ Saved data_raw/tables\table_1.csv
ğŸ’¾ Saved data_raw/tables\table_2.csv
ğŸ’¾ Saved data_raw/tables\table_3.csv
ğŸ’¾ Saved data_raw/tables\table_4.csv
ğŸ’¾ Saved data_raw/tables\table_5.csv
ğŸ’¾ Saved data_raw/tables\table_6.csv
ğŸ’¾ Saved data_raw/tables\table_7.csv
ğŸ’¾ Saved data_raw/tables\table_8.csv
ğŸ’¾ Saved data_raw/tables\table_9.csv
ğŸ’¾ Saved data_raw/tables\table_10.csv
ğŸ’¾ Saved data_raw/tables\table_11.csv
ğŸ’¾ Saved data_raw/tables\table_12.csv
ğŸ’¾ Saved data_raw/tables\table_13.csv
ğŸ’¾ Saved data_raw/tables\table_14.csv
ğŸ’¾ Saved data_raw/tables\table_15.csv
ğŸ’¾ Saved data_raw/tables\table_16.csv
ğŸ’¾ Saved data_raw/tables\table_17.csv
ğŸ’¾ Saved data_raw/tables\table_18.csv
ğŸ’¾ Saved data_raw/tables\table_19.csv
ğŸ’¾ Saved data_raw/tables\table_20.csv
ğŸ’¾ Saved data_raw/tables\table_21.csv
ğŸ’¾ Saved data_raw/tables\table_22.csv
ğŸ’¾ Saved data_raw/tables\table_23.csv
ğŸ’¾ Saved data_raw/tables\table_