# Ruled-Based Extraction for Structured Document

Notes about document
- want to extract Goals, Objectives, Policies

- formatting: 
    - GOAL #(number)
    - OBJECTIVE (ACRONYM)-(number):
    - Policy (ACRONYM)-(num)-(num):
    - DevStd (ACRONYM)-(num)(letter)



- acronyms: EGV, LUR-EGV, LUDS-EGV, LUC-EGV, LUA-EGV, SF-EGV, FIRE-EGV, POL-EGV, etc.

In [1]:
# import libraries/tools needed

import re                # used for pattern matching (regex) (like "Goal #1", "Policy EGV-1.1", etc.)
import pdfplumber        # used to read text from PDF files
import csv               # used to save extracted data to a spreadsheet

In [2]:
# define the types of phrases we’re looking for in the document
# patterns to help identify where goals, objectives, policies, and standards begin

goal_pattern = re.compile(r"GOAL\s?#\d+", re.IGNORECASE)  # looks for "GOAL #1", "GOAL #2", etc.
objective_pattern = re.compile(r"OBJECTIVE\s+[A-Z\-]+-\d+", re.IGNORECASE)  # looks for objectives like "OBJECTIVE EGV-1"
policy_pattern = re.compile(r"Policy\s+[A-Z\-]+-\d+\.\d+", re.IGNORECASE)  # matches "Policy EGV-1.1"
devstd_pattern = re.compile(r"DevStd\s+[A-Z\-]+-\d+[a-z]", re.IGNORECASE)  # matches "DevStd LUDS-EGV-1a"

In [3]:
# list that will collect all the extracted content (Goals, Objectives, Policies, DevStd)

extracted = []

In [4]:
# read in pdf and extract policies

with pdfplumber.open("01 Eastern Goleta Valley Community Plan (PDF) (1).pdf") as pdf:
    for page_num, page in enumerate(pdf.pages, start=1): # read pdf page by page
        text = page.extract_text()
        if not text:
            continue  # skip pages with no text

        lines = text.split("\n")  # break page into lines

        # variables to track current content being processed
        current_goal = None
        current_objective = None
        current_policy = None
        current_devstd = None

        temp_text = ""      # temporary storage for accumulating text
        context_type = None # stores whether we’re inside a goal, objective, etc.

        for line in lines:
            line = line.strip()

            # --- detect new GOAL ---
            if goal_pattern.match(line):
                if temp_text and context_type:
                    # save the previous sesction before starting a new one
                    extracted.append({
                        "page": page_num,
                        "type": context_type,
                        "id": current_goal or current_objective or current_policy or current_devstd,
                        "text": temp_text.strip()
                    })
                current_goal = line
                temp_text = line
                context_type = "Goal"
                continue

            # --- detect new OBJECTIVE ---
            elif objective_pattern.match(line):
                if temp_text and context_type:
                    extracted.append({
                        "page": page_num,
                        "type": context_type,
                        "id": current_goal or current_objective or current_policy or current_devstd,
                        "text": temp_text.strip()
                    })
                current_objective = line
                temp_text = line
                context_type = "Objective"
                continue

            # --- detect new POLICY ---
            elif policy_pattern.match(line):
                if temp_text and context_type:
                    extracted.append({
                        "page": page_num,
                        "type": context_type,
                        "id": current_goal or current_objective or current_policy or current_devstd,
                        "text": temp_text.strip()
                    })
                current_policy = line
                temp_text = line
                context_type = "Policy"
                continue

            # --- detect new DEVELOPMENT STANDARD ---
            elif devstd_pattern.match(line):
                if temp_text and context_type:
                    extracted.append({
                        "page": page_num,
                        "type": context_type,
                        "id": current_goal or current_objective or current_policy or current_devstd,
                        "text": temp_text.strip()
                    })
                current_devstd = line
                temp_text = line
                context_type = "DevStd"
                continue

            # --- continue collecting multi-line text ---
            elif context_type:
                temp_text += " " + line  # add this line to ongoing text
                if line.endswith("."):  # if it ends with a period, save it
                    extracted.append({
                        "page": page_num,
                        "type": context_type,
                        "id": current_goal or current_objective or current_policy or current_devstd,
                        "text": temp_text.strip()
                    })
                    # reset for next block
                    temp_text = ""
                    context_type = None
                    current_goal = current_objective = current_policy = current_devstd = None

        # === save anything left at the end of the page ===
        if temp_text and context_type:
            extracted.append({
                "page": page_num,
                "type": context_type,
                "id": current_goal or current_objective or current_policy or current_devstd,
                "text": temp_text.strip()
            })

In [5]:
# save what we found to a spreadsheet file

with open("egv_plan_extracted.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["page", "type", "id", "text"])
    writer.writeheader()
    for row in extracted:
        writer.writerow(row)

print("Saved to 'egv_plan_extracted.csv'")

Saved to 'egv_plan_extracted.csv'
