In [None]:
!sudo apt-get update
!sudo apt-get install -y poppler-utils tesseract-ocr
!tesseract --version # Verify tesseract install
!pip install -U "unstructured[ocr, local-inference]" pymupdf pytesseract pdfminer.six pi-heif

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,607 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,863 kB]
Get:13 https://ppa.launchpadcontent.net/

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
pdf_path = "/content/drive/MyDrive/RCA_ABSgroup.pdf"

In [None]:
!sudo apt-get update
!sudo apt-get install -y poppler-utils

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building depe

In [None]:
import os
import json
import re
import fitz  # Optional: For validation or page count
from unstructured.partition.pdf import partition_pdf

# --- Configuration ---
PDF_PATH = pdf_path # Your PDF path
OUTPUT_BASE_DIR = "output_grouped_text_chunk" # Changed dir name
OUTPUT_JSON_DIR = os.path.join(OUTPUT_BASE_DIR, "data")

# --- Define Start and End Markers ---
# The exact heading text (after cleaning) that starts a new block
START_MARKER_HEADING = "Typical Issues"
# A specific text string that reliably ends the block to be collected
# Using 'in' operator for matching, so partial match is okay if needed
END_MARKER_TEXT = "Primary Difficulty SOURCE" # Check exact text from PDF/OCR
# List of intermediate headings *not* to include in the final text_chunk
INTERMEDIATE_HEADINGS_TO_SKIP = [
    "Typical Issues", # Also skip the start marker itself
    "Typical Recommendations",
    "Examples"
]

# --- Helper Functions ---

def clean_text_for_final_chunk(text):
    """Cleans text for final concatenation. Removes page numbers, excessive whitespace."""
    if not text: return ""
    # Remove lines that are only digits or whitespace
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        if re.fullmatch(r'\s*\d+\s*', line.strip()):
            continue
        # Normalize spaces within the line
        cleaned_line = re.sub(r'\s+', ' ', line).strip()
        if cleaned_line: # Only keep non-empty lines
            cleaned_lines.append(cleaned_line)
    # Join cleaned lines with a single space for the final text chunk
    return " ".join(cleaned_lines)

def get_base_filename(filepath):
    """Extracts filename without extension."""
    return os.path.splitext(os.path.basename(filepath))[0]

# --- Main Processing Logic ---

print(f"--- Starting Grouped Text Chunk Processing ---")
print(f"PDF: {PDF_PATH}")
print(f"Output Base Directory: {OUTPUT_BASE_DIR}")

if not os.path.exists(PDF_PATH):
    print(f"\nERROR: PDF File not found at '{PDF_PATH}'")
    exit()
else:
    print(f"Found PDF file. Size: {os.path.getsize(PDF_PATH)} bytes.")

os.makedirs(OUTPUT_JSON_DIR, exist_ok=True)
filename_base = get_base_filename(PDF_PATH)

print(f"\n--- Starting Text Partitioning (using OCR) ---")
final_topic_objects = [] # Final list of topic objects
current_topic_content_lines = [] # Accumulates lines for the current topic
current_topic_start_page = None
processing_active_topic = False # Flag to indicate if we are between start and end markers

try:
    elements = partition_pdf(filename=PDF_PATH, strategy="ocr_only")
    print(f"Text partitioning complete (using OCR). Found {len(elements)} elements.")
    if not elements: print("WARNING: No elements were extracted via OCR.")

    print(f"\n--- Processing Elements and Grouping Text Chunks ---")

    for elem_idx, element in enumerate(elements):
        element_category = element.category
        metadata = element.metadata.to_dict()
        page_number = metadata.get('page_number', None)
        element_original_text = element.text

        # Skip common noise elements explicitly
        if element_category in ["Header", "Footer", "PageNumber", "Image", "Figure", "FigureCaption"]:
            continue

        # Clean the text for processing and matching
        cleaned_element_text = clean_text_for_final_chunk(element_original_text)

        # Skip if element is empty after cleaning
        if not cleaned_element_text:
            continue

        # --- Check for START Marker ---
        if cleaned_element_text == START_MARKER_HEADING:
            # If we were processing a previous topic, finalize it
            if processing_active_topic:
                final_text_chunk = " ".join(current_topic_content_lines)
                if final_text_chunk: # Only add if content was collected
                     topic_object = {
                         "source_document": os.path.basename(PDF_PATH),
                         "start_page": current_topic_start_page,
                         "text_chunk": final_text_chunk
                     }
                     final_topic_objects.append(topic_object)
                     print(f"DEBUG: Finalized topic starting on page {current_topic_start_page}")

            # Start collecting for a new topic
            print(f"DEBUG: Found START marker '{START_MARKER_HEADING}' on page {page_number}. Starting new topic.")
            current_topic_content_lines = [] # Reset content accumulator
            current_topic_start_page = page_number
            processing_active_topic = True
            # Don't add the heading itself to the content
            continue # Move to next element

        # --- Check for END Marker ---
        # Use 'in' for robustness against minor OCR variations or surrounding text
        if END_MARKER_TEXT in cleaned_element_text:
            # If we are currently processing a topic, finalize it
            if processing_active_topic:
                print(f"DEBUG: Found END marker containing '{END_MARKER_TEXT}' on page {page_number}. Finalizing current topic.")
                final_text_chunk = " ".join(current_topic_content_lines)
                if final_text_chunk: # Only add if content was collected
                    topic_object = {
                        "source_document": os.path.basename(PDF_PATH),
                        "start_page": current_topic_start_page,
                        "text_chunk": final_text_chunk
                    }
                    final_topic_objects.append(topic_object)

                # Reset state: stop processing until the next START marker
                processing_active_topic = False
                current_topic_content_lines = []
                current_topic_start_page = None
            # Don't add the end marker text itself
            continue # Move to next element

        # --- Check for Intermediate Headings to Skip ---
        is_intermediate_heading = cleaned_element_text in INTERMEDIATE_HEADINGS_TO_SKIP
        if is_intermediate_heading and processing_active_topic:
            # Skip adding these specific headings to the text chunk
            print(f"DEBUG: Skipping intermediate heading '{cleaned_element_text}' on page {page_number}")
            continue # Move to next element

        # --- Accumulate Content if inside an active topic ---
        if processing_active_topic:
            # Append the cleaned text (representing the content of this element)
            current_topic_content_lines.append(cleaned_element_text)


    # --- Final Step: Add the last topic if file ended mid-topic ---
    if processing_active_topic and current_topic_content_lines:
        print(f"DEBUG: Reached end of PDF while processing topic starting on page {current_topic_start_page}. Finalizing.")
        final_text_chunk = " ".join(current_topic_content_lines)
        topic_object = {
            "source_document": os.path.basename(PDF_PATH),
            "start_page": current_topic_start_page,
            "text_chunk": final_text_chunk
        }
        final_topic_objects.append(topic_object)

    print(f"\nProcessing complete.")
    print(f"Generated {len(final_topic_objects)} grouped text chunk objects.")

    # --- Save the Final Structured Data ---
    output_filename = os.path.join(OUTPUT_JSON_DIR, f"{filename_base}_grouped_text_chunks.json")
    # Ensure UTF-8 encoding when writing
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(final_topic_objects, f, indent=2, ensure_ascii=False)

    print(f"\n--- Output ---")
    print(f"Grouped text chunk data saved to: {output_filename}")
    print(f"\n--- Note ---")
    print("Each object in the JSON groups text from 'Typical Issues' until 'Primary Difficulty SOURCE'.")
    print("Includes only 'source_document', 'start_page', and 'text_chunk' keys.")
    print("This format is for specific structured viewing/processing.")

except ImportError as e:
     print(f"\nERROR: Import failed. Ensure all libraries and system dependencies are installed. {e}")
except Exception as e:
    print(f"\nAn error occurred during processing: {e}")
    import traceback
    traceback.print_exc()

--- Starting Grouped Text Chunk Processing ---
PDF: /content/drive/MyDrive/RCA_ABSgroup.pdf
Output Base Directory: output_grouped_text_chunk
Found PDF file. Size: 66191092 bytes.

--- Starting Text Partitioning (using OCR) ---
Text partitioning complete (using OCR). Found 4267 elements.

--- Processing Elements and Grouping Text Chunks ---
DEBUG: Found START marker 'Typical Issues' on page 1. Starting new topic.
DEBUG: Finalized topic starting on page 1
DEBUG: Found START marker 'Typical Issues' on page 2. Starting new topic.
DEBUG: Skipping intermediate heading 'Examples' on page 2
DEBUG: Finalized topic starting on page 2
DEBUG: Found START marker 'Typical Issues' on page 4. Starting new topic.
DEBUG: Skipping intermediate heading 'Examples' on page 4
DEBUG: Finalized topic starting on page 4
DEBUG: Found START marker 'Typical Issues' on page 5. Starting new topic.
DEBUG: Skipping intermediate heading 'Examples' on page 5
DEBUG: Finalized topic starting on page 5
DEBUG: Found START m