**Raw text extraction from PDFs with Document AI from Google**
Used for:
- cancellation notices
- ADs

In [7]:
import os
import json
from google.cloud import documentai
from google.api_core.client_options import ClientOptions

# --- Configuration ---
project_id = "mthesis-450913"  # Your Google Cloud Project ID
location = "us"  # Processor Location (e.g., "us" or "eu")
processor_id = "9aed4672acfd8a7a"  # Your Document AI Processor ID
mime_type = "application/pdf"

# --- Define Input and Output Directories ---
# IMPORTANT: Replace these with the actual paths to your directories
input_directory = r"C:\Users\zdrop\PycharmProjects\BRUMBRUMWEEEE\sample"  # Directory containing your PDF files
output_directory = r"C:\Users\zdrop\PycharmProjects\BRUMBRUMWEEEE\sample\output_raw" # Directory where JSON files will be saved

# Ensure output directory exists, create if it doesn't
if not os.path.exists(output_directory):
    os.makedirs(output_directory)
    print(f"Created output directory: {output_directory}")

def process_document_ocr_to_text(project_id: str, location: str, processor_id: str, file_path: str, mime_type: str) -> str | None:
    """
    Processes a single document using Document AI OCR and returns the extracted text.
    """
    try:
        # You must set the api_endpoint if you use a location other than "us".
        opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
        client = documentai.DocumentProcessorServiceClient(client_options=opts)

        name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

        with open(file_path, "rb") as f:
            image_content = f.read()

        raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)
        request = documentai.ProcessRequest(name=name, raw_document=raw_document)

        result = client.process_document(request=request)
        print(f"Successfully processed: {os.path.basename(file_path)}")
        return result.document.text
    except Exception as e:
        print(f"Error processing document {os.path.basename(file_path)}: {e}")
        return None

def main():
    processed_files = 0
    failed_files = 0

    print(f"Starting to process PDF files from: {input_directory}")
    print(f"Output JSON files will be saved to: {output_directory}")

    for filename in os.listdir(input_directory):
        if filename.lower().endswith(".pdf"):
            pdf_file_path = os.path.join(input_directory, filename)
            
            print(f"\nProcessing: {filename}...")
            extracted_text = process_document_ocr_to_text(
                project_id, location, processor_id, pdf_file_path, mime_type
            )

            if extracted_text:
                # Create a dictionary for the JSON content
                json_data = {"text": extracted_text}
                
                # Construct the output JSON filename
                base_filename = os.path.splitext(filename)[0]
                json_filename = f"{base_filename}.json"
                json_file_path = os.path.join(output_directory, json_filename)

                try:
                    with open(json_file_path, "w", encoding="utf-8") as json_file:
                        json.dump(json_data, json_file, ensure_ascii=False, indent=4)
                    print(f"Successfully saved extracted text to: {json_file_path}")
                    processed_files += 1
                except IOError as e:
                    print(f"Error writing JSON file {json_file_path}: {e}")
                    failed_files += 1
            else:
                print(f"Failed to extract text from: {filename}")
                failed_files += 1
        else:
            print(f"Skipping non-PDF file: {filename}")

    print(f"\n--- Processing Complete ---")
    print(f"Successfully processed and saved: {processed_files} files.")
    print(f"Failed to process or save: {failed_files} files.")
main()

Created output directory: C:\Users\zdrop\PycharmProjects\BRUMBRUMWEEEE\sample\output_raw
Starting to process PDF files from: C:\Users\zdrop\PycharmProjects\BRUMBRUMWEEEE\sample
Output JSON files will be saved to: C:\Users\zdrop\PycharmProjects\BRUMBRUMWEEEE\sample\output_raw

Processing: AD_2006-0069R1_1.pdf...
Successfully processed: AD_2006-0069R1_1.pdf
Successfully saved extracted text to: C:\Users\zdrop\PycharmProjects\BRUMBRUMWEEEE\sample\output_raw\AD_2006-0069R1_1.json

Processing: AD_2007-0036R1_1.pdf...
Successfully processed: AD_2007-0036R1_1.pdf
Successfully saved extracted text to: C:\Users\zdrop\PycharmProjects\BRUMBRUMWEEEE\sample\output_raw\AD_2007-0036R1_1.json

Processing: AD_2014-0270R1_1.pdf...
Successfully processed: AD_2014-0270R1_1.pdf
Successfully saved extracted text to: C:\Users\zdrop\PycharmProjects\BRUMBRUMWEEEE\sample\output_raw\AD_2014-0270R1_1.json

Processing: AD_2023-0205_1.pdf...
Successfully processed: AD_2023-0205_1.pdf
Successfully saved extracted te