# Document AI Exploration

**Purpose:** General extraction from PDF using Google Document AI Layout Parser. We explore the full API response without assumptions, then tune based on actual output.

**Approach:** Extract EVERYTHING the API returns first, examine it, then decide how to process.


---
## Section 1: Setup
---


### 1.1 Install Dependencies


In [None]:
%pip install -q google-cloud-documentai python-dotenv
print("Dependencies installed!")


### 1.2 Upload Credentials

Upload your Google Cloud service account JSON file.


In [None]:
from google.colab import files
import json
import os

print("Upload your Google Cloud credentials JSON file...")
uploaded = files.upload()

creds_filename = list(uploaded.keys())[0]
credentials_content = json.loads(uploaded[creds_filename].decode('utf-8'))

with open('docai-credentials.json', 'w') as f:
    json.dump(credentials_content, f)

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'docai-credentials.json'
print(f"Credentials saved: {creds_filename}")


### 1.3 Configure Project

Update these values with your Google Cloud project and processor IDs.


In [None]:
# UPDATE THESE VALUES
DOCAI_PROJECT_ID = "your-project-id-here"
DOCAI_PROCESSOR_ID = "your-processor-id-here"
DOCAI_LOCATION = "us"  # us, eu, or asia

os.environ['DOCAI_PROJECT_ID'] = DOCAI_PROJECT_ID
os.environ['DOCAI_PROCESSOR_ID'] = DOCAI_PROCESSOR_ID
os.environ['DOCAI_LOCATION'] = DOCAI_LOCATION

print(f"Project: {DOCAI_PROJECT_ID}")
print(f"Processor: {DOCAI_PROCESSOR_ID}")
print(f"Location: {DOCAI_LOCATION}")


### 1.4 Initialize Document AI Client


In [None]:
from google.cloud import documentai_v1 as documentai

processor_name = f"projects/{DOCAI_PROJECT_ID}/locations/{DOCAI_LOCATION}/processors/{DOCAI_PROCESSOR_ID}"
client = documentai.DocumentProcessorServiceClient()

print(f"Client initialized")
print(f"Processor: {processor_name}")


### 1.5 Verify Setup


In [None]:
print("Verifying Document AI setup...")

try:
    processor = client.get_processor(name=processor_name)
    print(f"Processor found: {processor.display_name}")
    print(f"Type: {processor.type_}")
    print(f"State: {documentai.Processor.State(processor.state).name}")
    print("Setup verified!")
except Exception as e:
    print(f"Setup failed: {e}")


### 1.6 Upload PDF


In [None]:
print("Upload your PDF file...")
uploaded_pdfs = files.upload()

pdf_filename = list(uploaded_pdfs.keys())[0]
pdf_content = uploaded_pdfs[pdf_filename]

print(f"PDF uploaded: {pdf_filename}")
print(f"Size: {len(pdf_content) / 1024:.1f} KB")


---
## Section 2: Process Document
---


### 2.1 Send to Document AI


In [None]:
print(f"Processing: {pdf_filename}")

request = documentai.ProcessRequest(
    name=processor_name,
    raw_document=documentai.RawDocument(
        content=pdf_content,
        mime_type="application/pdf"
    ),
    skip_human_review=True
)

result = client.process_document(request=request)
document = result.document

print("Document processed!")
print(f"Document type: {type(document)}")


---
## Section 3: Explore Raw Response
---


### 3.1 Document Top-Level Attributes

See what fields are available on the document object.


In [None]:
print("=" * 60)
print("DOCUMENT TOP-LEVEL ATTRIBUTES")
print("=" * 60)

# List all non-private attributes
attrs = [a for a in dir(document) if not a.startswith('_')]
print(f"\nTotal attributes: {len(attrs)}\n")

for attr in attrs:
    try:
        value = getattr(document, attr)
        if callable(value):
            print(f"  {attr}() - method")
        elif value:
            val_type = type(value).__name__
            if hasattr(value, '__len__'):
                print(f"  {attr}: {val_type} (len={len(value)})")
            else:
                print(f"  {attr}: {val_type}")
        else:
            print(f"  {attr}: (empty/None)")
    except:
        print(f"  {attr}: (error reading)")


### 3.2 Check document.text (Full OCR Text)


In [None]:
print("=" * 60)
print("DOCUMENT.TEXT (Full OCR)")
print("=" * 60)

if document.text:
    print(f"\nTotal characters: {len(document.text)}")
    print(f"\nFirst 2000 characters:")
    print("-" * 40)
    print(document.text[:2000])
    if len(document.text) > 2000:
        print("\n... (truncated)")
else:
    print("document.text is empty")


### 3.3 Check document.pages


In [None]:
print("=" * 60)
print("DOCUMENT.PAGES")
print("=" * 60)

if document.pages:
    print(f"\nTotal pages: {len(document.pages)}")
    
    for i, page in enumerate(document.pages[:3]):  # First 3 pages
        print(f"\n--- Page {i+1} ---")
        print(f"  page_number: {page.page_number}")
        print(f"  dimension: {page.dimension.width}x{page.dimension.height}" if page.dimension else "  dimension: None")
        print(f"  paragraphs: {len(page.paragraphs) if page.paragraphs else 0}")
        print(f"  lines: {len(page.lines) if page.lines else 0}")
        print(f"  tokens: {len(page.tokens) if page.tokens else 0}")
        print(f"  tables: {len(page.tables) if page.tables else 0}")
        print(f"  blocks: {len(page.blocks) if page.blocks else 0}")
else:
    print("document.pages is empty")


### 3.4 Check document.document_layout (Layout Parser specific)


In [None]:
print("=" * 60)
print("DOCUMENT.DOCUMENT_LAYOUT")
print("=" * 60)

doc_layout = document.document_layout

if doc_layout:
    print(f"\nType: {type(doc_layout)}")
    
    # Check blocks
    if doc_layout.blocks:
        print(f"Total blocks: {len(doc_layout.blocks)}")
        
        # Show block type distribution
        block_types = {}
        for block in doc_layout.blocks:
            if block.text_block:
                btype = str(block.text_block.type_) if block.text_block.type_ else "unknown"
                block_types[btype] = block_types.get(btype, 0) + 1
        
        print(f"\nBlock type distribution:")
        for btype, count in sorted(block_types.items()):
            print(f"  {btype}: {count}")
    else:
        print("No blocks in document_layout")
else:
    print("document.document_layout is empty")


### 3.5 Sample Blocks from document_layout


In [None]:
print("=" * 60)
print("SAMPLE BLOCKS (First 10)")
print("=" * 60)

if doc_layout and doc_layout.blocks:
    for i, block in enumerate(doc_layout.blocks[:10]):
        print(f"\n--- Block {i+1} ---")
        print(f"  block_id: {block.block_id}")
        
        if block.text_block:
            print(f"  type: {block.text_block.type_}")
            text = block.text_block.text or ""
            print(f"  text ({len(text)} chars): {text[:200]}{'...' if len(text) > 200 else ''}")
        
        if block.page_span:
            print(f"  pages: {block.page_span.page_start} - {block.page_span.page_end}")
        
        # Check for nested blocks
        if hasattr(block, 'text_block') and block.text_block:
            tb = block.text_block
            if hasattr(tb, 'blocks') and tb.blocks:
                print(f"  nested_blocks: {len(tb.blocks)}")
else:
    print("No blocks to show")


### 3.6 Explore Page Paragraphs (Alternative to Layout Blocks)


In [None]:
print("=" * 60)
print("PAGE PARAGRAPHS (from document.pages)")
print("=" * 60)

if document.pages and document.text:
    full_text = document.text
    
    for page_num, page in enumerate(document.pages[:2]):  # First 2 pages
        print(f"\n--- Page {page_num + 1} ---")
        
        if page.paragraphs:
            print(f"Paragraphs: {len(page.paragraphs)}")
            
            for j, para in enumerate(page.paragraphs[:5]):  # First 5 paragraphs
                # Extract text using text_anchor
                para_text = ""
                if para.layout and para.layout.text_anchor and para.layout.text_anchor.text_segments:
                    for seg in para.layout.text_anchor.text_segments:
                        start = int(seg.start_index) if seg.start_index else 0
                        end = int(seg.end_index) if seg.end_index else 0
                        para_text += full_text[start:end]
                
                print(f"\n  Paragraph {j+1}: {para_text[:150]}{'...' if len(para_text) > 150 else ''}")
        else:
            print("No paragraphs on this page")
else:
    print("No pages or text available")


### 3.7 Check for Tables


In [None]:
print("=" * 60)
print("TABLES")
print("=" * 60)

total_tables = 0

if document.pages:
    for page_num, page in enumerate(document.pages):
        if page.tables:
            print(f"\nPage {page_num + 1}: {len(page.tables)} table(s)")
            total_tables += len(page.tables)
            
            for t_idx, table in enumerate(page.tables):
                print(f"  Table {t_idx + 1}:")
                if hasattr(table, 'header_rows'):
                    print(f"    header_rows: {len(table.header_rows) if table.header_rows else 0}")
                if hasattr(table, 'body_rows'):
                    print(f"    body_rows: {len(table.body_rows) if table.body_rows else 0}")

print(f"\nTotal tables found: {total_tables}")


---
## Section 4: Save Raw Output
---


### 4.1 Build Complete Output


In [None]:
output = {
    "pdf_file": pdf_filename,
    "total_pages": len(document.pages) if document.pages else 0,
    "full_text_length": len(document.text) if document.text else 0,
    "full_text": document.text if document.text else "",
    
    # Layout blocks
    "layout_blocks": [],
    
    # Page-level data
    "pages": []
}

# Extract layout blocks
if doc_layout and doc_layout.blocks:
    for block in doc_layout.blocks:
        block_data = {
            "block_id": block.block_id,
            "type": str(block.text_block.type_) if block.text_block and block.text_block.type_ else None,
            "text": block.text_block.text if block.text_block else None,
            "page_start": block.page_span.page_start if block.page_span else None,
            "page_end": block.page_span.page_end if block.page_span else None
        }
        output["layout_blocks"].append(block_data)

# Extract page-level paragraph data
if document.pages and document.text:
    full_text = document.text
    
    for page in document.pages:
        page_data = {
            "page_number": page.page_number,
            "paragraphs": [],
            "tables": len(page.tables) if page.tables else 0
        }
        
        if page.paragraphs:
            for para in page.paragraphs:
                para_text = ""
                if para.layout and para.layout.text_anchor and para.layout.text_anchor.text_segments:
                    for seg in para.layout.text_anchor.text_segments:
                        start = int(seg.start_index) if seg.start_index else 0
                        end = int(seg.end_index) if seg.end_index else 0
                        para_text += full_text[start:end]
                page_data["paragraphs"].append(para_text)
        
        output["pages"].append(page_data)

print(f"Output built:")
print(f"  Layout blocks: {len(output['layout_blocks'])}")
print(f"  Pages: {len(output['pages'])}")
print(f"  Full text: {output['full_text_length']} chars")


### 4.2 Save to JSON


In [None]:
import json

output_path = "exploration_output.json"

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(output, f, indent=2, ensure_ascii=False)

print(f"Saved to: {output_path}")


### 4.3 Download Results


In [None]:
files.download(output_path)
print("Download complete!")


---
## Section 5: Quick Summary
---


In [None]:
print("=" * 60)
print("EXTRACTION SUMMARY")
print("=" * 60)

print(f"\nPDF: {pdf_filename}")
print(f"Pages: {len(document.pages) if document.pages else 0}")
print(f"Full text: {len(document.text) if document.text else 0} characters")
print(f"Layout blocks: {len(doc_layout.blocks) if doc_layout and doc_layout.blocks else 0}")

if document.pages:
    total_paragraphs = sum(len(p.paragraphs) if p.paragraphs else 0 for p in document.pages)
    total_tables = sum(len(p.tables) if p.tables else 0 for p in document.pages)
    print(f"Total paragraphs (from pages): {total_paragraphs}")
    print(f"Total tables: {total_tables}")

print("\n" + "=" * 60)
print("Next: Review exploration_output.json and decide what to extract")
print("=" * 60)
