# Test 6: Universal Document Parser

This notebook tests the  class, which implements a context-aware hierarchical extraction strategy.

**Goals:**
1. Extract document structure as a hierarchical JSON tree.
2. Preserve table structures as data grids.
3. Extract and save images/charts as separate files.

## 1. Setup

In [None]:
%pip install -q google-cloud-documentai python-dotenv pdf2image Pillow
print("Dependencies installed.")

In [None]:
# Clone repository if running in Colab to get utils
import os
if not os.path.exists("utils"):
    !git clone https://github.com/abhii-01/docai-extraction-test.git temp_repo
    !mv temp_repo/* .
    !rm -rf temp_repo
    print("Repository cloned.")
else:
    print("Utils already present.")

In [None]:
# Setup Credentials
from google.colab import files
import json

if not os.path.exists("docai-credentials.json"):
    print("Upload your Google Cloud credentials JSON file...")
    uploaded = files.upload()
    creds_filename = list(uploaded.keys())[0]
    with open("docai-credentials.json", "wb") as f:
        f.write(uploaded[creds_filename])
    print("Credentials saved.")
else:
    print("Credentials found.")

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "docai-credentials.json"

In [None]:
# Configuration
DOCAI_PROJECT_ID = "vudr0311" # UPDATE THIS
DOCAI_PROCESSOR_ID = "91f4e596a0b1c39d" # UPDATE THIS
DOCAI_LOCATION = "us"

os.environ["DOCAI_PROJECT_ID"] = DOCAI_PROJECT_ID
os.environ["DOCAI_PROCESSOR_ID"] = DOCAI_PROCESSOR_ID
os.environ["DOCAI_LOCATION"] = DOCAI_LOCATION

## 2. Initialize Parser

In [None]:
from utils.docai_client import get_client_from_env
from utils.universal_parser import UniversalParser

try:
    client = get_client_from_env()
    parser = UniversalParser(client, output_dir="universal_output")
    print("UniversalParser initialized successfully!")
except Exception as e:
    print(f"Error initializing: {e}")

## 3. Upload and Parse PDF

In [None]:
print("Upload a PDF file to test (preferably one with headings, tables, and images)...")
uploaded = files.upload()
pdf_filename = list(uploaded.keys())[0]

## 3.1 DEBUG: Inspect Raw Document AI Response

The following cells inspect the raw Document AI response to find where image/visual element data is stored.


In [None]:
# DEBUG: Get raw Document AI response for inspection
raw_doc = client.process_document(pdf_filename)
print("Raw document captured for debug inspection.")


In [None]:
# DEBUG: List all top-level attributes of the Document object
print("=== TOP-LEVEL DOCUMENT ATTRIBUTES ===")
for attr in dir(raw_doc):
    if not attr.startswith('_'):
        print(f"  - {attr}")


In [None]:
# DEBUG: Check doc.pages[i].visual_elements
print("=== VISUAL ELEMENTS CHECK ===")
if hasattr(raw_doc, 'pages') and raw_doc.pages:
    for i, page in enumerate(raw_doc.pages):
        print(f"\nPage {i+1}:")
        if hasattr(page, 'visual_elements') and page.visual_elements:
            print(f"  Found {len(page.visual_elements)} visual_elements!")
            for j, ve in enumerate(page.visual_elements[:5]):  # Limit to 5
                print(f"    [{j}] type: {getattr(ve, 'type_', 'N/A')}")
                layout = getattr(ve, 'layout', None)
                if layout and hasattr(layout, 'bounding_poly'):
                    print(f"        bbox: YES")
        else:
            print("  visual_elements: EMPTY or NOT FOUND")
else:
    print("No pages found in document.")


In [None]:
# DEBUG: Check doc.pages[i].blocks
print("=== PAGE-LEVEL BLOCKS CHECK ===")
if hasattr(raw_doc, 'pages') and raw_doc.pages:
    for i, page in enumerate(raw_doc.pages):
        print(f"\nPage {i+1}:")
        if hasattr(page, 'blocks') and page.blocks:
            print(f"  Found {len(page.blocks)} blocks")
            # Check for image-like blocks (have bbox but no text)
            for j, block in enumerate(page.blocks[:5]):
                layout = getattr(block, 'layout', None)
                has_bbox = layout and hasattr(layout, 'bounding_poly')
                text_anchor = getattr(layout, 'text_anchor', None) if layout else None
                has_text = text_anchor and getattr(text_anchor, 'text_segments', None)
                print(f"    [{j}] bbox: {has_bbox}, text: {has_text}")
        else:
            print("  blocks: EMPTY or NOT FOUND")


In [None]:
# DEBUG: Check doc.pages[i].image
print("=== PAGE IMAGE CHECK ===")
if hasattr(raw_doc, 'pages') and raw_doc.pages:
    for i, page in enumerate(raw_doc.pages):
        print(f"\nPage {i+1}:")
        if hasattr(page, 'image') and page.image:
            img = page.image
            print(f"  image found! content_length: {len(getattr(img, 'content', b''))} bytes")
            print(f"  mime_type: {getattr(img, 'mime_type', 'N/A')}")
        else:
            print("  image: NOT FOUND")


In [None]:
# DEBUG: Check doc.document_layout.blocks for image types
print("=== DOCUMENT_LAYOUT.BLOCKS TYPE ANALYSIS ===")
if hasattr(raw_doc, 'document_layout') and raw_doc.document_layout:
    blocks = getattr(raw_doc.document_layout, 'blocks', [])
    print(f"Total blocks: {len(blocks)}")
    
    type_counts = {}
    for block in blocks:
        # Check for explicit image_block
        has_image_block = getattr(block, 'image_block', None) is not None
        # Check text_block type
        text_block = getattr(block, 'text_block', None)
        block_type = str(getattr(text_block, 'type_', 'unknown')) if text_block else 'unknown'
        
        if has_image_block:
            block_type = "IMAGE_BLOCK"
        
        type_counts[block_type] = type_counts.get(block_type, 0) + 1
    
    print("Block type distribution:")
    for t, count in sorted(type_counts.items()):
        print(f"  {t}: {count}")
else:
    print("document_layout: NOT FOUND")


In [None]:
# ============================================================
# DEBUG CELL: COMPREHENSIVE IMAGE DETECTION DIAGNOSIS
# ============================================================
# Run this AFTER Cell 11 (raw_doc = client.process_document(...))
# This will exhaustively check ALL possible places images could hide

print("="*70)
print("COMPREHENSIVE IMAGE DETECTION DIAGNOSIS")
print("="*70)

# ----------------- SECTION 1: Basic Document Stats -----------------
print("\nüìä SECTION 1: BASIC DOCUMENT STATS")
print("-"*50)
print(f"  doc.text length: {len(getattr(raw_doc, 'text', '') or '')}")
print(f"  doc.pages count: {len(raw_doc.pages) if hasattr(raw_doc, 'pages') and raw_doc.pages else 0}")
print(f"  doc.document_layout exists: {hasattr(raw_doc, 'document_layout') and raw_doc.document_layout is not None}")

if hasattr(raw_doc, 'document_layout') and raw_doc.document_layout:
    root_blocks = getattr(raw_doc.document_layout, 'blocks', []) or []
    print(f"  document_layout.blocks count: {len(root_blocks)}")

# ----------------- SECTION 2: All Block Attributes -----------------
print("\nüîç SECTION 2: SAMPLE BLOCK ATTRIBUTES (First 3 blocks)")
print("-"*50)

if hasattr(raw_doc, 'document_layout') and raw_doc.document_layout:
    root_blocks = getattr(raw_doc.document_layout, 'blocks', []) or []
    for i, block in enumerate(root_blocks[:3]):
        print(f"\n  Block [{i}] attributes:")
        for attr in dir(block):
            if not attr.startswith('_'):
                val = getattr(block, attr, None)
                if val is not None and not callable(val):
                    # Truncate long values
                    val_str = str(val)[:80] + "..." if len(str(val)) > 80 else str(val)
                    print(f"    - {attr}: {val_str}")

# ----------------- SECTION 3: Recursive Block Inspection -----------------
print("\nüîé SECTION 3: RECURSIVE BLOCK INSPECTION (Option C)")
print("-"*50)

def inspect_blocks_recursive(blocks, depth=0, path="root", stats=None):
    """Recursively inspect ALL blocks to find hidden images."""
    if stats is None:
        stats = {
            "total_blocks": 0,
            "image_blocks": [],
            "visual_types": [],
            "all_types": {},
            "max_depth": 0,
            "blocks_with_layout": 0,
            "blocks_with_bbox": 0
        }
    
    stats["max_depth"] = max(stats["max_depth"], depth)
    
    for i, block in enumerate(blocks):
        block_path = f"{path}[{i}]"
        stats["total_blocks"] += 1
        
        # Check for explicit image_block
        image_block = getattr(block, 'image_block', None)
        if image_block:
            stats["image_blocks"].append(block_path)
        
        # Check layout/bbox
        layout = getattr(block, 'layout', None)
        if layout:
            stats["blocks_with_layout"] += 1
            bbox = getattr(layout, 'bounding_poly', None)
            if bbox:
                verts = getattr(bbox, 'normalized_vertices', None) or []
                if verts:
                    stats["blocks_with_bbox"] += 1
        
        # Check text_block type
        text_block = getattr(block, 'text_block', None)
        if text_block:
            block_type = str(getattr(text_block, 'type_', 'unknown'))
            stats["all_types"][block_type] = stats["all_types"].get(block_type, 0) + 1
            
            # Check for visual-related types
            if block_type.lower() in ['image', 'figure', 'chart', 'diagram', 'picture', 'visual', 'graphic']:
                stats["visual_types"].append((block_path, block_type))
            
            # RECURSE into nested blocks
            nested = getattr(text_block, 'blocks', None) or []
            if nested:
                inspect_blocks_recursive(nested, depth + 1, block_path, stats)
        
        # Check table_block for nested content
        table_block = getattr(block, 'table_block', None)
        if table_block:
            stats["all_types"]["table"] = stats["all_types"].get("table", 0) + 1
        
        # Check list_block
        list_block = getattr(block, 'list_block', None)
        if list_block:
            stats["all_types"]["list"] = stats["all_types"].get("list", 0) + 1
    
    return stats

if hasattr(raw_doc, 'document_layout') and raw_doc.document_layout:
    root_blocks = getattr(raw_doc.document_layout, 'blocks', []) or []
    stats = inspect_blocks_recursive(root_blocks)
    
    print(f"  Total blocks (all depths): {stats['total_blocks']}")
    print(f"  Max nesting depth: {stats['max_depth']}")
    print(f"  Blocks with layout: {stats['blocks_with_layout']}")
    print(f"  Blocks with valid bbox: {stats['blocks_with_bbox']}")
    print(f"\n  Block type distribution:")
    for t, count in sorted(stats['all_types'].items(), key=lambda x: -x[1]):
        print(f"    - {t}: {count}")
    
    print(f"\n  üéØ IMAGE_BLOCK found: {len(stats['image_blocks'])}")
    for path in stats['image_blocks']:
        print(f"      ‚Üí {path}")
    
    print(f"\n  üéØ Visual-type blocks found: {len(stats['visual_types'])}")
    for path, btype in stats['visual_types']:
        print(f"      ‚Üí {path} (type: {btype})")
else:
    print("  ‚ö†Ô∏è No document_layout found!")

# ----------------- SECTION 4: Check pages array -----------------
print("\nüìÑ SECTION 4: PAGES ARRAY CHECK")
print("-"*50)

if hasattr(raw_doc, 'pages') and raw_doc.pages:
    print(f"  Pages found: {len(raw_doc.pages)}")
    for i, page in enumerate(raw_doc.pages[:3]):  # Limit to first 3
        print(f"\n  Page {i+1}:")
        print(f"    - visual_elements: {len(page.visual_elements) if hasattr(page, 'visual_elements') and page.visual_elements else 0}")
        print(f"    - blocks: {len(page.blocks) if hasattr(page, 'blocks') and page.blocks else 0}")
        print(f"    - image: {'YES' if hasattr(page, 'image') and page.image else 'NO'}")
        
        # Check visual elements
        if hasattr(page, 'visual_elements') and page.visual_elements:
            print(f"    üì∏ VISUAL ELEMENTS FOUND!")
            for j, ve in enumerate(page.visual_elements[:5]):
                ve_type = getattr(ve, 'type_', 'unknown')
                print(f"        [{j}] type: {ve_type}")
else:
    print("  ‚ö†Ô∏è doc.pages is EMPTY (Layout Parser behavior)")

# ----------------- SECTION 5: Check for figures/images in text -----------------
print("\nüìù SECTION 5: IMAGE INDICATORS IN TEXT CONTENT")
print("-"*50)

def find_image_indicators(blocks, found=None):
    """Look for text patterns that suggest an image was OCR'd."""
    if found is None:
        found = []
    
    image_keywords = ['figure', 'diagram', 'chart', 'graph', 'image', 'illustration', 
                      'infographic', 'flowchart', 'fig.', 'fig:', 'source:', 'scan qr']
    
    for block in blocks:
        text_block = getattr(block, 'text_block', None)
        if text_block:
            text = (getattr(text_block, 'text', '') or '').lower()
            for kw in image_keywords:
                if kw in text and len(text) < 100:  # Short text with keyword
                    found.append((kw, text[:60]))
            
            # Recurse
            nested = getattr(text_block, 'blocks', None) or []
            find_image_indicators(nested, found)
    
    return found

if hasattr(raw_doc, 'document_layout') and raw_doc.document_layout:
    root_blocks = getattr(raw_doc.document_layout, 'blocks', []) or []
    indicators = find_image_indicators(root_blocks)
    
    print(f"  Found {len(indicators)} potential image-related text fragments:")
    for kw, text in indicators[:10]:  # Limit to 10
        print(f"    [{kw}] ‚Üí \"{text}...\"")

# ----------------- SECTION 6: Summary & Recommendation -----------------
print("\n" + "="*70)
print("üìã DIAGNOSIS SUMMARY")
print("="*70)

if hasattr(raw_doc, 'document_layout') and raw_doc.document_layout:
    root_blocks = getattr(raw_doc.document_layout, 'blocks', []) or []
    stats = inspect_blocks_recursive(root_blocks)
    
    has_images = len(stats['image_blocks']) > 0 or len(stats['visual_types']) > 0
    has_pages = hasattr(raw_doc, 'pages') and raw_doc.pages and len(raw_doc.pages) > 0
    
    print(f"""
  ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
  ‚îÇ RESULT: {'‚úÖ Images FOUND' if has_images else '‚ùå NO images detected'}                              ‚îÇ
  ‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§
  ‚îÇ doc.pages populated:     {'‚úÖ YES' if has_pages else '‚ùå NO (Layout Parser behavior)'}               ‚îÇ
  ‚îÇ image_block found:       {'‚úÖ YES (' + str(len(stats['image_blocks'])) + ')' if stats['image_blocks'] else '‚ùå NO'}                              ‚îÇ
  ‚îÇ visual-type blocks:      {'‚úÖ YES (' + str(len(stats['visual_types'])) + ')' if stats['visual_types'] else '‚ùå NO'}                              ‚îÇ
  ‚îÇ bbox available:          {'‚úÖ YES (' + str(stats['blocks_with_bbox']) + '/' + str(stats['total_blocks']) + ')' if stats['blocks_with_bbox'] > 0 else '‚ùå NO'}                    ‚îÇ
  ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
    """)
    
    if has_images:
        print("  ‚Üí RECOMMENDATION: Option C works! Update parser to extract these.")
    elif stats['blocks_with_bbox'] > 0:
        print("  ‚Üí RECOMMENDATION: Try Option D (pdf2image + CV) - bboxes exist but no image types.")
    else:
        print("  ‚Üí RECOMMENDATION: Move to Option D (pdf2image + Computer Vision).")
        print("     Layout Parser does NOT detect images - they're being OCR'd as text.")
else:
    print("  ‚ö†Ô∏è Cannot analyze - document_layout is missing!")

print("\n" + "="*70)
print("END OF DIAGNOSIS - Copy this output and share for analysis")
print("="*70)


In [None]:
# Run the parser
result = parser.parse(pdf_filename)

## 4. Explore Results

In [None]:
import json

print(f"Processing Complete.")
print(f"Metadata: {result["metadata"]}")
print(f"Top-level blocks found: {len(result["structure"])}")

# Function to print tree summary
def print_tree(nodes, level=0):
    for node in nodes:
        indent = "  " * level
        info = f"{indent}- [{node["type"]}] (ID: {node["id"]})"
        if node.get("text"):
            preview = node["text"][:50].replace("
", " ") + "..."
            info += f" : {preview}"
        if node.get("file_path"):
            info += f" [Saved Image: {node["file_path"]}]"
        if node.get("type") == "table":
            rows = len(node.get("data", {}).get("simple_matrix", []))
            info += f" [Table: {rows} rows]"
            
        print(info)
        
        if node.get("children"):
            print_tree(node["children"], level + 1)

print("
--- Document Structure ---")
print_tree(result["structure"][:20])

## 5. View Extracted Tables

In [None]:
# Helper to find tables recursively
def find_tables(nodes):
    tables = []
    for node in nodes:
        if node["type"] == "table":
            tables.append(node)
        if node.get("children"):
            tables.extend(find_tables(node["children"]))
    return tables

tables = find_tables(result["structure"])
print(f"Found {len(tables)} tables.")

for i, table in enumerate(tables):
    print(f"
Table {i+1}:")
    matrix = table["data"]["simple_matrix"]
    for row in matrix:
        print(f"  {row}")

## 6. View Extracted Images

In [None]:
from IPython.display import Image, display

def find_images(nodes):
    imgs = []
    for node in nodes:
        if node.get("file_path"):
            imgs.append(node)
        if node.get("children"):
            imgs.extend(find_images(node["children"]))
    return imgs

extracted_images = find_images(result["structure"])
print(f"Found {len(extracted_images)} images.")

for img in extracted_images:
    print(f"
[{img["type"]}] {img["file_path"]}")
    try:
        display(Image(filename=img["file_path"]))
    except Exception as e:
        print(f"Could not display image: {e}")

## 7. Save Full JSON Result

In [None]:
output_file = "universal_parsed_result.json"
with open(output_file, "w") as f:
    json.dump(result, f, indent=2)

print(f"Full JSON saved to {output_file}")
files.download(output_file)

# Also zip and download images if any
if extracted_images:
    !zip -r extracted_images.zip universal_output/images
    files.download("extracted_images.zip")