# Test 6: Universal Document Parser

This notebook tests the UniversalParser class, which implements a context-aware hierarchical extraction strategy.

**Goals:**
1. Extract document structure as a hierarchical JSON tree.
2. Preserve table structures as data grids.
3. Extract and save images/charts as separate files.

## 1. Setup

In [None]:
%pip install -q google-cloud-documentai python-dotenv pdf2image Pillow
print("Dependencies installed.")

In [None]:
# Clone repository if running in Colab to get utils
import os
if not os.path.exists('utils'):
    !git clone https://github.com/abhii-01/docai-extraction-test.git temp_repo
    !mv temp_repo/* .
    !rm -rf temp_repo
    print("Repository cloned.")
else:
    print("Utils already present.")

In [None]:
# Setup Credentials
from google.colab import files
import json

if not os.path.exists('docai-credentials.json'):
    print("Upload your Google Cloud credentials JSON file...")
    uploaded = files.upload()
    creds_filename = list(uploaded.keys())[0]
    with open('docai-credentials.json', 'wb') as f:
        f.write(uploaded[creds_filename])
    print("Credentials saved.")
else:
    print("Credentials found.")

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'docai-credentials.json'

In [None]:
# Configuration
DOCAI_PROJECT_ID = "your-project-id" # UPDATE THIS
DOCAI_PROCESSOR_ID = "your-layout-parser-id" # UPDATE THIS
DOCAI_LOCATION = "us"

os.environ['DOCAI_PROJECT_ID'] = DOCAI_PROJECT_ID
os.environ['DOCAI_PROCESSOR_ID'] = DOCAI_PROCESSOR_ID
os.environ['DOCAI_LOCATION'] = DOCAI_LOCATION

## 2. Initialize Parser

In [None]:
from utils.docai_client import get_client_from_env
from utils.universal_parser import UniversalParser

try:
    client = get_client_from_env()
    parser = UniversalParser(client, output_dir="universal_output")
    print("UniversalParser initialized successfully!")
except Exception as e:
    print(f"Error initializing: {e}")

## 3. Upload and Parse PDF

In [None]:
print("Upload a PDF file to test (preferably one with headings, tables, and images)...")
uploaded = files.upload()
pdf_filename = list(uploaded.keys())[0]

In [None]:
# Run the parser
result = parser.parse(pdf_filename)

## 4. Explore Results

In [None]:
import json

print(f"Processing Complete.")
print(f"Metadata: {result['metadata']}")
print(f"Top-level blocks found: {len(result['structure'])}")

# Function to print tree summary
def print_tree(nodes, level=0):
    for node in nodes:
        indent = "  " * level
        info = f"{indent}- [{node['type']}] (ID: {node['id']})"
        if node.get('text'):
            preview = node['text'][:50].replace('\n', ' ') + "..."
            info += f" : {preview}"
        if node.get('file_path'):
            info += f" [Saved Image: {node['file_path']}]"
        if node.get('type') == 'table':
            rows = len(node.get('data', {}).get('simple_matrix', []))
            info += f" [Table: {rows} rows]"
        print(info)
        if node.get('children'):
            print_tree(node['children'], level + 1)

print("\n--- Document Structure ---")
print_tree(result['structure'][:20])

## 5. View Extracted Tables

In [None]:
# Helper to find tables recursively
def find_tables(nodes):
    tables = []
    for node in nodes:
        if node['type'] == 'table':
            tables.append(node)
        if node.get('children'):
            tables.extend(find_tables(node['children']))
    return tables

tables = find_tables(result['structure'])
print(f"Found {len(tables)} tables.")

for i, table in enumerate(tables):
    print(f"\nTable {i+1}:")
    matrix = table['data']['simple_matrix']
    for row in matrix:
        print(f"  {row}")

## 6. View Extracted Images

In [None]:
from IPython.display import Image, display

def find_images(nodes):
    imgs = []
    for node in nodes:
        if node.get('file_path'):
            imgs.append(node)
        if node.get('children'):
            imgs.extend(find_images(node['children']))
    return imgs

extracted_images = find_images(result['structure'])
print(f"Found {len(extracted_images)} images.")

for img in extracted_images:
    print(f"\n[{img['type']}] {img['file_path']}")
    try:
        display(Image(filename=img['file_path']))
    except Exception as e:
        print(f"Could not display image: {e}")

## 7. Save Full JSON Result

In [None]:
output_file = "universal_parsed_result.json"
with open(output_file, 'w') as f:
    json.dump(result, f, indent=2)

print(f"Full JSON saved to {output_file}")
files.download(output_file)

# Also zip and download images if any
if extracted_images:
    !zip -r extracted_images.zip universal_output/images
    files.download('extracted_images.zip')