## Step 5: Import Libraries


In [None]:
from pathlib import Path
import json
from utils.docai_client import get_client_from_env

print("‚úÖ Libraries imported successfully")


## Step 6: Upload PDF File


In [None]:
print("üì§ Upload your PDF file (should have tables/structure):")
pdf_uploaded = files.upload()

pdf_filename = list(pdf_uploaded.keys())[0]
os.makedirs('sample_pdfs', exist_ok=True)
pdf_path = f'sample_pdfs/{pdf_filename}'

with open(pdf_path, 'wb') as f:
    f.write(pdf_uploaded[pdf_filename])

print(f"‚úÖ PDF saved to: {pdf_path}")


## Step 7: Define Helper Functions


In [None]:
def get_bounding_box(bounding_poly):
    """Extract normalized bounding box from polygon"""
    if not bounding_poly or not hasattr(bounding_poly, 'normalized_vertices'):
        return {"x_min": 0, "y_min": 0, "x_max": 0, "y_max": 0}
    
    vertices = bounding_poly.normalized_vertices
    if not vertices:
        return {"x_min": 0, "y_min": 0, "x_max": 0, "y_max": 0}
    
    x_coords = [v.x for v in vertices]
    y_coords = [v.y for v in vertices]
    
    return {
        "x_min": min(x_coords),
        "y_min": min(y_coords),
        "x_max": max(x_coords),
        "y_max": max(y_coords)
    }

def extract_table_cells(table, full_text):
    """Extract table cells into structured format"""
    cells = []
    
    # Extract header rows
    if hasattr(table, 'header_rows'):
        for row in table.header_rows:
            for cell in row.cells:
                text = extract_cell_text(cell, full_text)
                cells.append({
                    "row": cell.layout.table_row_index if hasattr(cell.layout, 'table_row_index') else 0,
                    "col": cell.layout.table_col_index if hasattr(cell.layout, 'table_col_index') else 0,
                    "text": text,
                    "is_header": True
                })
    
    # Extract body rows
    if hasattr(table, 'body_rows'):
        for row in table.body_rows:
            for cell in row.cells:
                text = extract_cell_text(cell, full_text)
                cells.append({
                    "row": cell.layout.table_row_index if hasattr(cell.layout, 'table_row_index') else 0,
                    "col": cell.layout.table_col_index if hasattr(cell.layout, 'table_col_index') else 0,
                    "text": text,
                    "is_header": False
                })
    
    return cells

def extract_cell_text(cell, full_text):
    """Extract text from a table cell"""
    if not hasattr(cell, 'layout') or not cell.layout.text_anchor:
        return ""
    
    text_segments = []
    for segment in cell.layout.text_anchor.text_segments:
        text = full_text[segment.start_index:segment.end_index]
        text_segments.append(text)
    
    return " ".join(text_segments).strip()

print("‚úÖ Helper functions defined")


 out what part of plan is impelement

In [None]:
print("="*60)
print("TEST 2: STRUCTURE DETECTION")
print("="*60)
print()

# Initialize client
print("üì° Initializing Document AI client...")
client = get_client_from_env()

# Process document
print(f"\nüìÑ Processing PDF: {pdf_path}")
document = client.process_document(pdf_path)

# Extract structured elements
print("\nüîç Detecting document structure...")

all_elements = []
stats = {
    "paragraphs": 0,
    "tables": 0,
    "images": 0,
    "headers": 0,
    "footers": 0
}

for page_num, page in enumerate(document.pages, 1):
    print(f"  Processing page {page_num}...")
    
    # Extract paragraphs
    for para_idx, paragraph in enumerate(page.paragraphs):
        if paragraph.layout.text_anchor:
            # Get text
            text_segments = []
            for segment in paragraph.layout.text_anchor.text_segments:
                text = document.text[segment.start_index:segment.end_index]
                text_segments.append(text)
            
            para_text = " ".join(text_segments).strip()
            
            # Get bounding box
            bbox = get_bounding_box(paragraph.layout.bounding_poly)
            
            element = {
                "type": "paragraph",
                "page": page_num,
                "index": para_idx,
                "text": para_text,
                "bbox": bbox,
                "char_count": len(para_text)
            }
            
            all_elements.append(element)
            stats["paragraphs"] += 1
    
    # Extract tables
    for table_idx, table in enumerate(page.tables):
        # Extract table data
        table_data = extract_table_cells(table, document.text)
        
        bbox = get_bounding_box(table.layout.bounding_poly)
        
        element = {
            "type": "table",
            "page": page_num,
            "index": table_idx,
            "rows": len(table.body_rows) if hasattr(table, 'body_rows') else 0,
            "table_data": table_data,
            "bbox": bbox
        }
        
        all_elements.append(element)
        stats["tables"] += 1
    
    # Extract images
    if hasattr(page, 'image'):
        for img_idx, image in enumerate(page.image):
            bbox = get_bounding_box(image.layout.bounding_poly)
            
            element = {
                "type": "image",
                "page": page_num,
                "index": img_idx,
                "bbox": bbox,
                "area": (bbox['x_max'] - bbox['x_min']) * (bbox['y_max'] - bbox['y_min'])
            }
            
            all_elements.append(element)
            stats["images"] += 1

print("‚úÖ Structure detection complete!")


In [None]:
print("="*60)
print("‚úÖ STRUCTURE DETECTION COMPLETE")
print("="*60)
print(f"üìä Statistics:")
for element_type, count in stats.items():
    print(f"  {element_type.capitalize()}: {count}")

print(f"\nTotal elements detected: {len(all_elements)}")


## Step 10: Visualize Sample Elements


In [None]:
# Show first paragraph
paragraphs = [e for e in all_elements if e['type'] == 'paragraph']
if paragraphs:
    print("\nüìÑ Sample Paragraph:")
    print("-" * 60)
    print(paragraphs[0]['text'][:300])
    print("-" * 60)

# Show table info
tables = [e for e in all_elements if e['type'] == 'table']
if tables:
    print(f"\nüìä Found {len(tables)} table(s)")
    print(f"   First table: {tables[0]['rows']} rows, {len(tables[0]['table_data'])} cells")

# Show image info
images = [e for e in all_elements if e['type'] == 'image']
if images:
    print(f"\nüñºÔ∏è  Found {len(images)} image(s)")
    for i, img in enumerate(images[:3]):
        print(f"   Image {i+1}: {img['area']*100:.1f}% of page")


In [None]:
results = {
    "pdf_file": Path(pdf_path).name,
    "total_pages": len(document.pages),
    "statistics": stats,
    "elements": all_elements
}

output_path = "output/test2_structured.json"
os.makedirs('output', exist_ok=True)

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"üíæ Results saved to: {output_path}")


## Step 12: Download Results


In [None]:
files.download(output_path)
print("üì• Download started!")


# Test 2: Structure Detection with Layout Parser

**Goal:** Detect and analyze document structure (paragraphs, tables, images, headers)

**What this test does:**
- Uses Layout Parser to identify structural elements
- Extracts paragraphs with bounding boxes
- Detects tables and extracts table data
- Identifies images and diagrams
- Provides element-level statistics

**Layout Parser Advantages:**
- Superior structure detection compared to basic OCR
- Accurate table boundary detection
- Multi-column layout support
- Better handling of complex document layouts


## Setup Steps (Run cells 2-8 same as Test 1)


In [None]:
# Install dependencies
%pip install -q google-cloud-documentai python-dotenv openai anthropic pdf2image Pillow
print("‚úÖ All dependencies installed!")


In [None]:
# Upload credentials
from google.colab import files
import json
import os

print("üì§ Please upload your Google Cloud credentials JSON file...")
uploaded = files.upload()

creds_filename = list(uploaded.keys())[0]
credentials_content = json.loads(uploaded[creds_filename].decode('utf-8'))

with open('docai-credentials.json', 'w') as f:
    json.dump(credentials_content, f)

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'docai-credentials.json'
print(f"‚úÖ Credentials saved: {creds_filename}")


In [None]:
# Configure environment - UPDATE THESE VALUES
DOCAI_PROJECT_ID = "your-project-id-here"
DOCAI_PROCESSOR_ID = "your-processor-id-here"
DOCAI_LOCATION = "us"

os.environ['DOCAI_PROJECT_ID'] = DOCAI_PROJECT_ID
os.environ['DOCAI_PROCESSOR_ID'] = DOCAI_PROCESSOR_ID
os.environ['DOCAI_LOCATION'] = DOCAI_LOCATION

print(f"‚úÖ Configuration set for Layout Parser")


In [None]:
# Clone repository
!git clone https://github.com/abhii-01/docai-extraction-test.git
%cd docai-extraction-test

import sys
from pathlib import Path
sys.path.append(str(Path.cwd()))

from utils.docai_client import get_client_from_env

print("‚úÖ Repository cloned and utilities loaded")


In [None]:
# Verify setup
client = get_client_from_env()
client.verify_setup()
print("\n‚úÖ Ready to detect document structure!")


In [None]:
# Upload PDF (preferably one with tables and images)
print("üì§ Please upload your PDF file (with tables/structure)...")
uploaded_pdfs = files.upload()

pdf_filename = list(uploaded_pdfs.keys())[0]
pdf_path = pdf_filename

print(f"‚úÖ PDF uploaded: {pdf_filename}")


## Process Document and Detect Structure


In [None]:
print(f"{'='*60}")
print("TEST 2: STRUCTURE DETECTION")
print(f"{'='*60}\n")

# Process document with Layout Parser
print(f"üìÑ Processing PDF with Layout Parser: {pdf_path}")
document = client.process_document(pdf_path)

print(f"‚úÖ Document processed!")
print(f"   Total pages: {len(document.pages)}")


## Extract Structured Elements

Define helper functions for extracting bounding boxes and table cells.


In [None]:
def get_bounding_box(bounding_poly):
    """Extract normalized bounding box from polygon"""
    if not bounding_poly or not hasattr(bounding_poly, 'normalized_vertices'):
        return {"x_min": 0, "y_min": 0, "x_max": 0, "y_max": 0}
    
    vertices = bounding_poly.normalized_vertices
    if not vertices:
        return {"x_min": 0, "y_min": 0, "x_max": 0, "y_max": 0}
    
    x_coords = [v.x for v in vertices]
    y_coords = [v.y for v in vertices]
    
    return {
        "x_min": min(x_coords),
        "y_min": min(y_coords),
        "x_max": max(x_coords),
        "y_max": max(y_coords)
    }

def extract_cell_text(cell, full_text):
    """Extract text from a table cell"""
    if not hasattr(cell, 'layout') or not cell.layout.text_anchor:
        return ""
    
    text_segments = []
    for segment in cell.layout.text_anchor.text_segments:
        text = full_text[segment.start_index:segment.end_index]
        text_segments.append(text)
    
    return " ".join(text_segments).strip()

def extract_table_cells(table, full_text):
    """Extract table cells into structured format"""
    cells = []
    
    # Extract header rows
    if hasattr(table, 'header_rows'):
        for row in table.header_rows:
            for cell in row.cells:
                text = extract_cell_text(cell, full_text)
                cells.append({
                    "row": cell.layout.table_row_index if hasattr(cell.layout, 'table_row_index') else 0,
                    "col": cell.layout.table_col_index if hasattr(cell.layout, 'table_col_index') else 0,
                    "text": text,
                    "is_header": True
                })
    
    # Extract body rows
    if hasattr(table, 'body_rows'):
        for row in table.body_rows:
            for cell in row.cells:
                text = extract_cell_text(cell, full_text)
                cells.append({
                    "row": cell.layout.table_row_index if hasattr(cell.layout, 'table_row_index') else 0,
                    "col": cell.layout.table_col_index if hasattr(cell.layout, 'table_col_index') else 0,
                    "text": text,
                    "is_header": False
                })
    
    return cells

print("‚úÖ Helper functions defined")


## Extract All Structural Elements

Extract paragraphs, tables, and images from all pages.


In [None]:
print("üîç Detecting document structure...\n")

all_elements = []
stats = {
    "paragraphs": 0,
    "tables": 0,
    "images": 0,
    "headers": 0,
    "footers": 0
}

for page_num, page in enumerate(document.pages, 1):
    print(f"  Processing page {page_num}...")
    
    # Extract paragraphs
    for para_idx, paragraph in enumerate(page.paragraphs):
        if paragraph.layout.text_anchor:
            # Get text
            text_segments = []
            for segment in paragraph.layout.text_anchor.text_segments:
                text = document.text[segment.start_index:segment.end_index]
                text_segments.append(text)
            
            para_text = " ".join(text_segments).strip()
            
            # Get bounding box
            bbox = get_bounding_box(paragraph.layout.bounding_poly)
            
            element = {
                "type": "paragraph",
                "page": page_num,
                "index": para_idx,
                "text": para_text,
                "bbox": bbox,
                "char_count": len(para_text)
            }
            
            all_elements.append(element)
            stats["paragraphs"] += 1
    
    # Extract tables
    for table_idx, table in enumerate(page.tables):
        # Extract table data
        table_data = extract_table_cells(table, document.text)
        
        bbox = get_bounding_box(table.layout.bounding_poly)
        
        element = {
            "type": "table",
            "page": page_num,
            "index": table_idx,
            "rows": len(table.body_rows) if hasattr(table, 'body_rows') else 0,
            "table_data": table_data,
            "bbox": bbox
        }
        
        all_elements.append(element)
        stats["tables"] += 1
    
    # Extract images
    if hasattr(page, 'image'):
        for img_idx, image in enumerate(page.image):
            bbox = get_bounding_box(image.layout.bounding_poly)
            
            element = {
                "type": "image",
                "page": page_num,
                "index": img_idx,
                "bbox": bbox,
                "area": (bbox['x_max'] - bbox['x_min']) * (bbox['y_max'] - bbox['y_min'])
            }
            
            all_elements.append(element)
            stats["images"] += 1

print(f"\n‚úÖ Structure detection complete!")


## Display Statistics

Show summary of detected elements.


In [None]:
print(f"{'='*60}")
print("üìä STRUCTURE DETECTION STATISTICS")
print(f"{'='*60}\n")

for element_type, count in stats.items():
    print(f"  {element_type.capitalize()}: {count}")

print(f"\n  Total elements: {len(all_elements)}")
print(f"  Total pages: {len(document.pages)}")


## Preview Elements

Show samples of detected elements.


In [None]:
# Show first paragraph
paragraphs = [e for e in all_elements if e['type'] == 'paragraph']
if paragraphs:
    print("\nüìÑ First Paragraph:")
    print("-" * 60)
    print(paragraphs[0]['text'][:300])
    if len(paragraphs[0]['text']) > 300:
        print("...")
    print("-" * 60)

# Show table info
tables = [e for e in all_elements if e['type'] == 'table']
if tables:
    print(f"\nüìä Tables Found: {len(tables)}")
    for i, table in enumerate(tables[:3], 1):
        print(f"  Table {i}: {table['rows']} rows, Page {table['page']}")

# Show image info
images = [e for e in all_elements if e['type'] == 'image']
if images:
    print(f"\nüñºÔ∏è  Images Found: {len(images)}")
    for i, img in enumerate(images[:3], 1):
        print(f"  Image {i}: {img['area']:.2%} of page, Page {img['page']}")


## Save Results


In [None]:
# Build results
results = {
    "pdf_file": Path(pdf_path).name,
    "total_pages": len(document.pages),
    "statistics": stats,
    "elements": all_elements
}

# Save to JSON
output_path = "test2_structured.json"
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"\nüíæ Results saved to: {output_path}")

# Download results
files.download(output_path)
print(f"‚úÖ Test 2 complete! Results downloaded.")
