# Test 5: Full Pipeline - Complete Textbook Processing

**Goal:** Run end-to-end extraction pipeline matching your textbook taxonomy architecture

This notebook combines **ALL features** for complete textbook processing with Layout Parser.

## What This Test Does:
- ‚úÖ Processes entire PDF with Layout Parser
- ‚úÖ Extracts paragraphs as text chunks
- ‚úÖ Detects and converts tables to narrative paragraphs
- ‚úÖ Identifies flowcharts and generates descriptions
- ‚úÖ Tags each chunk with structure type (paragraph/table/flowchart)
- ‚úÖ Creates ready-to-use chunks for taxonomy matching

**Complete Pipeline:**
1. Layout Parser detection (paragraphs, tables, images)
2. Table ‚Üí narrative conversion (LLM)
3. Flowchart ‚Üí description (Vision LLM)
4. Structure tagging for each chunk
5. Export chunks ready for semantic search


## Step 1: Install Dependencies


In [None]:
%pip install -q google-cloud-documentai python-dotenv openai anthropic pdf2image Pillow
print("‚úÖ All dependencies installed!")


## Step 2: Upload Credentials


In [None]:
from google.colab import files
import json
import os

print("üì§ Please upload your Google Cloud credentials JSON file...")
uploaded = files.upload()

creds_filename = list(uploaded.keys())[0]
credentials_content = json.loads(uploaded[creds_filename].decode('utf-8'))

with open('docai-credentials.json', 'w') as f:
    json.dump(credentials_content, f)

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'docai-credentials.json'
print(f"‚úÖ Credentials saved: {creds_filename}")


## Step 3: Configure Environment


In [None]:
# Configuration - UPDATE THESE VALUES
DOCAI_PROJECT_ID = "your-project-id-here"
DOCAI_PROCESSOR_ID = "your-layout-parser-processor-id"
DOCAI_LOCATION = "us"

# LLM Configuration
OPENAI_API_KEY = "sk-your-openai-key-here"
LLM_PROVIDER = "openai"
LLM_MODEL = "gpt-4o"

os.environ['DOCAI_PROJECT_ID'] = DOCAI_PROJECT_ID
os.environ['DOCAI_PROCESSOR_ID'] = DOCAI_PROCESSOR_ID
os.environ['DOCAI_LOCATION'] = DOCAI_LOCATION
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['LLM_PROVIDER'] = LLM_PROVIDER
os.environ['LLM_MODEL'] = LLM_MODEL

print(f"‚úÖ Configuration set")


## Step 4: Clone Repository and Load Utils


In [None]:
!git clone https://github.com/abhii-01/python-automation.git
%cd python-automation

import sys
from pathlib import Path
sys.path.append(str(Path.cwd()))

from utils.docai_client import get_client_from_env
from utils.table_converter import table_to_markdown, table_to_narrative, detect_table_type
from utils.vision_llm import describe_image_with_llm, extract_image_from_pdf, is_likely_diagram

print("‚úÖ Repository cloned and utilities loaded")


## Step 5: Verify Setup


In [None]:
print("üîç Verifying Document AI setup...\n")

try:
    client = get_client_from_env()
    client.verify_setup()
    print("\n‚úÖ Setup verified! Ready to process documents.")
except Exception as e:
    print(f"\n‚ùå Setup verification failed: {e}")


## Step 6: Upload PDF for Testing


In [None]:
print("üì§ Please upload your PDF file...")
uploaded_pdfs = files.upload()

pdf_filename = list(uploaded_pdfs.keys())[0]
pdf_path = pdf_filename

print(f"‚úÖ PDF uploaded: {pdf_filename}")
print(f"   Size: {len(uploaded_pdfs[pdf_filename]) / 1024:.1f} KB")


## Step 7: Process Document with Layout Parser


In [None]:
print(f"{'='*60}")
print("TEST 5: FULL PIPELINE")
print(f"{'='*60}\n")

print(f"üìÑ Processing PDF with Layout Parser: {pdf_path}")
document = client.process_document(pdf_path)

print(f"‚úÖ Document processed!")
print(f"   Total pages: {len(document.pages)}")


## Step 8: Define Helper Functions


In [None]:
def extract_table_simple(table, full_text):
    """Extract table into 2D list"""
    cells_dict = {}
    max_row = max_col = 0
    
    all_cells = []
    if hasattr(table, 'header_rows'):
        for row in table.header_rows:
            all_cells.extend(row.cells)
    if hasattr(table, 'body_rows'):
        for row in table.body_rows:
            all_cells.extend(row.cells)
    
    for cell in all_cells:
        if not hasattr(cell, 'layout') or not cell.layout.text_anchor:
            continue
        
        row_idx = getattr(cell.layout, 'table_row_index', 0)
        col_idx = getattr(cell.layout, 'table_col_index', 0)
        
        text_parts = []
        for segment in cell.layout.text_anchor.text_segments:
            text = full_text[segment.start_index:segment.end_index]
            text_parts.append(text)
        
        cells_dict[(row_idx, col_idx)] = " ".join(text_parts).strip()
        max_row = max(max_row, row_idx)
        max_col = max(max_col, col_idx)
    
    if not cells_dict:
        return []
    
    table_data = []
    for row in range(max_row + 1):
        row_data = [cells_dict.get((row, col), "") for col in range(max_col + 1)]
        table_data.append(row_data)
    
    return table_data

def get_bbox(bounding_poly):
    """Extract normalized bounding box"""
    if not bounding_poly or not hasattr(bounding_poly, 'normalized_vertices'):
        return {"x_min": 0, "y_min": 0, "x_max": 0, "y_max": 0}
    
    vertices = bounding_poly.normalized_vertices
    if not vertices:
        return {"x_min": 0, "y_min": 0, "x_max": 0, "y_max": 0}
    
    x_coords = [v.x for v in vertices]
    y_coords = [v.y for v in vertices]
    
    return {
        "x_min": min(x_coords),
        "y_min": min(y_coords),
        "x_max": max(x_coords),
        "y_max": max(y_coords)
    }

def get_page_text(page, full_text):
    """Extract text from page"""
    text_parts = []
    for para in page.paragraphs:
        if para.layout.text_anchor:
            for segment in para.layout.text_anchor.text_segments:
                text_parts.append(full_text[segment.start_index:segment.end_index])
    return " ".join(text_parts)

print("‚úÖ Helper functions defined")


In [None]:
print("\nüîÑ Running full extraction pipeline...\n")

all_chunks = []
chunk_id = 0

stats = {
    "paragraphs": 0,
    "tables": 0,
    "flowcharts": 0,
    "total_chunks": 0
}

for page_num, page in enumerate(document.pages, 1):
    print(f"  üìÑ Page {page_num}")
    
    # Extract paragraphs
    for para in page.paragraphs:
        if not para.layout.text_anchor:
            continue
        
        text_parts = []
        for segment in para.layout.text_anchor.text_segments:
            text = document.text[segment.start_index:segment.end_index]
            text_parts.append(text)
        
        para_text = " ".join(text_parts).strip()
        
        if len(para_text) < 30:  # Skip short paragraphs
            continue
        
        chunk_id += 1
        chunk = {
            "chunk_id": f"chunk_{chunk_id:04d}",
            "structure_type": "paragraph",
            "page": page_num,
            "text": para_text,
            "char_count": len(para_text),
            "metadata": {
                "extraction_method": "layout_parser"
            }
        }
        
        all_chunks.append(chunk)
        stats["paragraphs"] += 1
    
    # Extract and convert tables
    if hasattr(page, 'tables'):
        for table_idx, table in enumerate(page.tables):
            table_data = extract_table_simple(table, document.text)
            
            if not table_data:
                continue
            
            markdown = table_to_markdown(table_data)
            table_type = detect_table_type(table_data)
            
            try:
                narrative = table_to_narrative(markdown, method=table_type)
                print(f"    ‚úÖ Converted table {table_idx + 1} to narrative")
            except Exception as e:
                print(f"    ‚ö†Ô∏è  Table conversion failed: {e}")
                narrative = f"[Table with {len(table_data)} rows and {len(table_data[0])} columns]"
            
            chunk_id += 1
            chunk = {
                "chunk_id": f"chunk_{chunk_id:04d}",
                "structure_type": "table",
                "page": page_num,
                "text": narrative,
                "char_count": len(narrative),
                "metadata": {
                    "extraction_method": "layout_parser_table",
                    "conversion_method": "llm_narrative",
                    "table_type": table_type,
                    "original_markdown": markdown
                }
            }
            
            all_chunks.append(chunk)
            stats["tables"] += 1
    
    # Extract and describe flowcharts/diagrams
    if hasattr(page, 'image'):
        for img_idx, image in enumerate(page.image):
            bbox = get_bbox(image.layout.bounding_poly)
            page_text = get_page_text(page, document.text)
            
            if not is_likely_diagram(bbox, page_text):
                continue
            
            try:
                image_bytes = extract_image_from_pdf(pdf_path, page_num - 1, bbox)
                description = describe_image_with_llm(image_bytes, image_type="flowchart")
                print(f"    ‚úÖ Described diagram {img_idx + 1}")
            except Exception as e:
                print(f"    ‚ö†Ô∏è  Diagram description failed: {e}")
                description = "[Flowchart description unavailable]"
            
            chunk_id += 1
            chunk = {
                "chunk_id": f"chunk_{chunk_id:04d}",
                "structure_type": "flowchart",
                "page": page_num,
                "text": description,
                "char_count": len(description),
                "metadata": {
                    "extraction_method": "vision_llm",
                    "bbox": bbox
                }
            }
            
            all_chunks.append(chunk)
            stats["flowcharts"] += 1

stats["total_chunks"] = len(all_chunks)

print(f"\n‚úÖ Pipeline complete!")


In [None]:
print(f"{'='*60}")
print("‚úÖ FULL PIPELINE COMPLETE")
print(f"{'='*60}")
print(f"üìä Statistics:")
for key, value in stats.items():
    print(f"  {key.replace('_', ' ').capitalize()}: {value}")

print(f"\nüí° These chunks are ready for taxonomy matching!")
print(f"   Each chunk has structure_type tag and clean text.")


## Step 11: View Sample Chunks


In [None]:
# Show first chunk of each type
for chunk_type in ["paragraph", "table", "flowchart"]:
    chunks = [c for c in all_chunks if c['structure_type'] == chunk_type]
    if chunks:
        example = chunks[0]
        print(f"\nüìù Sample {chunk_type.upper()}:")
        print("-" * 60)
        print(f"ID: {example['chunk_id']}")
        print(f"Text: {example['text'][:200]}...")
        print("-" * 60)


## Step 12: Save Results to JSON


In [None]:
results = {
    "pdf_file": Path(pdf_path).name,
    "total_pages": len(document.pages),
    "statistics": stats,
    "chunks": all_chunks,
    "pipeline_config": {
        "extraction_tool": "google_document_ai_layout_parser",
        "table_conversion": "llm_narrative",
        "flowchart_handling": "vision_llm",
        "structure_tagging": True
    }
}

output_path = "test5_full_pipeline.json"
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"\nüíæ Results saved to: {output_path}")


## Step 13: Download Results


In [None]:
files.download(output_path)
print(f"‚úÖ Test 5 complete! Results downloaded.")
