# Document Processing Demo

This notebook demonstrates how to use the document processing functionality to:
1. Process a single PDF file
2. Process a directory of files
3. Save the extracted content as JSON files

In [None]:
import os
import json
from pathlib import Path
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from docai.data.models.document import Base
from docai.data.repositories.document_repository import DocumentRepository
from docai.services.document_processor.document_processor import DocumentProcessor

## Setup

First, let's set up our database connection and create necessary directories

In [None]:
# Create database engine
engine = create_engine("sqlite:///./demo.db")
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
db = Session()

# Create repository and processor
document_repository = DocumentRepository(db)
document_processor = DocumentProcessor(document_repository)

# Create output directory
output_dir = Path("./output")
output_dir.mkdir(exist_ok=True)

## Process Single PDF File

Let's process a single PDF file and save its content as JSON

In [None]:
def save_as_json(doc_id, output_dir):
    """Save document content as JSON"""
    doc = document_repository.get_by_id(doc_id)
    if doc:
        output_file = output_dir / f"{doc.id}.json"
        doc_data = {
            "id": doc.id,
            "filename": doc.filename,
            "content": doc.content,
            "storage_path": str(doc.storage_path),
            "created_at": doc.created_at.isoformat()
        }
        output_file.write_text(json.dumps(doc_data, indent=2))
        return output_file
    return None

# Process a single PDF file
pdf_path = Path("../tests/fixtures/sample.pdf")
if pdf_path.exists():
    doc_id = document_processor.process_file(pdf_path)
    if doc_id:
        output_file = save_as_json(doc_id, output_dir)
        print(f"Processed {pdf_path.name} -> {output_file}")
else:
    print(f"File not found: {pdf_path}")

## Process Directory

Now let's process all files in a directory

In [None]:
# Process all files in a directory
input_dir = Path("../tests/fixtures")
if input_dir.exists() and input_dir.is_dir():
    doc_ids = document_processor.process_directory(input_dir)
    for doc_id in doc_ids:
        output_file = save_as_json(doc_id, output_dir)
        if output_file:
            print(f"Processed document {doc_id} -> {output_file}")
else:
    print(f"Directory not found: {input_dir}")

## View Processed Results

Let's look at the contents of one of our processed files

In [None]:
# List all processed files
print("Processed files:")
for json_file in output_dir.glob("*.json"):
    print(f"\n{json_file.name}:")
    data = json.loads(json_file.read_text())
    print(f"- ID: {data['id']}")
    print(f"- Filename: {data['filename']}")
    print(f"- Content length: {len(data['content'])} characters")
    print(f"- Created at: {data['created_at']}")

## Cleanup

Finally, let's clean up our demo database

In [None]:
# Close database connection
db.close()

# Remove demo database
Path("./demo.db").unlink(missing_ok=True)

print("Cleanup complete!")