This cell processes all PDFs in the input folder and extracts structured markdown text using Docling. Key variables include:

- `input_dir`: The folder containing source PDF files.
- `output_dir`: The folder where chunked `.json` and `.md` outputs will be saved.
- `base_name`: The name of the file (without extension), used to generate matching output filenames.

Before processing, the script checks whether a file has already been chunked by looking for a `.json` file in the output directory with the same base name. If it exists, the PDF is skipped to avoid redundant processing.


In [1]:
import os
import json
import hashlib
from typing import List, Dict, Any
from langchain_core.documents import Document as LCDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
try:
    from docling.datamodel.pipeline_options import PdfPipelineOptions, AcceleratorOptions, AcceleratorDevice
except ImportError:
    # Fallback for older versions
    from docling.datamodel.pipeline_options import PdfPipelineOptions, AcceleratorOptions
    from docling.datamodel.acceleration_options import AcceleratorDevice
except ImportError:
    # Fallback for even older versions or different structure
    from docling.datamodel.pipeline_options import PdfPipelineOptions
    AcceleratorOptions = None
    AcceleratorDevice = None

class DoclingBookLoader:
    def __init__(self, file_path: str) -> None:
        self.file_path = file_path
        
        # Handle different Docling versions
        if AcceleratorOptions and AcceleratorDevice:
            accelerator_options = AcceleratorOptions(num_threads=8, device=AcceleratorDevice.AUTO)
            pipeline_options = PdfPipelineOptions(
                accelerator_options=accelerator_options,
                do_ocr=True,
                do_table_structure=True,
            )
        else:
            # Simplified options for older versions
            pipeline_options = PdfPipelineOptions(
                do_ocr=True,
                do_table_structure=True,
            )
        
        if hasattr(pipeline_options, 'table_structure_options'):
            pipeline_options.table_structure_options.do_cell_matching = True

        self.converter = DocumentConverter(
            format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
        )

    def extract_text(self) -> str:
        docling_doc = self.converter.convert(self.file_path).document
        return docling_doc.export_to_markdown()

def create_smart_chunks(text: str, metadata: dict, target_chunk_size=3000, overlap_size=300) -> List[LCDocument]:
    """
    Create larger, semantically meaningful chunks with smart overlap.
    Uses strategic overlap to preserve context at boundaries.
    """
    
    # Calculate overlap ratio (should be 10-15% of chunk size)
    overlap_ratio = overlap_size / target_chunk_size
    if overlap_ratio > 0.2:  # Cap at 20%
        overlap_size = int(target_chunk_size * 0.2)
        print(f"⚠️  Capped overlap to 20% of chunk size: {overlap_size} chars")
    
    # First, try to split by major sections (headers) for natural boundaries
    major_sections = text.split('\n# ')
    if len(major_sections) == 1:
        # Try secondary headers if no major ones
        major_sections = text.split('\n## ')
    if len(major_sections) == 1:
        # Try tertiary headers
        major_sections = text.split('\n### ')
    
    chunks = []
    previous_chunk_end = ""
    chunk_id = 1
    
    # If we have natural sections, use them
    if len(major_sections) > 1:
        current_chunk = ""
        
        for i, section in enumerate(major_sections):
            # Re-add the header marker (except for first section)
            if i > 0:
                if '\n# ' in text:
                    section = '# ' + section
                elif '\n## ' in text:
                    section = '## ' + section
                elif '\n### ' in text:
                    section = '### ' + section
            
            # If adding this section would exceed target size, save current chunk
            if len(current_chunk) + len(section) > target_chunk_size and current_chunk:
                # Add overlap from previous chunk if available
                chunk_content = previous_chunk_end + current_chunk
                chunks.append(create_chunk_document(chunk_content, metadata, chunk_id))
                
                # Prepare overlap for next chunk
                previous_chunk_end = get_smart_overlap(current_chunk, overlap_size)
                current_chunk = section
                chunk_id += 1
            else:
                current_chunk += ("\n" if current_chunk else "") + section
        
        # Add the final chunk
        if current_chunk.strip():
            chunk_content = previous_chunk_end + current_chunk
            chunks.append(create_chunk_document(chunk_content, metadata, chunk_id))
    
    else:
        # Fall back to recursive splitting with smart overlap
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=target_chunk_size,
            chunk_overlap=overlap_size,
            separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""],
            keep_separator=True
        )
        doc = LCDocument(page_content=text, metadata=metadata)
        split_chunks = splitter.split_documents([doc])
        
        for i, chunk in enumerate(split_chunks, 1):
            chunk.metadata.update({
                "chunk_id": f"chunk_{i:03d}",
                "content_hash": hashlib.md5(chunk.page_content.encode()).hexdigest()[:12],
                "chunk_size": len(chunk.page_content),
                "word_count": len(chunk.page_content.split()),
                "is_sub_chunk": False,
                "chunking_method": "recursive_with_overlap"
            })
            chunks.append(chunk)
        
        return chunks
    
    # If chunks are still too large after section-based splitting, split them further
    final_chunks = []
    for chunk in chunks:
        if len(chunk.page_content) > target_chunk_size * 1.5:
            # Use recursive splitter for oversized chunks
            splitter = RecursiveCharacterTextSplitter(
                chunk_size=target_chunk_size,
                chunk_overlap=overlap_size,
                separators=["\n\n", "\n", ". ", " ", ""],
                keep_separator=True
            )
            sub_chunks = splitter.split_documents([chunk])
            
            # Update chunk IDs for sub-chunks
            for j, sub_chunk in enumerate(sub_chunks):
                sub_chunk.metadata.update({
                    "chunk_id": f"{chunk.metadata['chunk_id']}.{j+1}",
                    "parent_chunk": chunk.metadata['chunk_id'],
                    "is_sub_chunk": True,
                    "chunking_method": "hybrid_section_recursive"
                })
                final_chunks.append(sub_chunk)
        else:
            chunk.metadata["chunking_method"] = "section_based"
            final_chunks.append(chunk)
    
    return final_chunks

def get_smart_overlap(text: str, overlap_size: int) -> str:
    """
    Extract smart overlap that ends at sentence boundaries.
    """
    if len(text) <= overlap_size:
        return text
    
    # Try to end at sentence boundary
    overlap_text = text[-overlap_size:]
    
    # Find the last sentence ending
    for separator in ['. ', '! ', '? ', '\n\n']:
        last_sep = overlap_text.rfind(separator)
        if last_sep > overlap_size * 0.5:  # Must be at least 50% of desired overlap
            return overlap_text[last_sep + len(separator):]
    
    # If no good sentence boundary, try paragraph
    last_para = overlap_text.rfind('\n')
    if last_para > overlap_size * 0.3:
        return overlap_text[last_para + 1:]
    
    # Fall back to character boundary
    return overlap_text

def create_chunk_document(content: str, base_metadata: dict, chunk_id: int) -> LCDocument:
    """Create a document chunk with enhanced metadata."""
    
    # Generate a unique hash for the content
    content_hash = hashlib.md5(content.encode()).hexdigest()[:12]
    
    # Extract potential section title from beginning of chunk
    lines = content.strip().split('\n')
    section_title = ""
    for line in lines[:3]:  # Check first 3 lines
        if line.startswith('#'):
            section_title = line.strip('#').strip()
            break
    
    # Enhanced metadata
    enhanced_metadata = {
        **base_metadata,
        "chunk_id": f"chunk_{chunk_id:03d}",
        "content_hash": content_hash,
        "chunk_size": len(content),
        "word_count": len(content.split()),
        "section_title": section_title,
        "starts_with": content[:100].replace('\n', ' '),
        "is_sub_chunk": False,
        "chunking_method": "section_based_with_overlap"
    }
    
    return LCDocument(page_content=content.strip(), metadata=enhanced_metadata)

def analyze_chunks(chunks: List[LCDocument]) -> Dict[str, Any]:
    """Analyze chunk statistics for optimization."""
    if not chunks:
        return {}
    
    sizes = [len(chunk.page_content) for chunk in chunks]
    word_counts = [chunk.metadata.get('word_count', 0) for chunk in chunks]
    
    return {
        "total_chunks": len(chunks),
        "avg_chunk_size": sum(sizes) / len(sizes),
        "min_chunk_size": min(sizes),
        "max_chunk_size": max(sizes),
        "avg_word_count": sum(word_counts) / len(word_counts),
        "sections_with_titles": sum(1 for chunk in chunks if chunk.metadata.get('section_title')),
        "total_characters": sum(sizes),
        "total_words": sum(word_counts)
    }

def save_outputs(base_name: str, output_dir: str, markdown: str, chunks: List[LCDocument]):
    """Save outputs with enhanced analytics."""
    json_path = os.path.join(output_dir, f"{base_name}.json")
    md_path = os.path.join(output_dir, f"{base_name}.md")
    stats_path = os.path.join(output_dir, f"{base_name}_stats.json")

    # Save markdown
    with open(md_path, "w", encoding="utf-8") as f_md:
        f_md.write(markdown)

    # Save chunks
    with open(json_path, "w", encoding="utf-8") as f_json:
        json.dump(
            [{"content": c.page_content, "metadata": c.metadata} for c in chunks],
            f_json,
            indent=2,
            ensure_ascii=False
        )
    
    # Save analytics
    stats = analyze_chunks(chunks)
    with open(stats_path, "w", encoding="utf-8") as f_stats:
        json.dump(stats, f_stats, indent=2)
    
    print(f"✅ Saved {base_name} - {stats['total_chunks']} chunks, avg size: {stats['avg_chunk_size']:.0f} chars")

def process_pdf_folder(input_folder: str, output_folder: str, chunk_size: int = 3000, overlap_size: int = 300):
    """Process PDFs with configurable chunk size and smart overlap."""
    os.makedirs(output_folder, exist_ok=True)

    pdf_files = [f for f in os.listdir(input_folder) if f.lower().endswith(".pdf")]
    print(f"📂 Found {len(pdf_files)} PDFs to process.")
    print(f"🎯 Target chunk size: {chunk_size} characters")
    print(f"🔄 Smart overlap: {overlap_size} characters ({overlap_size/chunk_size*100:.1f}%)")

    total_stats = {
        "files_processed": 0,
        "total_chunks": 0,
        "total_characters": 0,
        "avg_chunk_size_across_files": []
    }

    for filename in pdf_files:
        base_name = os.path.splitext(filename)[0]
        json_output = os.path.join(output_folder, f"{base_name}.json")

        if os.path.exists(json_output):
            print(f"⏩ Skipping {filename} (already processed)")
            continue

        pdf_path = os.path.join(input_folder, filename)
        print(f"\n📄 Processing: {filename}")

        try:
            loader = DoclingBookLoader(pdf_path)
            markdown = loader.extract_text()
            metadata = {
                "source": filename,
                "processing_timestamp": str(os.path.getmtime(pdf_path)),
                "chunk_strategy": "smart_semantic_with_overlap",
                "chunk_size": chunk_size,
                "overlap_size": overlap_size
            }
            
            chunks = create_smart_chunks(markdown, metadata, chunk_size, overlap_size)
            save_outputs(base_name, output_folder, markdown, chunks)
            
            # Update total stats
            stats = analyze_chunks(chunks)
            total_stats["files_processed"] += 1
            total_stats["total_chunks"] += stats["total_chunks"]
            total_stats["total_characters"] += stats["total_characters"]
            total_stats["avg_chunk_size_across_files"].append(stats["avg_chunk_size"])
            
        except Exception as e:
            print(f"❌ Error processing {filename}: {str(e)}")
    
    # Print summary
    if total_stats["files_processed"] > 0:
        overall_avg = sum(total_stats["avg_chunk_size_across_files"]) / len(total_stats["avg_chunk_size_across_files"])
        print(f"\n📊 Processing Summary:")
        print(f"   Files processed: {total_stats['files_processed']}")
        print(f"   Total chunks created: {total_stats['total_chunks']}")
        print(f"   Average chunk size: {overall_avg:.0f} characters")
        print(f"   Total content: {total_stats['total_characters']:,} characters")
        print(f"   Overlap strategy: {overlap_size} chars ({overlap_size/chunk_size*100:.1f}%)")

if __name__ == "__main__":
    input_dir = "pdfs/"
    output_dir = "outputs/"
    
    # Recommended overlap sizes:
    # 200-300 chars ≈ 1-2 sentences (conservative, good for most cases)
    # 400-500 chars ≈ 2-3 sentences (more context preservation)
    # 10-15% of chunk_size is typically optimal
    
    chunk_size = 5000      # Main chunk size
    overlap_size = 500     # Smart overlap (10% of chunk size)
    
    process_pdf_folder(input_dir, output_dir, chunk_size, overlap_size)

  from .autonotebook import tqdm as notebook_tqdm


📂 Found 1 PDFs to process.
🎯 Target chunk size: 5000 characters
🔄 Smart overlap: 500 characters (10.0%)
⏩ Skipping curry_et_al-2018-british_journal_of_haematology.pdf (already processed)


### Embedding & Uploading to Pinecone

This cell loads environment variables, initializes the Pinecone client, and uploads document embeddings to a Pinecone vector index:

- It loads the `PINECONE_API_KEY` from a `.env` file and initializes a Pinecone index named `medical-rag-index` in the `us-east-1` AWS region.
- It uses the `intfloat/e5-base` model from SentenceTransformers to embed each document chunk.
- Chunks are read from JSON files in the `outputs/` folder, embedded, and uploaded in batches to the Pinecone index under the `thera-rag` namespace.


In [2]:
import os
import json
import uuid
from tqdm import tqdm
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

In [3]:
import os
import json
import uuid
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm

# Load env variables
load_dotenv()
pinecone_api_key = os.getenv("PINECONE_API_KEY")

assert pinecone_api_key, "Missing Pinecone env vars in .env"

# ✅ Initialize Pinecone client
pc = Pinecone(api_key=pinecone_api_key)

# Settings
index_name = "medical-rag-index"
namespace = "thera-rag"
embed_dim = 768  # e5-base

# 🔧 Create index if it doesn't exist
if index_name not in pc.list_indexes().names():
    cloud = "aws"           
    region = "us-east-1"     

    pc.create_index(
        name=index_name,
        dimension=embed_dim,
        metric="cosine",
        spec=ServerlessSpec(cloud=cloud, region=region)
    )
    print(f"✅ Created index '{index_name}' with serverless backend in {cloud}/{region}")

index = pc.Index(index_name)

# Load model
embedder = SentenceTransformer("intfloat/e5-base")
print("🔍 Model loaded")

def extract_chunk_data(entry):
    """
    Robust function to extract content and metadata from different JSON structures.
    """
    # Case 1: {"content": "...", "metadata": {...}}
    if isinstance(entry, dict) and "content" in entry:
        content = entry["content"]
        metadata = entry.get("metadata", {})
        return content, metadata
    
    # Case 2: {"page_content": "...", "metadata": {...}} (LangChain format)
    elif isinstance(entry, dict) and "page_content" in entry:
        content = entry["page_content"]
        metadata = entry.get("metadata", {})
        return content, metadata
    
    # Case 3: Just a string (raw content)
    elif isinstance(entry, str):
        return entry, {}
    
    # Case 4: Other dict format - try to find text content
    elif isinstance(entry, dict):
        # Look for common text fields
        for field in ["text", "content", "page_content", "body", "chunk"]:
            if field in entry:
                content = entry[field]
                metadata = {k: v for k, v in entry.items() if k != field}
                return content, metadata
    
    # Fallback: convert to string
    return str(entry), {}

def inspect_json_structure(file_path, max_entries=3):
    """Debug function to inspect JSON structure."""
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    print(f"\n🔍 Inspecting {file_path}:")
    print(f"   Type: {type(data)}")
    print(f"   Length: {len(data) if isinstance(data, (list, dict)) else 'N/A'}")
    
    if isinstance(data, list) and data:
        print(f"   First entry type: {type(data[0])}")
        print(f"   First entry keys: {list(data[0].keys()) if isinstance(data[0], dict) else 'Not a dict'}")
        
        # Show sample entries
        for i, entry in enumerate(data[:max_entries]):
            print(f"   Entry {i+1}: {str(entry)[:100]}...")
    elif isinstance(data, dict):
        print(f"   Dict keys: {list(data.keys())}")
    
    return data

# Load chunks
chunk_dir = "outputs"
batch_size = 100
vectors = []
total_processed = 0

print(f"\n📁 Reading chunks from: {chunk_dir}")

json_files = [f for f in os.listdir(chunk_dir) if f.endswith(".json") and not f.endswith("_stats.json")]
print(f"Found {len(json_files)} JSON files to process")

for file in json_files:
    if file.endswith("_stats.json"):  # Skip stats files
        continue
        
    file_path = os.path.join(chunk_dir, file)
    
    try:
        # First, inspect the structure
        chunks = inspect_json_structure(file_path)
        
        print(f"\n🧠 Embedding chunks from {file}...")
        
        # Handle different data structures
        if isinstance(chunks, list):
            chunk_list = chunks
        elif isinstance(chunks, dict):
            # Maybe it's wrapped in a key
            chunk_list = chunks.get('chunks', chunks.get('data', [chunks]))
        else:
            print(f"⚠️ Unexpected data type in {file}: {type(chunks)}")
            continue

        for i, entry in enumerate(tqdm(chunk_list, desc=f"Processing {file}")):
            try:
                content, metadata = extract_chunk_data(entry)
                
                if not content or not content.strip():
                    continue

                # Create a unique ID using file name and index
                chunk_id = metadata.get("chunk_id", f"{file}_{i:03d}")
                unique_id = f"{os.path.splitext(file)[0]}_{chunk_id}"
                
                # Prepare text for embedding (E5 format)
                input_text = f"passage: {content.strip()}"
                embedding = embedder.encode(input_text, normalize_embeddings=True)

                # Prepare metadata for Pinecone (ensure all values are strings/numbers)
                pinecone_metadata = {
                    "text": content[:500],  # Truncate for Pinecone metadata limits
                    "source_file": file,
                    "chunk_index": i,
                    "chunk_size": len(content),
                    "word_count": len(content.split())
                }
                
                # Add original metadata, ensuring compatibility
                for key, value in metadata.items():
                    if isinstance(value, (str, int, float, bool)):
                        pinecone_metadata[key] = value
                    else:
                        pinecone_metadata[key] = str(value)

                vectors.append({
                    "id": unique_id,
                    "values": embedding.tolist(),
                    "metadata": pinecone_metadata
                })

                total_processed += 1

                # Batch upload
                if len(vectors) >= batch_size:
                    index.upsert(vectors=vectors, namespace=namespace)
                    print(f"⬆️  Uploaded batch: {len(vectors)} vectors (Total: {total_processed})")
                    vectors = []

            except Exception as e:
                print(f"❌ Error processing entry {i} in {file}: {str(e)}")
                continue

    except Exception as e:
        print(f"❌ Error processing file {file}: {str(e)}")
        continue

# Final upload
if vectors:
    index.upsert(vectors=vectors, namespace=namespace)
    print(f"⬆️  Final batch: Uploaded {len(vectors)} vectors.")

# Print summary
stats = index.describe_index_stats()
print(f"\n✅ Done! Processed {total_processed} chunks total.")
print(f"📊 Pinecone index stats:")
print(f"   Total vectors: {stats.total_vector_count}")
print(f"   Namespace '{namespace}': {stats.namespaces.get(namespace, {}).vector_count if stats.namespaces else 'Not found'}")

# Test a quick query to verify everything works
if total_processed > 0:
    print(f"\n🧪 Testing query...")
    test_query = "medical treatment"
    test_embedding = embedder.encode(f"query: {test_query}", normalize_embeddings=True)
    
    results = index.query(
        vector=test_embedding.tolist(),
        top_k=3,
        namespace=namespace,
        include_metadata=True
    )
    
    print(f"✅ Query test successful! Found {len(results.matches)} results for '{test_query}'")

🔍 Model loaded

📁 Reading chunks from: outputs
Found 1 JSON files to process

🔍 Inspecting outputs\curry_et_al-2018-british_journal_of_haematology.json:
   Type: <class 'list'>
   Length: 24
   First entry type: <class 'dict'>
   First entry keys: ['content', 'metadata']
   Entry 1: {'content': '<!-- image -->\n\n## The use of viscoelastic haemostatic assays in the management of ma...
   Entry 2: {'content': 'The Association of Anaesthetists of Great Britain and Ireland endorsed the document.\n#...
   Entry 3: {'content': "<!-- image -->\n\n|                 | TEG 5000 (cup and pin method)         | TEG 6s (c...

🧠 Embedding chunks from curry_et_al-2018-british_journal_of_haematology.json...


Processing curry_et_al-2018-british_journal_of_haematology.json: 100%|██████████| 24/24 [00:26<00:00,  1.09s/it]


⬆️  Final batch: Uploaded 24 vectors.

✅ Done! Processed 24 chunks total.
📊 Pinecone index stats:
   Total vectors: 24
   Namespace 'thera-rag': 24

🧪 Testing query...
✅ Query test successful! Found 3 results for 'medical treatment'
