# Docling Document Ingestion Pipeline

This notebook demonstrates document ingestion using Docling with LangChain integration. Docling provides state-of-the-art AI models for layout analysis and table structure recognition, enabling superior PDF processing compared to traditional loaders like PyPDFLoader.

## Key Features:
- **DoclingLoader**: Advanced PDF processing with AI-powered layout analysis
- **Intelligent Processing**: Superior text extraction and document structure recognition
- **Simple Integration**: Easy-to-use interface for document ingestion

In [None]:
# Section 1: Install and Import Required Libraries
import os
from pathlib import Path

# Import Docling and LangChain components
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType

# Import utilities
from dotenv import load_dotenv

print("✓ Successfully imported Docling and LangChain libraries")
print(f"Available export types: {[export_type.name for export_type in ExportType]}")

# Load environment variables
load_dotenv()

# Configure tokenizers to avoid warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# Section 2: Setup Configuration

# Configuration parameters
EXPORT_TYPE = ExportType.DOC_CHUNKS  # Use DOC_CHUNKS for automatic intelligent chunking
DOCS_FOLDER = "docs"

print("Configuration Settings:")
print(f"  📄 Export Type: {EXPORT_TYPE.name}")
print(f"  📁 Documents Folder: {DOCS_FOLDER}")

In [None]:
# Section 2.5: Docling Installation Fix (Run if needed)

def check_docling_installation():
    """Check if Docling is properly installed and try to fix common issues."""
    try:
        from docling_parse.pdf_parser import pdf_parser_v2
        parser = pdf_parser_v2(level="fatal")
        print("✅ Docling installation appears to be working correctly")
        return True
    except RuntimeError as e:
        if "filename does not exists" in str(e):
            print("❌ Docling installation issue detected:")
            print(f"   Missing file: {str(e).split(':')[-1].strip()}")
            print("\n🔧 Attempting to fix...")
            
            # Try to reinstall docling-parse
            import subprocess
            import sys
            
            try:
                subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "--force-reinstall", "docling-parse"])
                print("✅ Reinstalled docling-parse")
                return True
            except Exception as install_error:
                print(f"❌ Failed to reinstall: {install_error}")
                return False
        else:
            print(f"❌ Unknown Docling error: {e}")
            return False
    except Exception as e:
        print(f"❌ General error checking Docling: {e}")
        return False

# Check Docling installation
docling_working = check_docling_installation()
if not docling_working:
    print("\n⚠️  Docling has installation issues. The notebook will use PyPDFLoader as fallback.")
    print("   You may want to run: pip install --upgrade --force-reinstall docling-parse")

In [None]:
# Section 3: Scan for PDF Documents

pdf_files = []

# Check if docs folder exists and scan for PDF files
if os.path.exists(DOCS_FOLDER):
    # Get all PDF files in the docs folder
    for file in os.listdir(DOCS_FOLDER):
        if file.lower().endswith('.pdf'):
            pdf_files.append(os.path.join(DOCS_FOLDER, file))
    
    print(f"📁 Found {len(pdf_files)} PDF files in '{DOCS_FOLDER}' folder:")
    for i, pdf_file in enumerate(pdf_files, 1):
        file_size = os.path.getsize(pdf_file) / (1024 * 1024)  # Size in MB
        print(f"  {i}. {os.path.basename(pdf_file)} ({file_size:.1f} MB)")
        print(f"     Full path: {pdf_file}")
    
    if not pdf_files:
        print(f"⚠️  No PDF files found in '{DOCS_FOLDER}' folder")
else:
    print(f"❌ Error: '{DOCS_FOLDER}' folder not found!")
    print("Please ensure the docs folder exists with PDF files.")

print(f"\n📋 Total files to process: {len(pdf_files)}")

In [26]:
# Section 4: Load Documents with DoclingLoader (with Fallback)

def load_documents_with_docling(file_paths, export_type=ExportType.DOC_CHUNKS):
    """
    Load PDF documents using DoclingLoader with specified configuration.
    Includes fallback to PyPDFLoader if Docling fails.
    
    Args:
        file_paths: List of PDF file paths
        export_type: ExportType.DOC_CHUNKS or ExportType.MARKDOWN
    
    Returns:
        List of loaded documents and list of failed files
    """
    all_documents = []
    failed_files = []
    
    print(f"🚀 Starting Docling ingestion with {export_type.name} mode...")
    print("=" * 60)
    
    for pdf_file in file_paths:
        try:
            print(f"📖 Processing: {os.path.basename(pdf_file)}")
            
            # Try DoclingLoader first
            try:
                loader = DoclingLoader(
                    file_path=pdf_file,
                    export_type=export_type
                )
                documents = loader.load()
                print(f"  ✅ Docling: Successfully loaded {len(documents)} {'chunks' if export_type == ExportType.DOC_CHUNKS else 'documents'}")
                
            except Exception as docling_error:
                # Fallback to PyPDFLoader if Docling fails
                print(f"  ⚠️  Docling failed, trying PyPDFLoader fallback...")
                from langchain_community.document_loaders import PyPDFLoader
                
                loader = PyPDFLoader(pdf_file)
                documents = loader.load()
                print(f"  ✅ PyPDF: Successfully loaded {len(documents)} pages as fallback")
            
            # Add custom metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file
                doc.metadata['file_name'] = os.path.basename(pdf_file)
                doc.metadata['processing_mode'] = export_type.name
                doc.metadata['loader_used'] = 'DoclingLoader' if 'DoclingLoader' in str(type(loader)) else 'PyPDFLoader'
            
            all_documents.extend(documents)
            
        except Exception as e:
            failed_files.append((pdf_file, str(e)))
            print(f"  ❌ Complete failure for {os.path.basename(pdf_file)}: {e}")
    
    print("=" * 60)
    print(f"🎯 Ingestion Summary:")
    print(f"  ✅ Successfully processed: {len(file_paths) - len(failed_files)} files")
    print(f"  ❌ Failed: {len(failed_files)} files")
    print(f"  📄 Total documents/chunks: {len(all_documents)}")
    
    if failed_files:
        print(f"\n⚠️  Failed files:")
        for file, error in failed_files:
            print(f"  - {os.path.basename(file)}: {error}")
    
    return all_documents, failed_files

# Load documents using DOC_CHUNKS mode with fallback support
if pdf_files:
    docling_documents, failed_files = load_documents_with_docling(pdf_files, EXPORT_TYPE)
    print(f"\n✅ Ingestion completed! Loaded {len(docling_documents)} document chunks.")
else:
    print("⚠️  No PDF files available for processing")

🚀 Starting Docling ingestion with DOC_CHUNKS mode...
📖 Processing: aml-ctf-statement-attention-of-cbl-transfer-agent-data.pdf


2025-09-24 16:21:32,101 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-24 16:21:32,105 - ERROR - An unexpected error occurred while opening the document aml-ctf-statement-attention-of-cbl-transfer-agent-data.pdf
Traceback (most recent call last):
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\datamodel\document.py", line 152, in __init__
    self._init_doc(backend, path_or_stream)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\datamodel\document.py", line 188, in _init_doc
    self._backend = backend(self, path_or_stream=path_or_stream)
                    ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\backend\docling_parse_v4_backend.py", line 197, in __init__
    self.parser = DoclingPdfParser(loglevel="fatal")
                  ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv

  ⚠️  Docling failed, trying PyPDFLoader fallback...
  ✅ PyPDF: Successfully loaded 7 pages as fallback
📖 Processing: Canadian Collateral Management Services (CCMS).pdf
  ✅ PyPDF: Successfully loaded 7 pages as fallback
📖 Processing: Canadian Collateral Management Services (CCMS).pdf


2025-09-24 16:21:32,961 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-24 16:21:32,966 - ERROR - An unexpected error occurred while opening the document Canadian Collateral Management Services (CCMS).pdf
Traceback (most recent call last):
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\datamodel\document.py", line 152, in __init__
    self._init_doc(backend, path_or_stream)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\datamodel\document.py", line 188, in _init_doc
    self._backend = backend(self, path_or_stream=path_or_stream)
                    ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\backend\docling_parse_v4_backend.py", line 197, in __init__
    self.parser = DoclingPdfParser(loglevel="fatal")
                  ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\sit

  ⚠️  Docling failed, trying PyPDFLoader fallback...
  ✅ PyPDF: Successfully loaded 3 pages as fallback
📖 Processing: cbl-aml-questionnaire-data.pdf
  ✅ PyPDF: Successfully loaded 3 pages as fallback
📖 Processing: cbl-aml-questionnaire-data.pdf


2025-09-24 16:21:33,719 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-24 16:21:33,725 - ERROR - An unexpected error occurred while opening the document cbl-aml-questionnaire-data.pdf
Traceback (most recent call last):
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\datamodel\document.py", line 152, in __init__
    self._init_doc(backend, path_or_stream)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\datamodel\document.py", line 188, in _init_doc
    self._backend = backend(self, path_or_stream=path_or_stream)
                    ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\backend\docling_parse_v4_backend.py", line 197, in __init__
    self.parser = DoclingPdfParser(loglevel="fatal")
                  ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling_p

  ⚠️  Docling failed, trying PyPDFLoader fallback...
  ✅ PyPDF: Successfully loaded 13 pages as fallback
📖 Processing: Disclosure Requirements – Investment Funds –Denmark.pdf
  ✅ PyPDF: Successfully loaded 13 pages as fallback
📖 Processing: Disclosure Requirements – Investment Funds –Denmark.pdf


2025-09-24 16:21:35,136 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-24 16:21:35,138 - ERROR - An unexpected error occurred while opening the document Disclosure Requirements – Investment Funds –Denmark.pdf
Traceback (most recent call last):
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\datamodel\document.py", line 152, in __init__
    self._init_doc(backend, path_or_stream)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\datamodel\document.py", line 188, in _init_doc
    self._backend = backend(self, path_or_stream=path_or_stream)
                    ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\backend\docling_parse_v4_backend.py", line 197, in __init__
    self.parser = DoclingPdfParser(loglevel="fatal")
                  ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Li

  ⚠️  Docling failed, trying PyPDFLoader fallback...
  ✅ PyPDF: Successfully loaded 3 pages as fallback
📖 Processing: Holding Restrictions – Investment Funds – Ireland.pdf


2025-09-24 16:21:35,772 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-24 16:21:35,774 - ERROR - An unexpected error occurred while opening the document Holding Restrictions – Investment Funds – Ireland.pdf
Traceback (most recent call last):
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\datamodel\document.py", line 152, in __init__
    self._init_doc(backend, path_or_stream)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\datamodel\document.py", line 188, in _init_doc
    self._backend = backend(self, path_or_stream=path_or_stream)
                    ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\backend\docling_parse_v4_backend.py", line 197, in __init__
    self.parser = DoclingPdfParser(loglevel="fatal")
                  ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\

  ⚠️  Docling failed, trying PyPDFLoader fallback...
  ✅ PyPDF: Successfully loaded 2 pages as fallback
📖 Processing: Holding Restrictions – Investment Funds –Denmark.pdf


2025-09-24 16:21:36,336 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-24 16:21:36,339 - ERROR - An unexpected error occurred while opening the document Holding Restrictions – Investment Funds –Denmark.pdf
Traceback (most recent call last):
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\datamodel\document.py", line 152, in __init__
    self._init_doc(backend, path_or_stream)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\datamodel\document.py", line 188, in _init_doc
    self._backend = backend(self, path_or_stream=path_or_stream)
                    ~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\site-packages\docling\backend\docling_parse_v4_backend.py", line 197, in __init__
    self.parser = DoclingPdfParser(loglevel="fatal")
                  ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^
  File "c:\workspace-new\Deutsche Börse\.venv\Lib\s

  ⚠️  Docling failed, trying PyPDFLoader fallback...
  ✅ PyPDF: Successfully loaded 2 pages as fallback
🎯 Ingestion Summary:
  ✅ Successfully processed: 6 files
  ❌ Failed: 0 files
  📄 Total documents/chunks: 30

✅ Ingestion completed! Loaded 30 document chunks.


In [None]:
# Section 5: Display Basic Document Information

if 'docling_documents' in locals() and docling_documents:
    print("📋 Document Processing Results")
    print("=" * 50)
    
    # Group documents by source file
    files_info = {}
    loader_stats = {}
    
    for doc in docling_documents:
        file_name = doc.metadata.get('file_name', 'Unknown')
        loader_used = doc.metadata.get('loader_used', 'Unknown')
        
        if file_name not in files_info:
            files_info[file_name] = []
        files_info[file_name].append(doc)
        
        # Count loader usage
        if loader_used not in loader_stats:
            loader_stats[loader_used] = 0
        loader_stats[loader_used] += 1
    
    # Display loader statistics
    print("🔧 Loader Usage Statistics:")
    for loader, count in loader_stats.items():
        print(f"  {loader}: {count} documents")
    
    # Display summary for each file
    for file_name, docs in files_info.items():
        total_chars = sum(len(doc.page_content) for doc in docs)
        avg_chunk_size = total_chars // len(docs) if docs else 0
        loader_used = docs[0].metadata.get('loader_used', 'Unknown')
        
        print(f"\n📄 {file_name}")
        print(f"  Documents/Chunks: {len(docs)}")
        print(f"  Total characters: {total_chars:,}")
        print(f"  Average chunk size: {avg_chunk_size:,} characters")
        print(f"  Processed with: {loader_used}")
        
        # Show sample content from first document
        if docs:
            sample_content = docs[0].page_content[:150].strip().replace('\n', ' ')
            print(f"  Sample: {sample_content}...")
    
    print(f"\n🎯 Total processed: {len(docling_documents)} document chunks ready for use!")
else:
    print("⚠️  No documents processed. Please run the document loading cell first.")