In [1]:
"""
IFRS Document Processing Pipeline
=================================

This pipeline processes IFRS PDF documents using Docling for parsing and LangChain for processing,
creating embeddings with OpenAI, and storing the results in a Pandas DataFrame saved as Parquet.

Pipeline Flow:
PDFs → Docling (with VLM enrichment) → Smart Chunking → Dual Categorization → 
OpenAI Embeddings → Pandas DataFrame → Parquet Storage

Chunking Strategy:
-----------------
The pipeline uses MarkdownHeaderTextSplitter for hierarchical chunking, which is ideal for IFRS documents:
- Preserves document hierarchy (Standard > Section > Subsection > Paragraph)
- Splits on semantic boundaries (headers) rather than arbitrary token counts
- Maintains section context in metadata for better categorization
- Respects the natural structure of financial standards

Installation Requirements:
-------------------------
pip install docling
pip install langchain-openai
pip install langchain-text-splitters
pip install langchain-docling
pip install pandas
pip install numpy
pip install tqdm

Important Import Notes:
----------------------
Based on LangChain documentation, the correct imports are:
    from langchain_docling import DoclingLoader
    from langchain_docling.loader import ExportType

Key Features:
- Advanced PDF parsing with table and figure extraction
- Vision Language Model (VLM) support for chart descriptions
- Hierarchical-aware chunking that respects IFRS document structure
- Dual categorization system using embedding similarity
- Rich metadata preservation
- Comprehensive logging and progress tracking
"""

import logging
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Any
import pandas as pd
import numpy as np
from dataclasses import dataclass
from datetime import datetime
import json
import time
import sys
from tqdm import tqdm
from enum import Enum

# Docling imports
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
from docling.chunking import HybridChunker

# LangChain imports
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain.schema import Document

# Correct langchain_docling imports based on documentation
# IMPORTANT: ExportType must be imported from langchain_docling.loader, not langchain_docling
try:
    from langchain_docling import DoclingLoader
    from langchain_docling.loader import ExportType
    LANGCHAIN_DOCLING_AVAILABLE = True
    print("[OK] langchain_docling is available")
except ImportError as e:
    LANGCHAIN_DOCLING_AVAILABLE = False
    print(f"Warning: langchain_docling not available: {e}")
    print("Using direct Docling integration instead.")
    print("\nTo fix, install with: pip install langchain-docling")
    # Create a dummy ExportType for compatibility
    class ExportType(Enum):
        MARKDOWN = "markdown"
        DOC_CHUNKS = "doc_chunks"
        HTML = "html"


@dataclass
class ChunkingConfig:
    """Configuration for document chunking strategy"""
    min_chunk_tokens: int = 300
    max_chunk_tokens: int = 500
    overlap_tokens: int = 50
    respect_sections: bool = True  # Use hierarchical header-based splitting
    keep_tables_intact: bool = True
    include_figure_descriptions: bool = True


@dataclass
class ProcessingConfig:
    """Main configuration for the processing pipeline"""
    # Docling settings
    enable_table_extraction: bool = True
    enable_figure_descriptions: bool = True
    figure_description_model: str = "smolvlm"  # or "granite" for better quality
    images_scale: int = 2  # Quality of extracted images
    
    # Embedding settings
    embedding_model: str = "text-embedding-3-large"
    embedding_dimensions: int = 3072
    batch_size: int = 50
    
    # Chunking settings
    chunking: ChunkingConfig = None
    
    # Logging settings
    log_level: str = "INFO"
    log_file: Optional[str] = "ifrs_processing.log"
    
    def __post_init__(self):
        if self.chunking is None:
            self.chunking = ChunkingConfig()


class ProcessingStats:
    """Track processing statistics"""
    def __init__(self):
        self.start_time = time.time()
        self.total_pdfs = 0
        self.processed_pdfs = 0
        self.failed_pdfs = 0
        self.total_chunks = 0
        self.total_pages = 0
        self.tables_found = 0
        self.figures_found = 0
        self.processing_times = {}
    
    def get_summary(self) -> Dict[str, Any]:
        """Get summary statistics"""
        elapsed_time = time.time() - self.start_time
        return {
            'total_pdfs': self.total_pdfs,
            'processed_pdfs': self.processed_pdfs,
            'failed_pdfs': self.failed_pdfs,
            'success_rate': (self.processed_pdfs / self.total_pdfs * 100) if self.total_pdfs > 0 else 0,
            'total_chunks': self.total_chunks,
            'total_pages': self.total_pages,
            'avg_chunks_per_doc': self.total_chunks / self.processed_pdfs if self.processed_pdfs > 0 else 0,
            'tables_found': self.tables_found,
            'figures_found': self.figures_found,
            'total_time_seconds': elapsed_time,
            'avg_time_per_doc': elapsed_time / self.processed_pdfs if self.processed_pdfs > 0 else 0
        }


class IFRSDocumentProcessor:
    """
    Main processor for IFRS documents with intelligent chunking and categorization.
    
    Uses embedding-based categorization for semantic understanding of content.
    """
    
    def __init__(
        self,
        category_list_1: List[str],
        category_list_2: List[str],
        config: ProcessingConfig = None
    ):
        """
        Initialize the processor with categorization lists and configuration.
        
        Args:
            category_list_1: First categorization scheme (e.g., ['Recognition', 'Measurement', 'Disclosure'])
            category_list_2: Second categorization scheme (e.g., ['IFRS 15', 'IFRS 16', 'IAS 12'])
            config: Processing configuration
        """
        self.categories_1 = category_list_1
        self.categories_2 = category_list_2
        self.config = config or ProcessingConfig()
        self.stats = ProcessingStats()
        
        # Setup logging
        self._setup_logging()
        
        # Initialize components
        self._setup_embeddings()
        self._setup_docling()
        self._setup_chunking()
        
        # Category descriptions for embedding-based classification
        self.category_descriptions = {
            'Recognition': """
                This text discusses when and how to recognize items in financial statements,
                including initial recognition criteria, derecognition requirements, and the
                timing of when transactions should be recorded in the accounts.
            """,
            'Measurement': """
                This text covers how to measure financial items, including fair value measurement,
                cost basis, amortized cost, present value calculations, carrying amounts,
                revaluation methods, and subsequent measurement requirements.
            """,
            'Disclosure': """
                This text relates to disclosure requirements, notes to financial statements,
                required disclosures, presentation of information, and what information
                entities must provide to users of financial statements.
            """,
            'Presentation': """
                This text addresses how items should be presented in financial statements,
                including line items, classification of items, statement formats, and
                requirements for separate presentation of different elements.
            """,
            'Transition': """
                This text covers transition provisions, effective dates, first-time adoption
                requirements, retrospective or prospective application, and grandfathering
                provisions for new or amended standards.
            """
        }
        
        # Pre-compute category embeddings
        self.category_embeddings = self._compute_category_embeddings()
    
    def _setup_logging(self):
        """Configure comprehensive logging with UTF-8 encoding support"""
        log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        
        # Create handlers with UTF-8 encoding
        handlers = []
        
        # Console handler with UTF-8 encoding
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(logging.Formatter(log_format))
        handlers.append(console_handler)
        
        # File handler with UTF-8 encoding
        if self.config.log_file:
            file_handler = logging.FileHandler(self.config.log_file, encoding='utf-8')
            file_handler.setFormatter(logging.Formatter(log_format))
            handlers.append(file_handler)
        
        # Configure root logger
        logging.basicConfig(
            level=getattr(logging, self.config.log_level),
            format=log_format,
            handlers=handlers,
            encoding='utf-8',  # Set default encoding
            force=True  # Force reconfiguration
        )
        
        self.logger = logging.getLogger(__name__)
        
        # For Windows compatibility, detect if Unicode is supported
        try:
            import sys
            if sys.platform == 'win32':
                # Try to enable Unicode in Windows console
                import os
                os.system('chcp 65001 >nul 2>&1')  # Set console to UTF-8
        except:
            pass
        
        self.logger.info("="*80)
    
    def _get_check_mark(self) -> str:
        """Get appropriate check mark based on system encoding support"""
        try:
            # Test if we can encode Unicode
            '✓'.encode(sys.stdout.encoding or 'utf-8')
            return '✓'
        except (UnicodeEncodeError, AttributeError):
            # Fallback to ASCII
            return '[OK]'
    
    def _get_cross_mark(self) -> str:
        """Get appropriate cross mark based on system encoding support"""
        try:
            '✗'.encode(sys.stdout.encoding or 'utf-8')
            return '✗'
        except (UnicodeEncodeError, AttributeError):
            # Fallback to ASCII
            return '[FAIL]'
        self.logger.info("IFRS Document Processing Pipeline Started")
        self.logger.info(f"Timestamp: {datetime.now().isoformat()}")
        self.logger.info(f"Configuration: {json.dumps(self.config.__dict__, indent=2, default=str)}")
        self.logger.info("="*80)
    
    def _setup_embeddings(self):
        """Initialize OpenAI embeddings model"""
        self.logger.info(f"Initializing embeddings model: {self.config.embedding_model}")
        try:
            self.embeddings_model = OpenAIEmbeddings(
                model=self.config.embedding_model,
                dimensions=self.config.embedding_dimensions
            )
            # Test embedding
            test_embedding = self.embeddings_model.embed_query("test")
            self.logger.info(f"{self._get_check_mark()} Embeddings model initialized successfully (dim: {len(test_embedding)})")
        except Exception as e:
            self.logger.error(f"Failed to initialize embeddings model: {e}")
            raise
    
    def _setup_docling(self):
        """Configure Docling with enrichment options"""
        self.logger.info("Configuring Docling pipeline...")
        self.pipeline_options = PdfPipelineOptions()
        
        # Enable table extraction
        if self.config.enable_table_extraction:
            self.pipeline_options.do_table_structure = True
            self.logger.info(f"  {self._get_check_mark()} Table extraction enabled")
            
        # Enable figure description with VLM
        if self.config.enable_figure_descriptions:
            self.pipeline_options.generate_picture_images = True
            self.pipeline_options.images_scale = self.config.images_scale
            self.pipeline_options.do_picture_classification = True
            self.pipeline_options.do_picture_description = True
            self.logger.info(f"  {self._get_check_mark()} Figure description enabled (VLM: {self.config.figure_description_model})")
        
        # Create converter
        self.converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=self.pipeline_options)
            }
        )
        self.logger.info(f"{self._get_check_mark()} Docling pipeline configured")
    
    def _setup_chunking(self):
        """Initialize chunking strategy"""
        self.logger.info("Setting up chunking strategy...")
        
        if LANGCHAIN_DOCLING_AVAILABLE:
            self.chunker = HybridChunker(
                tokenizer="sentence-transformers/all-MiniLM-L6-v2",
                max_tokens=self.config.chunking.max_chunk_tokens,
                min_overlap_tokens=self.config.chunking.overlap_tokens,
                split_on_sentences=True
            )
            self.logger.info(f"  {self._get_check_mark()} Using Docling HybridChunker")
        else:
            # Fallback to LangChain splitter
            self.text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=self.config.chunking.max_chunk_tokens * 4,  # Approximate chars
                chunk_overlap=self.config.chunking.overlap_tokens * 4,
                separators=["\n\n", "\n", ". ", " ", ""]
            )
            self.logger.info(f"  {self._get_check_mark()} Using LangChain RecursiveCharacterTextSplitter")
    
    def _compute_category_embeddings(self) -> Dict[str, np.ndarray]:
        """Pre-compute embeddings for each category based on their descriptions"""
        self.logger.info("Computing category embeddings...")
        category_embeddings = {}
        
        for category, description in self.category_descriptions.items():
            embedding = self.embeddings_model.embed_query(description.strip())
            category_embeddings[category] = np.array(embedding)
            self.logger.info(f"  {self._get_check_mark()} Computed embedding for '{category}'")
            
        self.logger.info(f"{self._get_check_mark()} Computed embeddings for {len(category_embeddings)} categories")
        return category_embeddings
    
    def _compute_cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
        """Compute cosine similarity between two vectors"""
        dot_product = np.dot(vec1, vec2)
        norm_product = np.linalg.norm(vec1) * np.linalg.norm(vec2)
        return dot_product / norm_product if norm_product > 0 else 0.0
    
    def process_documents(self, pdf_paths: List[Path]) -> pd.DataFrame:
        """
        Main processing pipeline for IFRS documents.
        
        Args:
            pdf_paths: List of paths to PDF files
            
        Returns:
            DataFrame with processed chunks, embeddings, and categorizations
        """
        self.stats.total_pdfs = len(pdf_paths)
        self.logger.info(f"\nStarting processing of {len(pdf_paths)} PDF documents")
        self.logger.info("="*80)
        
        all_data = []
        
        with tqdm(total=len(pdf_paths), desc="Processing PDFs", unit="pdf") as pbar:
            for i, pdf_path in enumerate(pdf_paths, 1):
                self.logger.info(f"\n[{i}/{len(pdf_paths)}] Processing: {pdf_path.name}")
                doc_start_time = time.time()
                
                try:
                    # Process single document
                    chunks_data = self._process_single_document(pdf_path)
                    all_data.extend(chunks_data)
                    
                    # Update stats
                    self.stats.processed_pdfs += 1
                    doc_time = time.time() - doc_start_time
                    self.stats.processing_times[pdf_path.name] = doc_time
                    
                    self.logger.info(f"  {self._get_check_mark()} Successfully processed in {doc_time:.2f}s")
                    self.logger.info(f"  {self._get_check_mark()} Generated {len(chunks_data)} chunks")
                    
                except Exception as e:
                    self.stats.failed_pdfs += 1
                    self.logger.error(f"  {self._get_cross_mark()} Error processing {pdf_path.name}: {str(e)}")
                    self.logger.exception("Full traceback:")
                
                pbar.update(1)
                pbar.set_postfix({
                    'Success': self.stats.processed_pdfs,
                    'Failed': self.stats.failed_pdfs,
                    'Chunks': self.stats.total_chunks
                })
        
        # Create DataFrame
        df = pd.DataFrame(all_data)
        
        # Log final statistics
        self._log_final_statistics(df)
        
        return df
    
    def _process_single_document(self, pdf_path: Path) -> List[Dict]:
        """Process a single PDF document"""
        doc_start_time = time.time()
        
        # Convert document with Docling
        self.logger.info(f"  -> Converting with Docling...")
        conversion_result = self.converter.convert(str(pdf_path))
        doc = conversion_result.document
        
        # Log document info
        num_pages = len(doc.pages) if hasattr(doc, 'pages') else 0
        self.stats.total_pages += num_pages
        self.logger.info(f"  -> Document has {num_pages} pages")
        
        # Create chunks
        chunks = self._create_chunks(doc, pdf_path)
        self.logger.info(f"  -> Created {len(chunks)} chunks")
        
        # Process chunks with embeddings
        chunks_data = self._process_chunks_with_embeddings(chunks, pdf_path)
        
        doc_time = time.time() - doc_start_time
        self.logger.info(f"  -> Document processing completed in {doc_time:.2f}s")
        
        return chunks_data
    
    def _create_chunks(self, doc, pdf_path: Path) -> List[Document]:
        """Create chunks from Docling document following LangChain documentation pattern"""
        if LANGCHAIN_DOCLING_AVAILABLE:
            # Determine export type based on configuration
            if self.config.chunking.respect_sections:
                # Use MARKDOWN export for hierarchical splitting
                export_type = ExportType.MARKDOWN
                self.logger.info("  -> Using MARKDOWN export with hierarchical splitting")
            else:
                # Use DOC_CHUNKS for standard chunking
                export_type = ExportType.DOC_CHUNKS
                self.logger.info("  -> Using DOC_CHUNKS export")
            
            # Load document with DoclingLoader
            loader = DoclingLoader(
                file_path=str(pdf_path),
                export_type=export_type,
                converter=self.converter
            )
            docs = loader.load()
            self.logger.info(f"  -> Loaded {len(docs)} documents from DoclingLoader")
            
            # Process based on export type (following LangChain documentation pattern)
            if export_type == ExportType.DOC_CHUNKS:
                # With DOC_CHUNKS, documents are already chunked
                splits = docs
                self.logger.info(f"  -> Using {len(splits)} pre-chunked documents")
                
            elif export_type == ExportType.MARKDOWN:
                # With MARKDOWN, we need to split using MarkdownHeaderTextSplitter
                from langchain_text_splitters import MarkdownHeaderTextSplitter
                
                # Define headers relevant to IFRS documents
                splitter = MarkdownHeaderTextSplitter(
                    headers_to_split_on=[
                        ("#", "Standard"),      # IFRS 15
                        ("##", "Section"),      # Objective, Scope  
                        ("###", "Subsection"),  # Core principle
                        ("####", "Paragraph"),  # Detailed requirements
                    ],
                    strip_headers=False  # Keep headers in content for context
                )
                
                # Split all documents (following documentation pattern)
                splits = [
                    split for doc in docs 
                    for split in splitter.split_text(doc.page_content)
                ]
                self.logger.info(f"  -> Created {len(splits)} hierarchical chunks from {len(docs)} documents")
                
            else:
                raise ValueError(f"Unexpected export type: {export_type}")
            
            # Enhance metadata for all splits
            for i, split in enumerate(splits):
                split.metadata.update({
                    'source': str(pdf_path.name),
                    'chunk_index': i,
                    'export_type': export_type.value if hasattr(export_type, 'value') else str(export_type)
                })
            
            return splits
            
        else:
            # Fallback: convert to markdown and chunk with hierarchy
            self.logger.info("  -> Using direct Docling conversion (fallback mode)")
            markdown_content = doc.export_to_markdown()
            
            if self.config.chunking.respect_sections:
                # Use MarkdownHeaderTextSplitter for hierarchical chunking
                from langchain_text_splitters import MarkdownHeaderTextSplitter
                
                splitter = MarkdownHeaderTextSplitter(
                    headers_to_split_on=[
                        ("#", "Standard"),
                        ("##", "Section"),
                        ("###", "Subsection"),
                        ("####", "Paragraph"),
                    ],
                    strip_headers=False
                )
                
                # Create base document
                base_doc = Document(
                    page_content=markdown_content,
                    metadata={
                        'source': str(pdf_path.name),
                        'num_pages': len(doc.pages) if hasattr(doc, 'pages') else 0
                    }
                )
                
                # Split and create chunks
                splits = splitter.split_text(base_doc.page_content)
                
                # Convert to Document objects with metadata
                chunks = []
                for i, split in enumerate(splits):
                    # Inherit base metadata and add chunk-specific info
                    metadata = base_doc.metadata.copy()
                    metadata.update(split.metadata if hasattr(split, 'metadata') else {})
                    metadata['chunk_index'] = i
                    metadata['export_type'] = 'markdown'
                    
                    chunks.append(Document(
                        page_content=split.page_content if hasattr(split, 'page_content') else split,
                        metadata=metadata
                    ))
                
                return chunks
            else:
                # Use simple text splitting
                texts = self.text_splitter.split_text(markdown_content)
                
                chunks = []
                for i, text in enumerate(texts):
                    chunks.append(Document(
                        page_content=text,
                        metadata={
                            'source': str(pdf_path.name),
                            'chunk_index': i,
                            'export_type': 'text_split',
                            'num_pages': len(doc.pages) if hasattr(doc, 'pages') else 0
                        }
                    ))
                
                return chunks
    
    def _process_chunks_with_embeddings(
        self, 
        chunks: List[Document], 
        pdf_path: Path
    ) -> List[Dict]:
        """Process chunks in batches to create embeddings and categorize"""
        chunks_data = []
        batch_texts = []
        batch_metadata = []
        
        self.logger.info(f"  -> Creating embeddings for {len(chunks)} chunks...")
        
        for i, chunk in enumerate(chunks):
            batch_texts.append(chunk.page_content)
            batch_metadata.append(chunk.metadata)
            
            # Process batch when full or at end
            if len(batch_texts) >= self.config.batch_size or i == len(chunks) - 1:
                # Create embeddings for batch
                embeddings = self.embeddings_model.embed_documents(batch_texts)
                
                # Process each chunk in batch
                for text, embedding, metadata in zip(batch_texts, embeddings, batch_metadata):
                    # Categorize using embedding similarity
                    category_1, similarity_score = self._categorize_by_embedding_similarity(
                        embedding, self.category_embeddings
                    )
                    
                    # Extract IFRS standard from metadata
                    category_2 = self._extract_standard_from_metadata(metadata, text)
                    
                    # Check for special content
                    has_table = self._contains_table(metadata, text)
                    has_figure = self._contains_figure(metadata, text)
                    
                    if has_table:
                        self.stats.tables_found += 1
                    if has_figure:
                        self.stats.figures_found += 1
                    
                    # Create chunk data
                    chunk_data = {
                        'content': text,
                        'embedding': embedding,
                        'category_1': category_1,
                        'category_1_similarity': similarity_score,
                        'category_2': category_2,
                        'source_file': str(pdf_path.name),
                        'page_no': metadata.get('page_no', -1),
                        'section': self._extract_section(metadata),
                        'has_table': has_table,
                        'has_figure': has_figure,
                        'chunk_index': i,
                        'chunk_length': len(text),
                        'metadata': metadata
                    }
                    
                    chunks_data.append(chunk_data)
                    self.stats.total_chunks += 1
                
                # Clear batch
                batch_texts = []
                batch_metadata = []
                
                # Log progress
                if (i + 1) % 100 == 0:
                    self.logger.info(f"    Processed {i + 1}/{len(chunks)} chunks")
        
        return chunks_data
    
    def _categorize_by_embedding_similarity(
        self, 
        chunk_embedding: np.ndarray, 
        category_embeddings: Dict[str, np.ndarray]
    ) -> Tuple[str, float]:
        """
        Categorize chunk based on embedding similarity to category descriptions.
        
        Returns:
            Tuple of (category, similarity_score)
        """
        similarities = {}
        chunk_embedding = np.array(chunk_embedding)
        
        for category, cat_embedding in category_embeddings.items():
            similarity = self._compute_cosine_similarity(chunk_embedding, cat_embedding)
            similarities[category] = similarity
        
        # Get category with highest similarity
        best_category = max(similarities, key=similarities.get)
        best_score = similarities[best_category]
        
        # Log if similarity is low
        if best_score < 0.5:
            self.logger.debug(
                f"    Low similarity score ({best_score:.3f}) for category {best_category}"
            )
        
        return best_category, best_score
    
    def _extract_standard_from_metadata(self, metadata: Dict, text: str) -> str:
        """Extract IFRS standard from metadata or text"""
        # First check hierarchical metadata from MarkdownHeaderTextSplitter
        if 'Standard' in metadata:
            standard_text = metadata['Standard']
            for standard in self.categories_2:
                if standard.upper() in standard_text.upper():
                    return standard
        
        # Check all header levels
        for header_level in ['Section', 'Subsection', 'Paragraph']:
            if header_level in metadata:
                header_text = metadata[header_level]
                for standard in self.categories_2:
                    if standard.upper() in header_text.upper():
                        return standard
        
        # Check metadata headings
        headings = metadata.get('headings', [])
        for heading in headings:
            for standard in self.categories_2:
                if standard.upper() in str(heading).upper():
                    return standard
        
        # Check source filename
        source = metadata.get('source', '')
        for standard in self.categories_2:
            if standard.replace(' ', '').upper() in source.upper():
                return standard
        
        # Check text content
        text_upper = text[:500].upper()  # Check first 500 chars
        for standard in self.categories_2:
            if standard.upper() in text_upper:
                return standard
        
        return 'General'
    
    def _extract_section(self, metadata: Dict) -> str:
        """Extract section hierarchy from metadata"""
        # First check for headers from MarkdownHeaderTextSplitter
        if 'Standard' in metadata:
            sections = []
            for level in ['Standard', 'Section', 'Subsection', 'Paragraph']:
                if level in metadata:
                    sections.append(metadata[level])
            if sections:
                return ' > '.join(sections)
        
        # Fallback to other metadata
        headings = metadata.get('headings', [])
        if headings:
            return ' > '.join(str(h) for h in headings)
        return metadata.get('section', 'Root')
    
    def _contains_table(self, metadata: Dict, text: str) -> bool:
        """Check if chunk contains table"""
        # Check metadata
        doc_items = metadata.get('dl_meta', {}).get('doc_items', [])
        if any(item.get('label') == 'table' for item in doc_items):
            return True
        
        # Fallback: check text patterns
        table_indicators = ['|', '┌', '├', '│', 'Table ', 'TABLE ']
        return any(indicator in text for indicator in table_indicators)
    
    def _contains_figure(self, metadata: Dict, text: str) -> bool:
        """Check if chunk contains figure/picture"""
        # Check metadata
        doc_items = metadata.get('dl_meta', {}).get('doc_items', [])
        if any(item.get('label') in ['picture', 'figure'] for item in doc_items):
            return True
        
        # Fallback: check text patterns
        figure_indicators = ['Figure ', 'FIGURE ', 'Exhibit ', 'Chart ', 'Diagram ']
        return any(indicator in text for indicator in figure_indicators)
    
    def _log_final_statistics(self, df: pd.DataFrame):
        """Log comprehensive final statistics"""
        stats = self.stats.get_summary()
        
        self.logger.info("\n" + "="*80)
        self.logger.info("PROCESSING COMPLETE - FINAL STATISTICS")
        self.logger.info("="*80)
        
        self.logger.info(f"\nDocument Processing:")
        self.logger.info(f"  Total PDFs: {stats['total_pdfs']}")
        self.logger.info(f"  Successfully processed: {stats['processed_pdfs']}")
        self.logger.info(f"  Failed: {stats['failed_pdfs']}")
        self.logger.info(f"  Success rate: {stats['success_rate']:.1f}%")
        
        self.logger.info(f"\nContent Statistics:")
        self.logger.info(f"  Total chunks created: {stats['total_chunks']}")
        self.logger.info(f"  Total pages processed: {stats['total_pages']}")
        self.logger.info(f"  Average chunks per document: {stats['avg_chunks_per_doc']:.1f}")
        self.logger.info(f"  Tables found: {stats['tables_found']}")
        self.logger.info(f"  Figures found: {stats['figures_found']}")
        
        self.logger.info(f"\nPerformance:")
        self.logger.info(f"  Total processing time: {stats['total_time_seconds']:.2f}s")
        self.logger.info(f"  Average time per document: {stats['avg_time_per_doc']:.2f}s")
        
        if len(df) > 0:
            self.logger.info(f"\nCategorization Results:")
            self.logger.info("  Category 1 distribution:")
            for cat, count in df['category_1'].value_counts().items():
                pct = count/len(df)*100
                avg_sim = df[df['category_1']==cat]['category_1_similarity'].mean()
                self.logger.info(f"    {cat}: {count} ({pct:.1f}%) - avg similarity: {avg_sim:.3f}")
            
            self.logger.info("\n  Category 2 distribution (top 10):")
            for cat, count in df['category_2'].value_counts().head(10).items():
                pct = count/len(df)*100
                self.logger.info(f"    {cat}: {count} ({pct:.1f}%)")
            
            self.logger.info(f"\nContent Analysis:")
            self.logger.info(f"  Average chunk length: {df['chunk_length'].mean():.0f} characters")
            self.logger.info(f"  Chunks with tables: {df['has_table'].sum()} ({df['has_table'].sum()/len(df)*100:.1f}%)")
            self.logger.info(f"  Chunks with figures: {df['has_figure'].sum()} ({df['has_figure'].sum()/len(df)*100:.1f}%)")
            
            # Show sample hierarchical structure
            self.logger.info(f"\nSample Document Structure:")
            sample_sections = df[df['section'] != 'Root']['section'].head(5)
            for section in sample_sections:
                self.logger.info(f"  {section}")
        
        self.logger.info("\n" + "="*80)


def test_imports():
    """Test function to diagnose import issues"""
    print("\n=== Testing Imports ===")
    
    # Helper for safe printing
    def safe_print(msg, success=True):
        try:
            print(msg)
        except UnicodeEncodeError:
            # Fallback for Windows
            if success:
                print(msg.replace('✓', '[OK]'))
            else:
                print(msg.replace('✗', '[FAIL]'))
    
    # Test Docling
    try:
        import docling
        safe_print("[OK] docling is installed")
    except ImportError:
        safe_print("[FAIL] docling is not installed - run: pip install docling", False)
    
    # Test langchain_docling with correct import paths
    try:
        import langchain_docling
        safe_print("[OK] langchain_docling is installed")
        
        # Test DoclingLoader import
        try:
            from langchain_docling import DoclingLoader
            safe_print("  [OK] DoclingLoader imported successfully")
        except ImportError as e:
            safe_print(f"  [FAIL] DoclingLoader import failed: {e}", False)
        
        # Test ExportType import from correct location
        try:
            from langchain_docling.loader import ExportType
            safe_print("  [OK] ExportType imported from langchain_docling.loader")
            safe_print(f"    Available export types: {[e.name for e in ExportType]}")
        except ImportError as e:
            safe_print(f"  [FAIL] ExportType import failed: {e}", False)
            
    except ImportError:
        safe_print("[FAIL] langchain_docling is not installed - run: pip install langchain-docling", False)
    
    # Test other dependencies
    try:
        from langchain_openai import OpenAIEmbeddings
        safe_print("[OK] langchain_openai is installed")
    except ImportError:
        safe_print("[FAIL] langchain_openai is not installed - run: pip install langchain-openai", False)
    
    try:
        from langchain_text_splitters import MarkdownHeaderTextSplitter
        safe_print("[OK] langchain_text_splitters is installed")
    except ImportError:
        safe_print("[FAIL] langchain_text_splitters is not installed - run: pip install langchain-text-splitters", False)
    
    print("======================\n")


"""Example usage of the IFRS Document Processor"""

# Test imports first
test_imports()

# Define categorization schemes
categories_1 = [
    'Recognition',
    'Measurement', 
    'Disclosure',
    'Presentation',
    'Transition'
]

categories_2 = [
    'IFRS 15', 'IFRS 16', 'IFRS 9', 'IFRS 13', 'IFRS 3',
    'IAS 12', 'IAS 19', 'IAS 16', 'IAS 36', 'IAS 37',
    'General'
]

# Configure processing
config = ProcessingConfig(
    enable_table_extraction=True,
    enable_figure_descriptions=True,
    embedding_model="text-embedding-3-large",
    embedding_dimensions=3072,
    batch_size=50,
    log_level="INFO",
    log_file="ifrs_processing.log"
)

# Initialize processor
processor = IFRSDocumentProcessor(
    category_list_1=categories_1,
    category_list_2=categories_2,
    config=config
)

# Process documents
pdf_dir = Path("../data/knowledge_base/ifrs_test")
pdf_files = list(pdf_dir.glob("*.pdf"))

if not pdf_files:
    logging.error(f"No PDF files found in {pdf_dir}")

logging.info(f"Found {len(pdf_files)} PDF files to process")

# Process and create dataframe
df = processor.process_documents(pdf_files)

# Save to parquet with partitioning
output_dir = Path("output")
output_dir.mkdir(parents=True, exist_ok=True)

output_path = output_dir / "ifrs_embeddings.parquet"

if len(df) > 0:
    df.to_parquet(
        output_path,
        partition_cols=['category_2'],  # Partition by IFRS standard
        compression='snappy',
        index=False
    )
    
    logging.info(f"\n[OK] Saved processed data to {output_path}")
    
    # Save category statistics
    category_stats = df.groupby(['category_1', 'category_2']).agg({
        'content': 'count',
        'category_1_similarity': 'mean',
        'has_table': 'sum',
        'has_figure': 'sum'
    }).round(3)
    category_stats.columns = ['chunk_count', 'avg_similarity', 'tables', 'figures']
    category_stats.to_csv(output_dir / "category_statistics.csv")
    
    # Save processing statistics
    stats_df = pd.DataFrame([processor.stats.get_summary()])
    stats_df.to_csv(output_dir / "processing_statistics.csv", index=False)
    
    # Save a sample for inspection
    sample_df = df.drop(columns=['embedding']).head(20)
    sample_df.to_csv(output_dir / "sample_chunks.csv", index=False)
    
    logging.info("[OK] Saved statistics and sample files")
else:
    logging.warning("No data to save - DataFrame is empty")

logging.info("\n[COMPLETE] Processing pipeline completed successfully!")

2025-06-25 13:05:38,223 - __main__ - INFO - Initializing embeddings model: text-embedding-3-large


[OK] langchain_docling is available

=== Testing Imports ===
[OK] docling is installed
[OK] langchain_docling is installed
  [OK] DoclingLoader imported successfully
  [OK] ExportType imported from langchain_docling.loader
    Available export types: ['MARKDOWN', 'DOC_CHUNKS']
[OK] langchain_openai is installed
[OK] langchain_text_splitters is installed



2025-06-25 13:05:40,217 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-06-25 13:05:40,303 - __main__ - INFO - ✓ Embeddings model initialized successfully (dim: 3072)
2025-06-25 13:05:40,305 - __main__ - INFO - Configuring Docling pipeline...
2025-06-25 13:05:40,308 - __main__ - INFO -   ✓ Table extraction enabled
2025-06-25 13:05:40,309 - __main__ - INFO -   ✓ Figure description enabled (VLM: smolvlm)
2025-06-25 13:05:40,316 - __main__ - INFO - ✓ Docling pipeline configured
2025-06-25 13:05:40,318 - __main__ - INFO - Setting up chunking strategy...
2025-06-25 13:05:40,538 - __main__ - INFO -   ✓ Using Docling HybridChunker
2025-06-25 13:05:40,539 - __main__ - INFO - Computing category embeddings...
2025-06-25 13:05:40,750 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-06-25 13:05:40,836 - __main__ - INFO -   ✓ Computed embedding for 'Recognition'
2025-06-25 13:05:41,044 - httpx - INFO - HT

In [2]:
df.head()

Unnamed: 0,content,embedding,category_1,category_1_similarity,category_2,source_file,page_no,section,has_table,has_figure,chunk_index,chunk_length,metadata
0,Luxembourg: Publications Office of the Europea...,"[-0.005151908844709396, 0.03033466637134552, -...",Transition,0.19928,IFRS 9,IFRS9 monitoring report.pdf,-1,Root,False,False,49,59,"{'source': 'IFRS9 monitoring report.pdf', 'chu..."
1,"## © European Banking Authority, 2021 \nRepro...","[-0.026282820850610733, -0.012836170382797718,...",Disclosure,0.235516,IFRS 9,IFRS9 monitoring report.pdf,-1,Root,False,False,49,291,"{'Section': '© European Banking Authority, 202..."
2,## IFRS9 IMPLEMENTATION BY EU INSTITUTIONS \n...,"[-0.02926705777645111, 0.018509436398744583, -...",Measurement,0.359255,IFRS 9,IFRS9 monitoring report.pdf,-1,Root,False,False,49,62,{'Section': 'IFRS9 IMPLEMENTATION BY EU INSTIT...
3,## Contents \n| List of figures ...,"[-0.005802014376968145, 0.00991261750459671, -...",Transition,0.350552,IFRS 9,IFRS9 monitoring report.pdf,-1,Root,True,False,49,12230,"{'Section': 'Contents', 'source': 'IFRS9 monit..."
4,## List of Figures \n| Figure 1. | Represen...,"[-0.01747412420809269, 0.02101113460958004, -0...",Transition,0.371055,IFRS 9,IFRS9 monitoring report.pdf,-1,Root,True,True,49,18705,"{'Section': 'List of Figures', 'source': 'IFRS..."
