In [10]:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption, ImageFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
    PdfPipelineOptions
)
from docling.datamodel.pipeline_options import (TesseractOcrOptions, EasyOcrOptions)
from docling.datamodel import vlm_model_specs

vlm_models = [
    (vlm_model_specs.SMOLDOCLING_MLX, "SMOLDOCLING_MLX"),
    (vlm_model_specs.SMOLDOCLING_TRANSFORMERS, "SMOLDOCLING_TRANSFORMERS"),
    (vlm_model_specs.GRANITE_VISION_TRANSFORMERS, "GRANITE_VISION_TRANSFORMERS"),
    (vlm_model_specs.GRANITE_VISION_OLLAMA, "GRANITE_VISION_OLLAMA"),
    (vlm_model_specs.PIXTRAL_12B_TRANSFORMERS, "PIXTRAL_12B_TRANSFORMERS"),
    (vlm_model_specs.PIXTRAL_12B_MLX, "PIXTRAL_12B_MLX"),
    (vlm_model_specs.PHI4_TRANSFORMERS, "PHI4_TRANSFORMERS"),
    (vlm_model_specs.QWEN25_VL_3B_MLX, "QWEN25_VL_3B_MLX"),
    (vlm_model_specs.GEMMA3_12B_MLX, "GEMMA3_12B_MLX"),
    (vlm_model_specs.GEMMA3_27B_MLX, "GEMMA3_27B_MLX")
]

def test_vlm(source, model):
    pipeline_options = VlmPipelineOptions(
        vlm_options=model[0]
    )

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=VlmPipeline,
                pipeline_options=pipeline_options,
            ),
            InputFormat.IMAGE: ImageFormatOption(
                pipeline_cls=VlmPipeline,
                pipeline_options=pipeline_options,
            ),
        }
    )
    
    doc = converter.convert(source=source).document.export_to_markdown()
    with open(f"markdown_output_{model[1]}.txt", "w") as f:
        f.write(doc)
    

def test_ocr(source, use_easyocr=True):
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    if use_easyocr:
        pipeline_options.ocr_options = EasyOcrOptions()
    else:
        pipeline_options.ocr_options = TesseractOcrOptions()

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
            InputFormat.IMAGE: ImageFormatOption(pipeline_options=pipeline_options)
        }
    )

    doc = converter.convert(source=source).document.export_to_markdown()
    with open(f"markdown_output_{'easyocr' if use_easyocr else 'tesseract'}.txt", "w") as f:
        f.write(doc)



In [None]:
source = "sample_invoice_1.png"
test_ocr(source, use_easyocr=True)
test_ocr(source, use_easyocr=False)

In [None]:
source = "sample_invoice_1.png"
test_vlm(source, vlm_models[2])  # Test with the first VLM model

# Testing Multimodel LLM

In [7]:
import base64
import os
from pathlib import Path
from typing import Union, Dict, Any, Optional
import json
from openai import OpenAI
from src.constants import MULTIMODAL_EXTRACTION_TEMPLATE, EXTRACTION_SCHEMA

class MultimodalInvoiceProcessor:
    """
    A comprehensive processor for extracting invoice data from images and PDF files
    using OpenAI's multimodal capabilities.
    """
    
    def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4o"):
        """
        Initialize the processor with OpenAI client.
        
        Args:
            api_key: OpenAI API key (if None, uses environment variable)
            model: OpenAI model to use for extraction
        """
        self.client = OpenAI(api_key=api_key)
        self.model = model
        self.supported_image_formats = {'.png', '.jpg', '.jpeg', '.gif', '.webp'}
        self.supported_pdf_formats = {'.pdf'}
    
    def _get_file_type(self, file_path: Union[str, Path]) -> str:
        """Determine file type based on extension."""
        extension = Path(file_path).suffix.lower()
        if extension in self.supported_image_formats:
            return 'image'
        elif extension in self.supported_pdf_formats:
            return 'pdf'
        else:
            raise ValueError(f"Unsupported file format: {extension}")
    
    def _encode_file_to_base64(self, file_path: Union[str, Path]) -> str:
        """Encode file to base64 string."""
        with open(file_path, "rb") as file:
            return base64.b64encode(file.read()).decode("utf-8")
    
    def _get_mime_type(self, file_path: Union[str, Path]) -> str:
        """Get MIME type for the file."""
        extension = Path(file_path).suffix.lower()
        mime_types = {
            '.png': 'image/png',
            '.jpg': 'image/jpeg',
            '.jpeg': 'image/jpeg',
            '.gif': 'image/gif',
            '.webp': 'image/webp',
            '.pdf': 'application/pdf'
        }
        return mime_types.get(extension, 'application/octet-stream')
    
    def _create_content_for_image(self, file_path: Union[str, Path], base64_data: str) -> list:
        """Create content structure for image files."""
        mime_type = self._get_mime_type(file_path)
        return [
            {"type": "text", "text": MULTIMODAL_EXTRACTION_TEMPLATE},
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:{mime_type};base64,{base64_data}"
                }
            }
        ]
    
    def _create_content_for_pdf(self, file_path: Union[str, Path], base64_data: str) -> list:
        """Create content structure for PDF files."""
        filename = Path(file_path).name
        return [
            {"type": "text", "text": MULTIMODAL_EXTRACTION_TEMPLATE},
            {
                "type": "file",
                "file": {
                    "filename": filename,
                    "file_data": f"data:application/pdf;base64,{base64_data}"
                }
            }
        ]
    
    def extract_invoice_data(self, file_path: Union[str, Path]) -> Dict[str, Any]:
        """
        Extract invoice data from an image or PDF file.
        
        Args:
            file_path: Path to the invoice file (image or PDF)
            
        Returns:
            Dictionary containing extracted invoice data
            
        Raises:
            ValueError: If file format is not supported
            FileNotFoundError: If file doesn't exist
        """
        file_path = Path(file_path)
        
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")
        
        # Determine file type and create appropriate content
        file_type = self._get_file_type(file_path)
        base64_data = self._encode_file_to_base64(file_path)
        
        if file_type == 'image':
            content = self._create_content_for_image(file_path, base64_data)
        elif file_type == 'pdf':
            content = self._create_content_for_pdf(file_path, base64_data)
        else:
            raise ValueError(f"Unsupported file type: {file_type}")
        
        # Make API call
        try:
            completion = self.client.chat.completions.create(
                model=self.model,
                messages=[{
                    "role": "user",
                    "content": content
                }],
                response_format={
                    "type": "json_schema",
                    "json_schema": {
                        "name": "invoice_extraction_schema",
                        "schema": EXTRACTION_SCHEMA
                    }
                }
            )
            
            # Parse and return result
            result = json.loads(completion.choices[0].message.content)
            result['_metadata'] = {
                'file_path': str(file_path),
                'file_type': file_type,
                'model_used': self.model,
                'completion_tokens': completion.usage.completion_tokens,
                'prompt_tokens': completion.usage.prompt_tokens,
                'total_tokens': completion.usage.total_tokens
            }
            
            return result
            
        except Exception as e:
            raise RuntimeError(f"Error during API call: {str(e)}")
    
    def extract_from_multiple_files(self, file_paths: list) -> Dict[str, Dict[str, Any]]:
        """
        Extract invoice data from multiple files.
        
        Args:
            file_paths: List of file paths to process
            
        Returns:
            Dictionary with file paths as keys and extraction results as values
        """
        results = {}
        
        for file_path in file_paths:
            try:
                results[str(file_path)] = self.extract_invoice_data(file_path)
            except Exception as e:
                results[str(file_path)] = {
                    'error': str(e),
                    '_metadata': {
                        'file_path': str(file_path),
                        'status': 'failed'
                    }
                }
        
        return results
    
    def save_results(self, results: Dict[str, Any], output_path: Union[str, Path]) -> None:
        """Save extraction results to a JSON file."""
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)

# Initialize the processor
processor = MultimodalInvoiceProcessor()

In [8]:
# Example 1: Process a single image file
try:
    image_result = processor.extract_invoice_data("sample_invoice_1.png")
    print("‚úÖ Successfully extracted data from image:")
    print(json.dumps(image_result, indent=2))
    
    # Save the result
    processor.save_results(image_result, "sample_invoice_1_extraction.json")
    print("üíæ Results saved to extracted_image_data.json")
    
except Exception as e:
    print(f"‚ùå Error processing image: {e}")

‚úÖ Successfully extracted data from image:
{
  "issuing_company_name": "Your Business Name",
  "issuing_company_address": "5 Martin Pl\nSydney NSW 2000\nAustralia",
  "issuing_company_phone": "+61200000000",
  "issuing_company_website": "www.yourbusinessname.com.au",
  "issuing_company_email": "email@yourbusinessname.com.au",
  "invoice_number": "2022435",
  "issue_date": "19/7/2022",
  "due_date": "3/8/2022",
  "reference_number": "2022435",
  "currency": "AUD",
  "line_items": [
    {
      "item_name": "Services & products",
      "description": "Services & products",
      "quantity": 1,
      "unit_price": 100.0,
      "amount": 100.0
    },
    {
      "item_name": "More services & products",
      "description": "More services & products",
      "quantity": 1,
      "unit_price": 2000.0,
      "amount": 2000.0
    }
  ],
  "gst_information": {
    "gst_description": "GST 10% from $100.00, GST 20% from $2,000.00",
    "amount": 410.0
  },
  "total_amount_due": 2510.0,
  "_metada

In [9]:
# Example 2: Process multiple files (both images and PDFs)
import glob

# Find all supported files in the current directory
image_files = glob.glob("*.png") + glob.glob("*.jpg") + glob.glob("*.jpeg")
pdf_files = glob.glob("*.pdf")
all_files = image_files + pdf_files

print(f"Found files to process: {all_files}")

if all_files:
    try:
        # Process all files
        multiple_results = processor.extract_from_multiple_files(all_files)
        
        print("\nüìä Processing Summary:")
        successful = sum(1 for result in multiple_results.values() if 'error' not in result)
        failed = len(multiple_results) - successful
        print(f"‚úÖ Successful: {successful}")
        print(f"‚ùå Failed: {failed}")
        
        # Save all results
        processor.save_results(multiple_results, "all_extracted_data.json")
        print("üíæ All results saved to all_extracted_data.json")
        
        # Show summary of extracted data
        for file_path, result in multiple_results.items():
            if 'error' not in result:
                print(f"\nüìÑ {file_path}:")
                print(f"  Invoice Number: {result.get('invoice_number', 'N/A')}")
                print(f"  Company: {result.get('issuing_company_name', 'N/A')}")
                print(f"  Total Amount: {result.get('total_amount_due', 'N/A')} {result.get('currency', 'N/A')}")
                print(f"  Line Items: {len(result.get('line_items', []))}")
            else:
                print(f"\n‚ùå {file_path}: {result['error']}")
                
    except Exception as e:
        print(f"‚ùå Error processing multiple files: {e}")
else:
    print("No supported files found in the current directory.")

Found files to process: ['sample_invoice_1.png', 'sample_invoice_pdf.pdf']

üìä Processing Summary:
‚úÖ Successful: 2
‚ùå Failed: 0
üíæ All results saved to all_extracted_data.json

üìÑ sample_invoice_1.png:
  Invoice Number: 2022435
  Company: Your Business Name
  Total Amount: 2510.0 AUD
  Line Items: 2

üìÑ sample_invoice_pdf.pdf:
  Invoice Number: 123100401
  Company: CPB Software (Germany) GmbH
  Total Amount: 453.53 EUR
  Line Items: 3

üìä Processing Summary:
‚úÖ Successful: 2
‚ùå Failed: 0
üíæ All results saved to all_extracted_data.json

üìÑ sample_invoice_1.png:
  Invoice Number: 2022435
  Company: Your Business Name
  Total Amount: 2510.0 AUD
  Line Items: 2

üìÑ sample_invoice_pdf.pdf:
  Invoice Number: 123100401
  Company: CPB Software (Germany) GmbH
  Total Amount: 453.53 EUR
  Line Items: 3


In [None]:
# Example 3: Advanced usage with different models and custom processing
print("üîß Advanced Usage Examples:")

# Use a different model
processor_advanced = MultimodalInvoiceProcessor(model="gpt-4o-mini")
print(f"Created processor with model: {processor_advanced.model}")

# Check supported formats
print(f"Supported image formats: {processor_advanced.supported_image_formats}")
print(f"Supported PDF formats: {processor_advanced.supported_pdf_formats}")

# Example of processing with error handling and metadata extraction
def process_with_detailed_logging(file_path):
    """Process a file with detailed logging."""
    try:
        print(f"\nüîÑ Processing: {file_path}")
        result = processor_advanced.extract_invoice_data(file_path)
        
        # Extract metadata
        metadata = result.get('_metadata', {})
        print(f"‚úÖ Success! Tokens used: {metadata.get('tokens_used', 'Unknown')}")
        print(f"   File type detected: {metadata.get('file_type', 'Unknown')}")
        
        # Validate required fields
        required_fields = ['invoice_number', 'total_amount_due']
        missing_fields = [field for field in required_fields if not result.get(field)]
        
        if missing_fields:
            print(f"‚ö†Ô∏è  Warning: Missing required fields: {missing_fields}")
        else:
            print("‚úÖ All required fields present")
            
        return result
        
    except FileNotFoundError:
        print(f"‚ùå File not found: {file_path}")
    except ValueError as e:
        print(f"‚ùå File format error: {e}")
    except RuntimeError as e:
        print(f"‚ùå API error: {e}")
    except Exception as e:
        print(f"‚ùå Unexpected error: {e}")
    
    return None

# Test with the sample image
if os.path.exists("sample_invoice_1.png"):
    detailed_result = process_with_detailed_logging("sample_invoice_1.png")
else:
    print("‚ùå sample_invoice_1.png not found for detailed processing example")