# PDF to Markdown Converter with AI-Enhanced Image Descriptions

This notebook converts PDF documents to markdown format with extracted images and AI-generated descriptions.

Features:
- Uses Mistral OCR for text and image extraction
- Uses GPT-4.1 for summarization and image descriptions
- Saves output as markdown with embedded images
- Includes AI-generated alt text for accessibility

Requirements:
- `.env` file with Azure API credentials
- PDF documents < 1000 pages

## 1. Install Required Packages

In [None]:
%pip install pymupdf httpx python-dotenv pillow

## 2. Import Libraries and Load Environment

In [4]:
import base64
import httpx
import json
import pymupdf
import os
import re
from pathlib import Path
from typing import Dict, Any, Optional, List, Tuple
from datetime import datetime
from dotenv import load_dotenv
from PIL import Image
import io

# Load environment variables
load_dotenv()

# Azure Mistral OCR Configuration
AZURE_MISTRAL_OCR_ENDPOINT = os.getenv("AZURE_MISTRAL_OCR_ENDPOINT")
AZURE_MISTRAL_OCR_API_KEY = os.getenv("AZURE_MISTRAL_OCR_API_KEY")

# Azure OpenAI Configuration (GPT-4.1)
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

# Validate configuration
required_vars = [
    ("AZURE_MISTRAL_OCR_ENDPOINT", AZURE_MISTRAL_OCR_ENDPOINT),
    ("AZURE_MISTRAL_OCR_API_KEY", AZURE_MISTRAL_OCR_API_KEY),
    ("AZURE_OPENAI_ENDPOINT", AZURE_OPENAI_ENDPOINT),
    ("AZURE_OPENAI_API_KEY", AZURE_OPENAI_API_KEY)
]

missing_vars = [var[0] for var in required_vars if not var[1]]
if missing_vars:
    raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")

print("Environment loaded successfully!")

Environment loaded successfully!


## 3. Helper Functions

In [5]:
def _encode_document_to_base64(document_path: str) -> str:
    """Encode a document file to base64 string."""
    with Path(document_path).open(mode="rb") as f_in:
        doc_encoded = base64.b64encode(f_in.read()).decode("utf-8")
        return doc_encoded


def _call_mistral_ocr(base64_input_data: str) -> Dict[str, Any]:
    """Call Mistral OCR API to extract text and images from document."""
    endpoint_url = f"{AZURE_MISTRAL_OCR_ENDPOINT}/v1/ocr"
    headers = {
        "Content-Type": "application/json",
        "Accept": "application/json",
        "Authorization": f"Bearer {AZURE_MISTRAL_OCR_API_KEY}",
    }
    payload = {
        "model": "mistral-ocr-2503",
        "document": {"type": "document_url", "document_url": base64_input_data},
        "include_image_base64": True,
    }
    
    try:
        with httpx.Client() as client:
            ocr_resp = client.post(
                url=endpoint_url, headers=headers, json=payload, timeout=120.0
            )
            ocr_resp.raise_for_status()
            return ocr_resp.json()
    except httpx.HTTPError as e:
        print(f"HTTP error occurred: {e}")
        raise
    except Exception as e:
        print(f"An error occurred: {e}")
        raise


def _call_gpt4_vision(user_message: str, image_base64: Optional[str] = None, 
                      system_message: str = "You are a helpful assistant.") -> str:
    """Call GPT-4.1 for text generation or image description."""
    headers = {
        "Content-Type": "application/json",
        "api-key": AZURE_OPENAI_API_KEY,
    }
    
    messages = [{"role": "system", "content": system_message}]
    
    if image_base64:
        # For image description
        messages.append({
            "role": "user",
            "content": [
                {"type": "text", "text": user_message},
                {"type": "image_url", "image_url": {"url": image_base64}}
            ]
        })
    else:
        # For text-only tasks
        messages.append({"role": "user", "content": user_message})
    
    payload = {
        "messages": messages,
        "temperature": 0.3,
        "max_tokens": 500
    }
    
    try:
        with httpx.Client() as client:
            response = client.post(
                AZURE_OPENAI_ENDPOINT,
                headers=headers,
                json=payload,
                timeout=60.0
            )
            response.raise_for_status()
            return response.json()["choices"][0]["message"]["content"]
    except Exception as e:
        print(f"Error calling GPT-4.1: {e}")
        raise


def save_image_from_base64(base64_string: str, output_path: Path) -> None:
    """Save a base64-encoded image to file."""
    # Remove data URL prefix if present
    if base64_string.startswith("data:"):
        base64_string = base64_string.split(",")[1]
    
    # Decode and save image
    image_data = base64.b64decode(base64_string)
    image = Image.open(io.BytesIO(image_data))
    
    # Convert to RGB if necessary (for PNG with transparency)
    if image.mode in ('RGBA', 'LA'):
        rgb_image = Image.new('RGB', image.size, (255, 255, 255))
        rgb_image.paste(image, mask=image.split()[-1] if image.mode == 'RGBA' else None)
        image = rgb_image
    
    # Save as PNG
    image.save(output_path, 'PNG')


print("Helper functions loaded successfully!")

Helper functions loaded successfully!


## 4. DocumentConverter Class

In [6]:
class DocumentConverter:
    """Convert PDF documents to markdown with AI-enhanced image descriptions."""
    
    def __init__(self, source_file: str, output_dir: str = "output"):
        self.source_file = Path(source_file)
        self.output_dir = Path(output_dir)
        self.parsed_doc: Optional[Dict[str, Any]] = None
        self.image_descriptions: Dict[str, Dict[str, str]] = {}
        self.document_name = self.source_file.stem
        
        # Create output directory structure
        self.doc_output_dir = self.output_dir / self.document_name
        self.images_dir = self.doc_output_dir / "images"
        
        # Image mapping: OCR filename -> our filename
        self.image_mapping: Dict[str, str] = {}
        
    def validate_document(self) -> Dict[str, Any]:
        """Validate document size and page count."""
        if not self.source_file.exists():
            raise FileNotFoundError(f"File not found: {self.source_file}")
        
        # Check file size
        file_size_mb = self.source_file.stat().st_size / (1024 * 1024)
        if file_size_mb > 50:
            raise ValueError(f"File size ({file_size_mb:.1f}MB) exceeds 50MB limit")
        
        # Check page count
        doc = pymupdf.open(str(self.source_file))
        page_count = len(doc)
        doc.close()
        
        if page_count > 1000:
            raise ValueError(f"Document has {page_count} pages, exceeds 1000 page limit")
        
        return {
            "pages": page_count,
            "size_mb": file_size_mb,
            "valid": True
        }
    
    def parse(self) -> None:
        """Parse document using Mistral OCR."""
        print(f"Parsing document: {self.source_file}")
        
        # Validate first
        validation = self.validate_document()
        print(f"Document validated: {validation['pages']} pages, {validation['size_mb']:.1f}MB")
        
        # Encode and parse
        encoded_doc = _encode_document_to_base64(str(self.source_file))
        self.parsed_doc = _call_mistral_ocr(
            base64_input_data=f"data:application/pdf;base64,{encoded_doc}"
        )
        
        print(f"Document parsed successfully!")
    
    def extract_image_references(self, markdown_text: str) -> List[Tuple[str, str, str]]:
        """Extract image references from markdown text.
        Returns list of tuples: (full_match, alt_text, image_path)
        """
        # Pattern to match markdown image references: ![alt text](path)
        pattern = r'!\[([^\]]*)\]\(([^\)]+)\)'
        matches = re.findall(pattern, markdown_text)
        
        # Get full matches with their positions for replacement
        full_matches = []
        for match in re.finditer(pattern, markdown_text):
            full_match = match.group(0)
            alt_text = match.group(1)
            image_path = match.group(2)
            full_matches.append((full_match, alt_text, image_path))
        
        return full_matches
    
    def save_images(self) -> List[Dict[str, str]]:
        """Extract and save all images from the parsed document, building image mapping."""
        if not self.parsed_doc:
            raise ValueError("Document not parsed yet. Call parse() first.")
        
        # Create images directory
        self.images_dir.mkdir(parents=True, exist_ok=True)
        
        image_info = []
        image_count = 0
        
        # First pass: extract all image references from markdown to understand OCR naming
        ocr_image_names = set()
        for page in self.parsed_doc["pages"]:
            if "markdown" in page:
                refs = self.extract_image_references(page["markdown"])
                for _, _, img_path in refs:
                    # Extract just the filename from the path
                    img_name = os.path.basename(img_path)
                    ocr_image_names.add(img_name)
        
        # Convert to sorted list for consistent ordering
        ocr_image_names = sorted(list(ocr_image_names))
        
        # Second pass: save images and build mapping
        global_img_idx = 0
        for page_idx, page in enumerate(self.parsed_doc["pages"], 1):
            if "images" in page:
                for img_idx, img in enumerate(page["images"], 1):
                    # Generate our filename
                    our_filename = f"page_{page_idx:03d}_img_{img_idx:03d}.png"
                    filepath = self.images_dir / our_filename
                    
                    # Try to match with OCR image name based on order
                    if global_img_idx < len(ocr_image_names):
                        ocr_name = ocr_image_names[global_img_idx]
                        self.image_mapping[ocr_name] = f"images/{our_filename}"
                    
                    global_img_idx += 1
                    image_count += 1
                    
                    # Save image
                    save_image_from_base64(img["image_base64"], filepath)
                    
                    # Store info for later use
                    image_info.append({
                        "page": page_idx,
                        "index": img_idx,
                        "filename": our_filename,
                        "filepath": str(filepath),
                        "base64": img["image_base64"]
                    })
        
        print(f"Saved {image_count} images to {self.images_dir}")
        print(f"Built image mapping for {len(self.image_mapping)} images")
        return image_info
    
    def describe_images(self, image_info: List[Dict[str, str]]) -> None:
        """Generate descriptions for all images using GPT-4.1."""
        system_message = """You are an expert at analyzing technical diagrams and images from enterprise documents.
        Provide a concise, informative description and a short title for each image.
        Focus on the key information, relationships, and purpose of the diagram.
        Return your response as JSON with 'title' and 'description' fields."""
        
        print(f"Generating descriptions for {len(image_info)} images...")
        
        for idx, img in enumerate(image_info, 1):
            print(f"Processing image {idx}/{len(image_info)}: {img['filename']}")
            
            prompt = """Analyze this image and provide:
            1. A short, descriptive title (5-10 words)
            2. A detailed description of what the image shows, including key components, relationships, and purpose
            
            Return as JSON with 'title' and 'description' fields."""
            
            try:
                response = _call_gpt4_vision(
                    user_message=prompt,
                    image_base64=img["base64"],
                    system_message=system_message
                )
                
                # Parse JSON response
                try:
                    desc_data = json.loads(response)
                except json.JSONDecodeError:
                    # Fallback if response isn't valid JSON
                    desc_data = {
                        "title": "Image from document",
                        "description": response
                    }
                
                self.image_descriptions[img["filename"]] = desc_data
                
            except Exception as e:
                print(f"Error describing image {img['filename']}: {e}")
                self.image_descriptions[img["filename"]] = {
                    "title": f"Figure from page {img['page']}",
                    "description": "Image description unavailable"
                }
        
        print("Image descriptions generated successfully!")
    
    def generate_markdown(self, include_alt_text: bool = True) -> str:
        """Generate markdown content with embedded images replaced inline."""
        if not self.parsed_doc:
            raise ValueError("Document not parsed yet. Call parse() first.")
        
        markdown_lines = []
        
        # Add document title
        markdown_lines.append(f"# {self.document_name}\n")
        markdown_lines.append(f"*Converted from PDF on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n")
        
        # Process each page
        for page_idx, page in enumerate(self.parsed_doc["pages"], 1):
            # Add page marker
            markdown_lines.append(f"\n---\n**Page {page_idx}**\n")
            
            # Process page content with image reference replacement
            if "markdown" in page:
                page_content = page["markdown"]
                
                # Find and replace all image references
                image_refs = self.extract_image_references(page_content)
                
                for full_match, alt_text, img_path in image_refs:
                    # Get the image filename from the path
                    img_name = os.path.basename(img_path)
                    
                    # Look up our mapped filename
                    if img_name in self.image_mapping:
                        new_path = self.image_mapping[img_name]
                        
                        # If we have AI descriptions and they're requested
                        if include_alt_text:
                            # Find the corresponding filename in our descriptions
                            our_filename = os.path.basename(new_path)
                            if our_filename in self.image_descriptions:
                                desc = self.image_descriptions[our_filename]
                                new_alt_text = desc.get("description", alt_text)
                                title = desc.get("title", "")
                                # Create new image reference with AI description
                                new_ref = f'![{new_alt_text}]({new_path} "{title}")'
                            else:
                                # Keep original alt text but update path
                                new_ref = f'![{alt_text}]({new_path})'
                        else:
                            # Keep original alt text but update path
                            new_ref = f'![{alt_text}]({new_path})'
                        
                        # Replace in the content
                        page_content = page_content.replace(full_match, new_ref)
                
                markdown_lines.append(page_content)
        
        return "\n".join(markdown_lines)
    
    def save_output(self, include_alt_text: bool = True) -> Dict[str, str]:
        """Save the complete output including markdown and metadata."""
        # Create output directory
        self.doc_output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save images and build mapping
        image_info = self.save_images()
        
        # Generate image descriptions
        if include_alt_text:
            self.describe_images(image_info)
        
        # Generate and save markdown
        markdown_content = self.generate_markdown(include_alt_text)
        markdown_path = self.doc_output_dir / "document.md"
        markdown_path.write_text(markdown_content, encoding="utf-8")
        
        # Save metadata
        metadata = {
            "source_file": str(self.source_file),
            "conversion_date": datetime.now().isoformat(),
            "page_count": len(self.parsed_doc["pages"]),
            "image_count": len(image_info),
            "models_used": {
                "ocr": "mistral-ocr-2503",
                "image_description": "gpt-4.1" if include_alt_text else None
            },
            "image_mapping": self.image_mapping,
            "image_descriptions": self.image_descriptions if include_alt_text else {}
        }
        
        metadata_path = self.doc_output_dir / "metadata.json"
        metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
        
        print(f"\nOutput saved to: {self.doc_output_dir}")
        print(f"- Markdown: {markdown_path}")
        print(f"- Images: {self.images_dir} ({len(image_info)} files)")
        print(f"- Metadata: {metadata_path}")
        
        return {
            "output_dir": str(self.doc_output_dir),
            "markdown_path": str(markdown_path),
            "images_dir": str(self.images_dir),
            "metadata_path": str(metadata_path)
        }
    
    def convert(self, include_alt_text: bool = True) -> Dict[str, str]:
        """Complete conversion process."""
        print(f"\nStarting conversion of: {self.source_file}")
        print("=" * 50)
        
        # Parse document
        self.parse()
        
        # Save output
        result = self.save_output(include_alt_text)
        
        print("\nConversion completed successfully!")
        return result


print("DocumentConverter class loaded successfully!")

DocumentConverter class loaded successfully!


## 5. Example Usage

In [None]:
# Example: Convert a PDF document
# Replace with your PDF file path
PDF_FILE = "resources/code_researcher.pdf"

In [9]:
# Create converter instance
converter = DocumentConverter(
    source_file=PDF_FILE,
    output_dir="output"
)

# Run conversion with AI-generated image descriptions
result = converter.convert(include_alt_text=True)

print("\nConversion complete! Files saved to:", result["output_dir"])


Starting conversion of: guide-main-market-pdf.shortened.pdf
Parsing document: guide-main-market-pdf.shortened.pdf
Document validated: 4 pages, 0.4MB
Document parsed successfully!
Saved 3 images to output/guide-main-market-pdf.shortened/images
Built image mapping for 3 images
Generating descriptions for 3 images...
Processing image 1/3: page_001_img_001.png
Processing image 2/3: page_002_img_001.png
Processing image 3/3: page_004_img_001.png
Image descriptions generated successfully!

Output saved to: output/guide-main-market-pdf.shortened
- Markdown: output/guide-main-market-pdf.shortened/document.md
- Images: output/guide-main-market-pdf.shortened/images (3 files)
- Metadata: output/guide-main-market-pdf.shortened/metadata.json

Conversion completed successfully!

Conversion complete! Files saved to: output/guide-main-market-pdf.shortened


## 6. Advanced Usage Examples

In [None]:
# Example 1: Convert without AI descriptions (faster)
converter_fast = DocumentConverter(PDF_FILE)
result_fast = converter_fast.convert(include_alt_text=False)
print("Fast conversion complete!")

In [None]:
# Example 2: Validate document before conversion
test_converter = DocumentConverter("lseg-foundry-intro.pdf")
try:
    validation = test_converter.validate_document()
    print(f"Document is valid: {validation}")
except ValueError as e:
    print(f"Document validation failed: {e}")

In [None]:
# Example 3: Custom processing with progress tracking
def convert_with_progress(pdf_file: str):
    """Convert PDF with detailed progress tracking."""
    converter = DocumentConverter(pdf_file)
    
    print("Step 1/4: Validating document...")
    validation = converter.validate_document()
    print(f"  ✓ Valid: {validation['pages']} pages")
    
    print("\nStep 2/4: Parsing with OCR...")
    converter.parse()
    print("  ✓ OCR complete")
    
    print("\nStep 3/4: Extracting images...")
    image_info = converter.save_images()
    print(f"  ✓ {len(image_info)} images extracted")
    
    print("\nStep 4/4: Generating descriptions...")
    converter.describe_images(image_info)
    print("  ✓ Descriptions generated")
    
    # Generate final output
    markdown = converter.generate_markdown()
    output_path = converter.doc_output_dir / "document.md"
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(markdown, encoding="utf-8")
    
    print(f"\n✅ Conversion complete! Output: {output_path}")
    return str(output_path)

# Run with progress tracking
output_file = convert_with_progress(PDF_FILE)

## 7. Batch Processing (TODO: Check how to use the Batch Processing API directly)

In [None]:
def batch_convert_pdfs(pdf_folder: str, output_folder: str = "output"):
    """Convert all PDF files in a folder."""
    pdf_folder_path = Path(pdf_folder)
    pdf_files = list(pdf_folder_path.glob("*.pdf"))
    
    if not pdf_files:
        print(f"No PDF files found in {pdf_folder}")
        return
    
    print(f"Found {len(pdf_files)} PDF files to convert")
    results = []
    
    for idx, pdf_file in enumerate(pdf_files, 1):
        print(f"\n{'='*60}")
        print(f"Processing {idx}/{len(pdf_files)}: {pdf_file.name}")
        print(f"{'='*60}")
        
        try:
            converter = DocumentConverter(
                source_file=str(pdf_file),
                output_dir=output_folder
            )
            result = converter.convert(include_alt_text=True)
            results.append({
                "file": pdf_file.name,
                "status": "success",
                "output": result["output_dir"]
            })
        except Exception as e:
            print(f"Error processing {pdf_file.name}: {e}")
            results.append({
                "file": pdf_file.name,
                "status": "failed",
                "error": str(e)
            })
    
    # Summary
    print(f"\n{'='*60}")
    print("Batch conversion summary:")
    print(f"{'='*60}")
    
    successful = [r for r in results if r["status"] == "success"]
    failed = [r for r in results if r["status"] == "failed"]
    
    print(f"Total: {len(results)} files")
    print(f"Successful: {len(successful)} files")
    print(f"Failed: {len(failed)} files")
    
    if failed:
        print("\nFailed files:")
        for f in failed:
            print(f"  - {f['file']}: {f['error']}")
    
    return results

# Example: Convert all PDFs in current directory
# results = batch_convert_pdfs(".", "batch_output")

## 8. Summary

This notebook provides a complete solution for converting PDF documents to markdown format with:

- **Mistral OCR** for accurate text and image extraction
- **GPT-4.1** for intelligent image descriptions
- **Structured output** with markdown files and extracted images
- **AI-generated alt text** for accessibility
- **Metadata tracking** for audit trails

The output is ideal for:
- AI agent consumption
- Documentation workflows
- Knowledge base creation
- Content migration projects

Remember to ensure your `.env` file contains all required API credentials before running the converter.