# Document Processing with Azure OpenAI Multimodal Model

This notebook handles:
1. **Document Upload** - Upload policy and claims documents to Azure Blob Storage
2. **Text Processing** - Process .md files using GPT-4o
3. **OCR Processing** - Extract text from images using GPT-4o vision capabilities
4. **Text Enhancement** - Clean and prepare documents for vectorization

## Prerequisites
- Azure Blob Storage account created
- Azure OpenAI service with GPT-4o model deployed
- Environment variables configured in `.env` file

## 1. Setup and Configuration

In [13]:
import os
import json
import base64
from pathlib import Path
from typing import Dict, List, Optional
import pandas as pd
from tqdm import tqdm

# Azure SDK imports
from azure.storage.blob import BlobServiceClient
from azure.core.exceptions import ResourceExistsError

# OpenAI imports
from openai import AzureOpenAI

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

print("✅ All imports successful!")

✅ All imports successful!


In [14]:
# Configuration
class Config:
    # Storage configuration
    AZURE_STORAGE_CONNECTION_STRING = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
    AZURE_STORAGE_ACCOUNT_NAME = os.getenv('AZURE_STORAGE_ACCOUNT_NAME')
    AZURE_STORAGE_ACCOUNT_KEY = os.getenv('AZURE_STORAGE_ACCOUNT_KEY')
    
    # Azure OpenAI configuration
    AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
    AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_KEY')
    AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION', '2024-02-15-preview')
    AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME', 'gpt-4o-mini')
    
    # Container names
    POLICIES_CONTAINER = 'policies'
    CLAIMS_CONTAINER = 'claims'
    PROCESSED_CONTAINER = 'processed-documents'
    STATEMENTS_CONTAINER = 'statements'
    
    # Local data paths
    DATA_DIR = Path('data')
    POLICIES_DIR = DATA_DIR / 'policies'
    CLAIMS_DIR = DATA_DIR / 'claims'
    STATEMENTS_DIR = DATA_DIR / 'statements'

# Validate configuration
required_vars = [
    Config.AZURE_STORAGE_CONNECTION_STRING,
    Config.AZURE_OPENAI_ENDPOINT,
    Config.AZURE_OPENAI_API_KEY
]

missing_vars = [var for var in required_vars if not var]
if missing_vars:
    print("❌ Missing environment variables. Please check your .env file.")
    print("Missing variables - please add these to your .env file:")
    if not Config.AZURE_OPENAI_ENDPOINT:
        print("  - AZURE_OPENAI_ENDPOINT")
    if not Config.AZURE_OPENAI_API_KEY:
        print("  - AZURE_OPENAI_API_KEY")
    if not Config.AZURE_STORAGE_CONNECTION_STRING:
        print("  - AZURE_STORAGE_CONNECTION_STRING")
else:
    print("✅ Configuration loaded successfully!")
    print(f"📁 Policies directory: {Config.POLICIES_DIR}")
    print(f"📁 Claims directory: {Config.CLAIMS_DIR}")
    print(f"🤖 OpenAI Deployment: {Config.AZURE_OPENAI_DEPLOYMENT_NAME}")

✅ Configuration loaded successfully!
📁 Policies directory: data\policies
📁 Claims directory: data\claims
🤖 OpenAI Deployment: gpt-4o-mini


## 2. Azure Services Setup

In [15]:
# Initialize Azure clients
def initialize_clients():
    """Initialize Azure service clients"""
    try:
        # Blob Storage client
        blob_service_client = BlobServiceClient.from_connection_string(
            Config.AZURE_STORAGE_CONNECTION_STRING
        )
        
        # Azure OpenAI client
        openai_client = AzureOpenAI(
            azure_endpoint=Config.AZURE_OPENAI_ENDPOINT,
            api_key=Config.AZURE_OPENAI_API_KEY,
            api_version=Config.AZURE_OPENAI_API_VERSION
        )
        
        print("✅ Azure clients initialized successfully!")
        return blob_service_client, openai_client
        
    except Exception as e:
        print(f"❌ Error initializing clients: {e}")
        return None, None

blob_service_client, openai_client = initialize_clients()

✅ Azure clients initialized successfully!


In [16]:
# Enhanced container creation with multiple authentication methods and diagnostics
def create_containers_enhanced(blob_service_client):
    """Create blob storage containers with enhanced error handling and diagnostics"""
    
    # First, test the connection
    try:
        print("🔍 Testing storage account connection...")
        account_info = blob_service_client.get_account_information()
        print(f"✅ Connected to storage account successfully")
        print(f"   Account kind: {account_info.get('account_kind', 'Unknown')}")
        print(f"   SKU name: {account_info.get('sku_name', 'Unknown')}")
    except Exception as e:
        print(f"❌ Failed to connect to storage account: {e}")
        return False
    
    # Test listing existing containers
    try:
        print("\n🔍 Checking existing containers...")
        existing_containers = []
        for container in blob_service_client.list_containers():
            existing_containers.append(container.name)
        print(f"✅ Found {len(existing_containers)} existing containers: {existing_containers}")
    except Exception as e:
        print(f"❌ Failed to list containers: {e}")
        print("   This might indicate insufficient permissions")
    
    # Try to create containers
    containers = [
        Config.POLICIES_CONTAINER,
        Config.CLAIMS_CONTAINER,
        Config.STATEMENTS_CONTAINER,  # Added statements container
        Config.PROCESSED_CONTAINER
    ]
    
    created_containers = []
    failed_containers = []
    
    for container_name in containers:
        try:
            # Check if container already exists first
            container_client = blob_service_client.get_container_client(container_name)
            
            try:
                # Try to get container properties (this will fail if it doesn't exist)
                properties = container_client.get_container_properties()
                print(f"ℹ️ Container '{container_name}' already exists")
                created_containers.append(container_name)
                continue
            except Exception:
                # Container doesn't exist, try to create it
                pass
            
            # Create the container
            print(f"🔨 Creating container '{container_name}'...")
            container_client.create_container()
            print(f"✅ Container '{container_name}' created successfully")
            created_containers.append(container_name)
            
        except Exception as e:
            print(f"❌ Error with container '{container_name}': {e}")
            failed_containers.append((container_name, str(e)))
            
            # Additional diagnostics for authorization errors
            if "AuthorizationFailure" in str(e):
                print(f"   🔍 Authorization issue detected for '{container_name}'")
                print(f"   This could be due to:")
                print(f"   - Storage account access keys disabled")
                print(f"   - Network access restrictions")
                print(f"   - Storage account permissions")
    
    print(f"\n📊 Container Creation Summary:")
    print(f"   ✅ Successful: {len(created_containers)} - {created_containers}")
    print(f"   ❌ Failed: {len(failed_containers)} - {[name for name, _ in failed_containers]}")
    
    return len(failed_containers) == 0

if blob_service_client:
    print("🚀 Running enhanced container creation...")
    success = create_containers_enhanced(blob_service_client)
    
    if not success:
        print("\n🔄 Primary method failed, trying alternatives...")
        
        # Try alternative authentication
        alt_client = try_alternative_authentication()
        if alt_client:
            blob_service_client = alt_client
            success = create_containers_enhanced(blob_service_client)
        

🚀 Running enhanced container creation...
🔍 Testing storage account connection...
✅ Connected to storage account successfully
   Account kind: StorageV2
   SKU name: Standard_LRS

🔍 Checking existing containers...
✅ Found 4 existing containers: ['claims', 'policies', 'processed-documents', 'statements']
ℹ️ Container 'policies' already exists
ℹ️ Container 'claims' already exists
ℹ️ Container 'statements' already exists
ℹ️ Container 'processed-documents' already exists

📊 Container Creation Summary:
   ✅ Successful: 4 - ['policies', 'claims', 'statements', 'processed-documents']
   ❌ Failed: 0 - []


## 3. Document Upload Functions

In [17]:
class DocumentUploader:
    def __init__(self, blob_service_client):
        self.blob_service_client = blob_service_client
    
    def upload_file(self, file_path: Path, container_name: str, blob_name: str = None) -> bool:
        """Upload a single file to blob storage"""
        if blob_name is None:
            blob_name = file_path.name
            
        try:
            blob_client = self.blob_service_client.get_blob_client(
                container=container_name, 
                blob=blob_name
            )
            
            with open(file_path, 'rb') as data:
                blob_client.upload_blob(data, overwrite=True)
            
            print(f"✅ Uploaded: {file_path.name} → {container_name}/{blob_name}")
            return True
            
        except Exception as e:
            print(f"❌ Error uploading {file_path.name}: {e}")
            return False
    
    def upload_directory(self, directory_path: Path, container_name: str) -> Dict[str, bool]:
        """Upload all files from a directory to blob storage"""
        results = {}
        
        if not directory_path.exists():
            print(f"❌ Directory not found: {directory_path}")
            return results
        
        files = list(directory_path.glob('*'))
        if not files:
            print(f"ℹ️ No files found in {directory_path}")
            return results
        
        print(f"📤 Uploading {len(files)} files from {directory_path} to {container_name}...")
        
        for file_path in tqdm(files, desc="Uploading files"):
            if file_path.is_file():
                success = self.upload_file(file_path, container_name)
                results[file_path.name] = success
        
        successful_uploads = sum(results.values())
        print(f"\n📊 Upload Summary: {successful_uploads}/{len(results)} files uploaded successfully")
        
        return results
    
    def list_blobs(self, container_name: str) -> List[str]:
        """List all blobs in a container"""
        try:
            container_client = self.blob_service_client.get_container_client(container_name)
            blob_list = container_client.list_blobs()
            return [blob.name for blob in blob_list]
        except Exception as e:
            print(f"❌ Error listing blobs in {container_name}: {e}")
            return []

# Initialize uploader
if blob_service_client:
    uploader = DocumentUploader(blob_service_client)
    print("✅ Document uploader initialized!")

✅ Document uploader initialized!


## 4. Upload Documents to Blob Storage

In [18]:
# Upload policy documents
print("📄 Uploading Policy Documents...")
print("=" * 50)

policy_results = uploader.upload_directory(Config.POLICIES_DIR, Config.POLICIES_CONTAINER)

# Show uploaded policies
policy_blobs = uploader.list_blobs(Config.POLICIES_CONTAINER)
print(f"\n📋 Policies in storage ({len(policy_blobs)} files):")
for blob in policy_blobs:
    print(f"  • {blob}")

📄 Uploading Policy Documents...
📤 Uploading 5 files from data\policies to policies...


Uploading files:   0%|          | 0/5 [00:00<?, ?it/s]

✅ Uploaded: commercial_auto_policy.md → policies/commercial_auto_policy.md

Uploading files:  20%|██        | 1/5 [00:00<00:00,  8.09it/s]




Uploading files:  40%|████      | 2/5 [00:00<00:00,  8.30it/s]

✅ Uploaded: comprehensive_auto_policy.md → policies/comprehensive_auto_policy.md


Uploading files:  60%|██████    | 3/5 [00:00<00:00,  5.46it/s]

✅ Uploaded: high_value_vehicle_policy.md → policies/high_value_vehicle_policy.md


Uploading files:  80%|████████  | 4/5 [00:00<00:00,  4.78it/s]

✅ Uploaded: liability_only_policy.md → policies/liability_only_policy.md


Uploading files: 100%|██████████| 5/5 [00:01<00:00,  4.66it/s]

✅ Uploaded: motorcycle_policy.md → policies/motorcycle_policy.md

📊 Upload Summary: 5/5 files uploaded successfully

📋 Policies in storage (5 files):
  • commercial_auto_policy.md
  • comprehensive_auto_policy.md
  • high_value_vehicle_policy.md
  • liability_only_policy.md
  • motorcycle_policy.md





In [19]:
# Upload claims documents
print("\n🖼️ Uploading Claims Documents...")
print("=" * 50)

claims_results = uploader.upload_directory(Config.CLAIMS_DIR, Config.CLAIMS_CONTAINER)

# Show uploaded claims
claims_blobs = uploader.list_blobs(Config.CLAIMS_CONTAINER)
print(f"\n📋 Claims in storage ({len(claims_blobs)} files):")
for blob in claims_blobs:
    print(f"  • {blob}")


🖼️ Uploading Claims Documents...
📤 Uploading 6 files from data\claims to claims...


Uploading files:  33%|███▎      | 2/6 [00:00<00:01,  3.03it/s]

✅ Uploaded: crash1.jpg → claims/crash1.jpg
✅ Uploaded: crash2.jpg → claims/crash2.jpg


Uploading files:  67%|██████▋   | 4/6 [00:01<00:00,  4.26it/s]

✅ Uploaded: crash3.jpg → claims/crash3.jpg
✅ Uploaded: crash4.jpeg → claims/crash4.jpeg


Uploading files:  83%|████████▎ | 5/6 [00:01<00:00,  3.26it/s]

✅ Uploaded: crash5.jpg → claims/crash5.jpg


Uploading files: 100%|██████████| 6/6 [00:01<00:00,  3.35it/s]

✅ Uploaded: invoice.png → claims/invoice.png

📊 Upload Summary: 6/6 files uploaded successfully

📋 Claims in storage (6 files):
  • crash1.jpg
  • crash2.jpg
  • crash3.jpg
  • crash4.jpeg
  • crash5.jpg
  • invoice.png





In [20]:
# Upload statements documents
print("\n📄 Uploading Statements Documents...")
print("=" * 50)

statements_results = uploader.upload_directory(Config.STATEMENTS_DIR, Config.STATEMENTS_CONTAINER)

# Show uploaded statements
statements_blobs = uploader.list_blobs(Config.STATEMENTS_CONTAINER)
print(f"\n📋 Statements in storage ({len(statements_blobs)} files):")
for blob in statements_blobs:
    print(f"  • {blob}")


📄 Uploading Statements Documents...
📤 Uploading 5 files from data\statements to statements...


Uploading files:  40%|████      | 2/5 [00:00<00:00,  6.44it/s]

✅ Uploaded: crash1.md → statements/crash1.md
✅ Uploaded: crash2.md → statements/crash2.md


Uploading files:  80%|████████  | 4/5 [00:00<00:00,  7.86it/s]

✅ Uploaded: crash3.md → statements/crash3.md
✅ Uploaded: crash4.md → statements/crash4.md


Uploading files: 100%|██████████| 5/5 [00:00<00:00,  5.15it/s]

✅ Uploaded: crash5.md → statements/crash5.md

📊 Upload Summary: 5/5 files uploaded successfully

📋 Statements in storage (5 files):
  • crash1.md
  • crash2.md
  • crash3.md
  • crash4.md
  • crash5.md





## 5. Document Processing with Azure OpenAI GPT-4o

In [21]:
class DocumentProcessor:
    def __init__(self, openai_client, blob_service_client):
        self.openai_client = openai_client
        self.blob_service_client = blob_service_client
    
    def get_blob_content(self, container_name: str, blob_name: str) -> bytes:
        """Download blob content as bytes"""
        blob_client = self.blob_service_client.get_blob_client(
            container=container_name, 
            blob=blob_name
        )
        blob_data = blob_client.download_blob()
        return blob_data.readall()
    
    def encode_image_to_base64(self, image_bytes: bytes) -> str:
        """Encode image bytes to base64 string"""
        return base64.b64encode(image_bytes).decode('utf-8')
    
    def process_markdown_for_vectorization(self, container_name: str, blob_name: str) -> Dict:
        """Process markdown file for direct vectorization (no GPT-4o processing)"""
        try:
            print(f"📄 Preparing markdown for vectorization: {blob_name}...")
            
            # Download and decode content
            blob_content = self.get_blob_content(container_name, blob_name)
            content = blob_content.decode('utf-8')
            
            metadata = {
                "file_name": blob_name,
                "container": container_name,
                "file_type": "markdown",
                "text_length": len(content),
                "processing_date": pd.Timestamp.now().isoformat(),
                "processing_method": "direct_vectorization",
                "ready_for_embedding": True
            }
            
            return {
                "success": True,
                "text": content,  # Original markdown content for vectorization
                "metadata": metadata
            }
            
        except Exception as e:
            print(f"❌ Error processing {blob_name}: {e}")
            return {
                "success": False,
                "error": str(e),
                "metadata": {"file_name": blob_name, "container": container_name, "file_type": "markdown"}
            }

    def generate_image_description_with_gpt4o(self, container_name: str, blob_name: str) -> Dict:
        try:
            print(f"🖼️ Generating description for image: {blob_name}...")
            
            # Download image content
            image_bytes = self.get_blob_content(container_name, blob_name)
            base64_image = self.encode_image_to_base64(image_bytes)
            
            # Determine image format from file extension
            file_extension = Path(blob_name).suffix.lower()
            if file_extension == ".jpg" or file_extension == ".jpeg":
                image_format = "jpeg"
            elif file_extension == ".png":
                image_format = "png"
            else:
                image_format = "jpeg"  # default
            
            # Process with GPT-4o vision for description generation
            response = self.openai_client.chat.completions.create(
                model=Config.AZURE_OPENAI_DEPLOYMENT_NAME,
                messages=[
                    {
                        "role": "system",
                        "content": """You are an expert insurance claims analyst with advanced image analysis capabilities. 
                        Your task is to provide detailed, professional descriptions of insurance-related images, particularly vehicle damage and accident scenes.
                        
                        Focus on:
                        - Type of vehicle and visible damage
                        - Location and extent of damage (scratches, dents, broken parts, etc.)
                        - Environmental context (road conditions, weather signs, location type)
                        - Any visible people, other vehicles, or relevant objects
                        - Overall severity assessment
                        - Any safety concerns or hazards visible
                        
                        Provide clear, objective descriptions that would be useful for insurance claim processing and risk assessment."""
                    },
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": "Please provide a detailed description of this insurance claim image. Focus on damage assessment, environmental factors, and any relevant details for insurance processing."
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/{image_format};base64,{base64_image}"
                                }
                            }
                        ]
                    }
                ],
                max_tokens=4000,
                temperature=0.3  # Slightly higher for more descriptive language
            )
            description = response.choices[0].message.content
            
            metadata = {
                "file_name": blob_name,
                "container": container_name,
                "file_type": "image",
                "image_format": image_format,
                "image_size_bytes": len(image_bytes),
                "description_length": len(description),
                "processing_date": pd.Timestamp.now().isoformat(),
                "model_used": Config.AZURE_OPENAI_DEPLOYMENT_NAME,
                "processing_type": "image_description",
                "ready_for_embedding": True
            }
            
            return {
                "success": True,
                "description": description,  # Changed from "text" to "description"
                "metadata": metadata
            }
            
        except Exception as e:
            print(f"❌ Error processing {blob_name}: {e}")
            return {
                "success": False,
                "error": str(e),
                "metadata": {"file_name": blob_name, "container": container_name, "file_type": "image"}
            }

    def process_all_documents(self) -> Dict[str, List[Dict]]:
        """Process documents: prepare markdown for vectorization, generate descriptions for images"""
        results = {
            "policies": [],
            "claims": [],
            "statements": []  # Added statements to results
        }
        
        # Process policy documents (markdown files) - prepare for vectorization only
        print("📄 Preparing Policy Documents for Vectorization...")
        print("=" * 50)
        
        policy_blobs = uploader.list_blobs(Config.POLICIES_CONTAINER)
        for blob_name in tqdm(policy_blobs, desc="Preparing policies"):
            if blob_name.endswith(".md"):
                result = self.process_markdown_for_vectorization(Config.POLICIES_CONTAINER, blob_name)
                results["policies"].append(result)
            else:
                print(f"⚠️ Skipping non-markdown file: {blob_name}")
        
        # Process statements documents (markdown files) - prepare for vectorization only
        print("\n📄 Preparing Statements Documents for Vectorization...")
        print("=" * 50)
        
        statements_blobs = uploader.list_blobs(Config.STATEMENTS_CONTAINER)
        for blob_name in tqdm(statements_blobs, desc="Preparing statements"):
            if blob_name.endswith(".md"):
                result = self.process_markdown_for_vectorization(Config.STATEMENTS_CONTAINER, blob_name)
                results["statements"].append(result)
            else:
                print(f"⚠️ Skipping non-markdown file: {blob_name}")
        
        # Process claims documents (images) - generate descriptions with GPT-4o Vision
        print("\n🖼️ Generating Image Descriptions with GPT-4o Vision...")
        print("=" * 50)
        
        claims_blobs = uploader.list_blobs(Config.CLAIMS_CONTAINER)
        for blob_name in tqdm(claims_blobs, desc="Generating descriptions"):
            if blob_name.lower().endswith((".jpg", ".jpeg", ".png")):
                result = self.generate_image_description_with_gpt4o(Config.CLAIMS_CONTAINER, blob_name)
                results["claims"].append(result)
            else:
                print(f"⚠️ Skipping non-image file: {blob_name}")
        
        return results
    
    def save_processed_results(self, results: Dict, output_file: str = "processed_documents_for_vectorization.json"):
        """Save processed results to JSON file and upload to blob storage"""
        try:
            # Save locally
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(results, f, indent=2, ensure_ascii=False)
            
            print(f"💾 Results saved locally: {output_file}")
            
            # Upload to blob storage
            success = uploader.upload_file(
                Path(output_file), 
                Config.PROCESSED_CONTAINER, 
                output_file
            )
            
            if success:
                print(f"☁️ Results uploaded to blob storage: {Config.PROCESSED_CONTAINER}/{output_file}")
            
        except Exception as e:
            print(f"❌ Error saving results: {e}")

## 6. Process All Documents with GPT-4o

In [22]:
# Initialize processor with GPT-4o
if openai_client and blob_service_client:
    processor = DocumentProcessor(openai_client, blob_service_client)
    print("✅ Document processor initialized with GPT-4o!")
    
    # Process documents using GPT-4o
    print("\n🚀 Starting document processing with GPT-4o...")
    print("=" * 60)
    
    processing_results = processor.process_all_documents()
    
    print("\n✅ Document processing completed!")
else:
    print("❌ Cannot initialize processor - missing clients")

✅ Document processor initialized with GPT-4o!

🚀 Starting document processing with GPT-4o...
📄 Preparing Policy Documents for Vectorization...


Preparing policies:   0%|          | 0/5 [00:00<?, ?it/s]

📄 Preparing markdown for vectorization: commercial_auto_policy.md...


Preparing policies:  40%|████      | 2/5 [00:00<00:00, 13.13it/s]

📄 Preparing markdown for vectorization: comprehensive_auto_policy.md...
📄 Preparing markdown for vectorization: high_value_vehicle_policy.md...
📄 Preparing markdown for vectorization: liability_only_policy.md...


Preparing policies: 100%|██████████| 5/5 [00:00<00:00, 12.91it/s]

📄 Preparing markdown for vectorization: motorcycle_policy.md...

📄 Preparing Statements Documents for Vectorization...



Preparing statements:  40%|████      | 2/5 [00:00<00:00, 13.29it/s]

📄 Preparing markdown for vectorization: crash1.md...
📄 Preparing markdown for vectorization: crash2.md...
📄 Preparing markdown for vectorization: crash3.md...


Preparing statements: 100%|██████████| 5/5 [00:00<00:00, 12.75it/s]

📄 Preparing markdown for vectorization: crash4.md...
📄 Preparing markdown for vectorization: crash5.md...

🖼️ Generating Image Descriptions with GPT-4o Vision...



Generating descriptions:   0%|          | 0/6 [00:00<?, ?it/s]

🖼️ Generating description for image: crash1.jpg...


Generating descriptions:  17%|█▋        | 1/6 [00:06<00:32,  6.41s/it]

🖼️ Generating description for image: crash2.jpg...


Generating descriptions:  33%|███▎      | 2/6 [00:13<00:27,  6.79s/it]

🖼️ Generating description for image: crash3.jpg...


Generating descriptions:  50%|█████     | 3/6 [00:19<00:19,  6.36s/it]

🖼️ Generating description for image: crash4.jpeg...


Generating descriptions:  67%|██████▋   | 4/6 [00:28<00:14,  7.31s/it]

🖼️ Generating description for image: crash5.jpg...


Generating descriptions:  83%|████████▎ | 5/6 [00:35<00:07,  7.19s/it]

🖼️ Generating description for image: invoice.png...


Generating descriptions: 100%|██████████| 6/6 [00:39<00:00,  6.65s/it]


✅ Document processing completed!





## 7. Results Analysis and Summary

In [23]:
# Save processing results
processor.save_processed_results(processing_results)

# Create a summary report
report = {
    "processing_date": pd.Timestamp.now().isoformat(),
    "model_used": Config.AZURE_OPENAI_DEPLOYMENT_NAME,
    "processing_method": "gpt-4o_multimodal",
    "successful_files": {
        "policies": [r["metadata"]["file_name"] for r in processing_results["policies"] if r["success"]],
        "claims": [r["metadata"]["file_name"] for r in processing_results["claims"] if r["success"]]
    }
}

with open("processing_summary_gpt4o.json", "w") as f:
    json.dump(report, f, indent=2)

# Upload summary to blob storage
uploader.upload_file(Path("processing_summary_gpt4o.json"), Config.PROCESSED_CONTAINER)

print("\n💾 Summary report saved and uploaded!")

💾 Results saved locally: processed_documents_for_vectorization.json
✅ Uploaded: processed_documents_for_vectorization.json → processed-documents/processed_documents_for_vectorization.json
☁️ Results uploaded to blob storage: processed-documents/processed_documents_for_vectorization.json
✅ Uploaded: processing_summary_gpt4o.json → processed-documents/processing_summary_gpt4o.json

💾 Summary report saved and uploaded!
