# Document Processing with Azure AI Foundry

This notebook handles:
1. **Document Upload** - Upload policy and claims documents to Azure Blob Storage
2. **Text Processing** - Process .md files using AI Foundry GPT-4o-mini
3. **OCR Processing** - Extract text from images using AI Foundry vision capabilities
4. **Text Enhancement** - Clean and prepare documents for vectorization

## Prerequisites
- Azure Blob Storage account created
- Azure AI Foundry service with GPT-4o-mini model deployed
- Environment variables configured in `.env` file

In [84]:
import os
import json
import base64
from pathlib import Path
from typing import Dict, List, Optional
import pandas as pd
from tqdm import tqdm

# Azure SDK imports
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError
from azure.identity import DefaultAzureCredential

# Azure AI Foundry imports

from azure.core.credentials import AzureKeyCredential

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

print("✅ All imports successful!")

✅ All imports successful!


In [85]:
class Config:
    # Storage configuration - Clean up environment variables
    AZURE_STORAGE_CONNECTION_STRING = os.getenv('AZURE_STORAGE_CONNECTION_STRING', '').strip('"')
    AZURE_STORAGE_ACCOUNT_NAME = os.getenv('AZURE_STORAGE_ACCOUNT_NAME', '').strip('"').strip()
    AZURE_STORAGE_ACCOUNT_KEY = os.getenv('AZURE_STORAGE_ACCOUNT_KEY', '').strip('"')
    
    # Azure AI Foundry configuration
    AI_FOUNDRY_ENDPOINT = os.getenv('AI_FOUNDRY_ENDPOINT', '').strip('"')
    AI_FOUNDRY_KEY = os.getenv('AI_FOUNDRY_KEY', '').strip('"')
    AI_FOUNDRY_HUB_NAME = os.getenv('AI_FOUNDRY_HUB_NAME', '').strip('"')
    
    # Model deployment names in AI Foundry
    CHAT_MODEL_DEPLOYMENT = 'gpt-4o-mini'  # This should match your deployment name
    EMBEDDING_MODEL_DEPLOYMENT = 'text-embedding-ada-002'  # This should match your deployment name
    
    # Container names
    POLICIES_CONTAINER = 'policies'
    CLAIMS_CONTAINER = 'claims'
    PROCESSED_CONTAINER = 'processed-documents'
    
    # Local data paths
    DATA_DIR = Path('data')
    POLICIES_DIR = DATA_DIR / 'policies'
    CLAIMS_DIR = DATA_DIR / 'claims'

In [86]:
# Clean and validate configuration
def clean_config():
    """Clean and validate configuration values"""
    # Remove any extra quotes or whitespace
    for attr_name in dir(Config):
        if not attr_name.startswith('_') and isinstance(getattr(Config, attr_name), str):
            value = getattr(Config, attr_name).strip().strip('"').strip("'")
            setattr(Config, attr_name, value)

clean_config()

# Validate configuration
required_vars = [
    ('AZURE_STORAGE_CONNECTION_STRING', Config.AZURE_STORAGE_CONNECTION_STRING),
    ('AI_FOUNDRY_ENDPOINT', Config.AI_FOUNDRY_ENDPOINT),
    ('AI_FOUNDRY_KEY', Config.AI_FOUNDRY_KEY)
]

missing_vars = []
for var_name, var_value in required_vars:
    if not var_value or var_value == '':
        missing_vars.append(var_name)

if missing_vars:
    print("❌ Missing environment variables. Please check your .env file.")
    print("Missing variables:")
    for var in missing_vars:
        print(f"  - {var}")
else:
    print("✅ Configuration loaded successfully!")
    print(f"📁 Policies directory: {Config.POLICIES_DIR}")
    print(f"📁 Claims directory: {Config.CLAIMS_DIR}")
    print(f"🤖 AI Foundry Hub: {Config.AI_FOUNDRY_HUB_NAME}")
    print(f"🔗 AI Foundry Endpoint: {Config.AI_FOUNDRY_ENDPOINT}")

✅ Configuration loaded successfully!
📁 Policies directory: data\policies
📁 Claims directory: data\claims
🤖 AI Foundry Hub: msagthack-aifoundry-wuwcap2zryvge
🔗 AI Foundry Endpoint: https://msagthack-aifoundry-wuwcap2zryvge.cognitiveservices.azure.com/


In [None]:
from azure.ai.inference import ChatCompletionsClient


# Initialize Azure clients with improved error handling
def initialize_clients():
    """Initialize Azure service clients with comprehensive error handling"""
    try:
        print("🔄 Initializing Azure clients...")
        
        # Initialize Blob Storage client with multiple fallback methods
        blob_service_client = None
        
        # Method 1: Try connection string
        if Config.AZURE_STORAGE_CONNECTION_STRING:
            try:
                print("🔑 Trying connection string authentication...")
                blob_service_client = BlobServiceClient.from_connection_string(
                    Config.AZURE_STORAGE_CONNECTION_STRING
                )
                # Test the connection
                account_info = blob_service_client.get_account_information()
                print("✅ Storage client initialized with connection string!")
            except Exception as e:
                print(f"❌ Connection string failed: {e}")
                blob_service_client = None
        
        # Method 2: Try account name and key
        if not blob_service_client and Config.AZURE_STORAGE_ACCOUNT_NAME and Config.AZURE_STORAGE_ACCOUNT_KEY:
            try:
                print("🔑 Trying account name and key authentication...")
                blob_service_client = BlobServiceClient(
                    account_url=f"https://{Config.AZURE_STORAGE_ACCOUNT_NAME}.blob.core.windows.net",
                    credential=Config.AZURE_STORAGE_ACCOUNT_KEY
                )
                # Test the connection
                account_info = blob_service_client.get_account_information()
                print("✅ Storage client initialized with account key!")
            except Exception as e:
                print(f"❌ Account key failed: {e}")
                blob_service_client = None
        
        # Method 3: Try default credentials (if managed identity is available)
        if not blob_service_client and Config.AZURE_STORAGE_ACCOUNT_NAME:
            try:
                print("🔑 Trying default Azure credentials...")
                credential = DefaultAzureCredential()
                blob_service_client = BlobServiceClient(
                    account_url=f"https://{Config.AZURE_STORAGE_ACCOUNT_NAME}.blob.core.windows.net",
                    credential=credential
                )
                # Test the connection
                account_info = blob_service_client.get_account_information()
                print("✅ Storage client initialized with default credentials!")
            except Exception as e:
                print(f"❌ Default credentials failed: {e}")
                blob_service_client = None
        
        if not blob_service_client:
            print("❌ Failed to initialize storage client with any method")
        
        # Initialize AI Foundry client
        ai_client = None
        if Config.AI_FOUNDRY_ENDPOINT and Config.AI_FOUNDRY_KEY:
            try:
                print("🤖 Initializing AI Foundry client...")
                ai_client = ChatCompletionsClient(
                    endpoint=Config.AI_FOUNDRY_ENDPOINT,
                    credential=AzureKeyCredential(Config.AI_FOUNDRY_KEY)
                )
                print("✅ AI Foundry client initialized successfully!")
            except Exception as e:
                print(f"❌ Error initializing AI Foundry client: {e}")
                ai_client = None
        
        return blob_service_client, ai_client
        
    except Exception as e:
        print(f"❌ Error during client initialization: {e}")
        return None, None

blob_service_client, ai_client = initialize_clients()

🔄 Initializing Azure clients...
🔑 Trying connection string authentication...
❌ Connection string failed: <urllib3.connection.HTTPSConnection object at 0x00000191E55DE990>: Failed to resolve 'msagthacksawuwcap2zryvge.blob.core.windows.net' ([Errno 11001] getaddrinfo failed)
🔑 Trying account name and key authentication...


In [None]:
# Enhanced container creation with better error handling
def create_containers_enhanced(blob_service_client):
    """Create blob storage containers with enhanced error handling"""
    
    if not blob_service_client:
        print("❌ No blob service client available")
        return False
    
    # First, test the connection
    try:
        print("🔍 Testing storage account connection...")
        account_info = blob_service_client.get_account_information()
        print(f"✅ Connected to storage account successfully")
        print(f"   Account kind: {account_info.get('account_kind', 'Unknown')}")
        print(f"   SKU name: {account_info.get('sku_name', 'Unknown')}")
    except Exception as e:
        print(f"❌ Failed to connect to storage account: {e}")
        return False
    
    # Test listing existing containers
    try:
        print("\n🔍 Checking existing containers...")
        existing_containers = []
        for container in blob_service_client.list_containers():
            existing_containers.append(container.name)
        print(f"✅ Found {len(existing_containers)} existing containers: {existing_containers}")
    except Exception as e:
        print(f"❌ Failed to list containers: {e}")
        print("   This might indicate insufficient permissions")
    
    # Try to create containers
    containers = [
        Config.POLICIES_CONTAINER,
        Config.CLAIMS_CONTAINER,
        Config.PROCESSED_CONTAINER
    ]
    
    created_containers = []
    failed_containers = []
    
    for container_name in containers:
        try:
            # Check if container already exists first
            container_client = blob_service_client.get_container_client(container_name)
            
            try:
                # Try to get container properties (this will fail if it doesn't exist)
                properties = container_client.get_container_properties()
                print(f"ℹ️ Container '{container_name}' already exists")
                created_containers.append(container_name)
                continue
            except ResourceNotFoundError:
                # Container doesn't exist, try to create it
                pass
            except Exception as e:
                # Other error, container might exist but we can't access it
                print(f"⚠️ Container '{container_name}' exists but access check failed: {e}")
                created_containers.append(container_name)
                continue
            
            # Create the container
            print(f"🔨 Creating container '{container_name}'...")
            container_client.create_container()
            print(f"✅ Container '{container_name}' created successfully")
            created_containers.append(container_name)
            
        except ResourceExistsError:
            print(f"ℹ️ Container '{container_name}' already exists")
            created_containers.append(container_name)
        except Exception as e:
            print(f"❌ Error with container '{container_name}': {e}")
            failed_containers.append((container_name, str(e)))
            
            # Additional diagnostics for authorization errors
            if "AuthorizationFailure" in str(e) or "Forbidden" in str(e):
                print(f"   🔍 Authorization issue detected for '{container_name}'")
                print(f"   This could be due to:")
                print(f"   - Storage account access keys disabled")
                print(f"   - Network access restrictions")
                print(f"   - Storage account permissions")
                print(f"   - Try using Azure CLI: az storage container create --name {container_name} --account-name {Config.AZURE_STORAGE_ACCOUNT_NAME}")
    
    print(f"\n📊 Container Creation Summary:")
    print(f"   ✅ Successful: {len(created_containers)} - {created_containers}")
    print(f"   ❌ Failed: {len(failed_containers)} - {[name for name, _ in failed_containers]}")
    
    return len(failed_containers) == 0

# Run the enhanced container creation
if blob_service_client:
    print("🚀 Running enhanced container creation...")
    success = create_containers_enhanced(blob_service_client)
    
    if not success:
        print("\n📝 CONTAINER CREATION SUGGESTIONS")
        print("=" * 50)
        print("If container creation failed, try these solutions:")
        print("\n1. **Azure Portal Method:**")
        print("   - Go to https://portal.azure.com")
        print(f"   - Navigate to storage account: {Config.AZURE_STORAGE_ACCOUNT_NAME}")
        print("   - Go to 'Containers' in the left menu")
        print("   - Click '+ Container' and create these containers:")
        print("     • policies")
        print("     • claims") 
        print("     • processed-documents")
        print("   - Set 'Public access level' to 'Private' for all containers")
        
        print("\n2. **Azure CLI Method:**")
        print("   Run these commands in Azure CLI:")
        for container in [Config.POLICIES_CONTAINER, Config.CLAIMS_CONTAINER, Config.PROCESSED_CONTAINER]:
            print(f"   az storage container create --name {container} --account-name {Config.AZURE_STORAGE_ACCOUNT_NAME} --account-key {Config.AZURE_STORAGE_ACCOUNT_KEY}")
else:
    print("❌ No blob service client available - please check your configuration")

🚀 Running enhanced container creation...
🔍 Testing storage account connection...
✅ Connected to storage account successfully
   Account kind: StorageV2
   SKU name: Standard_RAGRS

🔍 Checking existing containers...
✅ Found 3 existing containers: ['claims', 'policies', 'processed-documents']
ℹ️ Container 'policies' already exists
ℹ️ Container 'claims' already exists
ℹ️ Container 'processed-documents' already exists

📊 Container Creation Summary:
   ✅ Successful: 3 - ['policies', 'claims', 'processed-documents']
   ❌ Failed: 0 - []


## 3. Document Upload Functions

In [None]:
class DocumentUploader:
    def __init__(self, blob_service_client):
        self.blob_service_client = blob_service_client
    
    def upload_file(self, file_path: Path, container_name: str, blob_name: str = None) -> bool:
        """Upload a single file to blob storage"""
        if not self.blob_service_client:
            print("❌ No blob service client available")
            return False
            
        if blob_name is None:
            blob_name = file_path.name
            
        try:
            blob_client = self.blob_service_client.get_blob_client(
                container=container_name, 
                blob=blob_name
            )
            
            with open(file_path, 'rb') as data:
                blob_client.upload_blob(data, overwrite=True)
            
            print(f"✅ Uploaded: {file_path.name} → {container_name}/{blob_name}")
            return True
            
        except Exception as e:
            print(f"❌ Error uploading {file_path.name}: {e}")
            return False
    
    def upload_directory(self, directory_path: Path, container_name: str) -> Dict[str, bool]:
        """Upload all files from a directory to blob storage"""
        results = {}
        
        if not directory_path.exists():
            print(f"❌ Directory not found: {directory_path}")
            return results
        
        files = list(directory_path.glob('*'))
        if not files:
            print(f"ℹ️ No files found in {directory_path}")
            return results
        
        print(f"📤 Uploading {len(files)} files from {directory_path} to {container_name}...")
        
        for file_path in tqdm(files, desc="Uploading files"):
            if file_path.is_file():
                success = self.upload_file(file_path, container_name)
                results[file_path.name] = success
        
        successful_uploads = sum(results.values())
        print(f"\n📊 Upload Summary: {successful_uploads}/{len(results)} files uploaded successfully")
        
        return results
    
    def list_blobs(self, container_name: str) -> List[str]:
        """List all blobs in a container"""
        if not self.blob_service_client:
            print("❌ No blob service client available")
            return []
            
        try:
            container_client = self.blob_service_client.get_container_client(container_name)
            blob_list = container_client.list_blobs()
            return [blob.name for blob in blob_list]
        except Exception as e:
            print(f"❌ Error listing blobs in {container_name}: {e}")
            return []

# Initialize uploader
uploader = None
if blob_service_client:
    uploader = DocumentUploader(blob_service_client)
    print("✅ Document uploader initialized!")
else:
    print("❌ Cannot initialize uploader - no blob service client")

✅ Document uploader initialized!


## 4. Upload Documents to Blob Storage

In [None]:
# Upload policy documents
if uploader:
    print("📄 Uploading Policy Documents...")
    print("=" * 50)

    policy_results = uploader.upload_directory(Config.POLICIES_DIR, Config.POLICIES_CONTAINER)

    # Show uploaded policies
    policy_blobs = uploader.list_blobs(Config.POLICIES_CONTAINER)
    print(f"\n📋 Policies in storage ({len(policy_blobs)} files):")
    for blob in policy_blobs:
        print(f"  • {blob}")
else:
    print("❌ Cannot upload policies - no uploader available")

📄 Uploading Policy Documents...
📤 Uploading 5 files from data\policies to policies...


Uploading files:   0%|          | 0/5 [00:00<?, ?it/s]

✅ Uploaded: commercial_auto_policy.md → policies/commercial_auto_policy.md


Uploading files:  40%|████      | 2/5 [00:00<00:00, 11.04it/s]

✅ Uploaded: comprehensive_auto_policy.md → policies/comprehensive_auto_policy.md
✅ Uploaded: high_value_vehicle_policy.md → policies/high_value_vehicle_policy.md


Uploading files:  80%|████████  | 4/5 [00:00<00:00, 11.42it/s]

✅ Uploaded: liability_only_policy.md → policies/liability_only_policy.md


Uploading files: 100%|██████████| 5/5 [00:00<00:00, 11.33it/s]

✅ Uploaded: motorcycle_policy.md → policies/motorcycle_policy.md

📊 Upload Summary: 5/5 files uploaded successfully






📋 Policies in storage (5 files):
  • commercial_auto_policy.md
  • comprehensive_auto_policy.md
  • high_value_vehicle_policy.md
  • liability_only_policy.md
  • motorcycle_policy.md


In [None]:
# Upload claims documents
if uploader:
    print("\n🖼️ Uploading Claims Documents...")
    print("=" * 50)

    claims_results = uploader.upload_directory(Config.CLAIMS_DIR, Config.CLAIMS_CONTAINER)

    # Show uploaded claims
    claims_blobs = uploader.list_blobs(Config.CLAIMS_CONTAINER)
    print(f"\n📋 Claims in storage ({len(claims_blobs)} files):")
    for blob in claims_blobs:
        print(f"  • {blob}")
else:
    print("❌ Cannot upload claims - no uploader available")


🖼️ Uploading Claims Documents...
📤 Uploading 6 files from data\claims to claims...


Uploading files:  50%|█████     | 3/6 [00:00<00:00,  7.71it/s]

✅ Uploaded: crash1.jpg → claims/crash1.jpg
✅ Uploaded: crash2.jpg → claims/crash2.jpg
✅ Uploaded: crash3.jpg → claims/crash3.jpg


Uploading files: 100%|██████████| 6/6 [00:00<00:00,  7.50it/s]

✅ Uploaded: crash4.jpeg → claims/crash4.jpeg
✅ Uploaded: crash5.jpg → claims/crash5.jpg
✅ Uploaded: invoice.png → claims/invoice.png

📊 Upload Summary: 6/6 files uploaded successfully






📋 Claims in storage (6 files):
  • crash1.jpg
  • crash2.jpg
  • crash3.jpg
  • crash4.jpeg
  • crash5.jpg
  • invoice.png


## 5. Document Processing with Azure AI Foundry

## 5A. Image Processing with Azure AI Foundry Vision
Process .jpg files from claims container using multimodal GPT-4o

In [None]:
class ImageProcessor:
    def __init__(self, ai_client, blob_service_client):
        self.ai_client = ai_client
        self.blob_service_client = blob_service_client
    
    def get_blob_content(self, container_name: str, blob_name: str) -> bytes:
        """Download blob content as bytes"""
        if not self.blob_service_client:
            raise Exception("No blob service client available")
            
        blob_client = self.blob_service_client.get_blob_client(
            container=container_name, 
            blob=blob_name
        )
        blob_data = blob_client.download_blob()
        return blob_data.readall()
    
    def encode_image_to_base64(self, image_bytes: bytes) -> str:
        """Encode image bytes to base64 string"""
        return base64.b64encode(image_bytes).decode('utf-8')
    
    def extract_text_from_image(self, container_name: str, blob_name: str) -> Dict:
        """Extract text from image using AI Foundry GPT-4o vision capabilities"""
        try:
            print(f"🖼️ Processing image: {blob_name}...")
            
            # Download image content
            image_bytes = self.get_blob_content(container_name, blob_name)
            base64_image = self.encode_image_to_base64(image_bytes)
            # Determine image format from file extension
            file_extension = Path(blob_name).suffix.lower()
            if file_extension == ".jpg" or file_extension == ".jpeg":
                image_format = "jpeg"
            elif file_extension == ".png":
                image_format = "png"
            else:
                image_format = "jpeg"  # default
            
            # Process with AI Foundry GPT-4o vision
            from azure.ai.inference.models import SystemMessage, UserMessage, ImageContentItem, TextContentItem
            
            messages = [
                SystemMessage(content="You are an expert insurance document analyzer. Extract ALL visible text from this insurance claim image with high accuracy. Structure the extracted information into clear categories: claim details, dates, amounts, policy numbers, damage descriptions, and any other relevant information. Preserve formatting and be thorough."),
                UserMessage(content=[
                    TextContentItem(text="Please extract and structure all text from this insurance claim image. Focus on claim numbers, dates, amounts, policy details, and damage descriptions."),
                    ImageContentItem(image_url=f"data:image/{image_format};base64,{base64_image}")
                ])
            ]
            
            response = self.ai_client.complete(
                model=Config.CHAT_MODEL_DEPLOYMENT,  # This should be gpt-4o-mini
                messages=messages,
                max_tokens=4000,
                temperature=0.1
            )
            
            extracted_text = response.choices[0].message.content
            
            metadata = {
                "file_name": blob_name,
                "container": container_name,
                "file_type": "image",
                "image_format": image_format,
                "image_size_bytes": len(image_bytes),
                "text_length": len(extracted_text),
                "processing_date": pd.Timestamp.now().isoformat(),
                "model_used": Config.CHAT_MODEL_DEPLOYMENT,
                "processing_type": "image_ocr"
            }
                        
            return {
                "success": True,
                "text": extracted_text,
                "metadata": metadata
            }
        except Exception as e:
            print(f"❌ Error processing image {blob_name}: {e}")
            return {
                "success": False,
                "error": str(e),
                "metadata": {"file_name": blob_name, "container": container_name, "file_type": "image"}
            }
    
    def process_claim_images(self) -> List[Dict]:
        """Process all .jpg images in claims container"""
        results = []
        
        if not uploader:
            print("❌ No uploader available - cannot process images")
            return results
        
        print("🖼️ Processing Claim Images with AI Foundry GPT-4o Vision...")
        print("=" * 60)
        
        claims_blobs = uploader.list_blobs(Config.CLAIMS_CONTAINER)
        image_blobs = [blob for blob in claims_blobs if blob.lower().endswith(('.jpg', '.jpeg'))]
        
        if not image_blobs:
            print("ℹ️ No .jpg images found in claims container")
            return results
        
        for blob_name in tqdm(image_blobs, desc="Processing claim images"):
            result = self.extract_text_from_image(Config.CLAIMS_CONTAINER, blob_name)
            results.append(result)
        
        print(f"\n✅ Processed {len(image_blobs)} claim images")
        return results

In [None]:
# Initialize image processor
image_processor = None
if ai_client and blob_service_client:
    image_processor = ImageProcessor(ai_client, blob_service_client)
    print("✅ Image processor initialized!")
else:
    print("❌ Cannot initialize image processor - missing clients")

✅ Image processor initialized!


## 5B. Text Document Processing
Process .md files from policies container as text documents

In [None]:
class TextProcessor:
    def __init__(self, blob_service_client):
        self.blob_service_client = blob_service_client
    
    def get_blob_content(self, container_name: str, blob_name: str) -> bytes:
        """Download blob content as bytes"""
        if not self.blob_service_client:
            raise Exception("No blob service client available")
            
        blob_client = self.blob_service_client.get_blob_client(
            container=container_name, 
            blob=blob_name
        )
        blob_data = blob_client.download_blob()
        return blob_data.readall()
    
    def process_text_document(self, container_name: str, blob_name: str) -> Dict:
        """Process text document (markdown) for direct vectorization"""
        try:
            print(f"📄 Processing text document: {blob_name}...")
            
            # Download and decode content
            blob_content = self.get_blob_content(container_name, blob_name)
            content = blob_content.decode('utf-8')
            
            # Clean and prepare text for vectorization
            # Remove excessive whitespace and normalize
            cleaned_text = ' '.join(content.split())
            
            metadata = {
                "file_name": blob_name,
                "container": container_name,
                "file_type": "markdown",
                "original_length": len(content),
                "cleaned_length": len(cleaned_text),
                "processing_date": pd.Timestamp.now().isoformat(),
                "processing_type": "text_cleaning"
            }
            
            return {
                "success": True,
                "text": cleaned_text,
                "original_text": content,
                "metadata": metadata
            }
            
        except Exception as e:
            print(f"❌ Error processing text document {blob_name}: {e}")
            return {
                "success": False,
                "error": str(e),
                "metadata": {"file_name": blob_name, "container": container_name, "file_type": "markdown"}
            }
    
    def process_policy_documents(self) -> List[Dict]:
        """Process all .md documents in policies container"""
        results = []
        
        if not uploader:
            print("❌ No uploader available - cannot process documents")
            return results
        
        print("📄 Processing Policy Documents as Text...")
        print("=" * 60)
        
        policy_blobs = uploader.list_blobs(Config.POLICIES_CONTAINER)
        text_blobs = [blob for blob in policy_blobs if blob.lower().endswith('.md')]
        
        if not text_blobs:
            print("ℹ️ No .md files found in policies container")
            return results
        
        for blob_name in tqdm(text_blobs, desc="Processing policy documents"):
            result = self.process_text_document(Config.POLICIES_CONTAINER, blob_name)
            results.append(result)
        
        print(f"\n✅ Processed {len(text_blobs)} policy documents")
        return results

# Initialize text processor
text_processor = None
if blob_service_client:
    text_processor = TextProcessor(blob_service_client)
    print("✅ Text processor initialized!")
else:
    print("❌ Cannot initialize text processor - missing blob client")

✅ Text processor initialized!


## 5C. Azure AI Search Integration
Set up Azure AI Search for vectorization using text-embedding-ada-002

In [None]:
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import VectorizedQuery
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
)
from azure.core.credentials import AzureKeyCredential as SearchKeyCredential

class VectorSearchManager:
    def __init__(self, ai_client):
        self.ai_client = ai_client
        self.search_endpoint = os.getenv('SEARCH_SERVICE_ENDPOINT', '').strip('"')
        self.search_key = os.getenv('SEARCH_ADMIN_KEY', '').strip('"')
        
        if not self.search_endpoint or not self.search_key:
            raise Exception("Missing Azure AI Search configuration")
        
        self.search_credential = SearchKeyCredential(self.search_key)
        self.index_client = SearchIndexClient(
            endpoint=self.search_endpoint,
            credential=self.search_credential
        )
        
        # Index names
        self.policies_index_name = "insurance-policies-index"
        self.claims_index_name = "insurance-claims-index"
    
    def generate_embeddings(self, text: str) -> List[float]:
        """Generate embeddings using text-embedding-ada-002"""
        try:
            from azure.ai.inference.models import EmbeddingsInput
            
            response = self.ai_client.embed(
                model=Config.EMBEDDING_MODEL_DEPLOYMENT,  # text-embedding-ada-002
                input=[text]
            )
            
            return response.data[0].embedding
        except Exception as e:
            print(f"❌ Error generating embeddings: {e}")
            return []
    
    def create_search_index(self, index_name: str, index_type: str):
        """Create search index for documents"""
        try:
            print(f"🔍 Creating search index: {index_name}...")
            
            # Define fields based on index type
            if index_type == "policies":
                fields = [
                    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
                    SearchableField(name="title", type=SearchFieldDataType.String),
                    SearchableField(name="content", type=SearchFieldDataType.String),
                    SimpleField(name="file_name", type=SearchFieldDataType.String),
                    SimpleField(name="container", type=SearchFieldDataType.String),
                    SimpleField(name="file_type", type=SearchFieldDataType.String),
                    SimpleField(name="processing_date", type=SearchFieldDataType.DateTimeOffset),
                    SearchField(
                        name="content_vector",
                        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                        searchable=True,
                        vector_search_dimensions=1536,  # Ada-002 embedding size
                        vector_search_profile_name="my-vector-config"
                    )
                ]
            else:  # claims
                fields = [
                    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
                    SearchableField(name="title", type=SearchFieldDataType.String),
                    SearchableField(name="content", type=SearchFieldDataType.String),
                    SimpleField(name="file_name", type=SearchFieldDataType.String),
                    SimpleField(name="container", type=SearchFieldDataType.String),
                    SimpleField(name="file_type", type=SearchFieldDataType.String),
                    SimpleField(name="image_format", type=SearchFieldDataType.String),
                    SimpleField(name="processing_date", type=SearchFieldDataType.DateTimeOffset),
                    SearchField(
                        name="content_vector",
                        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                        searchable=True,
                        vector_search_dimensions=1536,  # Ada-002 embedding size
                        vector_search_profile_name="my-vector-config"
                    )
                ]
            
            # Configure vector search
            vector_search = VectorSearch(
                profiles=[
                    VectorSearchProfile(
                        name="my-vector-config",
                        algorithm_configuration_name="my-hnsw"
                    )
                ],
                algorithms=[
                    HnswAlgorithmConfiguration(name="my-hnsw")
                ]
            )
            
            # Create the search index
            index = SearchIndex(
                name=index_name,
                fields=fields,
                vector_search=vector_search
            )
            
            result = self.index_client.create_or_update_index(index)
            print(f"✅ Search index '{index_name}' created successfully")
            return True
            
        except Exception as e:
            print(f"❌ Error creating search index: {e}")
            return False
    
    def upload_documents_to_search(self, documents: List[Dict], index_name: str):
        """Upload documents with embeddings to Azure AI Search"""
        try:
            print(f"📤 Uploading documents to search index: {index_name}...")
            
            search_client = SearchClient(
                endpoint=self.search_endpoint,
                index_name=index_name,
                credential=self.search_credential
            )
            
            search_documents = []
            successful_uploads = 0
            
            for i, doc in enumerate(tqdm(documents, desc="Generating embeddings and uploading")):
                if not doc["success"]:
                    continue
                
                try:
                    # Generate embeddings for the text content
                    embeddings = self.generate_embeddings(doc["text"])
                    
                    if not embeddings:
                        print(f"⚠️ Failed to generate embeddings for {doc['metadata']['file_name']}")
                        continue
                    
                    # Prepare document for search index
                    search_doc = {
                        "id": f"{doc['metadata']['file_name']}_{i}",
                        "title": doc['metadata']['file_name'],
                        "content": doc["text"],
                        "file_name": doc['metadata']['file_name'],
                        "container": doc['metadata']['container'],
                        "file_type": doc['metadata']['file_type'],
                        "processing_date": doc['metadata']['processing_date'],
                        "content_vector": embeddings
                    }
                    
                    # Add image-specific fields for claims
                    if index_name == self.claims_index_name and 'image_format' in doc['metadata']:
                        search_doc["image_format"] = doc['metadata']['image_format']
                    
                    search_documents.append(search_doc)
                    successful_uploads += 1
                    
                except Exception as e:
                    print(f"❌ Error processing document {doc['metadata']['file_name']}: {e}")
                    continue
            
            if search_documents:
                # Upload documents in batches
                batch_size = 10
                for i in range(0, len(search_documents), batch_size):
                    batch = search_documents[i:i + batch_size]
                    result = search_client.upload_documents(documents=batch)
                    print(f"📤 Uploaded batch {i//batch_size + 1}: {len(batch)} documents")
                
                print(f"✅ Successfully uploaded {successful_uploads} documents to search index")
            else:
                print("❌ No documents to upload")
                
        except Exception as e:
            print(f"❌ Error uploading documents to search: {e}")

# Initialize vector search manager
vector_search_manager = None
if ai_client:
    try:
        vector_search_manager = VectorSearchManager(ai_client)
        print("✅ Vector search manager initialized!")
    except Exception as e:
        print(f"❌ Cannot initialize vector search manager: {e}")
else:
    print("❌ Cannot initialize vector search manager - missing AI client")

✅ Vector search manager initialized!


## 6. Execute Processing Pipeline
Process images and text documents, then vectorize into Azure AI Search

In [None]:
# Execute the complete processing pipeline
print("🚀 Starting Complete Document Processing Pipeline...")
print("=" * 70)

# Step 1: Process claim images with GPT-4o
print("\n" + "="*50)
print("STEP 1: Processing Claim Images")
print("="*50)

claim_results = []
if image_processor:
    claim_results = image_processor.process_claim_images()
else:
    print("❌ Image processor not available")

# Step 2: Process policy text documents
print("\n" + "="*50)
print("STEP 2: Processing Policy Text Documents")
print("="*50)

policy_results = []
if text_processor:
    policy_results = text_processor.process_policy_documents()
else:
    print("❌ Text processor not available")

# Step 3: Create search indexes and upload documents
print("\n" + "="*50)
print("STEP 3: Creating Search Indexes and Vectorizing")
print("="*50)

if vector_search_manager:
    # Create indexes
    policies_index_created = vector_search_manager.create_search_index(
        vector_search_manager.policies_index_name, 
        "policies"
    )
    
    claims_index_created = vector_search_manager.create_search_index(
        vector_search_manager.claims_index_name, 
        "claims"
    )
    
    # Upload policy documents if index was created successfully
    if policies_index_created and policy_results:
        print(f"\n📚 Vectorizing {len(policy_results)} policy documents...")
        vector_search_manager.upload_documents_to_search(
            policy_results, 
            vector_search_manager.policies_index_name
        )
    
    # Upload claim documents if index was created successfully
    if claims_index_created and claim_results:
        print(f"\n🖼️ Vectorizing {len(claim_results)} claim documents...")
        vector_search_manager.upload_documents_to_search(
            claim_results, 
            vector_search_manager.claims_index_name
        )
else:
    print("❌ Vector search manager not available")

print("\n" + "="*70)
print("🎉 PROCESSING PIPELINE COMPLETED!")
print("="*70)

🚀 Starting Complete Document Processing Pipeline...

STEP 1: Processing Claim Images
🖼️ Processing Claim Images with AI Foundry GPT-4o Vision...


Processing claim images:   0%|          | 0/5 [00:00<?, ?it/s]

🖼️ Processing image: crash1.jpg...


Processing claim images:  40%|████      | 2/5 [00:00<00:01,  2.44it/s]

❌ Error processing image crash1.jpg: (404) Resource not found
Code: 404
Message: Resource not found
🖼️ Processing image: crash2.jpg...
❌ Error processing image crash2.jpg: (404) Resource not found
Code: 404
Message: Resource not found
🖼️ Processing image: crash3.jpg...


Processing claim images:  60%|██████    | 3/5 [00:01<00:00,  3.15it/s]

❌ Error processing image crash3.jpg: (404) Resource not found
Code: 404
Message: Resource not found
🖼️ Processing image: crash4.jpeg...


Processing claim images: 100%|██████████| 5/5 [00:01<00:00,  3.11it/s]

❌ Error processing image crash4.jpeg: (404) Resource not found
Code: 404
Message: Resource not found
🖼️ Processing image: crash5.jpg...
❌ Error processing image crash5.jpg: (404) Resource not found
Code: 404
Message: Resource not found

✅ Processed 5 claim images

STEP 2: Processing Policy Text Documents
📄 Processing Policy Documents as Text...



Processing policy documents:  40%|████      | 2/5 [00:00<00:00, 14.27it/s]

📄 Processing text document: commercial_auto_policy.md...
📄 Processing text document: comprehensive_auto_policy.md...
📄 Processing text document: high_value_vehicle_policy.md...


Processing policy documents: 100%|██████████| 5/5 [00:00<00:00, 14.22it/s]

📄 Processing text document: liability_only_policy.md...
📄 Processing text document: motorcycle_policy.md...

✅ Processed 5 policy documents

STEP 3: Creating Search Indexes and Vectorizing
🔍 Creating search index: insurance-policies-index...





✅ Search index 'insurance-policies-index' created successfully
🔍 Creating search index: insurance-claims-index...
✅ Search index 'insurance-claims-index' created successfully

📚 Vectorizing 5 policy documents...
📤 Uploading documents to search index: insurance-policies-index...


Generating embeddings and uploading: 100%|██████████| 5/5 [00:00<00:00, 1173.10it/s]


❌ Error generating embeddings: cannot import name 'EmbeddingsInput' from 'azure.ai.inference.models' (c:\Users\martasantos\OneDrive - Microsoft\FY26\agentic-ai-hack\.venv\Lib\site-packages\azure\ai\inference\models\__init__.py)
⚠️ Failed to generate embeddings for commercial_auto_policy.md
❌ Error generating embeddings: cannot import name 'EmbeddingsInput' from 'azure.ai.inference.models' (c:\Users\martasantos\OneDrive - Microsoft\FY26\agentic-ai-hack\.venv\Lib\site-packages\azure\ai\inference\models\__init__.py)
⚠️ Failed to generate embeddings for comprehensive_auto_policy.md
❌ Error generating embeddings: cannot import name 'EmbeddingsInput' from 'azure.ai.inference.models' (c:\Users\martasantos\OneDrive - Microsoft\FY26\agentic-ai-hack\.venv\Lib\site-packages\azure\ai\inference\models\__init__.py)
⚠️ Failed to generate embeddings for high_value_vehicle_policy.md
❌ Error generating embeddings: cannot import name 'EmbeddingsInput' from 'azure.ai.inference.models' (c:\Users\martasanto

Generating embeddings and uploading: 100%|██████████| 5/5 [00:00<00:00, 28966.19it/s]

❌ No documents to upload

🎉 PROCESSING PIPELINE COMPLETED!





In [None]:
# Save processing results
def save_processing_results():
    """Save all processing results to files"""
    try:
        # Combine results
        all_results = {
            "policies": policy_results,
            "claims": claim_results,
            "processing_summary": {
                "policies_processed": len(policy_results),
                "policies_successful": sum(1 for r in policy_results if r["success"]),
                "claims_processed": len(claim_results),
                "claims_successful": sum(1 for r in claim_results if r["success"]),
                "processing_date": pd.Timestamp.now().isoformat(),
                "text_model": Config.CHAT_MODEL_DEPLOYMENT,
                "embedding_model": Config.EMBEDDING_MODEL_DEPLOYMENT
            }
        }
        
        # Save locally
        output_file = "complete_processing_results.json"
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(all_results, f, indent=2, ensure_ascii=False)
        
        print(f"💾 Results saved locally: {output_file}")
        
        # Upload to blob storage
        if uploader:
            success = uploader.upload_file(
                Path(output_file), 
                Config.PROCESSED_CONTAINER, 
                output_file
            )
            
            if success:
                print(f"☁️ Results uploaded to blob storage: {Config.PROCESSED_CONTAINER}/{output_file}")
        
        return all_results
        
    except Exception as e:
        print(f"❌ Error saving results: {e}")
        return None

# Save the results
final_results = save_processing_results()

# Print summary
if final_results:
    summary = final_results["processing_summary"]
    print(f"\n📊 FINAL PROCESSING SUMMARY:")
    print(f"   📄 Policy Documents: {summary['policies_successful']}/{summary['policies_processed']} successful")
    print(f"   🖼️ Claim Images: {summary['claims_successful']}/{summary['claims_processed']} successful")
    print(f"   🤖 Text Model: {summary['text_model']}")
    print(f"   🔍 Embedding Model: {summary['embedding_model']}")

💾 Results saved locally: complete_processing_results.json
✅ Uploaded: complete_processing_results.json → processed-documents/complete_processing_results.json
☁️ Results uploaded to blob storage: processed-documents/complete_processing_results.json

📊 FINAL PROCESSING SUMMARY:
   📄 Policy Documents: 5/5 successful
   🖼️ Claim Images: 0/5 successful
   🤖 Text Model: gpt-4o-mini
   🔍 Embedding Model: text-embedding-ada-002


## 7. Test Search Functionality
Test the vectorized search capabilities

In [None]:
def test_search_functionality():
    """Test the search functionality with sample queries"""
    if not vector_search_manager:
        print("❌ Vector search manager not available")
        return
    
    print("🔍 Testing Search Functionality...")
    print("=" * 50)
    
    try:
        from azure.search.documents import SearchClient
        
        # Test policy search
        policies_search_client = SearchClient(
            endpoint=vector_search_manager.search_endpoint,
            index_name=vector_search_manager.policies_index_name,
            credential=vector_search_manager.search_credential
        )
        
        # Test claims search
        claims_search_client = SearchClient(
            endpoint=vector_search_manager.search_endpoint,
            index_name=vector_search_manager.claims_index_name,
            credential=vector_search_manager.search_credential
        )
        
        # Sample queries
        test_queries = [
            "motorcycle insurance coverage",
            "liability policy details",
            "comprehensive coverage benefits"
        ]
        
        for query in test_queries:
            print(f"\n🔎 Testing query: '{query}'")
            
            # Generate embedding for the query
            query_embeddings = vector_search_manager.generate_embeddings(query)
            
            if query_embeddings:
                # Search policies
                try:
                    vector_query = VectorizedQuery(vector=query_embeddings, k_nearest_neighbors=3, fields="content_vector")
                    policy_results = policies_search_client.search(
                        search_text="",
                        vector_queries=[vector_query],
                        select=["title", "content", "file_name"],
                        top=3
                    )
                    
                    print(f"   📄 Policy Results:")
                    for result in policy_results:
                        print(f"     • {result['file_name']}: {result['content'][:100]}...")
                        
                except Exception as e:
                    print(f"   ❌ Policy search failed: {e}")
                
                # Search claims
                try:
                    vector_query = VectorizedQuery(vector=query_embeddings, k_nearest_neighbors=3, fields="content_vector")
                    claim_results = claims_search_client.search(
                        search_text="",
                        vector_queries=[vector_query],
                        select=["title", "content", "file_name"],
                        top=3
                    )
                    
                    print(f"   🖼️ Claim Results:")
                    for result in claim_results:
                        print(f"     • {result['file_name']}: {result['content'][:100]}...")
                        
                except Exception as e:
                    print(f"   ❌ Claim search failed: {e}")
            else:
                print(f"   ❌ Failed to generate embeddings for query")
        
        print(f"\n✅ Search functionality test completed!")
        
    except Exception as e:
        print(f"❌ Error testing search functionality: {e}")

# Run the test
test_search_functionality()

🔍 Testing Search Functionality...

🔎 Testing query: 'motorcycle insurance coverage'
❌ Error generating embeddings: cannot import name 'EmbeddingsInput' from 'azure.ai.inference.models' (c:\Users\martasantos\OneDrive - Microsoft\FY26\agentic-ai-hack\.venv\Lib\site-packages\azure\ai\inference\models\__init__.py)
   ❌ Failed to generate embeddings for query

🔎 Testing query: 'liability policy details'
❌ Error generating embeddings: cannot import name 'EmbeddingsInput' from 'azure.ai.inference.models' (c:\Users\martasantos\OneDrive - Microsoft\FY26\agentic-ai-hack\.venv\Lib\site-packages\azure\ai\inference\models\__init__.py)
   ❌ Failed to generate embeddings for query

🔎 Testing query: 'comprehensive coverage benefits'
❌ Error generating embeddings: cannot import name 'EmbeddingsInput' from 'azure.ai.inference.models' (c:\Users\martasantos\OneDrive - Microsoft\FY26\agentic-ai-hack\.venv\Lib\site-packages\azure\ai\inference\models\__init__.py)
   ❌ Failed to generate embeddings for query
