In [1]:
%pip install -r requirements.txt

Collecting azure-storage-blob (from -r requirements.txt (line 3))
  Using cached azure_storage_blob-12.27.1-py3-none-any.whl.metadata (26 kB)
Collecting isodate>=0.6.1 (from azure-storage-blob->-r requirements.txt (line 3))
  Using cached isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Using cached azure_storage_blob-12.27.1-py3-none-any.whl (428 kB)
Using cached isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate, azure-storage-blob

   ---------------------------------------- 0/2 [isodate]
   ---------------------------------------- 0/2 [isodate]
   -------------------- ------------------- 1/2 [azure-storage-blob]
   -------------------- ------------------- 1/2 [azure-storage-blob]
   -------------------- ------------------- 1/2 [azure-storage-blob]
   -------------------- ------------------- 1/2 [azure-storage-blob]
   -------------------- ------------------- 1/2 [azure-storage-blob]
   -------------------- ------------------- 1/2 [azure-storage-blob]
   --

In [None]:
import logging
import json
import os
import sys
import uuid
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

load_dotenv(find_dotenv())
logging.basicConfig(level=logging.INFO)

# For authentication, you can use either token-based authentication or a subscription key; only one method is required.
AZURE_AI_ENDPOINT = os.getenv("CONTENT_UNDERSTANDING_ENDPOINT", "").strip()
AZURE_AI_API_KEY = os.getenv("CONTENT_UNDERSTANDING_API_KEY", "").strip()
AZURE_AI_API_VERSION = os.getenv("CONTENT_UNDERSTANDING_API_VERSION", "2025-05-01-preview").strip()

if not AZURE_AI_ENDPOINT:
    raise ValueError("Set CONTENT_UNDERSTANDING_ENDPOINT in your environment or .env file.")

# Add the parent directory to the path to use shared modules
parent_dir = Path(Path.cwd()).parent
sys.path.append(str(parent_dir))
from python.content_understanding_client import AzureContentUnderstandingClient

client_kwargs = {
    "endpoint": AZURE_AI_ENDPOINT,
    "api_version": AZURE_AI_API_VERSION or "2025-05-01-preview",
    "x_ms_useragent": "azure-ai-content-understanding-python/content_extraction",
}

if AZURE_AI_API_KEY:
    client_kwargs["subscription_key"] = AZURE_AI_API_KEY
else:
    credential = DefaultAzureCredential()
    token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default")
    client_kwargs["token_provider"] = token_provider

client = AzureContentUnderstandingClient(**client_kwargs)

# Utility function to save images
from PIL import Image
from io import BytesIO
import re

def save_image(image_id: str, response):
    raw_image = client.get_image_from_analyze_operation(analyze_response=response,
        image_id=image_id
    )
    image = Image.open(BytesIO(raw_image))
    # To display the image, uncomment the following line:
    # image.show()
    Path(".cache").mkdir(exist_ok=True)
    image.save(f".cache/{image_id}.jpg", "JPEG")


INFO:azure.identity._credentials.environment:No environment configuration found.
INFO:azure.identity._credentials.managed_identity:ManagedIdentityCredential will use IMDS
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'
Request method: 'GET'
Request headers:
    'User-Agent': 'azsdk-python-identity/1.25.1 Python/3.12.10 (Windows-11-10.0.26200-SP0)'
No body was attached to the request
INFO:azure.identity._credentials.chained:DefaultAzureCredential acquired a token from AzureCliCredential


In [3]:
INVOICE_IMAGE_PATH = '../data/invoice_sample.jpg'  # Replace with your uploaded invoice image
INVOICE_ANALYZER_ID = 'prebuilt-invoice'

invoice_image_file = Path(INVOICE_IMAGE_PATH)
if not invoice_image_file.exists():
    raise FileNotFoundError(f"Invoice image not found: {invoice_image_file}")

invoice_response = client.begin_analyze(INVOICE_ANALYZER_ID, file_location=str(invoice_image_file))
invoice_result = client.poll_result(invoice_response)

print(json.dumps(invoice_result, indent=2))

INFO:python.content_understanding_client:Analyzing file ..\data\invoice_sample.jpg with analyzer: prebuilt-invoice
INFO:python.content_understanding_client:Request 7ca9d092-a423-48c1-8f0a-38746fd6a667 in progress ...
INFO:python.content_understanding_client:Request 7ca9d092-a423-48c1-8f0a-38746fd6a667 in progress ...
INFO:python.content_understanding_client:Request 7ca9d092-a423-48c1-8f0a-38746fd6a667 in progress ...
INFO:python.content_understanding_client:Request result is ready after 9.24 seconds.


{
  "id": "7ca9d092-a423-48c1-8f0a-38746fd6a667",
  "status": "Succeeded",
  "result": {
    "analyzerId": "prebuilt-invoice",
    "apiVersion": "2025-05-01-preview",
    "createdAt": "2025-12-04T21:40:41Z",
    "contents": [
      {
        "markdown": "Contoso, Ltd.\n\n\n# Project Statement\n\nDate: 8/25/2020\n\nReceipt # No .: 3579\n\nSold To Bellows College\n892 Hemlock Street\nSpruce City, NY, 84112\nID#: 1234567\n\nLive Delivery:\n\u2610\n\nOnline Delivery (Livestream):\n\u2612\n\nVideo Delivery (On Demand):\n\n\u2610\n\n\n<table>\n<tr>\n<th>Training Date</th>\n<th>Description</th>\n<th>Price</th>\n<th>Discount</th>\n<th>Line Total</th>\n</tr>\n<tr>\n<td>10/12/2020</td>\n<td>Leadership Training</td>\n<td>$750</td>\n<td>10%</td>\n<td>$675</td>\n</tr>\n<tr>\n<td>10/13/2020</td>\n<td>Leadership Training</td>\n<td>$750</td>\n<td>10%</td>\n<td>$675</td>\n</tr>\n<tr>\n<td>10/14/2020</td>\n<td>Leadership Training</td>\n<td>$750</td>\n<td>10%</td>\n<td rowspan=\"2\">$675</td>\n</tr>\n<tr

In [4]:
invoice_contents = invoice_result.get("result", {}).get("contents", [])
if invoice_contents:
    first_invoice = invoice_contents[0]
    print(json.dumps(first_invoice.get("fields", {}), indent=2))
else:
    print("No invoice fields returned in the analyzer response.")

{
  "CustomerName": {
    "type": "string",
    "valueString": "Bellows College",
    "spans": [
      {
        "offset": 84,
        "length": 15
      }
    ],
    "confidence": 0.534,
    "source": "D(1,311.0198,233.8308,439.3728,236.0063,438.9971,258.1694,310.6442,255.9940)"
  },
  "InvoiceDate": {
    "type": "date",
    "valueDate": "2020-08-25",
    "spans": [
      {
        "offset": 43,
        "length": 9
      }
    ],
    "confidence": 0.883,
    "source": "D(1,1107.0000,157.0000,1191.0000,157.0000,1191.0000,178.0000,1107.0000,178.0000)"
  },
  "CustomerId": {
    "type": "string",
    "valueString": "1234567",
    "spans": [
      {
        "offset": 147,
        "length": 7
      }
    ],
    "confidence": 0.594,
    "source": "D(1,348.0000,362.0000,414.0000,362.0000,414.0000,380.0000,348.0000,380.0000)"
  },
  "PurchaseOrder": {
    "type": "string",
    "confidence": 0.741
  },
  "InvoiceId": {
    "type": "string",
    "valueString": "3579",
    "spans": [
      {
  