In [9]:
import os
from dotenv import find_dotenv, load_dotenv
from azure.storage.blob import BlobServiceClient, generate_blob_sas, BlobSasPermissions
from datetime import datetime, timedelta
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult, AnalyzeDocumentRequest
import json
from azure.ai.formrecognizer import DocumentAnalysisClient as OldDocumentIntelligenceClient, AnalyzeResult as OldAnalyzeResult
import httpx

In [10]:
from azure.core.exceptions import ResourceExistsError
data_folder = "data2"
container_name = "data"
connection_string = os.getenv("STORAGE_CONNECTION_STRING")

# Ensure the connection string, data folder, and container name are not None
if connection_string is None:
    raise ValueError("The connection string environment variable is not set.")
if data_folder is None:
    raise ValueError("The data folder environment variable is not set.")
if container_name is None:
    raise ValueError("The container name environment variable is not set.")

# Ensure the data folder exists
if not os.path.isdir(data_folder):
    raise FileNotFoundError(f"The specified data folder does not exist: {data_folder}")

# Create a BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Check if the container exists, and create it if it does not
container_client = blob_service_client.get_container_client(container_name)
try:
    container_client.create_container()
    print(f"Container '{container_name}' created.")
except ResourceExistsError:
    print(f"Container '{container_name}' already exists.")

# Upload files in the data folder and its subdirectories to the blob container
for root, dirs, files in os.walk(data_folder):
    for filename in files:
        file_path = os.path.join(root, filename)
        if os.path.isfile(file_path):
            # Create a blob path that maintains the directory structure
            blob_path = os.path.relpath(file_path, data_folder).replace("\\", "/")
            blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_path)
            with open(file_path, "rb") as data:
                blob_client.upload_blob(data, overwrite=True)
            print(f"Uploaded {blob_path} to blob storage.")

Container 'data' already exists.
Uploaded houseloan/houseloan.pdf to blob storage.


In [11]:
def generate_sas_url(blob_service_client, container_name, blob_name, expiry_hours=1):
    """
    Generate a SAS URL for a blob in Azure Blob Storage.

    :param blob_service_client: BlobServiceClient instance
    :param container_name: Name of the container
    :param blob_name: Name of the blob
    :param expiry_hours: Expiry time in hours for the SAS token
    :return: SAS URL for the blob
    """
    sas_token = generate_blob_sas(
        account_name=blob_service_client.account_name,
        container_name=container_name,
        blob_name=blob_name,
        account_key=blob_service_client.credential.account_key,
        permission=BlobSasPermissions(read=True),
        expiry=datetime.utcnow() + timedelta(hours=expiry_hours)
    )

    sas_url = f"https://{blob_service_client.account_name}.blob.core.windows.net/{container_name}/{blob_name}?{sas_token}"
    return sas_url

In [12]:
def get_words(page, line):
    result = []
    for word in page.words:
        if _in_span(word, line.spans):
            result.append(word)
    return result

def _in_span(word, spans):
    for span in spans:
        if word.span.offset >= span.offset and (word.span.offset + word.span.length) <= (span.offset + span.length):
            return True
    return False

In [13]:
load_dotenv()

endpoint = os.getenv("DOC_AI_ENDPOINT")
api_key = os.getenv("DOC_AI_KEY")
   
if not endpoint or not isinstance(endpoint, str):
    raise ValueError("The DOCUMENTINTELLIGENCE_ENDPOINT environment variable is not set or is not a string.")
if not api_key or not isinstance(api_key, str):
    raise ValueError("The DOCUMENTINTELLIGENCE_API_KEY environment variable is not set or is not a string.")

def analyze_layout(sas_url):
    document_intelligence_client = OldDocumentIntelligenceClient(
        endpoint=endpoint, credential=AzureKeyCredential(api_key)
    )

    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout", httpx.Client().get(sas_url).read()
    )

    result: OldAnalyzeResult = poller.result()

    analysis_result = {
        "handwritten": any([style.is_handwritten for style in result.styles]) if result.styles else False,
        "pages": [],
        "tables": []
    }

    for page in result.pages:
        page_info = {
            "page_number": page.page_number,
            "width": page.width,
            "height": page.height,
            "unit": page.unit,
            "lines": [],
            "selection_marks": []
        }

        if page.lines:
            for line in page.lines:
                line_info = {
                    "text": line.content,
                    "polygon": line.polygon,
                    "words": [{"content": word.content, "confidence": word.confidence} for word in get_words(page, line)]
                }
                page_info["lines"].append(line_info)

        if page.selection_marks:
            for selection_mark in page.selection_marks:
                selection_mark_info = {
                    "state": selection_mark.state,
                    "polygon": selection_mark.polygon,
                    "confidence": selection_mark.confidence
                }
                page_info["selection_marks"].append(selection_mark_info)

        analysis_result["pages"].append(page_info)

    if result.tables:
        for table in result.tables:
            table_info = {
                "row_count": table.row_count,
                "column_count": table.column_count,
                "bounding_regions": [{"page_number": region.page_number, "polygon": region.polygon} for region in table.bounding_regions] if table.bounding_regions else [],
                "cells": [{"row_index": cell.row_index, "column_index": cell.column_index, "content": cell.content, "bounding_regions": [{"page_number": region.page_number, "polygon": region.polygon} for region in cell.bounding_regions] if cell.bounding_regions else []} for cell in table.cells]
            }
            analysis_result["tables"].append(table_info)

    return analysis_result

In [14]:
def save_analysis_results(blob_service_client, container_name, blob_name, analysis_results):
    if analysis_results is None:
        print(f"No analysis results for {blob_name}. Skipping save.")
        return

    # Define the name for the results file
    results_blob_name = f"{blob_name}_results.json"

    # Convert the analysis results to JSON
    results_json = json.dumps(analysis_results, indent=2)

    # Upload the results to the blob
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=results_blob_name)
    blob_client.upload_blob(results_json, overwrite=True)

    print(f"Saved analysis results to {results_blob_name}")

In [15]:
load_dotenv()

if __name__ == "__main__":
    # Retrieve the connection string and container name from the environment variables
    connection_string = os.getenv('STORAGE_CONNECTION_STRING')
    container_name = "data"

    # Ensure the connection string is not None
    if connection_string is None:
        raise ValueError("The connection string environment variable is not set.")

    # Create a BlobServiceClient
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)

    # List all blobs in the container
    blob_list = blob_service_client.get_container_client(container_name).list_blobs()

    # Iterate over each blob
    for blob in blob_list:
        blob_name = blob.name
        print(f"Processing blob: {blob_name}")

        # Ensure the file format is supported
        supported_formats = ['.pdf', '.jpeg', '.jpg', '.png', '.tiff']
        if not any(blob_name.lower().endswith(ext) for ext in supported_formats):
            print(f"Skipping unsupported file format: {blob_name}")
            continue

        # Generate the SAS URL
        sas_url = generate_sas_url(blob_service_client, container_name, blob_name)
        print(f"Generated SAS URL: {sas_url}")

        # Call the analyze_layout function with the SAS URL
        analysis_results = analyze_layout(sas_url)

        # Save the analysis results
        save_analysis_results(blob_service_client, container_name, blob_name, analysis_results)

Processing blob: houseloan/houseloan.pdf
Generated SAS URL: https://hackdocssa7rwdk5tmoogam.blob.core.windows.net/data/houseloan/houseloan.pdf?se=2024-12-13T14%3A26%3A09Z&sp=r&sv=2025-01-05&sr=b&sig=XpBIO9rSj4agR6DSMakKQbmtJBxDUip46050tuUHOEY%3D
Saved analysis results to houseloan/houseloan.pdf_results.json
Processing blob: loanagreements/la_janesmith.pdf
Generated SAS URL: https://hackdocssa7rwdk5tmoogam.blob.core.windows.net/data/loanagreements/la_janesmith.pdf?se=2024-12-13T14%3A26%3A15Z&sp=r&sv=2025-01-05&sr=b&sig=X5%2BINhC%2BoIBqSl4RdQ%2BfzLTD852smlVN2yKDNj9tM8k%3D
Saved analysis results to loanagreements/la_janesmith.pdf_results.json
Processing blob: loanagreements/la_janesmith.pdf_results.json
Skipping unsupported file format: loanagreements/la_janesmith.pdf_results.json
Processing blob: loanform/lp_janesmith.pdf
Generated SAS URL: https://hackdocssa7rwdk5tmoogam.blob.core.windows.net/data/loanform/lp_janesmith.pdf?se=2024-12-13T14%3A26%3A21Z&sp=r&sv=2025-01-05&sr=b&sig=OKSo3zN9