In [41]:
import logging
import os
import json
import pandas as pd
from requests import get, post
from dotenv import load_dotenv  
import time
import ast
import re
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import (
        DocumentModelAdministrationClient,
        ClassifierDocumentTypeDetails,
        BlobSource,
        BlobFileListSource,
    )
from azure.ai.documentintelligence.models import (
                AzureBlobFileListContentSource,
                ClassifierDocumentTypeDetails,
                BuildDocumentClassifierRequest,
)
from azure.storage.blob import BlobServiceClient, ContainerSasPermissions, generate_container_sas
from datetime import datetime, timezone, timedelta
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence import DocumentIntelligenceAdministrationClient
from azure.ai.documentintelligence.models import AnalyzeResult
import uuid

In [2]:
# Configure environment variables  
load_dotenv(dotenv_path='./.env')

True

In [26]:
def createContainerSasUrl(container_client):
# [START create_container_sas_url]
    # Define the SAS token permissions
    sas_permissions=ContainerSasPermissions(read=True, list=True)

    # Define the expiry time and start time for the SAS token
    start_time = datetime.now(timezone.utc) - timedelta(minutes=1)
    expiry_time = datetime.now(timezone.utc) + timedelta(minutes=5)

    # Generate the container SAS token
    container_sas_token = generate_container_sas(
        container_client.account_name,
        container_client.container_name,
        account_key=container_client.credential.account_key,
        permission=sas_permissions,
        expiry=expiry_time,
        start=start_time,
    )
    # Create the container sas URL by appending the token to the container url
    container_sas_url = f"{container_client.url}?{container_sas_token}"

    return container_sas_url

In [27]:
def getDocTypes(container_client, container_sas_url):
# [START get_doctypes]
    doc_types = {}
    doc_types_list = []

    blob_list = container_client.walk_blobs()
    for blob in blob_list:
        if blob.name.endswith(".jsonl"):
            doc_type = os.path.splitext(blob.name)[0]
            doc_types_list.append(doc_type)

    for doc_type in doc_types_list:
        doc_types[doc_type] = ClassifierDocumentTypeDetails(
            azure_blob_file_list_source=AzureBlobFileListContentSource(
                container_url=container_sas_url, 
                file_list=f"{doc_type}.jsonl"
            )
        )
    return doc_types

In [44]:
def createDocClient():
# [START create_clients]
    FormRecognizerEndPoint = os.getenv('FormRecognizerEndPoint')
    FormRecognizerKey = os.getenv('FormRecognizerKey')
    connect_str = os.getenv("ClassifierConnectionString")
    container_name = "classifier"

    document_model_admin_client = DocumentIntelligenceAdministrationClient(endpoint=FormRecognizerEndPoint, 
                                                                           credential=AzureKeyCredential(FormRecognizerKey))
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
    container_client = blob_service_client.get_container_client(container_name)
    
    return document_model_admin_client, container_client

In [35]:
def printClassifierResult(result):
# [START print_classifier_results]
    classifierId = result.classifier_id
    print(f"Classifier ID: {result.classifier_id}")
    print(f"API version used to build the classifier model: {result.api_version}")
    print(f"Classifier description: {result.description}")
    print(f"Document classes used for training the model:")
    for doc_type in result.doc_types.items():
        print(f"Document type: {doc_type}")

In [33]:
def buildClassifier():
# [START build_classifier]
    document_model_admin_client, container_client = createDocClient()
    container_sas_url = createContainerSasUrl(container_client)

    poller = document_model_admin_client.begin_build_classifier(
        BuildDocumentClassifierRequest(
            classifier_id=str(uuid.uuid4()),
            doc_types= getDocTypes(container_client, container_sas_url),
        )
        #description=os.environ["CLASSIFIER_DESCRIPTION"]
    )
    result = poller.result()
    printClassifierResult(result)

In [34]:
buildClassifier()

Classifier ID: 07b4841c-455a-4e39-9c3f-c6460b4f5a61
API version used to build the classifier model: 2024-02-29-preview
Classifier description: None
Document classes used for training the model:
Document type: ('1040A', {'kind': 'azureBlobFileList', 'azureBlobFileListSource': {'containerUrl': 'https://dataaidocumentstor.blob.core.windows.net/classifier', 'fileList': '1040A.jsonl'}})
Document type: ('1040B', {'kind': 'azureBlobFileList', 'azureBlobFileListSource': {'containerUrl': 'https://dataaidocumentstor.blob.core.windows.net/classifier', 'fileList': '1040B.jsonl'}})


In [42]:
def classifyDocument(classifier_id, doc_path):
    docClient = DocumentIntelligenceClient(endpoint=FormRecognizerEndPoint, credential=AzureKeyCredential(FormRecognizerKey))
    with open(doc_path, "rb") as f:
        poller = docClient.begin_classify_document(
            classifier_id, classify_request=f, content_type="application/pdf"
        )
    result: AnalyzeResult = poller.result()

    print("----Classified documents----")
    if result.documents:
        for doc in result.documents:
            if doc.bounding_regions:
                print(
                    f"Found document of type '{doc.doc_type or 'N/A'}' with a confidence of {doc.confidence} contained on "
                    f"the following pages: {[region.page_number for region in doc.bounding_regions]}"
                )
    # [END classify_document]

In [43]:
doc_path = "Data/Classifier/1040A/IRS_1040_1_01.pdf"
print(f"Classifying document {doc_path}...")
request = classifyDocument(classifierId, doc_path)
print(result)

Classifying document Data/Classifier/1040A/IRS_1040_1_01.pdf...


----Classified documents----
Found document of type '1040A' with a confidence of 0.498 contained on the following pages: [1, 2]
DocumentClassifierDetails(classifier_id=13f642b9-a0b2-4375-b6b6-119cf1e0cbd2, description=IRS document classifier, created_on=2024-05-09 15:06:53+00:00, expires_on=2026-05-09 15:06:53+00:00, api_version=2023-07-31, doc_types={'IRS-1040-A': ClassifierDocumentTypeDetails(source_kind=azureBlobFileList, source=BlobFileListSource(container_url=https://dataaidocumentstor.blob.core.windows.net/classifier, file_list=IRS-1040-A.jsonl)), 'IRS-1040-B': ClassifierDocumentTypeDetails(source_kind=azureBlobFileList, source=BlobFileListSource(container_url=https://dataaidocumentstor.blob.core.windows.net/classifier, file_list=IRS-1040-B.jsonl))})
