In [1]:
# %pip install azure-ai-documentintelligence

# Azure AI Document Intelligence - Layout Model
Sample code using the `2024-11-30` API

## Environment Set up

In [38]:
import os
import json

from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, AnalyzeResult, AnalyzeOutputOption, DocumentAnalysisFeature, DocumentContentFormat

In [39]:
load_dotenv(override=True)

DOC_AI_ENDPOINT   = os.getenv("DOC_AI_ENDPOINT")
DOC_AI_API_KEY    = os.getenv("DOC_AI_API_KEY")

doc_client: DocumentIntelligenceClient = DocumentIntelligenceClient(
  DOC_AI_ENDPOINT, 
  AzureKeyCredential(DOC_AI_API_KEY),
)

## Extract Content

In [40]:
# Load file from local path

file_path = "./data/sample-text-lorem-ipsum.pdf"

with open(file_path, "rb") as f:
  poller = doc_client.begin_analyze_document(
    "prebuilt-layout", 
    AnalyzeDocumentRequest(bytes_source=f.read()), 
    output_content_format=DocumentContentFormat.MARKDOWN,
    output=[AnalyzeOutputOption.FIGURES],
  )
  
  result:AnalyzeResult = poller.result()

In [41]:
# Save extracted content as Makdown file
with open("./output/result.md", "w", encoding="utf-8") as f:
  f.write(result.content)

In [43]:
from IPython.display import Markdown, display

display(Markdown(result.content))

# Lorem Ipsum Dolor Sit Amet


## 1. Introduction

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla facilisi. Donec consectetur,
metus a feugiat aliquam, quam justo tincidunt est, vel molestie risus lacus in orci.


### Key Points

☒
Nulla facilisi et malesuada fames ac ante ipsum primis in faucibus.

☐
Donec elementum massa sit amet augue suscipit, et molestie leo tristique.

☐
Fusce id justo vel lorem pharetra accumsan non et nisi.


## 2. Data Table

Below is a sample table with Lorem Ipsum data:


<table>
<tr>
<th>Category</th>
<th>Description</th>
<th>Status</th>
</tr>
<tr>
<td>Alpha</td>
<td>Lorem ipsum dolor sit amet</td>
<td>☒ Completed</td>
</tr>
<tr>
<td>Beta</td>
<td>Consectetur adipiscing elit</td>
<td>☒ In Progress</td>
</tr>
<tr>
<td>Gamma</td>
<td>Sed do eiusmod tempor incididunt</td>
<td>☒ Not Started</td>
</tr>
<tr>
<td>Delta</td>
<td>Ut enim ad minim veniam</td>
<td>☒ Completed</td>
</tr>
</table>


<!-- PageBreak -->


## 3. Checklist for Success

Phase 1 - Planning

1\.
Define project scope

2\.
Identify stakeholders

3\.
Set timelines

Phase 2 - Development

1\.
Write initial drafts

2\.
Review and revise

3\.
Validate requirements

Phase 3 - Finalization

1\.
Conduct final testing

2\.
Deploy project

3\.
Gather feedback


## 4. Conclusion

Praesent tristique fermentum massa vel congue. Suspendisse potenti. Integer eu mi nec nisl
feugiat luctus at non mi. Quisque dapibus mi nec malesuada facilisis.

<!-- PageBreak -->


<figure>

90

20

10

0.4

0.0

30

::

00

1

J

₡

.

A

6

.

5

0

10

1

30

</figure>

## Extract Images from file

In [None]:
figures_list = []

if result.figures:
  for figure in result.figures:
    if figure.id:
      figure_path = f"./output/figures/{figure.id}.png"

      # Extract the image data from the file
      image_data = doc_client.get_analyze_result_figure(
        model_id=result.model_id,
        result_id=poller.details["operation_id"],
        figure_id=figure.id,
      )

      # Save as PNG
      with open(figure_path, "wb") as writer:
        writer.writelines(image_data)

      figures_list.append(
        {
          "figure_id": figure.id,
          "figure_path": figure_path,
          "figure_caption": figure.caption,
          "span": figure.spans[0].as_dict(),
        }
      )

In [51]:
figures_list_json = json.dumps(figures_list, indent=2)

In [53]:
print(figures_list_json)

[
  {
    "figure_id": "3.1",
    "figure_path": "./output/figures/3.1.png",
    "figure_caption": null,
    "span": {
      "offset": 1480,
      "length": 91
    }
  }
]


## Extract Checkboxes

In [56]:
checkbox_list = []

for page in result.pages:
  if page.selection_marks:
    for mark in page.selection_marks:
      checkbox_list.append(
        {
          "page_number": page.page_number,
          "state": mark.state.value,
          "span": mark.span.as_dict(),
        }
      )

In [57]:
checkbox_list_json = json.dumps(checkbox_list, indent=2)

In [58]:
print(checkbox_list_json)

[
  {
    "page_number": 1,
    "state": "selected",
    "span": {
      "offset": 246,
      "length": 1
    }
  },
  {
    "page_number": 1,
    "state": "unselected",
    "span": {
      "offset": 317,
      "length": 1
    }
  },
  {
    "page_number": 1,
    "state": "unselected",
    "span": {
      "offset": 394,
      "length": 1
    }
  },
  {
    "page_number": 1,
    "state": "selected",
    "span": {
      "offset": 655,
      "length": 1
    }
  },
  {
    "page_number": 1,
    "state": "selected",
    "span": {
      "offset": 738,
      "length": 1
    }
  },
  {
    "page_number": 1,
    "state": "selected",
    "span": {
      "offset": 829,
      "length": 1
    }
  },
  {
    "page_number": 1,
    "state": "selected",
    "span": {
      "offset": 911,
      "length": 1
    }
  }
]
