In [47]:
import os
import re
import json
from pathlib import Path
from dotenv import load_dotenv
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence.models import DocumentAnalysisFeature, AnalyzeResult
from azure.core.exceptions import HttpResponseError

In [48]:
load_dotenv() # load environment variables from .env file

endpoint = os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT")
key = os.getenv("DOCUMENT_INTELLIGENCE_SUBSCRIPTION_KEY")

client = DocumentIntelligenceClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(key)
)

In [None]:
pdf_file_path = "test_descriptive_text.pdf"

try:
    with open(pdf_file_path, "rb") as pdf_file:
        poller = client.begin_analyze_document(
            model_id="prebuilt-read",
            body=pdf_file,
            features=[DocumentAnalysisFeature.FORMULAS, DocumentAnalysisFeature.STYLE_FONT]
        )
    result = poller.result()

    total_formulas = 0
    for page in result.pages:
        if hasattr(page, 'formulas') and page.formulas:
            total_formulas += len(page.formulas)
    
    print(f"Number of pages: {len(result.pages)}")
    print(f"Paragraphs: {len(result.paragraphs)}")
    print(f"Formulas: {total_formulas}")

except HttpResponseError as error:
    print(f"Error analyzing document: {error}")

	Number of pages: 2
	Paragraphs: 14
	Formulas: 18


In [50]:
if result.styles:
        for style in result.styles:
            if style.font_style:
                    print(f"The document contains '{style.font_style}' font style, applied to the following text: ")
                    print(",".join([result.content[span.offset : span.offset + span.length] for span in style.spans]))

all_formulas = []
formula_count = 0
for page in result.pages:
    if hasattr(page, 'formulas') and page.formulas:
        print(f"Page #{page.page_number}: {len(page.formulas)} formula(s) detected")
        for i, formula in enumerate(page.formulas, 1):
            formula_value = getattr(formula, 'value', 'Unknown')
            all_formulas.append({
                'value': formula_value,
                'page': page.page_number,
                'kind': getattr(formula, 'kind', 'N/A'),
            })
            formula_count += 1

reconstructed_content = result.content # reconstruct content by replacing :formula: placeholders with actual latex code
print(reconstructed_content)
for formula_data in all_formulas:
    reconstructed_content = reconstructed_content.replace(":formula:", formula_data['value'], 1)
print(reconstructed_content)
print(f"\nNumber of total formulas found: {formula_count}")

for i, formula in enumerate(all_formulas, 1):
    print()
    print(f"{i}. Formula (Page #{formula['page']}):")
    print(f"\tLaTeX: {formula['value']}")
    print(f"\tKind: {formula['kind']}")

The document contains 'DocumentFontStyle.NORMAL' font style, applied to the following text: 
Taylor Series,Limit of Arctangent,as,Approaches Negative Infinity,Standing-wave function,Relationship between Energy and Principal Quantum Number,Z-transform time domain multiplication (z domain convolution) property,2
Page #1: 8 formula(s) detected
Page #2: 10 formula(s) detected
Taylor Series
:formula: :formula: :formula: :formula:
:formula:
:formula: :formula: :formula:
:formula: :formula: :formula:
Limit of Arctangent :formula: as :formula: Approaches Negative Infinity
:formula:
Standing-wave function
:formula: :formula:
Relationship between Energy and Principal Quantum Number
:formula:
Z-transform time domain multiplication (z domain convolution) property
:formula:
2
Taylor Series
f \left( x \right) = \sum _ { n = 0 } ^ { \infty } \frac { f ^ { \left( n \right) } \left( a \right) } { n ! } \left( x - a \right) ^ { n } x ^ { n } + y ^ { n } = z ^ { n } \frac { \left( x - x _ { 0 } \right) ^