# Document Ingestion

In [None]:
%pip install -r requirements.txt

## Parse the Document

In [7]:
import os
from pathlib import Path
import pickle

from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient

# Load the environment variables
from dotenv import load_dotenv
load_dotenv(override=True)

FORM_RECOGNIZER_ENDPOINT = os.getenv("AZURE_FORM_RECOGNIZER_ENDPOINT")
FORM_RECOGNIZER_KEY = os.getenv("AZURE_FORM_RECOGNIZER_KEY")

if not FORM_RECOGNIZER_ENDPOINT or not FORM_RECOGNIZER_KEY:
    raise ValueError(
        f"Please provide FORM_RECOGNIZER_ENDPOINT ({'set' if FORM_RECOGNIZER_ENDPOINT else 'None'}) and FORM_RECOGNIZER_KEY ({'set' if FORM_RECOGNIZER_KEY else 'None'}) as environment variables."
    )

In [3]:
docint_client = DocumentIntelligenceClient(
    endpoint=FORM_RECOGNIZER_ENDPOINT, credential=AzureKeyCredential(FORM_RECOGNIZER_KEY),
)

pdfs_dir = Path("../public/pdfs")
pickled_pdfs_dir = Path('./data')
pickled_pdfs_dir.mkdir(exist_ok=True)

### Analyze the documents and pickle the results

In [None]:
from azure.ai.documentintelligence.models import ContentFormat

for file_name in pdfs_dir.iterdir():
    print('Analyzing:', file_name.stem)
    with open(f'{file_name}', 'rb') as doc:
        handler = docint_client.begin_analyze_document(
            model_id="prebuilt-layout",
            analyze_request=doc,
            content_type="application/octet-stream",
            output_content_format=ContentFormat.MARKDOWN,
        )

    parsed_doc = handler.result()
    
    # Pickle the value and save it to the file
    output_file_path = pickled_pdfs_dir / f'{file_name.stem}_markdown.pkl'
    with output_file_path.open('wb') as output_file:
        pickle.dump(parsed_doc, output_file)
    print('Done!')

### Load the pickled results

In [None]:
# Load the parsed_documents dictionary from the pickled file

parsed_pdfs = {}
for file_path in pickled_pdfs_dir.iterdir():
    with file_path.open('rb') as file:
        parsed_pdfs[file_path.stem] = pickle.load(file)

print('Parsed pdfs:\n ', "\n  ".join(parsed_pdfs.keys()))

In [2]:
import os

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    HnswAlgorithmConfiguration,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    SemanticSearch,
    SimpleField,
    VectorSearch,
    VectorSearchProfile,
)
from openai import AzureOpenAI

endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.environ["AZURE_SEARCH_API_KEY"])
azure_openai_endpoint = os.environ["AZURE_OPENAI_API_ENDPOINT"]
azure_openai_key = os.environ["AZURE_OPENAI_API_KEY"]
azure_openai_embedding_deployment = os.getenv(
    "AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-large"
)
azure_openai_embedding_dimensions = 1536
embedding_model_name = os.getenv(
    "AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-large"
)
azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION", "2024-10-21")


INDEX_NAME = "index001"

index_client = SearchIndexClient(endpoint=endpoint, credential=credential)
search_client = SearchClient(
    endpoint=endpoint,
    index_name=INDEX_NAME,
    credential=credential,
)
aoai_client = AzureOpenAI(
    api_key=azure_openai_key,
    azure_endpoint=azure_openai_endpoint,
    api_version=azure_openai_api_version,
)

In [3]:
fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        sortable=True,
        filterable=True,
        facetable=True,
    ),
    SimpleField(name="filepath", type=SearchFieldDataType.String),
    SimpleField(name="page", type=SearchFieldDataType.String),
    SimpleField(name="polygon", type=SearchFieldDataType.Collection(SearchFieldDataType.Double)),
    SearchableField(name="content", type=SearchFieldDataType.String, searchable=True),
    SearchField(
        name="contentVector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=azure_openai_embedding_dimensions,
        vector_search_profile_name="myHnswProfile",
    ),
]

# Configure the vector search configuration
vector_search = VectorSearch(
    algorithms=[HnswAlgorithmConfiguration(name="myHnsw")],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
            vectorizer_name="myVectorizer",
        )
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            vectorizer_name="myVectorizer",
            parameters=AzureOpenAIVectorizerParameters(
                resource_url=azure_openai_endpoint,
                deployment_name=azure_openai_embedding_deployment,
                model_name=embedding_model_name,
                api_key=azure_openai_key,
            ),
        )
    ],
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        content_fields=[SemanticField(field_name="content")],
    ),
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

In [None]:
# Create the search index with the semantic settings
index = SearchIndex(
    name=INDEX_NAME,
    fields=fields,
    vector_search=vector_search,
    semantic_search=semantic_search,
)
result = index_client.create_or_update_index(index)
print(f"{result.name} created or updated")

## Populate the index

In [None]:
from azure.ai.documentintelligence.models import DocumentTable, DocumentParagraph
from datetime import datetime

def get_element(doc, pointer):
    content_type, index = pointer.split('/')[1:]
    if content_type == 'sections':
        return
    element = getattr(doc, content_type)[int(index)]
    if hasattr(element, 'elements'):
        for p in element.elements:
            get_element(doc, p)
    else:
        if isinstance(element, DocumentTable):
            last_row = 0
            for item in element.cells:
                if item.row_index > last_row:
                    print()
                    last_row = item.row_index
                print(f"{item.content:<80}", end="")
        elif isinstance(element, DocumentParagraph):
            print(f"{'':>10} {element.content}")

def generate_embedding(text):
    return (
        aoai_client.embeddings.create(
            input=[text],
            model=azure_openai_embedding_deployment,
            dimensions=azure_openai_embedding_dimensions,
        ).data[0].embedding
    )

def chunk_document_whole(document):
    id = 0
    yield {
        'id': id,
        'page': "1",
        'content': document.content,
    }

def chunk_document_by_page(document):
    for i, page in enumerate(document.pages):
        yield {
            'id': i,
            'page': str(i + 1),
            'content': "\n".join([line.content for line in page.lines]),
        }


In [None]:
documents = []
for pdf_name, pdf_doc in parsed_pdfs.items():
    # Index chunks
    for chunk in chunk_document_by_page(pdf_doc):
        embedding = generate_embedding(chunk['content'])
        doc = {
            "id": f"{pdf_name}_{chunk['id']}",
            "filepath": pdf_name,
            "page": chunk['page'],
            "polygon": [],
            "content": chunk['content'],
            "contentVector": embedding,
        }
        documents.append(doc)
    
print(f"Indexing {len(documents)} document chunks")

In [None]:
search_client.upload_documents(documents=documents)