### Install packages

In [1]:
! pip install -r requirements-nb.txt --quiet


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### Load .env file (Copy .env-sample to .env and update accordingly)

Set the appropriate environment variables below:

1. Use the [Document Layout Skill](https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-document-intelligence-layout) to convert PDFs and other compatible documents to markdown. It requires an [AI Services account](https://learn.microsoft.com/en-us/azure/search/cognitive-search-attach-cognitive-services) and a search service in a [supported region](https://learn.microsoft.com/en-us/azure/search/cognitive-search-attach-cognitive-services)
   1. Specify `AZURE_AI_SERVICES_KEY` if using key-based authentication, and specify `AZURE_AI_SERVICES_ENDPOINT`.


In [None]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os

load_dotenv(override=True) # take environment variables from .env.

endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.getenv("AZURE_SEARCH_ADMIN_KEY")) if os.getenv("AZURE_SEARCH_ADMIN_KEY") else DefaultAzureCredential()
index_namespace = os.getenv("AZURE_SEARCH_INDEX_NAMESPACE", "index-and-chat")
blob_connection_string = os.environ["BLOB_CONNECTION_STRING"]
search_blob_connection_string = os.getenv("SEARCH_BLOB_DATASOURCE_CONNECTION_STRING", blob_connection_string)
blob_container_name = os.getenv("BLOB_CONTAINER_NAME", "index-and-chat")
azure_ai_services_endpoint = os.environ["AZURE_AI_SERVICES_ENDPOINT"]
azure_ai_services_key = os.getenv("AZURE_AI_SERVICES_KEY", "")
document_layout_depth = os.getenv("LAYOUT_MARKDOWN_HEADER_DEPTH", "h3")

## Connect to Blob Storage and load documents

Retrieve documents from Blob Storage. You can use the sample documents in the data/documents folder.  

In [None]:
from azure.storage.blob import BlobServiceClient  
import glob

def upload_sample_documents(
        blob_connection_string: str,
        blob_container_name: str,
        documents_directory: str,
        # Set to false if you want to use credentials included in the blob connection string
        # Otherwise your identity will be used as credentials
        use_user_identity: bool = True,
    ):
        # Connect to Blob Storage
        blob_service_client = BlobServiceClient.from_connection_string(logging_enable=True, conn_str=blob_connection_string, credential=DefaultAzureCredential() if use_user_identity else None)
        container_client = blob_service_client.get_container_client(blob_container_name)
        if not container_client.exists():
            container_client.create_container()

        pdf_files = glob.glob(os.path.join(documents_directory, '*.pdf'))
        for file in pdf_files:
            with open(file, "rb") as data:
                name = os.path.basename(file)
                if not container_client.get_blob_client(name).exists():
                    container_client.upload_blob(name=name, data=data)

upload_sample_documents(
    blob_connection_string=blob_connection_string,
    blob_container_name=blob_container_name,
    # documents_directory = os.path.join("..", "..", "..", "data", "layoutdocuments")
    documents_directory=r"your path to the documents directory",
)

print(f"Setup sample data in {blob_container_name}")

Setup sample data in index-and-chat


## Create a blob data source connector on Azure AI Search

In [None]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection
)
from azure.search.documents.indexes.models import NativeBlobSoftDeleteDeletionDetectionPolicy

# Create a data source 
indexer_client = SearchIndexerClient(endpoint, credential)
container = SearchIndexerDataContainer(name=blob_container_name)
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_namespace}-blob",
    type="azureblob",
    connection_string=search_blob_connection_string,
    container=container,
    data_deletion_detection_policy=NativeBlobSoftDeleteDeletionDetectionPolicy()
)
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

Data source 'test-blob' created or updated


## Create search index

Index created for storing markdown text.

In [None]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    SearchIndex
)

# Create a search index  
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)  

index_fields = [  
    SearchField(name="parent_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),  
    SearchField(name="title", type=SearchFieldDataType.String, searchable=True, filterable=True, sortable=False, facetable=True),  
    SearchField(name="content", type=SearchFieldDataType.String, searchable=True, filterable=False, sortable=False, facetable=False), 
    SearchField(name="metadata_storage_path", type=SearchFieldDataType.String, filterable=True, sortable=False, facetable=True)
]


# Create the search indexes
index = SearchIndex(name=f"{index_namespace}-index", fields=index_fields)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created")

test-parent created


## Create a skillset

Skills drive integrated vectorization. [Document Layout](https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-document-intelligence-layout) analyzes a document to extract regions of interest and their inter-relationships to produce a syntactical representation of the document in Markdown format. This skill uses the [Document Intelligence layout model](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-layout) provided in [Azure AI Document Intelligence](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/overview). [Text Split](https://learn.microsoft.com/azure/search/cognitive-search-skill-textsplit) provides data chunking. \

In [None]:
from azure.search.documents.indexes.models import (
    SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    MergeSkill,
    SearchIndexerSkillset,
    AIServicesAccountKey,
    AIServicesAccountIdentity,
    DocumentIntelligenceLayoutSkill
)

# Create a skillset name 
skillset_name = f"{index_namespace}-skillset"


layout_skill = DocumentIntelligenceLayoutSkill(
    description="Layout skill to read documents",
    context="/document",
    output_mode="oneToMany",
    markdown_header_depth="h3",
    inputs=[
        InputFieldMappingEntry(name="file_data", source="/document/file_data")
    ],
    outputs=[
        OutputFieldMappingEntry(name="markdown_document", target_name="markdownDocument")
    ]
)

split_skill = SplitSkill(  
    description="Split skill to chunk documents",  
    text_split_mode="pages",  
    context="/document/markdownDocument/*",  
    maximum_page_length=2000,  
    page_overlap_length=500,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/markdownDocument/*/content"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="textItems", target_name="pages")  
    ]
)

merge_skill = MergeSkill(
    description="Merge skill to get full document content",
    insert_pre_tag="",
    insert_post_tag="\n",
    context="/document",
    inputs=[
        InputFieldMappingEntry(name="itemsToInsert", source="/document/markdownDocument/*/content")
    ],
    outputs=[
        OutputFieldMappingEntry(name="mergedText", target_name="content")
    ]
)


skills = [layout_skill, split_skill, merge_skill]

skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to process documents",  
    skills=skills,
    cognitive_services_account=AIServicesAccountKey(key=azure_ai_services_key, subdomain_url=azure_ai_services_endpoint) if azure_ai_services_key else AIServicesAccountIdentity(identity=None, subdomain_url=azure_ai_services_endpoint)
)

client = SearchIndexerClient(endpoint, credential)  
client.create_or_update_skillset(skillset)  
print(f"{skillset.name} created")  


test-skillset created


## Create an indexer

In [None]:
from azure.search.documents.indexes.models import (
    SearchIndexer,
    IndexingParameters,
    IndexingParametersConfiguration,
    FieldMapping
)

# Create an indexer  
indexer_name = f"{index_namespace}-indexer"  

indexer_parameters = IndexingParameters(
    configuration=IndexingParametersConfiguration(
        allow_skillset_to_read_file_data=True,
        data_to_extract="storageMetadata",
        query_timeout=None))

indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents",  
    skillset_name=skillset_name,  
    target_index_name=index.name,  
    data_source_name=data_source.name,
    parameters=indexer_parameters,
    field_mappings=[
        FieldMapping(source_field_name="metadata_storage_name", target_field_name="title"),
    ],
    output_field_mappings=[
        FieldMapping(source_field_name="/document/content", target_field_name="content"),
    ]
)  

indexer_client = SearchIndexerClient(endpoint, credential)  
indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer  
indexer_client.run_indexer(indexer_name)  
print(f' {indexer_name} is created and running. If queries return no results, please wait a bit and try again.')  


 test-indexer is created and running. If queries return no results, please wait a bit and try again.
