# Azure AI Search Multimodal Retrieval Demo

## Introduction
In this demo, we will show you how to create a multimodal (text + images) vector index in Azure AI Search.

## Prerequisites
- 🐍 Python 3.9 or higher
- ☁️ Azure Blob Storage
- 🔗 Azure AI Vision Service or Azure AI Multi-Service Account
- 🔗 Azure AI Search Service

## Features Covered
This demo covers the following features:
- ✅ Stored=False
- ✅ Scalar Quantization to int8
- ✅ Reranking w/full precision vectors
- ✅ Oversampling
- ✅ Integrated Vectorization
- ✅ Multi-Vector Search
- ✅ Generate Dense Captions with AI Vision Image Analysis API
- ✅ **[NEW]** Azure AI Vision Embedding Skill
- ✅ **[NEW]** Azure AI Vision Vectorizer
- ✅ **[NEW]** Azure AI Vision Latest Multilingual Model
- ✅ **[NEW]** Vector Weighting

Let's get started!

In [None]:
#%pip install azure-search-documents --pre --quiet
#%pip install azure-search-documents==11.6.0b4
#%pip install openai python-dotenv azure-identity cohere azure-ai-vision-imageanalysis --quiet
#%pip install azure-storage-blob

In [None]:
from dotenv import load_dotenv
from pathlib import Path 
import os

env_path = Path('.') / 'secrets.env'
load_dotenv(dotenv_path=env_path)

In [None]:
from openai import AzureOpenAI, OpenAI
import base64


endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
api_key = os.environ["AZURE_OPENAI_KEY"]
# set the deployment name for the model we want to use
deployment = os.environ["AZURE_OPENAI_CHATGPT_DEPLOYMENT"]
model: str = "text-embedding-ada-002" 

azure_client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_KEY"),  
    api_version="2023-12-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

openai_client = OpenAI(api_key=os.getenv("OPENAI_KEY"))

def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')
  


def get_chatgpt_base_response_with_image(userQuery, system_message, image_path, client_type="openai", base64_image=None, model=None, max_tokens=4000):
    
    if base64_image is None:
        base64_image = encode_image(image_path)
    
    
    if model is None:
        model = os.getenv('DEPLOYMENT_NAME')

    print(f"model used:{model}")

    client = None
    if client_type == "openai":
        client = openai_client
        print("openai client used")
    else:
        client = azure_client
        print("azure client used")

    completion = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": system_message
            },
            {
            "role": "user",
            "content": [
                {"type": "text", "text": userQuery},
                {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_image}",
                    "detail": "high"
                },
                },
            ],
            }
        ],
        response_format={ "type": "json_object" },
        temperature=0,
        max_tokens=max_tokens,
        stream=False)
    
    return completion.choices[0].message.content

In [None]:
# list files in path downloads
import os
from os import listdir
from os.path import isfile, join
dir_path = "downloads"

system_message = """You are an AI assistant that describe images in detail
Sample Json format:
{
    "image": "image_path",
    "description": "image description in detail." 
    
}

"""




for f in listdir(dir_path):
    # print full path
    image_path = join(dir_path, f)
    response = get_chatgpt_base_response_with_image(system_message, f"describe the attached image in detail in json format. image_path: {image_path}", image_path, client_type="openai", model="gpt-4o", max_tokens=200)
    print(response)
    break



def get_image_description(base64_img):
    response = get_chatgpt_base_response_with_image(system_message, f"describe the attached image in detail in json format. image_path: {image_path}", image_path=None, client_type="openai", base64_image=base64_img, model="gpt-4o", max_tokens=200)
    return response


In [None]:
# Load environment variables
#load_dotenv()

# Configuration
AZURE_AI_VISION_API_KEY = os.getenv("AZURE_AI_VISION_KEY")
AZURE_AI_VISION_ENDPOINT = os.getenv("AZURE_AI_VISION_ENDPOINT")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
BLOB_CONNECTION_STRING = os.getenv("BLOB_CONNECTION_STRING")
BLOB_CONTAINER_NAME = os.getenv("BLOB_CONTAINER_NAME")
SEARCH_BLOB_CONTAINER = os.getenv("SEARCH_BLOB_CONTAINER")
INDEX_NAME = os.getenv("AZURE_SEARCH_INDEX")
SEARCH_SERVICE_API_KEY = os.getenv("AZURE_SEARCH_ADMIN_KEY")
SEARCH_SERVICE_ENDPOINT = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")

## Generate JSON with text and images via Azure AI Vision Studio
https://portal.vision.cognitive.azure.com/demo/image-captioning

In [None]:
import json
import os
from uuid import uuid4

from azure.ai.vision.imageanalysis import ImageAnalysisClient
from azure.ai.vision.imageanalysis.models import VisualFeatures
from azure.core.credentials import AzureKeyCredential
from azure.storage.blob import BlobServiceClient

# Get environment variables for Azure AI Vision
try:
    endpoint = os.getenv("AZURE_AI_VISION_ENDPOINT")
    key = os.getenv("AZURE_AI_VISION_KEY")
    connection_string = os.getenv("BLOB_CONNECTION_STRING")
    container_name = os.getenv("BLOB_CONTAINER_NAME")
    #container_name = "vector-sandbox"
except KeyError as e:
    print(f"Missing environment variable: {str(e)}")
    print("Set them before running this sample.")
    exit()

# Create an Image Analysis client
client = ImageAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))

# Setup for Azure Blob Storage
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client(container_name)

In [None]:
# upload files to blob storage from local blob_files folder

local_images_folder = "blob_files"

def upload_files_to_blob_storage():
    # Get all files in the blob_files folder
    blob_files = [f for f in listdir(local_images_folder) if isfile(join(local_images_folder, f))]

    # Upload each file to the blob storage
    for file in blob_files:
        blob_client = container_client.get_blob_client(file)
        with open(f"{local_images_folder}/{file}", "rb") as data:
            blob_client.upload_blob(data, overwrite=True)
            print(f"Uploaded {file} to blob storage")


upload_files_to_blob_storage()

In [None]:
import base64
for blob in container_client.list_blobs():
    print(f"Analyzing {blob.name}")
    # read the image from the blob storage
    blob_client = container_client.get_blob_client(blob.name)
    image = blob_client.download_blob().readall()
    base64_img = base64.b64encode(image).decode('utf-8')
    img_description = get_image_description(base64_img)
    print(json.loads(img_description)["description"])
    break
   

In [None]:
sas_token = "?" # Add the Azure Blob Storage SAS token here

def get_caption(image_url):
    """
    Get a caption for the image using Azure AI Vision.
    """
    try:
        result = client.analyze_from_url(
            image_url=image_url,
            visual_features=[VisualFeatures.CAPTION, VisualFeatures.READ],
            gender_neutral_caption=False
        )
        if result.caption is not None:
            return result.caption.text
        else:
            return "No caption available"
    except Exception as e:
        print(f"An error occurred: {e}")
        return "Error generating caption"

def generate_json_objects():
    json_objects = []

    # Iterate over the blobs in the container
    for blob in container_client.list_blobs():
        image_url = f"https://{blob_service_client.account_name}.blob.core.windows.net/{container_name}/{blob.name}{sas_token}"
        caption = get_caption(image_url)

        print(f"Analyzing {blob.name}")
        # read the image from the blob storage
        blob_client = container_client.get_blob_client(blob.name)
        image = blob_client.download_blob().readall()
        base64_img = base64.b64encode(image).decode('utf-8')
        img_description = get_image_description(base64_img)
        json_object = {"id": str(uuid4()), "imageUrl": image_url, "caption": caption, "imageDescription": json.loads(img_description)["description"]}
        json_objects.append(json_object)

    return json_objects

def write_to_file(json_objects):
    # Write the updated JSON to a file
    with open("build-demo.json", "w") as json_file:
        json.dump(json_objects, json_file, indent=4)

json_objects = generate_json_objects()
write_to_file(json_objects)

In [None]:
from azure.storage.blob import BlobServiceClient  
import glob

def upload_sample_documents(
        blob_connection_string: str,
        blob_container_name: str,
        use_user_identity: bool = True
    ):
    # Connect to Blob Storage
    blob_service_client = BlobServiceClient.from_connection_string(conn_str=blob_connection_string, credential=DefaultAzureCredential() if use_user_identity else None)
    container_client = blob_service_client.get_container_client(blob_container_name)
    if not container_client.exists():
        container_client.create_container()

    documents_directory = "."
    csv_files = glob.glob(os.path.join(documents_directory, '*.json'))
    for file in csv_files:
        with open(file, "rb") as data:
            name = os.path.basename(file)
            container_client.upload_blob(name=name, data=data, overwrite=True)
            #if not container_client.get_blob_client(name).exists():
            #    container_client.upload_blob(name=name, data=data)

upload_sample_documents(
    blob_connection_string=BLOB_CONNECTION_STRING,
    blob_container_name=SEARCH_BLOB_CONTAINER,
    # Set to false if you want to use credentials included in the blob connection string
    # Otherwise your identity will be used as credentials
    use_user_identity=False
)
print(f"Setup sample data in {SEARCH_BLOB_CONTAINER}")

In [None]:
import os

from azure.core.credentials import AzureKeyCredential
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.indexes.models import (
    AIServicesVisionParameters,
    AIServicesVisionVectorizer,
    AIStudioModelCatalogName,
    AzureMachineLearningVectorizer,
    AzureOpenAIVectorizer,
    AzureOpenAIModelName,
    AzureOpenAIParameters,
    BlobIndexerDataToExtract,
    BlobIndexerParsingMode,
    CognitiveServicesAccountKey,
    DefaultCognitiveServicesAccount,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    FieldMapping,
    HnswAlgorithmConfiguration,
    HnswParameters,
    IndexerExecutionStatus,
    IndexingParameters,
    IndexingParametersConfiguration,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    ScalarQuantizationCompressionConfiguration,
    ScalarQuantizationParameters,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SearchIndexer,
    SearchIndexerDataContainer,
    SearchIndexerDataIdentity,
    SearchIndexerDataSourceConnection,
    SearchIndexerSkillset,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    SemanticSearch,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile,
    VisionVectorizeSkill
)
from azure.search.documents.models import (
    HybridCountAndFacetMode,
    HybridSearch,
    SearchScoreThreshold,
    VectorizableTextQuery,
    VectorizableImageBinaryQuery,
    VectorizableImageUrlQuery,
    VectorSimilarityThreshold,
)
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv
from IPython.display import Image, display, HTML
from openai import AzureOpenAI

In [None]:
print(AZURE_AI_VISION_ENDPOINT)
print(AZURE_AI_VISION_API_KEY)

In [None]:
# User-specified parameter
USE_AAD_FOR_SEARCH = False  # Set this to False to use API key for authentication

def authenticate_azure_search(api_key=None, use_aad_for_search=False):
    if use_aad_for_search:
        print("Using AAD for authentication.")
        credential = DefaultAzureCredential()
    else:
        print("Using API keys for authentication.")
        if api_key is None:
            raise ValueError("API key must be provided if not using AAD for authentication.")
        credential = AzureKeyCredential(api_key)
    return credential

azure_search_credential = authenticate_azure_search(api_key=SEARCH_SERVICE_API_KEY, use_aad_for_search=USE_AAD_FOR_SEARCH)


## Create a blob data source connector on Azure AI Search

In [None]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection,
    SoftDeleteColumnDeletionDetectionPolicy
)

credential = AzureKeyCredential(os.getenv("AZURE_SEARCH_ADMIN_KEY")) if os.getenv("AZURE_SEARCH_ADMIN_KEY") else DefaultAzureCredential()


# Create a data source
# NOTE: To remove records from a search index, add a column to the row "IsDeleted" set to "True". The next indexer run will remove this record
# To learn more please visit https://learn.microsoft.com/en-us/azure/search/search-howto-index-one-to-many-blobs
indexer_client = SearchIndexerClient(SEARCH_SERVICE_ENDPOINT, credential)
container = SearchIndexerDataContainer(name=SEARCH_BLOB_CONTAINER)
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{INDEX_NAME}-blob",
    type="azureblob",
    connection_string=BLOB_CONNECTION_STRING,
    container=container,
    data_deletion_detection_policy=SoftDeleteColumnDeletionDetectionPolicy(soft_delete_column_name="IsDeleted", soft_delete_marker_value="True")
)
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

## Create a search index

In [None]:
def create_fields():
    """Creates the fields for the search index based on the specified schema."""
    return [
        SimpleField(
            name="id", type=SearchFieldDataType.String, key=True, filterable=True
        ),
        SearchField(name="caption", type=SearchFieldDataType.String, searchable=True),
        SearchField(name="imageUrl", type=SearchFieldDataType.String, searchable=True),
        SearchField(name="imageDescription", type=SearchFieldDataType.String, searchable=True),
        SearchField(
            name="imageDescriptionVector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            vector_search_dimensions=1024,
            vector_search_profile_name="myHnswProfile",
            stored=False,
            retrievable=True
        ),
        SearchField(
            name="captionVector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            vector_search_dimensions=1024,
            vector_search_profile_name="myHnswProfile",
            stored=False,
            retrievable=True
        ),
        SearchField(
            name="imageVector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            vector_search_dimensions=1024,
            vector_search_profile_name="myHnswProfile",
            stored=False,
            retrievable=True
        ),
    ]


def create_vector_search_configuration():
    """Creates the vector search configuration."""
    return VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="myHnsw",
                parameters=HnswParameters(
                    m=4,
                    ef_construction=400,
                    ef_search=500,
                    metric=VectorSearchAlgorithmMetric.COSINE,
                ),
            )
        ],
        compressions=[
            ScalarQuantizationCompressionConfiguration(
                name="myScalarQuantization",
                rerank_with_original_vectors=True,
                default_oversampling=10,
                parameters=ScalarQuantizationParameters(quantized_data_type="int8"),
            )
        ],
        vectorizers=[
            AIServicesVisionVectorizer(
                name="myAIServicesVectorizer",
                kind="aiServicesVision",
                ai_services_vision_parameters=AIServicesVisionParameters(
                    model_version="2023-04-15",
                    resource_uri=AZURE_AI_VISION_ENDPOINT,
                    api_key=AZURE_AI_VISION_API_KEY,
                ),
            )
        ],
        profiles=[
            VectorSearchProfile(
                name="myHnswProfile",
                algorithm_configuration_name="myHnsw",
                compression_configuration_name="myScalarQuantization",
                vectorizer="myAIServicesVectorizer",
            )
        ],
    )


def create_search_index(index_client, index_name, fields, vector_search):
    """Creates or updates a search index."""
    index = SearchIndex(
        name=index_name,
        fields=fields,
        vector_search=vector_search,
    )
    index_client.create_or_update_index(index=index)


index_client = SearchIndexClient(
    endpoint=SEARCH_SERVICE_ENDPOINT, credential=azure_search_credential
)
fields = create_fields()
vector_search = create_vector_search_configuration()

# Create the search index with the adjusted schema
create_search_index(index_client, INDEX_NAME, fields, vector_search)
print(f"Created index: {INDEX_NAME}")

## Create a Skillset    

In [None]:
print(AZURE_AI_VISION_API_KEY)

In [None]:
def create_text_embedding_skill():
    return VisionVectorizeSkill(
        name="text-embedding-skill",
        description="Skill to generate embeddings for text via Azure AI Vision",
        context="/document",
        model_version="2023-04-15",
        inputs=[InputFieldMappingEntry(name="text", source="/document/caption")],
        outputs=[OutputFieldMappingEntry(name="vector", target_name="captionVector")],
    )

def create_image_description_text_embedding_skill():
    return VisionVectorizeSkill(
        name="image-description-text-embedding-skill",
        description="Skill to generate embeddings for text via Azure AI Vision",
        context="/document",
        model_version="2023-04-15",
        inputs=[InputFieldMappingEntry(name="text", source="/document/imageDescription")],
        outputs=[OutputFieldMappingEntry(name="vector", target_name="imageDescriptionVector")],
    )

def create_image_embedding_skill():
    return VisionVectorizeSkill(
        name="image-embedding-skill",
        description="Skill to generate embeddings for image via Azure AI Vision",
        context="/document",
        model_version="2023-04-15",
        inputs=[InputFieldMappingEntry(name="url", source="/document/imageUrl")],
        outputs=[OutputFieldMappingEntry(name="vector", target_name="imageVector")],
    )

def create_skillset(client, skillset_name, text_embedding_skill, image_description_text_embedding_skill, image_embedding_skill):
    skillset = SearchIndexerSkillset(
        name=skillset_name,
        description="Skillset for generating embeddings",
        skills=[text_embedding_skill, image_description_text_embedding_skill, image_embedding_skill],
        cognitive_services_account=CognitiveServicesAccountKey(
            key=AZURE_AI_VISION_API_KEY,
            description="AI Vision Multi Service Account in West US",
        ),
    )
    client.create_or_update_skillset(skillset)

client = SearchIndexerClient(
    endpoint=SEARCH_SERVICE_ENDPOINT, credential=azure_search_credential
)
skillset_name = f"{INDEX_NAME}-skillset"
text_embedding_skill = create_text_embedding_skill()
image_description_text_embedding_skill = create_image_description_text_embedding_skill()
image_embedding_skill = create_image_embedding_skill()

create_skillset(client, skillset_name, text_embedding_skill, image_description_text_embedding_skill, image_embedding_skill)
print(f"Created skillset: {skillset_name}")

## Run Indexer

In [None]:
from azure.search.documents.indexes.models import (
    SearchIndexer,
    FieldMapping,
    FieldMappingFunction,
    IndexingParameters,
    IndexingParametersConfiguration,
    BlobIndexerParsingMode
)

# Create an indexer  
indexer_name = f"{INDEX_NAME}-indexer"  
indexer_parameters = IndexingParameters(
        configuration=IndexingParametersConfiguration(
            parsing_mode=BlobIndexerParsingMode.JSON_ARRAY,
            query_timeout=None,
           ))

indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings",  
    skillset_name=skillset_name,  
    target_index_name=INDEX_NAME,  
    data_source_name=data_source.name,
    parameters=indexer_parameters,
    field_mappings=[FieldMapping(source_field_name="id", target_field_name="id")],
    output_field_mappings=[
        FieldMapping(source_field_name="/document/captionVector", target_field_name="captionVector"),
        FieldMapping(source_field_name="/document/imageDescriptionVector", target_field_name="imageDescriptionVector"),
        FieldMapping(source_field_name="/document/imageVector", target_field_name="imageVector")
    ]
)  

indexer_client = SearchIndexerClient(SEARCH_SERVICE_ENDPOINT, credential)  
indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer  
indexer_client.run_indexer(indexer_name)  
print(f'{indexer_name} is created and running. If queries return no results, please wait a bit and try again.')  

## Simple vector search (text to text)

In [None]:
# Initialize the SearchClient
search_client = SearchClient(
    SEARCH_SERVICE_ENDPOINT,
    index_name=INDEX_NAME,
    credential=azure_search_credential,
)

# Define the query
# query = "sunglasses for holiday"
# query = "休日のサングラス" # Japanese query
query = "trees with buildings" # Spanish query

vector_query = VectorizableTextQuery(
    text=query,
    k_nearest_neighbors=3,
    fields="captionVector",
    # fields="imageVector",
)

# Perform the search
results = search_client.search(
    search_text=None,
    vector_queries=[vector_query],
    top=3
)


# Print the results
for result in results:
    print(f"Caption: {result['caption']}")
    print(f"Score: {result['@search.score']}")
    display(HTML(f'<img src="{result["imageUrl"]}" style="width:200px;"/>'))
    print("-" * 50) 

## Simple Vector Search (text to image)

In [None]:
# Define the text query
query = "city with buildings"
text_vector_query = VectorizableTextQuery(
    text=query,
    k_nearest_neighbors=10,
    fields="captionVector",
)
# Define the image query
image_vector_query = VectorizableImageUrlQuery(  # Alternatively, use VectorizableImageBinaryQuery
    url="https://media.gettyimages.com/id/155422469/photo/office-skysraper-in-the-sun.jpg?s=1024x1024&w=gi&k=20&c=E32XYAydthNC2NY59OqU2PzGes_i40E8aywKIgtnSBI=",  #skyscrapper
    k_nearest_neighbors=10,
    fields="imageVector",
)

# Perform the search
results = search_client.search(
    search_text=None, vector_queries=[text_vector_query, image_vector_query], top=3
)

# Print the results
for result in results:
    print(f"Caption: {result['caption']}")
    print(f"Score: {result['@search.score']}")
    print(f"URL: {result['imageUrl']}")
    display(HTML(f'<img src="{result["imageUrl"]}" style="width:200px;"/>'))
    print("-" * 50)  

## Multi-modal vector search with weighting images 100x more than captions

In [None]:
# Define the text query
query = "city with trees an buildings"
text_vector_query = VectorizableTextQuery(
    text=query,
    k_nearest_neighbors=5,
    fields="captionVector",
)
# Define the image query
image_vector_query = VectorizableImageUrlQuery(  # Alternatively, use VectorizableImageBinaryQuery
    url="https://media.gettyimages.com/id/1326704523/photo/sunrise-skyline-view-of-midtown-manhattan-and-lower-manhattan.jpg?s=1024x1024&w=gi&k=20&c=VWOJfHBYc0YGRYqN1vAysD6KXsYJqf3s-afHa8tl9dY=",  # New York skyline
    k_nearest_neighbors=5,
    fields="imageVector",
    weight=100,
)

# Perform the search
results = search_client.search(
    search_text=None, vector_queries=[text_vector_query, image_vector_query], top=3
)

# Print the results
for result in results:
    print(f"Caption: {result['caption']}")
    print(f"Score: {result['@search.score']}")
    print(f"URL: {result['imageUrl']}")
    display(HTML(f'<img src="{result["imageUrl"]}" style="width:200px;"/>'))
    print("-" * 50)  