# Azure Cognitive Search Vector Search Code Sample with Azure OpenAI
This code demonstrates how to use Azure Cognitive Search with OpenAI and Azure Python SDK
## Prerequisites
To run the code, install the following packages. Please use the latest pre-release version `pip install azure-search-documents --pre`.

In [None]:
! pip install azure-search-documents --pre
! pip install openai
! pip install python-dotenv

## Import required libraries and environment variables

In [52]:
# Import required libraries  
import os  
import json  
import openai  
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    VectorSearchAlgorithmConfiguration,  
)  
  
# Configure environment variables  
load_dotenv()  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT") 
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME") 
key = os.getenv("AZURE_SEARCH_API_KEY") 
openai.api_type = "azure"  
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION")  
credential = AzureKeyCredential(key)

# Create a search index client
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)

## Create functions to create embeddings and calculate similarities

In [53]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text):
    response = openai.Embedding.create(
        input=text, engine="text-embedding-ada-002")
    embeddings = response['data'][0]['embedding']
    return embeddings

import numpy as np
def cosine_similarity(vec1, vec2):
    np_vec1 = np.array(vec1)
    np_vec2 = np.array(vec2)
    dot_product = np.dot(np_vec1, np_vec2)
    norm_vec1 = np.linalg.norm(np_vec1)
    norm_vec2 = np.linalg.norm(np_vec2)
    similarity = dot_product / (norm_vec1 * norm_vec2)
    return similarity

## Manually calculate similarities

Generate some vectors

In [54]:
vec1 = generate_embeddings("An enchilada is a dish consisting of a corn tortilla rolled around a filling and covered with a savory sauce.")
vec2 = generate_embeddings("Pizza is a dish consisting of a usually round, flattened base of leavened wheat-based dough topped with tomatoes, cheese, and often various other ingredients, which is then baked at a high temperature, traditionally in a wood-fired oven.")

Perform cosine similarity

In [56]:
query_vector = generate_embeddings("What is a good Mexican food")
print(cosine_similarity(vec1, query_vector))
print(cosine_similarity(vec2, query_vector))

0.8218928550909893
0.7714511010492089


## Create Vector Search Index

Create some data

In [57]:
data = [
    {'id': '1', 'category': 'standard', 'content': 'An enchilada is a dish consisting of a corn tortilla rolled around a filling and covered with a savory sauce.'},
    {'id': '2', 'category': 'standard', 'content': 'Pizza is a dish consisting of a usually round, flattened base of leavened wheat-based dough topped with tomatoes, cheese, and often various other ingredients, which is then baked at a high temperature, traditionally in a wood-fired oven.'},
    {'id': '3', 'category': 'premium', 'content': 'Our secrete ingredient to making the perfect pizza sauce is to add Marzano tomoatoes and Pecorino parmesan cheese to the sauce.'},
    {'id': '4', 'category': 'standard', 'content': 'I know its cheesy but I love to cook'}
]

Generate embeddings for content field

In [58]:
for item in data:
    item['contentEmbeddings'] = generate_embeddings(item['content'])

In [59]:
for item in data:
    print(item)

{'id': '1', 'category': 'standard', 'content': 'An enchilada is a dish consisting of a corn tortilla rolled around a filling and covered with a savory sauce.', 'contentEmbeddings': [-0.01027632039040327, 0.0014435856137424707, 0.0022509892005473375, -0.004100178834050894, -0.015974564477801323, -0.008583893068134785, -0.004355869255959988, -0.02900260128080845, 0.0009048093343153596, -0.0243027675896883, 0.024193186312913895, 0.020893562585115433, 0.006775796879082918, 0.010154563002288342, -0.01613285019993782, 0.023523520678281784, 0.02404707670211792, -0.00299523095600307, -0.0105441864579916, -0.03613758459687233, -0.03433557227253914, 0.007134980987757444, -0.00353705114684999, 0.013551593758165836, -0.03616193309426308, 0.01168870646506548, -0.0009725368581712246, -0.009655358269810677, 0.023474818095564842, 0.006410524714738131, 0.010026718489825726, 0.015329251065850258, -0.026664860546588898, -0.03801264613866806, -0.044197920709848404, -0.022330299019813538, 0.015524062328040

Create fields search configuration

In [60]:
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchableField(name="category", type=SearchFieldDataType.String,
                    filterable=True),
    SearchField(name="contentEmbeddings", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="my-vector-config"),
]

vector_search = VectorSearch(
    algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
            hnsw_parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            }
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        #title_field=SemanticField(field_name="title"),
        prioritized_keywords_fields=[SemanticField(field_name="category")],
        prioritized_content_fields=[SemanticField(field_name="content")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

index_name = "vector-demo-index"

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 vector-demo-index created


Load data into Index

In [61]:
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(data)
print(f"Uploaded {len(data)} documents")

Uploaded 4 documents


## Perform Vector/Hybrid Search Query

In [66]:
#query = "italian"
#query = "What is an encilada?"
query = "cheese food"

results = search_client.search(
    search_text=query,
    vector=generate_embeddings(query),
    top_k=3,
    vector_fields="contentEmbeddings",
    filter="category eq 'standard'",
    select=["content", "category"],
    include_total_count=True
)

for result in results:
    print(f"Score: {result['@search.score']}")
    print(f"Content: {result['content']}")
    print(f"Category: {result['category']}\n")

Score: 0.03306011110544205
Content: Pizza is a dish consisting of a usually round, flattened base of leavened wheat-based dough topped with tomatoes, cheese, and often various other ingredients, which is then baked at a high temperature, traditionally in a wood-fired oven.
Category: standard

Score: 0.01666666753590107
Content: I know its cheesy but I love to cook
Category: standard

Score: 0.016129031777381897
Content: An enchilada is a dish consisting of a corn tortilla rolled around a filling and covered with a savory sauce.
Category: standard



## Semantic Hybrid Search

In [None]:
#query = "What is an encilada?"
query = "cheese food"

results = search_client.search(
    search_text=query,
    vector=generate_embeddings(query),
    top_k=3,
    vector_fields="contentEmbeddings",
    # filter="category eq 'standard'",
    select=["content", "category"],
    query_type="semantic", query_language="en-us", semantic_configuration_name='my-semantic-config', query_caption="extractive", query_answer="extractive",
    top=3
)

semantic_answers = results.get_answers()
for answer in semantic_answers:
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"Score: {result['@search.score']}")
    print(f"Content: {result['content']}")
    print(f"Category: {result['category']}")

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")