# BEIR Benchmarking for Azure AI Search (Part1)

## Preparation

### Configuration

In [None]:
import os

In [None]:
# Recreate Index
recreate_index = False

# datasets to evaluate
dataset_name = "scifact"

### Environment variables

In [None]:
# Load environment variabls from .env file
from dotenv import load_dotenv
load_dotenv()

### Download BEIR datasets

In [None]:
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.lexical import BM25Search as BM25


In [None]:
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset_name)
out_dir = "./datasets"
data_path = util.download_and_unzip(url, out_dir)

In [None]:
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")# pull data from corpus and queries

## Azure AI Search

### Connect to Azure AI Search

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient 
from azure.search.documents import SearchClient

service_name = "benchmark-ai-search"
index_name = dataset_name  + "-vector"

admin_key = os.environ["SEARCH_ADMIN_KEY"]
endpoint = "https://{}.search.windows.net/".format(service_name)

admin_client = SearchIndexClient(endpoint=endpoint,
                    index_name=index_name,
                    credential=AzureKeyCredential(admin_key))

search_client = SearchClient(endpoint=endpoint,
                    index_name=index_name,
                    credential=AzureKeyCredential(admin_key))

### Create Index

In [None]:
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryCaptionResult,
    QueryAnswerResult,
    SemanticErrorMode,
    SemanticErrorReason,
    SemanticSearchResultsType,
    QueryType,
    VectorizedQuery,
    VectorQuery,
    VectorFilterMode,    
)
from azure.search.documents.indexes.models import (  
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    HnswAlgorithmConfiguration,
    HnswParameters,
    SemanticConfiguration,  
    SemanticField,  
    VectorSearch,  
    VectorSearchAlgorithmKind,  
    VectorSearchAlgorithmMetric,  
    VectorSearchProfile,  
    SemanticSearch,
    SemanticPrioritizedFields,
    CorsOptions
) 

In [None]:
dir(QueryType)

In [None]:
if recreate_index==True:
    try:
        admin_client.delete_index(index_name)
    except Exception as e:
        print(e)

#### Configure vector search

In [None]:
# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        ),
        ExhaustiveKnnAlgorithmConfiguration(
            name="myExhaustiveKnn",
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
            parameters=ExhaustiveKnnParameters(
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm_configuration_name="myExhaustiveKnn",
        )
    ]
)


#### Configure semantic search

In [None]:
# import SemanticConfiguration from azure ai search
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
            title_field=SemanticField(field_name="title"),
            # prioritized_keywords_fields=[SemanticField(field_name="Category")],
            content_fields=[SemanticField(field_name="text")]
    )
)

semantic_settings = SemanticSearch(configurations=[semantic_config])

#### Other configuration


In [None]:
cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
scoring_profiles = []

#### Embedding data using OpenAI model

In [None]:
from openai import AzureOpenAI
from tenacity import retry, wait_random_exponential, stop_after_attempt  

openai_client = AzureOpenAI(
    api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version = "2023-05-15",
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
)

model = "text-embedding-ada-002-v2"

# Generate Document Embeddings using OpenAI Ada 002
# Read the text-sample.json
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text, model=model):
    return openai_client.embeddings.create(input = [text], model=model).data[0].embedding

# create documents for corpus
documents = []
for id in corpus:
    #print(id)
    documents.append({
        "corpusId": id,
        "title": corpus[id]["title"],
        "text": corpus[id]["text"],
        "titleVector": generate_embeddings(corpus[id]["title"]),
        "textVector": generate_embeddings(corpus[id]["text"])
    })
    break

#### Upload dataset into Index

In [None]:
try:
    # Upload documents to the index per 100 documents
    print("documents size is", len(documents))
    if len(documents) > 1000:
        for i in range(0, len(documents), 1000):
            result = search_client.upload_documents(documents=documents[i:i+1000])
            print("Upload of new document succeeded: {}".format(result[0].succeeded))
    else:
            result = search_client.upload_documents(documents=documents)
            print("Upload of new document succeeded: {}".format(result[0].succeeded))
except Exception as e:
    print (e)

## Test

### Full text search (simple)

In [None]:
query = "Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging."

results = search_client.search(
    search_text=query, 
    include_total_count=True, 
    top=5,
    query_type="simple")
for result in results:
    print(result["@search.score"])

### Semantic search

In [None]:
query = "Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging."

results = search_client.search(
    search_text=query, 
    include_total_count=True, 
    top=50,
    query_type="semantic",
    semantic_configuration_name='my-semantic-config')
for result in results:
    print(result["@search.score"])

### Vector search

#### Hnsw

In [None]:
query = "Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging."
vector_query = VectorizedQuery(
    vector=generate_embeddings(query),
    k_nearest_neighbors=3,
    fields="titleVector, textVector",
)

results = search_client.search(
    search_text=None, 
    vector_queries=[vector_query], 
    include_total_count=True, 
    top=50)
for result in results:
    print(result["@search.score"])

#### Exhausive KNN

In [None]:
query = "Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging."
vector_query = VectorizedQuery(
    vector=generate_embeddings(query),
    k_nearest_neighbors=3,
    fields="titleVector, textVector",
    exhaustive=True
)

results = search_client.search(
    search_text=None, 
    vector_queries=[vector_query], 
    include_total_count=True, 
    top=50,
    )
for result in results:
    print(result["@search.score"])

### Hybrid search

In [None]:
query = "Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging."
vector_query = VectorizedQuery(
    vector=generate_embeddings(query),
    k_nearest_neighbors=3,
    fields="titleVector, textVector",
)

results = search_client.search(
    search_text=query, 
    vector_queries=[vector_query], 
    include_total_count=True, 
    top=5,
    query_type="semantic",
    semantic_configuration_name='my-semantic-config')
for result in results:
    print(result["@search.score"])
    print(result["@search.reranker_score"])