# AI Search performance benchmarking tool

In [None]:
import argparse
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import os
from collections import Counter
import requests
import uuid
from datetime import datetime
import time
import openai
from azure.identity import DefaultAzureCredential
import dotenv
import logging
import json
from openai import AzureOpenAI
import pandas as pd
dotenv.load_dotenv()

In [None]:
EMBEDDING_DIMS=1536
logging.basicConfig(level=logging.INFO)
AZURE_OPENAI_SERVICE = os.environ.get("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_SERVICE = f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com"
AZURE_OPENAI_API_KEY = os.environ.get("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_DEPLOYMENT_NAME = (
    os.environ.get("AZURE_OPENAI_DEPLOYMENT_NAME") or "embedding"
)
AZURE_SEARCH_SERVICE_ENDPOINT = os.environ.get("AZURE_SEARCH_SERVICE_ENDPOINT_S2")
AZURE_SEARCH_SERVICE_API_KEY = os.environ.get("AZURE_SEARCH_SERVICE_API_KEY_S2")


In [None]:
search_endpoint = AZURE_SEARCH_SERVICE_ENDPOINT
search_index_name = "index-product-poc-ada02-suggester"
search_api_key = AZURE_SEARCH_SERVICE_API_KEY
search_api_version = "2023-11-01" #"2023-10-01-Preview"
search_headers = {"Content-Type": "application/json", "api-key": search_api_key}

In [None]:
def query_ai_search(query, search_type, search_index_name, text_fields, top=10, semantic_config_name="", vector=None, select_fields=None, vector_fields=None, application_name = None):
        """
        Query the search index
        """
        search_payload = {
            "top": top,
        }
        if search_type in ["text","hybrid"]:
            search_payload["search"] = query
            search_payload["searchFields"] = ",".join(text_fields)
            search_payload["queryType"] = "simple"
            if len(semantic_config_name) > 0:
                search_payload["queryType"] = "semantic"
                search_payload["semanticConfiguration"] = semantic_config_name                
        if application_name:
            search_payload["filter"] = f"application_name eq '{application_name}'"
        if select_fields:
            search_payload["select"] = ",".join(select_fields)
        if vector_fields and search_type in ["vector","hybrid"]:
            search_payload["vectorQueries"] = [
                {
                    "fields": ",".join(vector_fields),
                    "vector": vector,
                    "k": top,
                    "kind": "vector"
                }
            ]
            search_payload["vectorFilterMode"] = "preFilter"
        #logging.info(search_payload)
        response = requests.post(
            f"{search_endpoint}/indexes('{search_index_name}')/docs/search?api-version={search_api_version}",
            headers=search_headers,
            json=search_payload,
        )
        if response.status_code == 200:
            return response.json()
        else:
            logging.error(f"ERROR: {response.status_code}")
            return None

In [None]:
client = AzureOpenAI(
  api_key = AZURE_OPENAI_API_KEY,  
  api_version = "2024-02-01",
  azure_endpoint = AZURE_OPENAI_SERVICE
)

def generate_embeddings(text, model="embedding"): # model = "deployment_name"
    return client.embeddings.create(input = [text], model=model).data[0].embedding
# model should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model
generate_embeddings ("test", model = 'embedding') 


In [None]:
QUERY = "packet roaming ke bangkok"
EMBEDDING = generate_embeddings(QUERY, model="embedding")
TOP_K=10

In [None]:
%%time
query_ai_search(query=QUERY, 
                search_type="text", 
                search_index_name="index-product-poc-ada02-suggester", 
                text_fields=["product_description"], 
                top=TOP_K, 
                vector=EMBEDDING, 
                select_fields=["product_id","product_name","product_description"], 
                vector_fields=["product_description_vector"], 
                application_name = None
                )

In [None]:
%%time
response = query_ai_search(query=QUERY, 
                search_type="hybrid", 
                search_index_name="index-product-poc-ada02-suggester", 
                text_fields=["product_description"], 
                top=TOP_K, 
                vector=EMBEDDING, 
                select_fields=["product_id","product_name","product_description"], 
                vector_fields=["product_description_vector"], 
                application_name = None
                )

In [None]:
%%timeit -n 5 -r 5
response = query_ai_search(query=QUERY, 
                search_type="vector", 
                search_index_name="index-product-poc-ada02-suggester", 
                text_fields=["product_description"], 
                top=TOP_K, 
                vector=EMBEDDING, 
                select_fields=["product_id","product_name","product_description"], 
                vector_fields=["product_description_vector"], 
                application_name = None
                )

# Using Python Multiprocessing tool

In [None]:
import time
import multiprocessing
import requests
import logging

def measure_latency(pool_size=100, search_type="text", search_index_name=search_index_name):
    start_time = time.time()

    with multiprocessing.Pool(processes=pool_size) as pool:
        # Define the parameters for query_ai_search function
        query = QUERY
        search_type = search_type
        search_index_name = search_index_name
        text_fields = ["product_description"]
        top = 10
        semantic_config_name = ""
        vector = EMBEDDING
        select_fields = ["product_id","product_name","product_description"]
        vector_fields = ["product_description_vector"]
        application_name = None

        # Create a list of tuples for the parameters
        params = [(query, search_type, search_index_name, text_fields, top, semantic_config_name, vector, select_fields, vector_fields, application_name) for _ in range(50)]

        results = pool.starmap(query_ai_search, params)

    end_time = time.time()

    latency = end_time - start_time
    return latency

In [None]:
POOL_SIZE=200
search_type="text"
latency = measure_latency(pool_size=POOL_SIZE, search_type=search_type)
print(f"Latency for {POOL_SIZE} concurrent requests with search_type={search_type}: {latency:.2f} seconds")
print(f"Throughput for {POOL_SIZE} concurrent requests with search_type={search_type}: {POOL_SIZE/latency:.2f} requests per second")

In [None]:
POOL_SIZE=200
search_type="vector"
latency = measure_latency(pool_size=POOL_SIZE, search_type=search_type)
print(f"Latency for {POOL_SIZE} concurrent requests with search_type={search_type}: {latency:.2f} seconds")
print(f"Throughput for {POOL_SIZE} concurrent requests with search_type={search_type}: {POOL_SIZE/latency:.2f} requests per second")

In [None]:
POOL_SIZE=200
search_type="hybrid"
latency = measure_latency(pool_size=POOL_SIZE, search_type=search_type)
print(f"Latency for {POOL_SIZE} concurrent requests with search_type={search_type}: {latency:.2f} seconds")
print(f"Throughput for {POOL_SIZE} concurrent requests with search_type={search_type}: {POOL_SIZE/latency:.2f} requests per second")

# Using Async io requests

In [None]:
import time
import asyncio
import aiohttp

async def query_ai_search_async(session, query, search_type, search_index_name, text_fields, top=10, semantic_config_name="", vector=None, select_fields=None, vector_fields=None, application_name = None):
    """
    Query the search index
    """
    search_payload = {
        "top": top,
    }
    if search_type in ["text","hybrid"]:
        search_payload["search"] = query
        search_payload["searchFields"] = ",".join(text_fields)
        search_payload["queryType"] = "simple"
        if len(semantic_config_name) > 0:
            search_payload["queryType"] = "semantic"
            search_payload["semanticConfiguration"] = semantic_config_name                
    if application_name:
        search_payload["filter"] = f"application_name eq '{application_name}'"
    if select_fields:
        search_payload["select"] = ",".join(select_fields)
    if vector_fields and search_type in ["vector","hybrid"]:
        search_payload["vectorQueries"] = [
            {
                "fields": ",".join(vector_fields),
                "vector": vector,
                "k": top,
                "kind": "vector"
            }
        ]
        search_payload["vectorFilterMode"] = "preFilter"
    async with session.post(
        f"{search_endpoint}/indexes('{search_index_name}')/docs/search?api-version={search_api_version}",
        headers=search_headers,
        json=search_payload,
    ) as response:
        if response.status == 200:
            return await response.json()
        else:
            logging.error(f"ERROR: {response.status}")
            return None

async def measure_latency_async(pool_size=100, search_type="text", search_index_name=search_index_name):
    start_time = time.time()

    async with aiohttp.ClientSession() as session:
        # Define the parameters for query_ai_search function
        query = QUERY
        search_type = search_type
        search_index_name = search_index_name
        text_fields = ["product_description"]
        top = 10
        semantic_config_name = ""
        vector = EMBEDDING
        select_fields = ["product_id","product_name","product_description"]
        vector_fields = ["product_description_vector"]
        application_name = None

        # Create a list of tasks for the parameters
        tasks = [query_ai_search_async(session, query, search_type, search_index_name, text_fields, top, semantic_config_name, vector, select_fields, vector_fields, application_name) for _ in range(pool_size)]

        responses = await asyncio.gather(*tasks)

    end_time = time.time()
    total_time = end_time - start_time
    average_latency = total_time / pool_size
    throughput = pool_size / total_time

    return  {"total_time": total_time, "average_latency": average_latency, "throughput": throughput}

In [19]:
# Define your parameters
df_res = pd.DataFrame()
for pool_size in [100,200,300,400,500]:
    for search_type in ["text", "vector", "hybrid"]:
        TOTAL_QUERIES = 5000
        result = []
        print("*"*50)
        print(f"Running with pool_size={pool_size} and search_type={search_type}")

        # Run the function
        while TOTAL_QUERIES > 0:
            stats = await measure_latency_async(pool_size, search_type, search_index_name)
            result.append(stats)
            print(stats)
            TOTAL_QUERIES -= pool_size
        _df_tmp = pd.DataFrame.from_dict(result)
        _df_tmp['search_type'] = search_type
        _df_tmp['concurrency'] = pool_size
        df_res = pd.concat([df_res, _df_tmp])

**************************************************
Running with pool_size=100 and search_type=text
{'total_time': 0.4126734733581543, 'average_latency': 0.004126734733581543, 'throughput': 242.3223358318726}
{'total_time': 0.4307575225830078, 'average_latency': 0.004307575225830078, 'throughput': 232.14916689175126}
{'total_time': 0.3902745246887207, 'average_latency': 0.003902745246887207, 'throughput': 256.22989376454706}
{'total_time': 0.38953089714050293, 'average_latency': 0.0038953089714050294, 'throughput': 256.7190452261614}
{'total_time': 0.39194154739379883, 'average_latency': 0.0039194154739379886, 'throughput': 255.14008572182865}
{'total_time': 1.142974853515625, 'average_latency': 0.01142974853515625, 'throughput': 87.49098870584466}
{'total_time': 0.5302259922027588, 'average_latency': 0.005302259922027588, 'throughput': 188.5988266711752}
{'total_time': 0.4132554531097412, 'average_latency': 0.004132554531097412, 'throughput': 241.98107792045204}
{'total_time': 0.409380

In [21]:
df_res.groupby(['search_type','concurrency'])[['average_latency','throughput']].agg(['mean','std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,average_latency,average_latency,throughput,throughput
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
search_type,concurrency,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
hybrid,100,0.011735,0.002223,87.835691,14.477368
hybrid,200,0.007854,0.000734,128.386047,11.845754
hybrid,300,0.00693,0.000451,144.844925,8.899013
hybrid,400,0.006464,0.000385,155.216363,9.206685
hybrid,500,0.006076,0.000379,165.140976,10.184316
text,100,0.007068,0.003889,173.549535,65.722342
text,200,0.003931,0.001511,288.609333,100.39655
text,300,0.002497,0.000603,421.647892,96.400895
text,400,0.001834,0.000368,563.577649,101.140582
text,500,0.001501,0.000203,677.955798,97.558207


In [None]:
await measure_latency_async(pool_size=10, search_type="text", search_index_name=search_index_name)