In [1]:
import os
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [2]:
search_service_endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
search_service_key = os.environ["AZURE_SEARCH_API_KEY"]
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.getenv("AZURE_OPENAI_KEY", "")
azure_openai_embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-large")
azure_openai_embedding_dimensions = int(os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS", 1024))
embedding_model_name = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-large")
azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION", "2024-06-01")
cohere_api_key = os.getenv("COHERE_API_KEY", "")
base_index_name = 'scorp-opt'

cohere_int8 = "Collection(Edm.SByte)"
cohere_ubinary = "Collection(Edm.Byte)"

In [3]:
import cohere

cohere_client = cohere.Client(cohere_api_key)

In [4]:
def get_embeddings_cohere(texts, input_type="search_document", embedding_type="ubinary"):
    model = "embed-english-v3.0"

    texts = [texts] if isinstance(texts, str) else texts

    response = cohere_client.embed(
        texts=texts,
        model=model,
        input_type=input_type,
        embedding_types=[embedding_type],
    )

    res = [embedding for embedding in getattr(response.embeddings, embedding_type)]
    return res[0]

In [6]:
from openai import AzureOpenAI

openai_client = AzureOpenAI(
        azure_deployment=azure_openai_embedding_deployment,
        api_version=azure_openai_api_version,
        azure_endpoint=azure_openai_endpoint,
        api_key=azure_openai_key
    )

def get_embeddings(text):
    response = openai_client.embeddings.create(input=text, model=embedding_model_name, dimensions=azure_openai_embedding_dimensions)
    return response.data[0].embedding

In [7]:
from pypdf import PdfReader

file_path = '../files/QML-DS.pdf'

pdf_content = PdfReader(file_path)

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [18]:
documents = []

for i, page in enumerate(pdf_content.pages):
    text = page.extract_text()
    
    documents.append({
        "id": str(i + 1),
        "title": "QLM-DS",
        "category": "QML",
        "content": text,
        "contentVector": get_embeddings(text)
    })

In [37]:
def get_cohere_embedded_documents(embedding_type):
    cohere_documents = []

    for i, page in enumerate(pdf_content.pages):
        text = page.extract_text()
        
        cohere_documents.append({
            "id": str(i + 1),
            "title": "QLM-DS",
            "category": "QML",
            "content": text,
            "contentVector": get_embeddings_cohere(text, embedding_type=embedding_type)
        })

    return cohere_documents

In [40]:
from typing import List
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex,
    SearchField,
    ScalarQuantizationCompression,
    BinaryQuantizationCompression,
    VectorSearchCompression,
    VectorSearchAlgorithmKind,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    VectorEncodingFormat
)

def create_index(index_name, dimensions, cohere_vector_type, use_scalar_compression=False, use_binary_compression=False, use_float16=False, use_stored=True, use_cohere=False, truncate_dimensions=None):
    if not use_cohere:
        if use_float16:
            vector_type = "Collection(Edm.Half)"
        else:
            vector_type = "Collection(Edm.Single)"

    fields = [
        SimpleField(
            name="id",
            type=SearchFieldDataType.String,
            key=True,
            sortable=True,
            filterable=True,
            facetable=True
        ),
        SearchField(
            name="title",
            type=SearchFieldDataType.String
        ),
        SearchField(
            name="category",
            type=SearchFieldDataType.String,
            filterable=True
        ),
        SearchField(
            name="content",
            type=SearchFieldDataType.String
        ),
        SearchField(
            name="contentVector",
            type=vector_type if cohere_vector_type is None else cohere_vector_type,
            searchable=True,
            stored=use_stored,
            vector_search_dimensions=dimensions,
            vector_search_profile_name="myHnswProfile",
            vector_encoding_format=(
                VectorEncodingFormat.PACKED_BIT
                if cohere_vector_type == "Collection(Edm.Byte)"
                else None
            )
        )
    ]

    compression_configurations: List[VectorSearchCompression] = []
    if use_scalar_compression:
        compression_name = "myCompression"
        compression_configurations = [
            ScalarQuantizationCompression(
                compression_name=compression_name,
                truncation_dimension=truncate_dimensions
            )
        ]
    elif use_binary_compression:
        compression_name = "myCompression"
        compression_configurations = [
            BinaryQuantizationCompression(
                compression_name=compression_name,
                rerank_with_original_vectors=True,
                truncation_dimension=truncate_dimensions
            )
        ]
    else:
        compression_name = None
        compression_configurations = []
    
    vector_search = VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="myHnsw",
                kind=VectorSearchAlgorithmKind.HNSW,
                parameters=HnswParameters(
                    metric=(
                        VectorSearchAlgorithmMetric.HAMMING
                        if cohere_vector_type == "Collection(Edm.Byte)"
                        else VectorSearchAlgorithmMetric.COSINE
                    )
                ),
            )
        ],
        profiles=[
            VectorSearchProfile(
                name="myHnswProfile",
                algorithm_configuration_name="myHnsw",
                compression_name=compression_name
            )
        ],
        compressions=compression_configurations
    )

    semantic_config = SemanticConfiguration(
        name="my-semantic-config",
        prioritized_fields=SemanticPrioritizedFields(
            title_field=SemanticField(field_name="title"),
            content_fields=[SemanticField(field_name="content")]
        )
    )
    semantic_search = SemanticSearch(configurations=[semantic_config])

    return SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)

In [56]:
from azure.core.credentials import AzureKeyCredential

baseline_index = f"{base_index_name}-baseline"
scalar_compression_index = f"{base_index_name}-scalar-compression"
binary_compression_index = f"{base_index_name}-binary-compression"
narrow_index = f"{base_index_name}-narrow"
no_stored_index = f"{base_index_name}-no-stored"
cohere_index_ubinary = f"{base_index_name}-cohere-ubinary"
cohere_index_int8 = f"{base_index_name}-cohere-int8"
all_index_scalar = f"{base_index_name}-all-options-with-scalar"
all_index_binary = f"{base_index_name}-all-options-with-binary"

search_index_client = SearchIndexClient(search_service_endpoint, AzureKeyCredential(search_service_key))
search_index_client.create_or_update_index(
    create_index(scalar_compression_index, azure_openai_embedding_dimensions, None, use_scalar_compression=True))
search_index_client.create_or_update_index(
    create_index(binary_compression_index, azure_openai_embedding_dimensions, None, use_binary_compression=True))
search_index_client.create_or_update_index(
    create_index(narrow_index, azure_openai_embedding_dimensions, None, use_float16=True))
search_index_client.create_or_update_index(
    create_index(no_stored_index, azure_openai_embedding_dimensions, None, use_stored=False))
search_index_client.create_or_update_index(
    create_index(all_index_scalar, azure_openai_embedding_dimensions, None, use_scalar_compression=True, use_float16=True, use_stored=False))
search_index_client.create_or_update_index(
    create_index(all_index_binary, azure_openai_embedding_dimensions, None, use_binary_compression=True, use_float16=True, use_stored=False))
search_index_client.create_or_update_index(
    create_index(baseline_index, azure_openai_embedding_dimensions, None))
search_index_client.create_or_update_index(
    create_index(cohere_index_ubinary, 1024, cohere_ubinary, use_cohere=True, use_stored=True))
search_index_client.create_or_update_index(
    create_index(cohere_index_int8, 1024, cohere_int8, use_cohere=True, use_stored=True))

print("Created indexes")

Created indexes


In [51]:
from azure.search.documents import SearchIndexingBufferedSender

def upload_embeddings(index_name, documents):    
    with SearchIndexingBufferedSender(search_service_endpoint, index_name, AzureKeyCredential(search_service_key)) as client:
        client.upload_documents(documents=documents)

In [57]:
# Upload ada embedded documents
upload_embeddings(scalar_compression_index, documents)
upload_embeddings(binary_compression_index, documents)
upload_embeddings(narrow_index, documents)
upload_embeddings(no_stored_index, documents)
upload_embeddings(all_index_scalar, documents)
upload_embeddings(all_index_binary, documents)
upload_embeddings(baseline_index, documents)

# Upload cohere embedded documents
upload_embeddings(cohere_index_ubinary, get_cohere_embedded_documents("ubinary"))
upload_embeddings(cohere_index_int8, get_cohere_embedded_documents("int8"))

In [58]:
def bytes_to_mb(bytes):
    return round(bytes / (1024 * 1024), 4)

def find_storage_size_mb(index_name):
    response = search_index_client.get_index_statistics(index_name)
    return bytes_to_mb(response["storage_size"]), bytes_to_mb(response["vector_index_size"])

index_sizes = [(find_storage_size_mb(index_name), index_name) for index_name in [
                    scalar_compression_index,
                    binary_compression_index,
                    baseline_index,
                    no_stored_index,
                    narrow_index,
                    cohere_index_ubinary,
                    cohere_index_int8,
                    all_index_scalar,
                    all_index_binary
                ]
            ]
index_sizes.sort(key=lambda item: item[0][0], reverse=True)

for ((storage_size, vector_size), index_name) in index_sizes:
    print("*" * 40)
    print(f"Index Name: {index_name}\nStorage Size: {storage_size}MB\nVector Size: {vector_size}MB")

****************************************
Index Name: scorp-opt-scalar-compression
Storage Size: 1.6637MB
Vector Size: 0.2732MB
****************************************
Index Name: scorp-opt-baseline
Storage Size: 1.6628MB
Vector Size: 0.2732MB
****************************************
Index Name: scorp-opt-binary-compression
Storage Size: 1.4031MB
Vector Size: 0.0121MB
****************************************
Index Name: scorp-opt-narrow
Storage Size: 1.3933MB
Vector Size: 0.1384MB
****************************************
Index Name: scorp-opt-no-stored
Storage Size: 1.096MB
Vector Size: 0.2732MB
****************************************
Index Name: scorp-opt-all-options-with-scalar
Storage Size: 0.8275MB
Vector Size: 0.1384MB
****************************************
Index Name: scorp-opt-cohere-int8
Storage Size: 0.758MB
Vector Size: 0.0486MB
****************************************
Index Name: scorp-opt-all-options-with-binary
Storage Size: 0.7016MB
Vector Size: 0.0121MB
**************