In [1]:
import os
import dotenv

%reload_ext dotenv
%dotenv

In [2]:
import os
import json
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.models import (
    VectorizedQuery
)
from azure.search.documents.indexes.models import (
    SearchIndex,
    ScoringProfile,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchField,
    SemanticConfiguration,
    SemanticField,
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
    VectorSearch,
    HnswParameters,
    SemanticPrioritizedFields,
    SemanticSearch,
)
from azure.search.documents.indexes import SearchIndexClient
import os.path

# subscription_id = os.environ["subscription_id"]
# resource_group_name = os.environ["resource_group_name"]
# workspace_name = os.environ["workspace_name"]
service_endpoint = os.environ[
    "service_endpoint"
]  # the endpoint of your Azure Cognitive Search service
key = os.environ["search_key"]

# aoai_connection_name = os.environ['aoai_connection_name']
aoi_deployment_name = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"]
aoi_api_key = os.environ["aoi_api_key"]
aoai_endpoint = os.environ["aoai_endpoint"]
embedding_model_name = os.environ["embeddingModelName"]

search_index_name = "index_chunks_2"
search_index_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
credential = AzureKeyCredential(key)
storage_account_connection_string = os.getenv("storage_account_connection_string")
embeddingModelName = os.getenv("embeddingModelName")

# Create Index Function


In [7]:
def create_index(search_index_name):
    client = SearchIndexClient(service_endpoint, AzureKeyCredential(key))

    # 1. Define the fields
    fields = [
        SimpleField(
            name="chunkId",
            type=SearchFieldDataType.String,
            sortable=True,
            filterable=True,
            key=True,
        ),
        SimpleField(
            name="source",
            type=SearchFieldDataType.String,
            sortable=True,
            filterable=True,
        ),
        SearchableField(name="chunkContent", type=SearchFieldDataType.String),
        SearchField(
            name="chunkContentVector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=1536,  # the dimension of the embedded vector
            vector_search_profile_name="my-vector-config",
        ),
    ]

    # 2. Configure the vector search configuration
    vector_search = VectorSearch(
        profiles=[
            VectorSearchProfile(
                name="my-vector-config",
                algorithm_configuration_name="my-algorithms-config"
            )
        ],
        algorithms=[
            # Contains configuration options specific to the hnsw approximate nearest neighbors  algorithm used during indexing and querying
            HnswAlgorithmConfiguration(
                name="my-algorithms-config",
                kind="hnsw",
                # https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.indexes.models.hnswparameters?view=azure-python-preview#variables
                parameters=HnswParameters(
                    m=4,
                    # The size of the dynamic list containing the nearest neighbors, which is used during index time.
                    # Increasing this parameter may improve index quality, at the expense of increased indexing time.
                    ef_construction=400,
                    # The size of the dynamic list containing the nearest neighbors, which is used during search time.
                    # Increasing this parameter may improve search results, at the expense of slower search.
                    ef_search=500,
                    # The similarity metric to use for vector comparisons.
                    # Known values are: "cosine", "euclidean", and "dotProduct"
                    metric="cosine",
                ),
            )
        ],
    )

    index = SearchIndex(
        name=search_index_name,
        fields=fields,
        vector_search=vector_search,
    )

    result = client.create_or_update_index(index)
    print(f"Index: '{result.name}' created or updated")

# Create Embeddings Function


In [4]:
import requests

def get_query_embedding(
    query,
    endpoint=aoai_endpoint,
    api_key=aoi_api_key,
    api_version="2023-07-01-preview",
    embedding_model_deployment=embedding_model_name,
):
    request_url = f"{endpoint}/openai/deployments/{embedding_model_deployment}/embeddings?api-version={api_version}"
    headers = {"Content-Type": "application/json", "api-key": api_key}
    request_payload = {"input": query}
    embedding_response = requests.post(
        request_url, json=request_payload, headers=headers, timeout=None
    )
    if embedding_response.status_code == 200:
        data_values = embedding_response.json()["data"]
        embeddings_vectors = [data_value["embedding"] for data_value in data_values]
        return embeddings_vectors
    else:
        raise Exception(f"failed to get embedding: {embedding_response.json()}")

In [5]:
def generate_embeddings_for_chunks_and_save_to_file(path_to_chunks_file, path_to_output):
    # path_to_file = f"./output/chunks-solution-ops-embedded-{totalNumberOfDocuments}.json"
    if(os.path.exists(path_to_chunks_file)):
        print(f"Embeddings were already created for chunked data at: {path_to_chunks_file} ")
        return
    with open(path_to_chunks_file, "r", encoding="utf-8") as file:
        input_data = json.load(file)
        for chunk in input_data:
            content = chunk["chunkContent"]
            content_emebddings = get_query_embedding(content)[0]
            chunk["chunkContentVector"] = content_emebddings
    print(f"Created {len(input_data)} chunks")
    print(f"Example of one chunk: {input_data[1]}")

    with open(path_to_output, "w") as f:
        json.dump(input_data, f)

# Upload data to the Index


In [9]:
import json

# Upload documents to the index
def upload_data(file_path, search_index_name):
    # f"./output/chunks-solution-ops-embedded-{totalNumberOfDocuments}.json"
    
    try:
        with open(file_path, "r") as file:
            documents = json.load(file)

        search_client = SearchClient(
            endpoint=service_endpoint, index_name=search_index_name, credential=credential
        )
        search_client.upload_documents(documents)
        print(f"Uploaded {len(documents)} documents")
    except Exception as e:
        print(f"Error uploading documents: {e}")