In [1]:
import openai
from azure.storage.blob import BlobServiceClient
import pickle
import faiss
import numpy as np
import os

In [3]:
#Step 1: Generate Embeddings and Save to Blob Storage

import openai
from azure.storage.blob import BlobServiceClient
import pickle

# Define your Azure Blob Storage connection details
account_name = "XXX"
account_key = "XXX"
AZURE_STORAGE_CONNECTION_STRING = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
CONTAINER_NAME = 'test-asif-container'

# Initialize Azure Blob Storage client
blob_service_client = BlobServiceClient.from_connection_string(AZURE_STORAGE_CONNECTION_STRING)
container_client = blob_service_client.get_container_client(CONTAINER_NAME)

# Example data with metadata
texts = [
    "Artificial intelligence is transforming various industries.",
    "Climate change is one of the most pressing global challenges.",
    "Recent advancements in space technology are remarkable.",
    "The impact of quantum computing on cryptography is profound.",
    "Sustainable energy solutions are crucial for combating climate change."
]

metadata = [
    {
        "title": "Artificial Intelligence and Its Impact",
        "author": "Dr. Jane Smith",
        "date": "2024-01-15",
        "category": "Technology",
        "summary": "An overview of how artificial intelligence is transforming industries and daily life."
    },
    {
        "title": "The Urgency of Climate Change",
        "author": "John Doe",
        "date": "2024-02-22",
        "category": "Environment",
        "summary": "A detailed look at the effects of climate change on our planet and potential solutions."
    },
    {
        "title": "Recent Advances in Space Exploration",
        "author": "Dr. Emily Johnson",
        "date": "2024-03-10",
        "category": "Science",
        "summary": "A summary of recent achievements in space missions and technological advancements."
    },
    {
        "title": "Quantum Computing and Cryptography",
        "author": "Dr. Alice Brown",
        "date": "2024-04-05",
        "category": "Technology",
        "summary": "An exploration of how quantum computing could revolutionize cryptographic methods and data security."
    },
    {
        "title": "Sustainable Energy Solutions",
        "author": "Michael Green",
        "date": "2024-05-20",
        "category": "Environment",
        "summary": "An examination of various sustainable energy technologies and their potential to reduce carbon emissions."
    }
]

# Azure OpenAI Service client
client = openai.AzureOpenAI(
        azure_endpoint="XXX",
        api_key="XXX",
        api_version="2023-09-01-preview"
    )

model = "demo-wrmg-txt-ada-02"

# Generate embeddings using Azure OpenAI Service
def generate_embeddings(texts):
    try:
        response = client.embeddings.create(input=texts, model=model).data
        embeddings = [item.embedding for item in response]
        
        # Print the number of embeddings and their shapes
        print(f"Number of embeddings: {len(embeddings)}")
        for i, embedding in enumerate(embeddings):
            print(f"Embedding {i} shape: {len(embedding)}")
        
        return embeddings
    
    except Exception as e:
        print(f"Error generating embeddings: {e}")
        raise

# Upload text files and embeddings to Blob Storage
def upload_to_blob(documents, embeddings):
    try:
        # Upload documents and their embeddings
        for i, (text, meta) in enumerate(zip(documents, metadata)):
            doc_name = f"document_{i}.txt"
            emb_name = f"embedding_{i}.pkl"
            
            # Upload text document
            blob_client = container_client.get_blob_client(doc_name)
            blob_client.upload_blob(text, overwrite=True)
            
            # Upload embedding
            blob_client = container_client.get_blob_client(emb_name)
            blob_client.upload_blob(pickle.dumps(embeddings[i]), overwrite=True)
            
            print(f"Uploaded {doc_name} and {emb_name} to Blob Storage.")
        
        # Save metadata as a separate file
        meta_name = "metadata.pkl"
        blob_client = container_client.get_blob_client(meta_name)
        blob_client.upload_blob(pickle.dumps(metadata), overwrite=True)
        print(f"Uploaded {meta_name} to Blob Storage.")
    except Exception as e:
        print(f"Error uploading documents and embeddings: {e}")
        raise

# Generate embeddings and upload everything to Blob Storage
try:
    embeddings = generate_embeddings(texts)
    upload_to_blob(texts, embeddings)
except Exception as e:
    print(f"Error generating embeddings or uploading to Blob Storage: {e}")


Number of embeddings: 5
Embedding 0 shape: 1536
Embedding 1 shape: 1536
Embedding 2 shape: 1536
Embedding 3 shape: 1536
Embedding 4 shape: 1536
Uploaded document_0.txt and embedding_0.pkl to Blob Storage.
Uploaded document_1.txt and embedding_1.pkl to Blob Storage.
Uploaded document_2.txt and embedding_2.pkl to Blob Storage.
Uploaded document_3.txt and embedding_3.pkl to Blob Storage.
Uploaded document_4.txt and embedding_4.pkl to Blob Storage.
Uploaded metadata.pkl to Blob Storage.


In [None]:
# pip install azure-search-documents==11.4.0

# https://stackoverflow.com/questions/77613936/how-to-create-a-vector-search-index-in-azure-ai-search-using-v11-4-0

In [5]:
#Step 2: Create a Vector-Compatible Index in Azure Cognitive Search

from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,
    SearchField,  
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration
) 

# Azure Cognitive Search details
search_service_name = "XXXX"
admin_api_key = "XXXX"
index_name = "XXXXX" #define new name

# Initialize Search Index client
endpoint = f"https://{search_service_name}.search.windows.net"
credential = AzureKeyCredential(admin_api_key)
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)

# Define the index schema with embeddings
def create_vector_search_index():
    try:
        fields = [
            SimpleField(name="id", type=SearchFieldDataType.String, key=True),
            SearchableField(name="content", type=SearchFieldDataType.String),
            SearchableField(name="title", type=SearchFieldDataType.String),
            SearchableField(name="author", type=SearchFieldDataType.String),
            SimpleField(name="date", type=SearchFieldDataType.String),
            SearchableField(name="category", type=SearchFieldDataType.String),
            SearchableField(name="summary", type=SearchFieldDataType.String),
            #VectorField(name="embeddings", type=SearchFieldDataType.Collection(SearchFieldDataType.Single))
            SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True, vector_search_dimensions=1536, vector_search_profile_name="my-vector-config")
            ]

        vector_search = VectorSearch(
            profiles=[VectorSearchProfile(name="my-vector-config", algorithm_configuration_name="my-algorithms-config")],
            algorithms=[HnswAlgorithmConfiguration(name="my-algorithms-config")],
    )
        index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
        index_client.create_index(index)
        print("Vector search index created successfully.")
    except Exception as e:
        print(f"Error creating vector search index: {e}")
        raise

create_vector_search_index()


Vector search index created successfully.


In [15]:
from azure.search.documents import SearchClient

# Initialize Search client
search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(admin_api_key))

# Upload documents with embeddings to Azure Cognitive Search
def index_documents_with_embeddings(index_data):
    try:
        documents = []
        for i, text in enumerate(index_data['texts']):
            doc = {
                "id": str(i),
                "content": text,
                "title": index_data['metadata'][i]['title'],
                "author": index_data['metadata'][i]['author'],
                "date": index_data['metadata'][i]['date'],
                "category": index_data['metadata'][i]['category'],
                "summary": index_data['metadata'][i]['summary'],
                "content_vector": index_data['embeddings'][i]
            }
            documents.append(doc)
        
        search_client.upload_documents(documents)
        print("Documents with embeddings indexed successfully.")
    except Exception as e:
        print(f"Error indexing documents with embeddings: {e}")
        raise

# Load index data and upload documents with embeddings
def load_index_from_blob():
    try:
        index_data = {
            "texts": [],
            "metadata": [],
            "embeddings": []
        }
        
        for blob in container_client.list_blobs():
            if "document_" in blob.name and blob.name.endswith(".txt"):
                blob_client = container_client.get_blob_client(blob.name)
                index_data["texts"].append(blob_client.download_blob().readall().decode("utf-8"))
            elif "embedding_" in blob.name and blob.name.endswith(".pkl"):
                blob_client = container_client.get_blob_client(blob.name)
                index_data["embeddings"].append(pickle.loads(blob_client.download_blob().readall()))
            elif blob.name == "metadata.pkl":
                blob_client = container_client.get_blob_client(blob.name)
                index_data["metadata"] = pickle.loads(blob_client.download_blob().readall())
        
        return index_data
    except Exception as e:
        print(f"Error loading index data from Blob Storage: {e}")
        raise

try:
    index_data = load_index_from_blob()
    index_documents_with_embeddings(index_data)
except Exception as e:
    print(e)


Documents with embeddings indexed successfully.


In [None]:
#https://stackoverflow.com/questions/76419780/azure-cognitive-search-in-python-using-vector-embeddings-error

In [38]:
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential

# Initialize Search client
search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(admin_api_key))

# Generate embeddings for the query
def generate_query_embedding(query_text):
    try:
        response = client.embeddings.create(input=[query_text], model=model).data
        query_embedding = response[0].embedding
        print(f"Query embedding shape: {len(query_embedding)}")
        return query_embedding
    except Exception as e:
        print(f"Error generating query embedding: {e}")
        raise

# Perform search using the query embedding
def perform_search(query_text, top_k=2):
    try:
        query_embedding = generate_query_embedding(query_text)
        print("query_embedding",query_embedding)
        #vector_query = VectorizedQuery(vector=get_embeddings(query), k_nearest_neighbors=3, fields="descriptionVector")

         # Convert query embedding to the required format
        # vector_query = {
        #     "kind": "vector",  # Specify the kind of query
        #     "value": query_embedding,  # Ensure this is a list of floats
        #     "fields": "content_vector",
        #     "k": top_k
        # }
        from azure.search.documents.models import VectorizedQuery

        vector_query = VectorizedQuery(vector=query_embedding, k_nearest_neighbors=2, fields="content_vector")


        search_results = search_client.search(search_text="", vector_queries=[vector_query], top=top_k)
        print(search_results)
        
        results = []
        for result in search_results:
            print("")

            results.append({
                "text": result.get("content"),
                "metadata": {
                    "title": result.get("title"),
                    "author": result.get("author"),
                    "date": result.get("date"),
                    "category": result.get("category"),
                    "summary": result.get("summary")
                },
                "score": result.get("@search.score")  # Document relevance score
            })
        
        return results
    except Exception as e:  
        print(f"Error performing search: {e}")
        raise

# Example usage
if __name__ == "__main__":
    try:
        query_text = "climate change impact"
        print(f"Performing search for query: '{query_text}'")
        results = perform_search(query_text)
        print("Search results:")
        for result in results:
            print(f"Text: {result['text']}")
            print(f"Title: {result['metadata']['title']}")
            print(f"Author: {result['metadata']['author']}")
            print(f"Date: {result['metadata']['date']}")
            print(f"Category: {result['metadata']['category']}")
            print(f"Summary: {result['metadata']['summary']}")
            print(f"Score: {result['score']}\n")
    except Exception as e:
        print(f"Error in main execution: {e}")


Performing search for query: 'climate change impact'
Query embedding shape: 1536
query_embedding [0.007770773023366928, -0.02070867456495762, 0.013707617297768593, -0.035206086933612823, -0.0012323803966864944, 0.016398271545767784, -0.03860621899366379, 0.0054214694537222385, -0.011900460347533226, -0.0008023440605029464, 0.004136380273848772, 0.024162352085113525, -0.01615731790661812, 0.02227487787604332, -0.011846914887428284, 0.01145871076732874, 0.032876864075660706, -0.03713371977210045, 0.018607018515467644, -0.020440949127078056, -0.017094362527132034, 0.03137759119272232, 0.007804238703101873, 0.002253925893455744, -0.000978876487351954, 0.0002495298394933343, 0.007911330088973045, -0.03970389813184738, -0.012375676073133945, -0.006087440066039562, 0.02872709557414055, 0.007262092083692551, -0.018861358985304832, -0.014604502357542515, -0.014992705546319485, -0.019490517675876617, 0.004527931101620197, -0.003969051409512758, 0.001275049289688468, -0.023908011615276337, 0.0073