<a href="https://colab.research.google.com/github/birajchoudhury/birajchoudhury.github.io/blob/master/hesemanticsearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Encryption Semantic Search using Homomorphic Encryption

This is a tutorial for implementing encrypted semantic search using Homomorphic Encryption.

# Install requirements



In [None]:
pip install tenseal sentence_transformers

In [None]:
import tenseal as ts # pip install tenseal
from sentence_transformers import SentenceTransformer

#Setup Embedding Model

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

NameError: name 'SentenceTransformer' is not defined

In [None]:
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "I watched the sunset over the ocean.",
    "Artificial intelligence is transforming industries.",
    "The library was quiet and smelled like old books.",
    "He dreamed of traveling to distant galaxies.",
    "Innovation drives progress in the tech world.",
    "The chef prepared a delicious meal for the guests.",
    "Climate change poses a significant threat to global biodiversity.",
    "The athlete trained rigorously for the upcoming marathon.",
    "Music has the power to evoke deep emotional responses."
]

# Get and Store Sentence Embeddings

In [None]:
embeddings = model.encode(sentences)
print(embeddings)

# Get Query Embedding

In [None]:
query_text = "fox"
query_embedding = model.encode([query_text])
print(query_embedding)

[[ 1.10053448e-02 -2.71984432e-02 -4.23476994e-02  2.68690772e-02
   1.24752568e-02 -8.15262552e-03  4.80136722e-02 -2.42797434e-02
  -2.33290507e-03 -5.91702238e-02 -2.49941424e-02  1.63972601e-02
   2.21027294e-03  3.52101140e-02 -6.22175336e-02 -2.45897863e-02
  -4.41655591e-02 -9.53475311e-02  7.82251637e-03 -5.37793562e-02
  -1.33924574e-01 -2.80652903e-02  3.92268151e-02  2.02562846e-02
   1.93843842e-02 -1.97538938e-02 -1.83801502e-02  1.11274617e-02
   4.36563939e-02 -1.61914945e-01  3.24179232e-02  7.76520669e-02
   1.56489275e-02  1.42848510e-02 -1.09123990e-01 -3.61411311e-02
   4.34143357e-02 -3.68130766e-02 -2.02566609e-02  6.56031817e-02
  -1.88652258e-02 -4.23035473e-02 -1.20622693e-02  4.05694209e-02
  -6.23182766e-02 -3.29615772e-02 -4.26615179e-02 -1.31064188e-02
   6.30795732e-02  5.93835749e-02 -3.97166461e-02 -1.18213529e-02
  -5.47252446e-02  6.31510541e-02  6.42526001e-02  8.26242659e-03
  -4.80821263e-03 -1.55517664e-02 -1.95879806e-02  2.20050365e-02
   3.07974

# Setup CKKS Homomorphic Encryption

In [None]:
context = ts.context(
            ts.SCHEME_TYPE.CKKS,
            poly_modulus_degree = 8192,
            coeff_mod_bit_sizes = [60, 40, 40, 60]
          )

context.generate_galois_keys()
context.global_scale = 2**40

In [None]:
secret_context = context.serialize(save_secret_key = True)

In [None]:
context.make_context_public() #drop the secret_key from the context
public_context = context.serialize()

In [None]:
context = ts.context_from(secret_context)

# Setup Vector Database

In [None]:
class VectorStore:
    def __init__(self):
        self.vector_data = {}  # A dictionary to store vectors
        self.vector_index = {}  # An indexing structure for retrieval

    def add_vector(self, vector_id, vector):
        """
        Add a vector to the store.

        Args:
            vector_id (str or int): A unique identifier for the vector.
            vector (numpy.ndarray): The vector data to be stored.
        """
        self.vector_data[vector_id] = vector
        self._update_index(vector_id, vector)

    def get_vector(self, vector_id):
        """
        Retrieve a vector from the store.

        Args:
            vector_id (str or int): The identifier of the vector to retrieve.

        Returns:
            numpy.ndarray: The vector data if found, or None if not found.
        """
        return self.vector_data.get(vector_id)

    def _update_index(self, vector_id, vector):
        """
        Update the index with the new vector.

        Args:
            vector_id (str or int): The identifier of the vector.
            vector (numpy.ndarray): The vector data.
        """
        # In this simple example, we use brute-force cosine similarity for indexing
        for existing_id, existing_vector in self.vector_data.items():
            similarity = np.dot(vector, existing_vector) / (np.linalg.norm(vector) * np.linalg.norm(existing_vector))
            if existing_id not in self.vector_index:
                self.vector_index[existing_id] = {}
            self.vector_index[existing_id][vector_id] = similarity

    def find_similar_vectors(self, query_vector, num_results=5):
        """
        Find similar vectors to the query vector using brute-force search.

        Args:
            query_vector (numpy.ndarray): The query vector for similarity search.
            num_results (int): The number of similar vectors to return.

        Returns:
            list: A list of (vector_id, similarity_score) tuples for the most similar vectors.
        """
        results = []
        for vector_id, vector in self.vector_data.items():
            similarity = np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
            results.append((vector_id, similarity))

        # Sort by similarity in descending order
        results.sort(key=lambda x: x[1], reverse=True)

        # Return the top N results
        return results[:num_results]

In [None]:
vector_store = VectorStore()

# Run Encrypted Search Calculations

Encrypt the query embedding as an encrypted vector

In [None]:
enc_queryvec = ts.ckks_vector(context, query_embedding[0].tolist())

Measure Cosine Similarity

In [None]:
cosine_similarity_ranking = []
for i in range(len(sentences)):
    enc_sentence = ts.ckks_vector(context, embeddings[i].tolist())
    dot_product = enc_queryvec.dot(enc_sentence)
    cosine_similarity = dot_product.decrypt()[0]
    cosine_similarity_ranking.append({sentences[i]: abs(cosine_similarity)})

# Present Results

In [None]:
search_results = sorted(cosine_similarity_ranking, key=lambda x: list(x.values())[0], reverse=True)
for item in search_results:
    print(item)
    print()


{'The quick brown fox jumps over the lazy dog.': 0.48099632818018995}

{'I watched the sunset over the ocean.': 0.09014756000127495}

{'The athlete trained rigorously for the upcoming marathon.': 0.0781264779389985}

{'Artificial intelligence is transforming industries.': 0.07767405389268811}

{'He dreamed of traveling to distant galaxies.': 0.05060842452338736}

{'The library was quiet and smelled like old books.': 0.02383919876453875}

{'The chef prepared a delicious meal for the guests.': 0.022408268994282547}

{'Climate change poses a significant threat to global biodiversity.': 0.01684344552838132}

{'Innovation drives progress in the tech world.': 0.00642538038049721}

{'Music has the power to evoke deep emotional responses.': 0.003653404599791276}

