In [3]:

# Import Necessary Modules
from sentence_transformers import SentenceTransformer
import numpy as np
from scipy.spatial.distance import cosine

# Step 1: Verify SBERT Initialization
class EmbeddingModel:
    def __init__(self, model_name='all-mpnet-base-v2'):
        self.model = SentenceTransformer(model_name)

    def generate_embeddings(self, texts):
        return np.array(self.model.encode(texts, convert_to_numpy=True))

# Initialize SBERT Model
embedding_model = EmbeddingModel()
print(f"Using SBERT Model: all-mpnet-base-v2")

# Step 2: Test SBERT Embeddings
print("\nStep 2: Generating Test Embeddings")
test_texts = ["This is a test sentence.", "Another example sentence."]
test_embeddings = embedding_model.generate_embeddings(test_texts)
print(f"Embeddings Shape: {test_embeddings.shape}")
print(f"First Embedding: {test_embeddings[0]}")

# Step 3: Validate Consistency with Similarity Check
print("\nStep 3: Validating Consistency")
sentence1 = "This is a test sentence."
sentence2 = "This is another test sentence."

embedding1 = embedding_model.generate_embeddings([sentence1])[0]
embedding2 = embedding_model.generate_embeddings([sentence2])[0]

similarity = 1 - cosine(embedding1, embedding2)
print(f"Similarity between sentences: {similarity}")

# Step 4: Check Document and Query Embeddings
print("\nStep 4: Validating Document and Query Embeddings")
# Example documents and queries (replace these with actual dataset samples)
documents = ["Document one about natural language processing.",
             "Document two regarding machine learning.",
             "Another document about semantic search and SBERT."]
queries = ["What is SBERT?", "Applications of semantic search", "Machine learning concepts"]

# Validate document embeddings
print("\nValidating Document Embeddings")
print(f"Number of documents: {len(documents)}")
doc_embeddings = embedding_model.generate_embeddings(documents)
print(f"Document Embeddings Shape: {doc_embeddings.shape}")
print(f"First Document Embedding: {doc_embeddings[0]}")

# Validate query embeddings
print("\nValidating Query Embeddings")
query_embeddings = embedding_model.generate_embeddings(queries)
print(f"Query Embeddings Shape: {query_embeddings.shape}")
print(f"First Query Embedding: {query_embeddings[0]}")

# Step 5: Test SBERT Retrieval
print("\nStep 5: Testing SBERT Retrieval")
query_embedding = embedding_model.generate_embeddings(["test sentence"])[0]
similarities = [
    (doc_index, 1 - cosine(query_embedding, doc_embedding))
    for doc_index, doc_embedding in enumerate(doc_embeddings)
]
similarities = sorted(similarities, key=lambda x: x[1], reverse=True)  # Sort by similarity score

print("Top Results:")
for doc_index, similarity_score in similarities[:5]:
    print(f"Document {doc_index}: Similarity Score = {similarity_score}")

Using SBERT Model: all-mpnet-base-v2

Step 2: Generating Test Embeddings
Embeddings Shape: (2, 768)
First Embedding: [ 3.78101628e-04 -5.08034118e-02 -3.51472162e-02 -2.32510511e-02
 -4.41582873e-02  2.04877723e-02  1.46184221e-03  3.12617980e-02
  5.60515672e-02  1.88154187e-02  6.46202043e-02 -1.66587327e-02
  2.24144920e-03 -6.62649274e-02  2.82418933e-02 -2.49877898e-03
  8.14976171e-02  8.00235849e-03 -4.89552468e-02  3.32184471e-02
 -1.88362785e-02  9.67359543e-03 -2.18883576e-03 -3.58970948e-02
 -5.01143485e-02 -2.18428997e-03 -2.14774571e-02 -3.25634927e-02
  2.42515299e-02 -2.65391357e-02  6.25296384e-02 -3.62271955e-03
 -1.09873051e-02 -7.67028406e-02  1.53072881e-06  1.44891487e-02
 -3.17222299e-03 -3.32370102e-02 -6.87476769e-02 -5.63181471e-03
  5.28363418e-03  6.53426796e-02  4.27037850e-03  4.32255603e-02
 -2.95564327e-02  9.66737326e-03  4.99072932e-02  1.99880358e-02
 -5.37454039e-02  8.12139809e-02 -1.67013111e-03 -2.15664506e-04
 -3.63482046e-03 -5.01496233e-02  7.31