In [1]:
import chromadb
from sentence_transformers import SentenceTransformer

# Step 1: Load a pre-trained embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")  # Efficient & accurate

# Step 2: Initialize ChromaDB
client = chromadb.PersistentClient(path="./chroma_db")  # Persistent storage

# Delete previous collection if it exists
client.delete_collection(name="text_search")

# Create a fresh collection
collection = client.get_or_create_collection(name="text_search")

# Step 3: Store some example texts
texts = [
    "Deep learning is a subset of AI."
]

# Convert texts into embeddings
embeddings = model.encode(texts).tolist()

print(embeddings)



  from .autonotebook import tqdm as notebook_tqdm


[[-0.060460060834884644, -0.07446379214525223, 0.05727696046233177, 0.008092081174254417, -0.0021655242890119553, -0.014366322197020054, -0.019088711589574814, -0.002213858999311924, -0.021509507670998573, -0.03355859965085983, -0.0561508983373642, 0.004062152933329344, -0.013911853544414043, -0.020584838464856148, -0.03368431329727173, -0.018947329372167587, 0.0022794862743467093, -0.01343687903136015, -0.11374551057815552, -0.04779313877224922, 0.022085221484303474, 0.02689935266971588, -0.016120482236146927, -0.052876755595207214, 0.022022323682904243, 0.060236651450395584, 0.01717345044016838, -0.02785523049533367, -0.011812848038971424, 0.018951434642076492, 0.06163961440324783, 0.006809416227042675, 0.012236316688358784, 0.02207774482667446, -0.003891540924087167, 0.07542424649000168, -0.09001521021127701, 0.04812930151820183, 0.06010732799768448, 0.019014623016119003, -0.015191778540611267, 0.025604546070098877, 0.01585373468697071, -0.011732621118426323, 0.06934528052806854, 0.

In [5]:
query_embedding = model.encode(["Deep learning is a subset of AI.."])
print(query_embedding)

# Step 4: Add texts and embeddings to ChromaDB
collection.add(
    ids=[str(i) for i in range(len(texts))],  # Unique IDs
    embeddings=embeddings,  # Vector representations
    metadatas=[{"text": t} for t in texts]  # Store original text
)

# üîç Function to search for the closest text in ChromaDB
def find_closest_match(query_text, top_n=1):
    query_embedding = model.encode([query_text]).tolist()  # Convert query text to vector
    results = collection.query(query_embeddings=query_embedding, n_results=top_n)  # Search

    # Print closest match
    print("\nüîπ Query:", query_text)
    print("üîπ Closest Match:", results["metadatas"][0][0]["text"])
    print("üîπ Distance Score:", results["distances"][0][0])  # Lower is better

# Step 5: Test with user input
user_query = "Deep learning is a subset of AI."
find_closest_match(user_query)

Add of existing embedding ID: 0
Insert of existing embedding ID: 0


[[-5.17093912e-02 -7.39649087e-02  6.67236298e-02  5.28742699e-03
  -1.97732970e-02 -1.69549286e-02  8.20804108e-03 -1.64078716e-02
  -2.31052446e-03 -4.30325493e-02 -4.40655313e-02 -3.27190245e-03
  -1.83197241e-02 -1.60326008e-02 -5.00254370e-02 -1.38394646e-02
  -7.03779515e-03 -1.58825368e-02 -1.21944547e-01 -2.24514361e-02
   1.82135496e-02  1.89012755e-02 -2.52228007e-02 -4.91074510e-02
   2.41177920e-02  6.85938746e-02  1.74294263e-02 -1.73560008e-02
  -1.84539445e-02  8.16941075e-03  5.81392162e-02  3.02682146e-02
   1.15499021e-02  1.67212524e-02  1.90108549e-02  6.75904378e-02
  -7.56788552e-02  4.73553203e-02  5.08706495e-02  2.50358079e-02
  -1.16721420e-02  3.69270965e-02  1.96459107e-02 -9.63631179e-03
   6.42910525e-02  4.75164317e-02 -5.23047298e-02 -4.97579053e-02
  -1.43094417e-02  1.93326157e-02 -7.10259750e-02 -1.75743140e-02
  -2.24023573e-02  3.71095836e-02  1.43158268e-02  4.89377743e-03
   6.11142069e-02 -6.35172334e-03 -3.58586982e-02  1.16496496e-02
   4.36314

The model "all-MiniLM-L6-v2" generates embeddings of size 384. This is a compact and efficient model designed for tasks like text classification, clustering, and semantic search, where smaller embedding sizes are preferred for faster processing and lower memory usage.

Example: Embedding for a Sentence
Let‚Äôs take the sentence:
"The cat sat on the mat."

The 384-dimensional embedding for this sentence might encode:

The subject ("cat") and its relationship to the action ("sat").

The object ("mat") and its relationship to the subject and action.

The overall sentiment (neutral).

The grammatical structure (subject-verb-object).

The topic (pets or household items).