# Atlas Vector Search - Create Embeddings - Open Source - New Data

This notebook is a companion to the [Create Embeddings](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/) page. Refer to the page for set-up instructions and detailed explanations.

This notebook takes you through how to generate embeddings from **new data** by using the open-source ``nomic-embed-text-v1`` model. It also includes code to convert your embeddings to BSON binData vectors for efficient processing of your data.

<a target="_blank" href="https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/create-embeddings/open-source-new-data.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
pip install --quiet --upgrade sentence-transformers pymongo einops

In [None]:
from sentence_transformers import SentenceTransformer

# Load the embedding model
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)

# Define a function to generate embeddings in multiple precisions
def get_embedding(data, precision="float32"):
    return model.encode(data, precision=precision)

In [None]:
from bson.binary import Binary

# Generate BSON vector using `BinaryVectorDtype`
def generate_bson_vector(vector, vector_dtype):
    return Binary.from_vector(vector, vector_dtype)

In [None]:
# Function to create documents with BSON vector embeddings
def create_docs_with_bson_vector_embeddings(bson_float32, bson_int8, bson_int1, data):
  docs = []
  for i, (bson_f32_emb, bson_int8_emb, bson_int1_emb, text) in enumerate(zip(bson_float32, bson_int8, bson_int1, data)):
        doc = {
            "_id": i,
            "data": text,
            "BSON-Float32-Embedding": bson_f32_emb,
            "BSON-Int8-Embedding": bson_int8_emb,
            "BSON-Int1-Embedding": bson_int1_emb,
        }
        docs.append(doc)
  return docs

In [None]:
# Example generating embeddings for the strings "foo" and "bar"
data = ["foo", "bar"]
float32_embeddings = get_embedding(data, "float32")
int8_embeddings = get_embedding(data, "int8")
int1_embeddings = get_embedding(data, "ubinary")

print("Float32 Embedding:", float32_embeddings)
print("Int8 Embedding:", int8_embeddings)
print("Int1 Embedding (binary representation):", int1_embeddings)

In [None]:
from bson.binary import BinaryVectorDtype

bson_float32_embeddings = []
bson_int8_embeddings = []
bson_int1_embeddings = []

for (f32_emb, int8_emb, int1_emb) in zip(float32_embeddings, int8_embeddings, int1_embeddings):
    bson_float32_embeddings.append(generate_bson_vector(f32_emb, BinaryVectorDtype.FLOAT32))
    bson_int8_embeddings.append(generate_bson_vector(int8_emb, BinaryVectorDtype.INT8))
    bson_int1_embeddings.append(generate_bson_vector(int1_emb, BinaryVectorDtype.PACKED_BIT))

# Print the embeddings
print(f"The converted bson_float32_new_embedding is: {bson_float32_embeddings}")
print(f"The converted bson_int8_new_embedding is: {bson_int8_embeddings}")
print(f"The converted bson_int1_new_embedding is: {bson_int1_embeddings}")

In [None]:
 # Sample data
 sentences = [
  "Titanic: The story of the 1912 sinking of the largest luxury liner ever built",
  "The Lion King: Lion cub and future king Simba searches for his identity",
  "Avatar: A marine is dispatched to the moon Pandora on a unique mission",
  "Inception: A skilled thief is given a chance at redemption if he can successfully implant an idea into a person's subconscious.",
  "The Godfather: The aging patriarch of a powerful crime family transfers control of his empire to his reluctant son.",
  "Forrest Gump: A man with a low IQ recounts several decades of extraordinary events in his life.",
  "Jurassic Park: Scientists clone dinosaurs to populate an island theme park, which soon goes awry.",
  "The Matrix: A hacker discovers the true nature of reality and his role in the war against its controllers.",
  "Star Wars: A young farm boy is swept into the struggle between the Rebel Alliance and the Galactic Empire.",
  "The Shawshank Redemption: A banker is sentenced to life in Shawshank State Penitentiary for the murders of his wife and her lover.",
  "Indiana Jones and the Last Crusade: An archaeologist pursues the Holy Grail while confronting adversaries from the past.",
  "The Dark Knight: Batman faces a new menace, the Joker, who plunges Gotham into anarchy.",
  "Back to the Future: A teenager accidentally travels back in time and must ensure his parents fall in love.",
  "The Silence of the Lambs: A young FBI agent seeks the help of an incarcerated cannibalistic killer to catch another serial killer.",
  "E.T. the Extra-Terrestrial: A young boy befriends an alien stranded on Earth and helps him return home.",
  "Saving Private Ryan: During WWII, a group of U.S. soldiers go behind enemy lines to retrieve a paratrooper whose brothers have been killed in action.",
  "Gladiator: A once-powerful Roman general seeks vengeance against the corrupt emperor who betrayed his family.",
  "Rocky: A small-time boxer gets a once-in-a-lifetime chance to fight the world heavyweight champion.",
  "Pirates of the Caribbean: Jack Sparrow races to recover the heart of Davy Jones to escape eternal servitude.",
  "Schindler's List: The true story of a man who saved hundreds of Jews during the Holocaust by employing them in his factory."
]

In [None]:
float32_embeddings = get_embedding(sentences, precision="float32")
int8_embeddings = get_embedding(sentences, precision="int8")
int1_embeddings = get_embedding(sentences, precision="ubinary")

# Print stored embeddings
print("Generated embeddings stored in different variables:")
for i, text in enumerate(sentences):
    print(f"\nText: {text}")
    print(f"Float32 Embedding: {float32_embeddings[i][:3]}... (truncated)")
    print(f"Int8 Embedding: {int8_embeddings[i][:3]}... (truncated)")
    print(f"Ubinary Embedding: {int1_embeddings[i][:3]}... (truncated)")

In [None]:
from bson.binary import BinaryVectorDtype

bson_float32_embeddings = []
bson_int8_embeddings = []
bson_int1_embeddings = []

# Convert each embedding to BSON
for (f32_emb, int8_emb, int1_emb) in zip(float32_embeddings, int8_embeddings, int1_embeddings):
    bson_float32_embeddings.append(generate_bson_vector(f32_emb, BinaryVectorDtype.FLOAT32))
    bson_int8_embeddings.append(generate_bson_vector(int8_emb, BinaryVectorDtype.INT8))
    bson_int1_embeddings.append(generate_bson_vector(int1_emb, BinaryVectorDtype.PACKED_BIT))

# Print the embeddings
for idx, text in enumerate(sentences):
    print(f"\nText: {text}")
    print(f"Float32 BSON: {bson_float32_embeddings[idx]}")
    print(f"Int8 BSON: {bson_int8_embeddings[idx]}")
    print(f"Int1 BSON: {bson_int1_embeddings[idx]}")

In [None]:
# Create BSON documents
docs = create_docs_with_bson_vector_embeddings(bson_float32_embeddings, bson_int8_embeddings, bson_int1_embeddings, sentences)

In [None]:
import pymongo

# Connect to your Atlas cluster
mongo_client = pymongo.MongoClient("<connection-string>")
db = mongo_client["sample_db"]
collection = db["embeddings"]

# Ingest data into Atlas
collection.insert_many(docs)

In [None]:
from pymongo.operations import SearchIndexModel
import time

# Create your index model, then create the search index
search_index_model = SearchIndexModel(
  definition = {
    "fields": [
      {
        "type": "vector",
        "path": "BSON-Float32-Embedding",
        "similarity": "dotProduct",
        "numDimensions": 768
      },
      {
        "type": "vector",
        "path": "BSON-Int8-Embedding",
        "similarity": "dotProduct",
        "numDimensions": 768
      },
      {
        "type": "vector",
        "path": "BSON-Int1-Embedding",
        "similarity": "euclidean",
        "numDimensions": 768
      }
    ]
  },
  name="vector_index3",
  type="vectorSearch",
)
result = collection.create_search_index(model=search_index_model)
print("New search index named " + result + " is building.")

# Wait for initial sync to complete
print("Polling to check if the index is ready. This may take up to a minute.")
predicate=None
if predicate is None:
  predicate = lambda index: index.get("queryable") is True

while True:
  indices = list(collection.list_search_indexes(result))
  if len(indices) and predicate(indices[0]):
    break
  time.sleep(5)
print(result + " is ready for querying.")

In [None]:
# Prepare your query
query_text = "ocean tragedy"

# Generate embedding for the search query
query_float32_embeddings = get_embedding(query_text, precision="float32")
query_int8_embeddings = get_embedding(query_text, precision="int8")
query_int1_embeddings = get_embedding(query_text, precision="ubinary")

# Convert each embedding to BSON format
query_bson_float32_embeddings = generate_bson_vector(query_float32_embeddings, BinaryVectorDtype.FLOAT32)
query_bson_int8_embeddings = generate_bson_vector(query_int8_embeddings, BinaryVectorDtype.INT8)
query_bson_int1_embeddings = generate_bson_vector(query_int1_embeddings, BinaryVectorDtype.PACKED_BIT)

# Define vector search pipeline for each precision
pipelines = []
for query_embedding, path in zip(
    [query_bson_float32_embeddings, query_bson_int8_embeddings, query_bson_int1_embeddings],
    ["BSON-Float32-Embedding", "BSON-Int8-Embedding", "BSON-Int1-Embedding"]
):
    pipeline = [
       {
          "$vectorSearch": {
                "index": "vector_index",  # Adjust if necessary
                "queryVector": query_embedding,
                "path": path,
                "exact": True,
                "limit": 5
          }
       },
       {
          "$project": {
             "_id": 0,
             "data": 1,
             "score": {
                "$meta": "vectorSearchScore"
             }
          }
       }
    ]
    pipelines.append(pipeline)

# Execute the search for each precision
for pipeline in pipelines:
    print(f"\nResults for {pipeline[0]['$vectorSearch']['path']}:")
    results = collection.aggregate(pipeline)

    # Print results
    for i in results:
        print(i)