# Atlas Vector Search - Vector Quantization - Existing Data

This notebook is a companion for the [Vector Quantization](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-quantization/#how-to-ingest-pre-quantized-vectors) page. Refer to this page for set up steps and explanation details.

This notebook takes you through how to pre-quantize and ingest your vectors for vector search from **existing data in Atlas** by using [Cohere's](https://cohere.com/) `embed-english-v3.0` model.

In [None]:
pip --quiet --upgrade install pymongo cohere

In [None]:
import os
import pymongo
import cohere
from bson.binary import Binary, BinaryVectorDtype

# Specify your Cohere API key
os.environ["COHERE_API_KEY"] = "<COHERE-API-KEY>"
cohere_client = cohere.Client(os.environ["COHERE_API_KEY"])

# Define function to generate embeddings using the embed-english-v3.0 model
def get_embedding(text):
    response = cohere_client.embed(
      texts=[text],
      model='embed-english-v3.0',
      input_type='search_document',
      embedding_types=["float"] # Can also be "int8" or "ubinary" (int1)
    )
    embedding = response.embeddings.float[0]
    return embedding

# Define function to convert embeddings to BSON-compatible format
def generate_bson_vector(vector, vector_dtype):
    return Binary.from_vector(vector, vector_dtype)

In [None]:
# Connect to your Atlas cluster
mongo_client = pymongo.MongoClient("<ATLAS-CONNECTION-STRING>")
db = mongo_client["sample_airbnb"]
collection = db["listingsAndReviews"]

# Filter to exclude null or empty summary fields
filter = { "summary": {"$nin": [None, ""]} }

# Get a subset of documents in the collection
documents = collection.find(filter).limit(50)

# Initialize the count of updated documents
updated_doc_count = 0

In [None]:
for doc in documents:
    # Generate embeddings based on the summary
    summary = doc["summary"]
    embedding = get_embedding(summary)  # Get float32 embedding

    # Convert float32 embeddings into BSON format
    bson_vector = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)

    # Uncomment one of following lines if you specified a different data type in the get_embedding function
    # bson_vector = generate_bson_vector(embedding, BinaryVectorDtype.INT8)
    # bson_vector = generate_bson_vector(embedding, BinaryVectorDtype.PACKED_BIT) # int1

    # Update the document with the BSON embedding
    collection.update_one(
        {"_id": doc["_id"]},
        {"$set": {"embedding": bson_vector}}
    )
    updated_doc_count += 1

print(f"Updated {updated_doc_count} documents with BSON embeddings.")

In [None]:
from pymongo.operations import SearchIndexModel

# Define and create the vector search index
vector_search_index_definition = {
  "fields":[
    {
      "type": "vector",
      "path": "embedding",
      "similarity": "euclidean",
      "numDimensions": 1024,
    }
  ]
}

search_index_model = SearchIndexModel(definition=vector_search_index_definition, name="<INDEX-NAME>", type="vectorSearch")

collection.create_search_index(model=search_index_model)

In [None]:
# Define a function to run a vector search query
def run_vector_search(query_text, collection, path):
  query_embedding = get_embedding("query_text")
  bson_query_vector = generate_bson_vector(query_embedding, BinaryVectorDtype.FLOAT32)

  pipeline = [
    {
      '$vectorSearch': {
        'index': '<INDEX-NAME>',
        'path': path,
        'queryVector': bson_query_vector,
        'numCandidates': <NUMBER-OF-CANDIDATES-TO-CONSIDER>, # for example, 20
        'limit': <NUMBER-OF-DOCUMENTS-TO-RETURN> # for example, 5
       }
     },
     {
       '$project': {
         '_id': 0,
         'name': 1,
         'summary': 1,
         'score': { '$meta': 'vectorSearchScore' }
        }
     }
  ]

  return collection.aggregate(pipeline)

In [None]:
from pprint import pprint

# Run a vector search query
query_text = "ocean view"
query_results = run_vector_search(query_text, collection, "embedding")

print("query results:")
pprint(list(query_results))