# Vector database

## FAISS

https://github.com/facebookresearch/faiss/wiki/Getting-started

In [2]:
import faiss
import numpy as np

In [25]:
# Generate our 3D vectors (1331 vectors)
# from [0,0,0] to [10,10,10]
all_vectors = [[x, y, z] for x in range(11) for y in range(11) for z in range(11)]
vectors = np.array(all_vectors, dtype="float32")

# Create a FAISS index
dimension = 3  # Our vectors are 3D
index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity search

# Add vectors to the index
index.add(vectors)
print(f"Index contains {index.ntotal} vectors")

Index contains 1331 vectors


In [31]:
# Search for similar vectors
query_vector = np.array([[5.1, 5.2, 5.3]], dtype="float32")
k = 5  # Number of nearest neighbors to find

distances, indices = index.search(query_vector, k)

print("\nTop 5 similar vectors to [5.1, 5.2, 5.3]:")
for i, idx in enumerate(indices[0]):
    print(f"{i + 1}: {all_vectors[idx]} (distance: {distances[0][i]:.2f})")

# OR

distances, indices, search_results = index.search_and_reconstruct(query_vector, k)

print("\nTop 5 similar vectors to [5.1, 5.2, 5.3]:")
for i, idx in enumerate(indices[0]):
    print(f"{i + 1}: {search_results[0][i]} (distance: {distances[0][i]:.2f})")


Top 5 similar vectors to [5.1, 5.2, 5.3]:
1: [5, 5, 5] (distance: 0.14)
2: [5, 5, 6] (distance: 0.54)
3: [5, 6, 5] (distance: 0.74)
4: [6, 5, 5] (distance: 0.94)
5: [5, 6, 6] (distance: 1.14)

Top 5 similar vectors to [5.1, 5.2, 5.3]:
1: [5. 5. 5.] (distance: 0.14)
2: [5. 5. 6.] (distance: 0.54)
3: [5. 6. 5.] (distance: 0.74)
4: [6. 5. 5.] (distance: 0.94)
5: [5. 6. 6.] (distance: 1.14)


In [34]:
# Find all vectors within distance 1.5 from [5,5,5]
query = np.array([[5, 5, 5]], dtype="float32")
radius = 1.5

# FAISS range search
lims, D, I = index.range_search(query, radius)

print("\nVectors within distance 1.5 from [5,5,5]:")
for i in range(lims[0], lims[1]):
    print(f"{all_vectors[I[i]]} (distance: {D[i]:.2f})")


Vectors within distance 1.5 from [5,5,5]:
[4, 5, 5] (distance: 1.00)
[5, 4, 5] (distance: 1.00)
[5, 5, 4] (distance: 1.00)
[5, 5, 5] (distance: 0.00)
[5, 5, 6] (distance: 1.00)
[5, 6, 5] (distance: 1.00)
[6, 5, 5] (distance: 1.00)


In [32]:
# Update a vector (FAISS doesn't have direct update, so we remove and add)
vector_to_update = [1, 1, 1]
new_vector = [1.5, 1.5, 1.5]

# Find the index of the vector to update
vector_array = np.array([vector_to_update], dtype="float32")
_, idx = index.search(vector_array, 1)
target_idx = idx[0][0]

# Remove the old vector (FAISS doesn't support direct removal, we reconstruct)
all_vectors[target_idx] = new_vector
vectors = np.array(all_vectors, dtype="float32")

# Rebuild the index
index.reset()
index.add(vectors)
print(f"\nUpdated vector at position {target_idx} to {new_vector}")


Updated vector at position 133 to [1.5, 1.5, 1.5]


In [33]:
# Delete a vector (again, by reconstructing)
vector_to_delete = [2, 2, 2]

# Find the index
vector_array = np.array([vector_to_delete], dtype="float32")
_, idx = index.search(vector_array, 1)
target_idx = idx[0][0]

# Remove from our list and rebuild
del all_vectors[target_idx]
vectors = np.array(all_vectors, dtype="float32")

index.reset()
index.add(vectors)
print(f"\nDeleted vector [2,2,2]. Now index contains {index.ntotal} vectors")


Deleted vector [2,2,2]. Now index contains 1330 vectors


## Chroma

https://docs.trychroma.com/docs/overview/introduction

In [1]:
import chromadb
from chromadb.utils import embedding_functions

In [1]:
# Initialize client
client = chromadb.Client()

# Create collection with embedding function
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

collection = client.create_collection(
    name="3d_vectors", embedding_function=sentence_transformer_ef
)

# Generate our 3D vectors (just 10 for demo)
vectors = [[x, y, z] for x in range(2) for y in range(2) for z in range(2)]
documents = [f"Vector {i}: {vec}" for i, vec in enumerate(vectors)]
ids = [str(i) for i in range(len(vectors))]

# Add to collection
collection.add(
    documents=documents,
    embeddings=vectors,  # Use our 3D vectors directly
    ids=ids,
)

print(f"Added {len(vectors)} vectors to collection")

  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Added 8 vectors to collection


In [13]:
# Simple query by ID
# By default, only documents and metadata are returned
result = collection.get(ids=["0", "3"])
print("\nGet by ID:")
print(result)

# Simple query by ID
# ask it to return embeddings as well
result = collection.get(ids=["0", "3"], include=["embeddings"])
print("\nGet by ID:")
print(result)


Get by ID:
{'ids': ['0', '3'], 'embeddings': None, 'documents': ['Vector 0: [0, 0, 0]', 'Vector 3: [0, 1, 1]'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [None, None]}

Get by ID:
{'ids': ['0', '3'], 'embeddings': array([[0., 0., 0.],
       [0., 1., 1.]]), 'documents': None, 'uris': None, 'included': ['embeddings'], 'data': None, 'metadatas': None}


In [27]:
# Similarity search
query_vector = [0.5, 0.65, 0.7]
results = collection.query(
    query_embeddings=[query_vector],
    n_results=3,
    include=["documents", "distances", "embeddings"],
)
print(f"Similarity search: {query_vector}")

print("\nSimilarity search results:")
for doc, emb, dist in zip(
    results["documents"][0], results["embeddings"][0], results["distances"][0]
):
    print(f"Document: {doc}")
    print(f"Vector: {emb}")
    print(f"Distance: {dist:.4f}")
    print("---")

Similarity search: [0.5, 0.65, 0.7]

Similarity search results:
Document: Vector 3: [0, 1, 1]
Vector: [0. 1. 1.]
Distance: 0.4625
---
Document: Vector 7: [1, 1, 1]
Vector: [1. 1. 1.]
Distance: 0.4625
---
Document: Vector 1: [0, 0, 1]
Vector: [0. 0. 1.]
Distance: 0.7625
---


In [28]:
# Update a vector (Chroma handles this as delete+add internally)
collection.update(
    ids=["1"],
    embeddings=[[1.1, 1.1, 1.1]],
    documents=["Updated vector 1: [1.1, 1.1, 1.1]"],
)

# Verify update
result = collection.get(ids=["1"])
print("\nAfter update:")
print(result)


After update:
{'ids': ['1'], 'embeddings': None, 'documents': ['Updated vector 1: [1.1, 1.1, 1.1]'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [None]}


In [29]:
# Delete a vector
collection.delete(ids=["2"])

# Verify deletion
remaining = collection.get()
print(f"\nRemaining vectors: {len(remaining['ids'])}")


Remaining vectors: 7


## Metadata

In [7]:
# Similarity search
results = collection.query(
    query_texts=["phone"], n_results=3, include=["documents", "distances"]
)
# print(f"Similarity search: {query_vector}")

print("\nSimilarity search results:")
for doc, dist in zip(results["documents"][0], results["distances"][0]):
    print(f"Document: {doc}")
    # print(f"Vector: {emb}")
    print(f"Distance: {dist:.4f}")
    print("---")


Similarity search results:
Document: The latest smartphone features a 108MP camera
Distance: 1.3211
---
Document: This budget laptop has 8GB RAM and 256GB SSD
Distance: 1.7377
---
Document: Premium wireless headphones with noise cancellation
Distance: 1.7670
---


In [2]:
# Initialize client
client = chromadb.Client()

# Create collection with embedding function
# Modify default settings
collection = client.create_collection(
    name="new_collection",
    embedding_function=embedding_functions.DefaultEmbeddingFunction(),
)

# Add documents with multiple metadata fields
documents = [
    "The latest smartphone features a 108MP camera",
    "This budget laptop has 8GB RAM and 256GB SSD",
    "Premium wireless headphones with noise cancellation",
]

embeddings = [
    [0.1, 0.2, 0.3],  # Simple 3D embeddings for demo
    [0.4, 0.5, 0.6],
    [0.7, 0.8, 0.9],
]

metadatas = [
    {
        "category": "electronics",
        "subcategory": "phones",
        "price": 999.99,
        "brand": "Samsung",
        "in_stock": True,
        "tags": "flagship,camera,5G",
    },
    {
        "category": "electronics",
        "subcategory": "computers",
        "price": 499.99,
        "brand": "Acer",
        "in_stock": False,
        "tags": "budget,student",
    },
    {
        "category": "audio",
        "subcategory": "headphones",
        "price": 349.99,
        "brand": "Sony",
        "in_stock": True,
        "tags": "wireless,noise-cancelling,premium",
    },
]

ids = ["doc1", "doc2", "doc3"]

collection.add(
    documents=documents,
    # embeddings=embeddings,
    metadatas=metadatas,
    ids=ids,
)

# Verify metadata was stored
print("Stored metadata example:")
print(collection.get(ids=["doc1"])["metadatas"][0])

Stored metadata example:
{'in_stock': True, 'brand': 'Samsung', 'subcategory': 'phones', 'price': 999.99, 'tags': 'flagship,camera,5G', 'category': 'electronics'}


In [3]:
# Query with simple equality
results = collection.query(
    query_texts=["gadget"], n_results=2, where={"brand": {"$eq": "Sony"}}
)
print("\nSony products:")
print(results["documents"])

# Numerical comparison
results = collection.query(
    query_texts=["device"],
    n_results=3,
    where={"price": {"$gte": 500}},  # Greater than or equal to 500
)
print("\nPremium products ($500+):")
print(results["documents"])

# Boolean filter
results = collection.query(
    query_texts=["product"], n_results=3, where={"in_stock": {"$eq": True}}
)
print("\nIn-stock items:")
print(results["documents"])


Sony products:
[['Premium wireless headphones with noise cancellation']]

Premium products ($500+):
[['The latest smartphone features a 108MP camera']]

In-stock items:
[['The latest smartphone features a 108MP camera', 'Premium wireless headphones with noise cancellation']]
