## Embedding with NVIDIA's model

A `SentenceTransformer` model instantiated from `NVIDIA-Embed-v2` will return tensors or np arrays but can only take a string input or a list containing a single string. It cannot accept a list of strings.

In [2]:
from sentence_transformers import SentenceTransformer
import torch

sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]
device = 'cpu' # mps was producing a dimension error on batch input to model.encode 
print(f"Using device: {device}")
model = SentenceTransformer(
    "nvidia/NV-Embed-v2", trust_remote_code=True, device=device)
print(model)

# Encoding a single string will work:
string_embedding = model.encode(
    sentences[0], convert_to_tensor=True, normalize_embeddings=False)
print(f"String embedding shape: {string_embedding.shape}")


# Encoding a list containing a single string works:
singleton_embedding = model.encode(
    sentences[:1], convert_to_tensor=True, normalize_embeddings=False)
print(f"Singleton shape: {singleton_embedding.shape}")

# Encoding a list of strings
try:
    embeddings = model.encode(
        sentences, convert_to_tensor=True, normalize_embeddings=False)
    print(f"Batch embeddings shape: {embeddings.shape}")
except Exception as e:
    print(f"Error: {e}")

Using device: cpu


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 4096, 'do_lower_case': False}) with Transformer model: NVEmbedModel 
  (1): Pooling({'word_embedding_dimension': 4096, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': False})
  (2): Normalize()
)


  self.gen = func(*args, **kwds)


String embedding shape: torch.Size([4096])
Singleton shape: torch.Size([1, 4096])
Batch embeddings shape: torch.Size([3, 4096])


In [3]:
embeddings = model.encode(sentences, convert_to_numpy=True, normalize_embeddings=False)
print(type(embeddings))
print(embeddings.shape)

<class 'numpy.ndarray'>
(3, 4096)


  self.gen = func(*args, **kwds)


In [4]:
type(embeddings[0])

numpy.ndarray

In [None]:
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings

client = chromadb.PersistentClient(path='./vector_stores/nv/')
print(client)

# Set up embedding model
class NVChromaEmbedder(EmbeddingFunction):
    def __init__(self, embedding_fn):
        self._encode = embedding_fn

    def __call__(self, input: Documents) -> Embeddings:
        return self._encode(input)

embedding_lambda = lambda docs: model.encode(docs, convert_to_numpy=True, normalize_embeddings=False)

embedder = NVChromaEmbedder(embedding_lambda)
result = embedder(['hi there', 'hello world'])
print(type(result))
print(type(result[0]))
print(result[0].shape)

<chromadb.api.client.Client object at 0xa878b2750>
<class 'list'>
2
<class 'numpy.ndarray'>
(4096,)


  self.gen = func(*args, **kwds)


In [9]:
# Get all collection names
collections = client.list_collections()

# Delete each collection
for collection in collections:
    client.delete_collection(name=collection.name)

print("All collections have been deleted.")

All collections have been deleted.


In [10]:
collection = client.create_collection(
    name="nv_test",
    embedding_function=embedder,
    metadata={"hnsw:space": "cosine"}
)
print(f"Created {collection}")

Created Collection(name=nv_test)


In [11]:
import os
import json

PATH_TO_DATA = 'data/json/'
FILENAMES = os.listdir(PATH_TO_DATA)
data = dict()
for filename in FILENAMES:
    with open(f'{PATH_TO_DATA}/{filename}', 'r') as file:
        data[os.path.splitext(os.path.basename(filename))[0]] = json.load(file)

print("Found files:")
for filename in data:
    print(f"  {filename}")

Found files:
  Earth_Science_Reviews
  Earth_Science_Research
  Planetary_Research
  Planetary_Reviews
  Astro_Reviews
  Astro_Research


In [12]:
def preprocess_papers(papers):
    # Convert titles from list to string
    for paper in papers:
        paper['title'] = paper['title'][0]
    return papers


def construct_document(record, fields):
    """
    Construct a document from the specified fields
    """
    return "\n".join([record[field] for field in fields])


def prep_metadata(record):
    """
    JSONify any list or dict fields, as Chroma requires all metadata to be primitive
    """
    return {key: json.dumps(value) if isinstance(value, (list, dict)) else value for key, value in record.items()}


data = {key: preprocess_papers(value) for key, value in data.items()}

all_papers = data['Astro_Research'] + \
    data['Earth_Science_Research'] + data['Planetary_Research']
print(f"Number of records: {len(all_papers)}")

documents = [construct_document(
    paper, ['title', 'abstract', 'body']) for paper in all_papers]
metadatas = [prep_metadata(paper) for paper in all_papers]
ids = [paper['id'] for paper in all_papers]

assert len(documents) == len(metadatas) == len(ids)

Number of records: 3000


In [13]:
direct_embedding = embedding_lambda(["the sun is a star"])
foo = collection.add(
    documents=["the sun is a star"],
    # metadatas=[{}],
    ids=['foo_id']
)
print(foo)

  self.gen = func(*args, **kwds)


None


In [14]:
chroma_embedding = collection.query(
    query_texts=["the sun is a star"],
    n_results=1,
    include=["embeddings", "documents"]
)

print(chroma_embedding)

{'ids': [['foo_id']], 'embeddings': [array([[-0.02781531, -0.00907008,  0.0181422 , ..., -0.00454538,
        -0.0195914 ,  0.00130222]])], 'documents': [['the sun is a star']], 'uris': None, 'data': None, 'metadatas': None, 'distances': None, 'included': [<IncludeEnum.embeddings: 'embeddings'>, <IncludeEnum.documents: 'documents'>]}


In [15]:
print(type(chroma_embedding['embeddings'][0]))
print(chroma_embedding['embeddings'][0][0].shape)

<class 'numpy.ndarray'>
(4096,)


In [18]:
direct_embedding.shape

(1, 4096)

In [19]:
import numpy as np

chroma_numpy = chroma_embedding['embeddings'][0]
print(direct_embedding.shape)
print(chroma_numpy.shape)
print(np.array_equal(direct_embedding, chroma_numpy))

(1, 4096)
(1, 4096)
True
