In [1]:
import torch
import numpy as np
import time

In [2]:
results = {}

candidate_texts = [
    "The grasp is on the handle of the mug, from the side.",
    "Grasp is from above, on the rim of the mug. The grasp is perpendicular to the opening of the mug.",
    "The grasp is on the body of the mug, from the side, oriented parallel to the base of the mug."
]

proposal_text = "The mug should be grasped around its cylindrical body, approximately halfway up from the base to the rim. Orient the parallel gripper fingers horizontally, adjusting them so that both fingers are equidistant from either side of the mug, ensuring a firm hold. Avoid the area near the handle to maintain a balanced grip."

In [3]:
from transformers import CLIPProcessor, CLIPModel

clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')

def encode_clip(texts):
    if isinstance(texts, str):
        texts = [texts]
    rets = []
    with torch.no_grad():
        clip_inputs = clip_processor(texts, return_tensors='pt', padding=True, truncation=True)
        rets.append(clip_model.get_text_features(**clip_inputs))
    return torch.stack(rets)

start = time.perf_counter()
candidates = encode_clip(candidate_texts)
proposal = encode_clip(proposal_text)
elapsed = time.perf_counter() - start
scores = (candidates @ proposal.reshape(-1, 1)) / (torch.norm(candidates, dim=-1, keepdim=True) * torch.norm(proposal))
scores = scores.cpu().numpy().flatten()
results["clip"] = {"scores": scores, "elapsed": elapsed}

In [4]:
from sentence_transformers import SentenceTransformer

mpnet = SentenceTransformer("all-mpnet-base-v2")

def encode_mpnet(text):
    with torch.no_grad():
        return mpnet.encode(text, convert_to_tensor=True)

start = time.perf_counter()
candidates = encode_mpnet(candidate_texts)
proposal = encode_mpnet(proposal_text)
elapsed = time.perf_counter() - start
scores = (candidates @ proposal.reshape(-1, 1)) / (torch.norm(candidates, dim=-1, keepdim=True) * torch.norm(proposal))
scores = scores.cpu().numpy().flatten()
results["mpnet"] = {"scores": scores, "elapsed": elapsed}

In [5]:
from sentence_transformers import CrossEncoder

crossencoder = CrossEncoder('cross-encoder/stsb-roberta-large')
start = time.perf_counter()
ranking = crossencoder.rank(proposal_text, candidate_texts)
elapsed = time.perf_counter() - start
ranked_scores = [ranking["score"] for ranking in ranking]
ranked_ids = [ranking["corpus_id"] for ranking in ranking]
scores = np.zeros(len(candidate_texts))
for i, id in enumerate(ranked_ids):
    scores[id] = ranked_scores[i]
results["crossencoder"] = {"scores": scores, "elapsed": elapsed}

In [6]:
from sentence_transformers import SentenceTransformer

query_prefix = "Instruct: Given a description of a grasp, retrieve grasp descriptions that describe similar grasps on similar objects\nQuery: "

if "nv_embed" not in globals():
    nv_embed = SentenceTransformer('nvidia/NV-Embed-v2', trust_remote_code=True)
    nv_embed.max_seq_length = 32768
    nv_embed.tokenizer.padding_side="right"

def add_eos(input_examples):
    input_examples = [input_example + nv_embed.tokenizer.eos_token for input_example in input_examples]
    return input_examples

# get the embeddings
batch_size = 2
with torch.no_grad():
    start = time.perf_counter()
    query_embeddings = nv_embed.encode(add_eos([proposal_text]), batch_size=batch_size, prompt=query_prefix, normalize_embeddings=True)
    passage_embeddings = nv_embed.encode(add_eos(candidate_texts), batch_size=batch_size, normalize_embeddings=True)
    elapsed = time.perf_counter() - start

scores = (query_embeddings @ passage_embeddings.T).flatten()
results["nv-embed"] = {"scores": scores, "elapsed": elapsed}

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  self.gen = func(*args, **kwds)


In [7]:
print(f"Proposal: {proposal_text}")
print("Candidates:")
for i, text in enumerate(candidate_texts):
    print(f"\t{i+1}: {text}")
print()

for model, result in results.items():
    print(model)
    scores = result["scores"]
    print(f"\tScores: {scores}")
    print(f"\tRanking: {np.argsort(scores)[::-1]+1}")
    print(f"\tElapsed: {result['elapsed']}")

Proposal: The mug should be grasped around its cylindrical body, approximately halfway up from the base to the rim. Orient the parallel gripper fingers horizontally, adjusting them so that both fingers are equidistant from either side of the mug, ensuring a firm hold. Avoid the area near the handle to maintain a balanced grip.
Candidates:
	1: The grasp is on the handle of the mug, from the side.
	2: Grasp is from above, on the rim of the mug. The grasp is perpendicular to the opening of the mug.
	3: The grasp is on the body of the mug, from the side, oriented parallel to the base of the mug.

clip
	Scores: [0.8430011 0.8193814 0.8449857]
	Ranking: [3 1 2]
	Elapsed: 0.05918162688612938
mpnet
	Scores: [0.7771651  0.74865735 0.8147641 ]
	Ranking: [3 1 2]
	Elapsed: 0.810763094574213
crossencoder
	Scores: [0.57904649 0.60282588 0.72182417]
	Ranking: [3 2 1]
	Elapsed: 0.02026798017323017
nv-embed
	Scores: [0.5280144  0.53775704 0.6598767 ]
	Ranking: [3 2 1]
	Elapsed: 0.3232739083468914
