In [16]:
from transformers import AutoTokenizer
from adapters import AutoAdapterModel

tokenizer = AutoTokenizer.from_pretrained("allenai/specter2_base")
# Base model
model = AutoAdapterModel.from_pretrained("allenai/specter2_base")
adapter_name = model.load_adapter("allenai/specter2", source="hf", set_active=True)
model.to("mps")
model.eval()
print(f"Adapter name: {adapter_name}")
print(f"Active adapters: {model.active_adapters}")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

There are adapters available but none are activated for the forward pass.


Adapter name: [PRX]
Active adapters: Stack[[PRX]]


In [17]:
papers = [
    {"title": "BERT", "abstract": "We introduce a new language representation model called BERT"},
    {
        "title": "Attention is all you need",
        "abstract": " The dominant sequence transduction models are based on complex recurrent or convolutional neural networks",
    },
]

# concatenate title and abstract
text_batch = [d["title"] + tokenizer.sep_token + (d.get("abstract") or "") for d in papers]
# preprocess the input
inputs = tokenizer(
    text_batch, padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False, max_length=512
)
inputs = inputs.to("mps")
output = model(**inputs)
# take the first token in the batch as the embedding
embeddings = output.last_hidden_state[:, 0, :]
print(embeddings.shape)

torch.Size([2, 768])


In [4]:
docs = [
    "This is a document about the BGE model.",
    "This document discusses the AstroBERT model."
]

inputs = tokenizer(docs, padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False, max_length=512)
inputs.to("mps")
output = model(**inputs)
embeddings = output.last_hidden_state[:, 0, :]
print(embeddings.shape)

torch.Size([2, 768])


In [7]:
# Let's try with the ad hoc adapter

adapter_name = model.load_adapter("allenai/specter2_adhoc_query", source="hf", set_active=True)
model.to("mps")
print(f"Adapter name: {adapter_name}")
print(f"Adapters in use: {model.active_adapters}")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Overwriting existing adapter '[QRY]'.


Adapter name: [QRY]
Adapters in use: Stack[[QRY]]


In [8]:
queries = [
    "This is a query",
    "This is another query"
]
inputs = tokenizer(queries, padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False, max_length=512)
inputs.to("mps")
output = model(**inputs)
embeddings = output.last_hidden_state[:, 0, :]
print(embeddings.shape)

torch.Size([2, 768])


In [9]:
# check the norm of each vector
import numpy as np
np.linalg.norm(embeddings.detach().cpu().numpy(), axis=1)

array([22.817785, 22.941668], dtype=float32)

In [12]:
np.linalg.norm(embeddings[1].detach().cpu().numpy())

np.float32(22.941668)

In [21]:
import torch
normed = torch.nn.functional.normalize(embeddings, p=2, dim=1)

print(np.linalg.norm(normed.detach().cpu().numpy(), axis=1))

[0.99999994 1.0000001 ]


In [15]:
'[QRY]' in model.active_adapters

True

In [22]:
type(normed.detach().cpu())

torch.Tensor

In [23]:
model.config.hidden_size

768