In [1]:
!pip install -q torch transformers sentence-transformers

import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from typing import List

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
def mean_pool(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    """
    Mean-pool token embeddings with the attention mask.
    last_hidden_state: [batch, seq_len, hidden]
    attention_mask:    [batch, seq_len]
    returns:           [batch, hidden]
    """
    mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)  # [batch, seq_len, 1]
    summed = (last_hidden_state * mask).sum(dim=1)                  # [batch, hidden]
    counts = mask.sum(dim=1).clamp(min=1e-9)                        # avoid div-by-zero
    return summed / counts

def l2_normalize(x: np.ndarray) -> np.ndarray:
    norm = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12
    return x / norm

def cosine_sim(a: np.ndarray, b: np.ndarray) -> np.ndarray:
    return (a @ b.T) / ((np.linalg.norm(a, axis=1, keepdims=True) + 1e-12) * (np.linalg.norm(b, axis=1, keepdims=True) + 1e-12))


In [4]:
bert_name = "bert-base-uncased"
bert_tok = AutoTokenizer.from_pretrained(bert_name)
bert_model = AutoModel.from_pretrained(bert_name).to(device).eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [8]:
sentences = [
    "The ADSP course is taken at Polito.",
    "Students attend the ADSP course at Politecnico di Torino.",
    "Transformers enable large language models.",
]

batch = bert_tok(
    sentences, padding=True, truncation=True, return_tensors="pt"
).to(device)

with torch.no_grad():
    outputs = bert_model(**batch)  # BaseModelOutputWithPoolingAndCrossAttentions
    token_embeddings = outputs.last_hidden_state                 # [B, L, H]
    cls_embeddings = token_embeddings[:, 0, :]                   # [B, H]
    mean_embeddings = mean_pool(token_embeddings, batch["attention_mask"])  # [B, H]

# Convert to numpy + normalize (common practice before similarity)
cls_np  = l2_normalize(cls_embeddings.cpu().numpy())
mean_np = l2_normalize(mean_embeddings.cpu().numpy())

print("CLS shape:", cls_np.shape, "MEAN shape:", mean_np.shape)

cls_np, mean_np


CLS shape: (3, 768) MEAN shape: (3, 768)


(array([[-0.01360141,  0.00203705,  0.02282828, ..., -0.00616113,
          0.03210211,  0.04206397],
        [-0.00466807,  0.01044108,  0.01059336, ..., -0.01073389,
          0.04638067,  0.05928184],
        [-0.04557362, -0.01178286, -0.00441295, ..., -0.03701726,
         -0.02091555,  0.01471345]], dtype=float32),
 array([[-0.00526124, -0.01783407,  0.01272741, ...,  0.02429543,
          0.02708768,  0.02878738],
        [ 0.02204404, -0.00230697,  0.01932801, ...,  0.00419975,
          0.02615163,  0.07923935],
        [-0.00316657, -0.02215518,  0.01316765, ..., -0.03146913,
         -0.02398103, -0.01835394]], dtype=float32))

In [6]:
S = cosine_sim(mean_np, mean_np)
for i, s in enumerate(sentences):
    top = np.argsort(-S[i])  # descending
    print(f"\nSentence: {s}")
    for j in top[:3]:
        print(f"  sim={S[i, j]:.3f}  ->  {sentences[j]}")


Query: The ADSP course is taken at Polito.
  sim=1.000  ->  The ADSP course is taken at Polito.
  sim=0.760  ->  Students attend the ADSP course at Politecnico di Torino.
  sim=0.533  ->  Transformers enable large language models.

Query: Students attend the ADSP course at Politecnico di Torino.
  sim=1.000  ->  Students attend the ADSP course at Politecnico di Torino.
  sim=0.760  ->  The ADSP course is taken at Polito.
  sim=0.488  ->  Transformers enable large language models.

Query: Transformers enable large language models.
  sim=1.000  ->  Transformers enable large language models.
  sim=0.533  ->  The ADSP course is taken at Polito.
  sim=0.488  ->  Students attend the ADSP course at Politecnico di Torino.
