In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [5]:
import torch
from transformers import AutoModel, AutoConfig, AutoTokenizer,  AutoModelForMaskedLM
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F
from data_modules.mind_aspect_data import AspectNewsBatch, MINDAspectDataModule
from modules.aspect_enc import AspectRepr


In [77]:
tokenizer = AutoTokenizer.from_pretrained('nickprock/ModernBERT-large-sts')
text = [
    "Unexpected Reasons Why You Might Have A Fever	Feeling a little heated? It's typically nothing to worry about ― especially if you're sick. However, other issues can cause your temperature to rise.",
    "Brain scans don't lie: The minds of girls and boys are equal in math	Several studies have already debunked the myth that boys are innately better at math than girls, and new brain images offer more proof",
    "Ken Fisher has a side bet on a risky corner of Wall Street	Billionaire Ken Fisher made his name and fortune picking stocks. But over the years he's also become a huge player in an arcane -- and controversial -- corner of Wall Street: exchange-traded notes."
]
query_texts = [
    "Australia deports woman to Vietnam over smuggled pork	CANBERRA, Australia (AP)   Australia for the first time has canceled a tourist's visa over undeclared food as the country tries to keep itself free of African swine fever.",
    "Exercising More Past 60 Cuts Risk Of Stroke, Heart Disease	The elderly are supposed to exercise more if they want to live longer, according to a new study.",
    "FaceTime chats and the chance to change IU's fortunes lured Tiawan Mullen to Indiana	Tiawan Mullen had nearly two dozen offers; he chose IU for the chance 'to change everything.'"
]
# Create 5 queries: some similar, some dissimilar to the texts in `text`
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenized_texts = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
tokenized_texts = tokenized_texts.to(device)
tokenized_queries = tokenizer(query_texts, padding=True, truncation=True, return_tensors='pt')
tokenized_queries = tokenized_queries.to(device)
batch_texts = AspectNewsBatch(news={'text': tokenized_texts}, labels=None)
batch_queries = AspectNewsBatch(news={'text': tokenized_queries}, labels=None)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [81]:
model = AspectRepr.load_from_checkpoint('/home/users1/hardy/hardy/project/vae/checkpoints/aspect_cat_sts-epoch=17-val_loss=1.7645.ckpt')  # Replace with your checkpoint path
model.to(device)

with torch.no_grad():
    text_embeddings = model(batch_texts)[0].cpu()
    query_embeddings = model(batch_queries)[0].cpu()
# Compute cosine similarity between each query and each text embedding
# text_embeddings = text_embeddings - text_embeddings.mean(dim=0, keepdim=True)
# query_embeddings = query_embeddings - query_embeddings.mean(dim=0, keepdim=True)
similarity_matrix__cat_sts = F.cosine_similarity(
    text_embeddings.unsqueeze(1),  # shape: (5, 1, hidden_dim)
    query_embeddings.unsqueeze(0),   # shape: (1, 5, hidden_dim)
    dim=-1
)  # shape: (5, 5)

print(similarity_matrix__cat_sts)
print(similarity_matrix__cat_sts.shape)

tensor([[0.0856, 0.8785, 0.0258],
        [0.0853, 0.5327, 0.1114],
        [0.1065, 0.1819, 0.0523]])
torch.Size([3, 3])


In [82]:
model = AspectRepr.load_from_checkpoint(plm_name='nickprock/ModernBERT-large-sts')  # Replace with your checkpoint path
model.to(device)

with torch.no_grad():
    text_embeddings = model(batch_texts)[0].cpu()
    query_embeddings = model(batch_queries)[0].cpu()
# Compute cosine similarity between each query and each text embedding
# text_embeddings = text_embeddings - text_embeddings.mean(dim=0, keepdim=True)
# query_embeddings = query_embeddings - query_embeddings.mean(dim=0, keepdim=True)
similarity_matrix_sts = F.cosine_similarity(
    text_embeddings.unsqueeze(1),  # shape: (5, 1, hidden_dim)
    query_embeddings.unsqueeze(0),   # shape: (1, 5, hidden_dim)
    dim=-1
)  # shape: (5, 5)

print(similarity_matrix_sts)
print(similarity_matrix_sts.shape)

tensor([[0.4911, 0.5301, 0.5458],
        [0.3858, 0.4863, 0.5057],
        [0.5627, 0.4540, 0.6066]])
torch.Size([3, 3])


In [85]:
model = AspectRepr.load_from_checkpoint('/home/users1/hardy/hardy/project/vae/checkpoints/aspect_cat-epoch=23-val_loss=0.1483.ckpt')  # Replace with your checkpoint path
model.to(device)

with torch.no_grad():
    text_embeddings = model(batch_texts)[0].cpu()
    query_embeddings = model(batch_queries)[0].cpu()
# Compute cosine similarity between each query and each text embedding
# text_embeddings = text_embeddings - text_embeddings.mean(dim=0, keepdim=True)
# query_embeddings = query_embeddings - query_embeddings.mean(dim=0, keepdim=True)
similarity_matrix__cat = F.cosine_similarity(
    text_embeddings.unsqueeze(1),  # shape: (5, 1, hidden_dim)
    query_embeddings.unsqueeze(0),   # shape: (1, 5, hidden_dim)
    dim=-1
)  # shape: (5, 5)

print(similarity_matrix__cat)
print(similarity_matrix__cat.shape)

tensor([[-0.0608,  0.8463,  0.0968],
        [ 0.2161,  0.5786,  0.2868],
        [-0.1097,  0.2985,  0.1948]])
torch.Size([3, 3])


In [87]:
from scipy.stats import pearsonr

# Flatten the matrices to 1D arrays
flat_sts = similarity_matrix_sts.flatten().numpy()
flat_cat_sts = similarity_matrix__cat_sts.flatten().numpy()

# Compute Pearson correlation
corr, p_value = pearsonr(flat_sts, flat_cat_sts)
print("Pearson correlation:", corr)
print("p-value:", p_value)

Pearson correlation: -0.11734262
p-value: 0.7636776054503343


In [53]:


# Compute cosine similarity between each query and each text embedding
similarity_matrix = F.cosine_similarity(
    text_embeddings.unsqueeze(1),  # shape: (5, 1, hidden_dim)
    query_embeddings.unsqueeze(0),   # shape: (1, 5, hidden_dim)
    dim=-1
)  # shape: (5, 5)

print(similarity_matrix)
print(similarity_matrix.shape)

tensor([[0.7774, 0.4657, 0.5165, 0.4903, 0.4036, 0.4034, 0.4367],
        [0.4476, 0.9223, 0.3963, 0.5841, 0.3943, 0.3956, 0.4614],
        [0.5431, 0.5897, 0.3965, 0.7107, 0.4081, 0.4401, 0.4518],
        [0.5719, 0.6070, 0.4026, 0.9721, 0.4467, 0.3462, 0.4645],
        [0.4630, 0.6549, 0.4077, 0.5781, 0.4336, 0.3734, 0.4852],
        [0.4351, 0.3932, 0.2597, 0.3564, 0.3210, 0.9712, 0.3807],
        [0.4661, 0.5065, 0.3939, 0.5059, 0.3938, 0.3232, 0.4728]])
torch.Size([7, 7])


In [58]:


# Compute cosine similarity between each query and each text embedding
similarity_matrix = F.cosine_similarity(
    text_embeddings.unsqueeze(1),  # shape: (5, 1, hidden_dim)
    query_embeddings.unsqueeze(0),   # shape: (1, 5, hidden_dim)
    dim=-1
)  # shape: (5, 5)

print(similarity_matrix)
print(similarity_matrix.shape)

tensor([[ 0.5747,  0.3004,  0.5111,  0.1979,  0.2439,  0.5848,  0.2136],
        [ 0.2841,  0.7830,  0.1380,  0.5863,  0.1190,  0.3566,  0.2754],
        [ 0.1427,  0.4481,  0.0093,  0.8223,  0.0607,  0.4142,  0.0732],
        [ 0.1785,  0.4405,  0.0284,  0.9386,  0.0641,  0.4007,  0.0352],
        [ 0.1281,  0.5254, -0.0067,  0.7838,  0.0876,  0.3851,  0.0666],
        [ 0.4195,  0.3483,  0.2975,  0.3971,  0.2493,  0.9454,  0.2192],
        [ 0.4221,  0.3663,  0.3560,  0.3708,  0.1997,  0.4435,  0.4579]])
torch.Size([7, 7])


tensor([[ 0.4373, -0.2444,  0.1924, -0.2702,  0.0163, -0.0143, -0.0626],
        [-0.1138,  0.4727, -0.1028,  0.0852, -0.0200, -0.2261, -0.1003],
        [ 0.0440,  0.0956, -0.0512,  0.1628, -0.0326, -0.1186, -0.0892],
        [-0.0504, -0.0135, -0.0799,  0.6442, -0.0599, -0.2328, -0.1963],
        [-0.1419,  0.3530, -0.0690,  0.1569, -0.0131, -0.2202, -0.0735],
        [-0.1203, -0.2846, -0.0717, -0.3513,  0.0657,  0.7930, -0.0609],
        [-0.1026, -0.1626,  0.1232, -0.2273,  0.0182, -0.1675,  0.4878]])
torch.Size([7, 7])


In [68]:
text_embeddings = text_embeddings - text_embeddings.mean(dim=0, keepdim=True)
query_embeddings = query_embeddings - query_embeddings.mean(dim=0, keepdim=True)
# Compute cosine similarity between each query and each text embedding
similarity_matrix = F.cosine_similarity(
    text_embeddings.unsqueeze(1),  # shape: (5, 1, hidden_dim)
    query_embeddings.unsqueeze(0),   # shape: (1, 5, hidden_dim)
    dim=-1
)  # shape: (5, 5)

print(similarity_matrix)
print(similarity_matrix.shape)

tensor([[ 0.4373, -0.2444,  0.1924, -0.2702,  0.0163, -0.0143, -0.0626],
        [-0.1138,  0.4727, -0.1028,  0.0852, -0.0200, -0.2261, -0.1003],
        [ 0.0440,  0.0956, -0.0512,  0.1628, -0.0326, -0.1186, -0.0892],
        [-0.0504, -0.0135, -0.0799,  0.6442, -0.0599, -0.2328, -0.1963],
        [-0.1419,  0.3530, -0.0690,  0.1569, -0.0131, -0.2202, -0.0735],
        [-0.1203, -0.2846, -0.0717, -0.3513,  0.0657,  0.7930, -0.0609],
        [-0.1026, -0.1626,  0.1232, -0.2273,  0.0182, -0.1675,  0.4878]])
torch.Size([7, 7])


In [5]:
model_name =  "nickprock/ModernBERT-large-sts"

In [7]:
config = AutoConfig.from_pretrained(model_name)
config.update({"device_map": "cuda:0", "torch_dtype": torch.bfloat16})
text_encoder = AutoModel.from_pretrained(model_name, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
tokens = tokenizer(
            text, return_tensors="pt", return_token_type_ids=False, padding=True, truncation=True
        )

outputs = text_encoder(**tokens)
text_embeddings = outputs.last_hidden_state[:, 0, :]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
query_tokens = tokenizer(
            query_texts, return_tensors="pt", return_token_type_ids=False, padding=True, truncation=True
        )
query_outputs = text_encoder(**query_tokens)
query_embeddings = query_outputs.last_hidden_state[:, 0, :]

In [10]:


# Compute cosine similarity between each query and each text embedding
similarity_matrix = F.cosine_similarity(
    text_embeddings.unsqueeze(1),  # shape: (5, 1, hidden_dim)
    query_embeddings.unsqueeze(0),   # shape: (1, 5, hidden_dim)
    dim=-1
)  # shape: (5, 5)

print(similarity_matrix)
print(similarity_matrix.shape)

tensor([[0.7774, 0.4657, 0.5165, 0.4903, 0.4036, 0.4034],
        [0.4476, 0.9223, 0.3963, 0.5841, 0.3943, 0.3956],
        [0.5431, 0.5897, 0.3965, 0.7107, 0.4081, 0.4401],
        [0.5719, 0.6070, 0.4026, 0.9721, 0.4467, 0.3462],
        [0.4630, 0.6549, 0.4077, 0.5781, 0.4336, 0.3734],
        [0.4351, 0.3932, 0.2597, 0.3564, 0.3210, 0.9712]],
       grad_fn=<SumBackward1>)
torch.Size([6, 6])


In [9]:
model_name = "Qwen/Qwen3-Embedding-4B"
config = AutoConfig.from_pretrained(model_name)
config.update({"attn_implementation": "flash_attention_2", "device_map": "auto", "torch_dtype": torch.bfloat16})
text_encoder = AutoModel.from_pretrained(model_name, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
tokens = tokenizer(
            text, return_tensors="pt", return_token_type_ids=False, padding=True, truncation=True
        )

outputs = text_encoder(**tokens)
text_embeddings = outputs.last_hidden_state[:, 0, :]


In [14]:
query_tokens = tokenizer(
            query_texts, return_tensors="pt", return_token_type_ids=False, padding=True, truncation=True
        )
query_outputs = text_encoder(**query_tokens)
query_embeddings = query_outputs.last_hidden_state[:, 0, :]

In [15]:


# Compute cosine similarity between each query and each text embedding
similarity_matrix = F.cosine_similarity(
    text_embeddings.unsqueeze(1),  # shape: (5, 1, hidden_dim)
    query_embeddings.unsqueeze(0),   # shape: (1, 5, hidden_dim)
    dim=-1
)  # shape: (5, 5)

print(similarity_matrix)
print(similarity_matrix.shape)

tensor([[1.0000, 0.9182, 0.7954, 0.7907, 1.0000, 0.9313],
        [0.8799, 0.9548, 0.8821, 0.9011, 0.8799, 0.9004],
        [0.7686, 0.8889, 0.8978, 0.9188, 0.7686, 0.8149],
        [0.7907, 0.8817, 0.8911, 1.0000, 0.7907, 0.8351],
        [0.8653, 0.9553, 0.8971, 0.9160, 0.8653, 0.8965],
        [0.9313, 0.9085, 0.8730, 0.8351, 0.9313, 1.0000]],
       grad_fn=<SumBackward1>)
torch.Size([6, 6])


In [6]:
model = SentenceTransformer(
    "Qwen/Qwen3-Embedding-4B",
    model_kwargs={"attn_implementation": "flash_attention_2", "device_map": "auto", "torch_dtype": torch.bfloat16},
    tokenizer_kwargs={"padding_side": "left"},
)

KeyboardInterrupt: 

In [11]:
query_embeddings = model.encode(query_texts)
document_embeddings = model.encode(text, convert_to_tensor=True)

In [1]:
model = AutoModel.from_pretrained("Qwen/Qwen3-Embedding-4B")

NameError: name 'AutoModel' is not defined

In [24]:
similarity = model.similarity(query_embeddings, document_embeddings)
print(similarity)

tensor([[0.7837, 0.4297, 0.4470, 0.5315, 0.4269, 0.2790],
        [0.3465, 0.8922, 0.4865, 0.5247, 0.5822, 0.1880],
        [0.4432, 0.4472, 0.3335, 0.4556, 0.3956, 0.1639],
        [0.3883, 0.4413, 0.5320, 0.9348, 0.4025, 0.1411],
        [0.4616, 0.3829, 0.3040, 0.3573, 0.3999, 0.2902],
        [0.3018, 0.1434, 0.1413, 0.1385, 0.1579, 0.9171]])
