In [36]:
from datasets import load_dataset
import tqdm as notebook_tqdm

In [37]:
# FB15k-237 from HuggingFace
dataset = load_dataset("VLyb/FB15k")

train_triples = dataset["train"]
test_triples = dataset["test"]
valid_triples = dataset["validation"]

In [38]:
print(train_triples[0])

{'head': '/m/027rn', 'relation': '/location/country/form_of_government', 'tail': '/m/06cx9'}


In [39]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base", trust_remote_code=True)
model = AutoModel.from_pretrained("answerdotai/ModernBERT-base", trust_remote_code=True)

In [40]:
model.eval()

ModernBertModel(
  (embeddings): ModernBertEmbeddings(
    (tok_embeddings): Embedding(50368, 768, padding_idx=50283)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (drop): Dropout(p=0.0, inplace=False)
  )
  (layers): ModuleList(
    (0): ModernBertEncoderLayer(
      (attn_norm): Identity()
      (attn): ModernBertAttention(
        (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
        (rotary_emb): ModernBertRotaryEmbedding()
        (Wo): Linear(in_features=768, out_features=768, bias=False)
        (out_drop): Identity()
      )
      (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): ModernBertMLP(
        (Wi): Linear(in_features=768, out_features=2304, bias=False)
        (act): GELUActivation()
        (drop): Dropout(p=0.0, inplace=False)
        (Wo): Linear(in_features=1152, out_features=768, bias=False)
      )
    )
    (1-21): 21 x ModernBertEncoderLayer(
      (attn_norm): LayerNorm((768,), eps=1e-05, e

In [41]:
def triple_to_sentence2(triple):
    h = triple['head']
    r = triple['relation']
    t = triple['tail']
    return f"{h} {r} {t}."

def triple_to_sentence(h, r, t):
    return f"{h} {r} {t}."

s = triple_to_sentence2(train_triples[0])
print(s) 

/m/027rn /location/country/form_of_government /m/06cx9.


In [None]:
import torch

@torch.no_grad()
def score_sentence(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # shape: [1, hidden_size]
    score = torch.norm(cls_embedding, p=2, dim=1)  # simple scalar score
    return score.item()

In [43]:
# get a set containing all the entities in any of the three sets
all_entities = set()

for split in [train_triples, test_triples, valid_triples]:
    for row in split:
        all_entities.update([row["head"], row["tail"]])

all_entities = list(all_entities)

In [44]:
import random

# corrupt either the head or the tail
def corrupt_triple(triple, entity_list):
    head, relation, tail = triple
    if random.random() < 0.5:
        # Corrupt tail
        corrupted = (head, relation, random.choice(entity_list))
    else:
        # Corrupt head
        corrupted = (random.choice(entity_list), relation, tail)
    return corrupted

In [45]:
# Pick a positive triple
pos = test_triples[0]
pos_triple = (pos["head"], pos["relation"], pos["tail"])

# Corrupt it
neg_triple = corrupt_triple(pos_triple, all_entities)

# Convert to sentences
pos_sentence = triple_to_sentence(*pos_triple)
neg_sentence = triple_to_sentence(*neg_triple)

# Score both
pos_score = score_sentence(pos_sentence)
neg_score = score_sentence(neg_sentence)

print(f"Positive score: {pos_score:.4f}")
print(f"Negative score: {neg_score:.4f}")

Positive score: 37.4577
Negative score: 37.3436


using bert alone, with a basic loss function and no training shows that the model is not attuned enough to this type of task and the difference between positive (indicating that the triple is correct) and negative (triple is incorrect) scores is too mild for it to be acceptable. Therefore some steps need to be taken in order to make it better

- build a dataset which contains the triples labelled (corrupted or not)

In [46]:
len(train_triples)

483142

In [47]:
train_triples[0]

{'head': '/m/027rn',
 'relation': '/location/country/form_of_government',
 'tail': '/m/06cx9'}

In [48]:
import random

# corrupt either the head or the tail
def corrupt_triple2(triple, entity_list):
    h, r, t = triple['head'], triple['relation'], triple['tail']
    if random.random() < 0.5:
        corrupted = (random.choice(entity_list), r, t)
    else:
        corrupted = (h, r, random.choice(entity_list))
    return corrupted

In [49]:
# from datasets import load_dataset
# dataset = load_dataset("VLyb/FB15k")
# train_triples = dataset["train"]

N = 2000 # let us label a subset
data = []
for i in range(1, N+1):
    triple = train_triples[i]
    h, r, t = triple["head"], triple["relation"], triple["tail"]
    data.append((triple_to_sentence(h, r, t), 1))

    ch, cr, ct = corrupt_triple2(triple, all_entities)
    data.append((triple_to_sentence(ch, cr, ct), 0))

- create a torch dataset

In [None]:
# from torch.utils.data import Dataset

# # class TripleDataset(Dataset):
# #     def __init__(self, triples):
# #         self.triples = triples

# #     def __len__(self):
# #         return len(self.triples)

# #     def __getitem__(self, idx):
# #         sentence, label = self.triples[idx]
# #         return tokenizer(sentence, return_tensors='pt', truncation=True, padding='max_length', max_length=32), torch.tensor(label, dtype=torch.float)

# class TripleDataset(Dataset):
#     def __init__(self, triples):
#         self.triples = triples

#     def __len__(self):
#         return len(self.triples)

#     def __getitem__(self, idx):
#         sentence, label = self.triples[idx]
#         encoding = tokenizer(
#             sentence,
#             truncation=True,
#             padding='max_length',
#             max_length=32,
#             return_tensors='pt'
#         )

#         return {
#             "input_ids": encoding["input_ids"].squeeze(0),
#             "attention_mask": encoding["attention_mask"].squeeze(0),
#             "label": torch.tensor(label, dtype=torch.float)
#         }



In [54]:
from torch.utils.data import Dataset, DataLoader
import torch

class TripleDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence, label = self.data[idx]
        encoded = self.tokenizer(sentence, truncation=True, padding="max_length", max_length=32, return_tensors="pt")
        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.float)
        }

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base", trust_remote_code=True)

dataset = TripleDataset(data, tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


- add classification head

In [55]:
import torch.nn as nn

# class TripleClassifier(nn.Module):
#     def __init__(self, base_model):
#         super().__init__()
#         self.encoder = base_model
#         self.classifier = nn.Linear(base_model.config.hidden_size, 1)

#     def forward(self, input_ids, attention_mask):
#         outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
#         cls = outputs.last_hidden_state[:, 0, :]  # [CLS]
#         logits = self.classifier(cls).squeeze(-1)
#         return logits

# class TripleClassifier(nn.Module):
#     def __init__(self, base_model):
#         super().__init__()
#         self.encoder = base_model
#         self.classifier = nn.Linear(base_model.config.hidden_size, 1)

#     def forward(self, input_ids, attention_mask):
#         outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
#         cls_embedding = outputs.last_hidden_state[:, 0, :]  # shape: (batch_size, hidden_size)
#         logits = self.classifier(cls_embedding)  # shape: (batch_size, 1)
#         return logits.squeeze(1)  # shape: (batch_size,)

import torch.nn as nn
from transformers import AutoModel

class TripleClassifier(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.encoder = base_model
        self.classifier = nn.Linear(base_model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls).squeeze(-1)
        return logits




In [None]:
base_model = AutoModel.from_pretrained("answerdotai/ModernBERT-base", trust_remote_code=True)
model = TripleClassifier(base_model)

- train the model

In [None]:
# from torch.utils.data import DataLoader

# # Create an instance of your dataset
# dataset = TripleDataset(data)  # 'data' should be your list of (sentence, label) tuples

# # Create the DataLoader
# dataloader = DataLoader(
#     dataset,
#     batch_size=16,         # You can change this depending on memory
#     shuffle=True,          # Shuffle for training
#     drop_last=True         # Optional: drop last batch if it's incomplete
# )

In [57]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.BCEWithLogitsLoss()
model.train()

for epoch in range(3):
    total_loss = 0
    for batch in dataloader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]

        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {total_loss:.4f}")


Epoch 1 Loss: 174.9417
Epoch 2 Loss: 174.6750
Epoch 3 Loss: 174.1778
