In [1]:
# Step 1: Install dependencies (only needed once)
!pip install transformers datasets torch scikit-learn matplotlib tqdm nltk optuna

# Step 2: Import required libraries
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from tqdm import tqdm
from sklearn.metrics import classification_report, accuracy_score

# Step 3: Choose data format (CSV or Parquet)
DATA_FORMAT = "csv"  # Choose between "csv" or "parquet"

# Step 4: Load the preprocessed dataset
if DATA_FORMAT == "csv":
    df_train = pd.read_csv("train_enhanced.csv")
    df_test = pd.read_csv("test_enhanced.csv")
elif DATA_FORMAT == "parquet":
    df_train = pd.read_parquet("train_enhanced.parquet", engine="pyarrow")
    df_test = pd.read_parquet("test_enhanced.parquet", engine="pyarrow")
else:
    raise ValueError("Invalid DATA_FORMAT. Please set to either 'csv' or 'parquet'.")

print("Preprocessed data successfully loaded!")
print(f"Training set size: {len(df_train)}, Test set size: {len(df_test)}")


Preprocessed data successfully loaded!
Training set size: 8000, Test set size: 2717


In [2]:
# Step 5: Generate relation label mappings
unique_relations = sorted(df_train["relation"].unique())
label2id = {label: idx for idx, label in enumerate(unique_relations)}
id2label = {idx: label for label, idx in label2id.items()}

df_train["label_id"] = df_train["relation"].map(label2id)
df_test["label_id"] = df_test["relation"].map(label2id)

# Step 6: Ensure GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 7: Load pre-trained BERT model and tokenizer
num_labels = len(label2id)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model.to(device)  # Move model to GPU if available

# Step 8: Tokenization
def encode_texts(texts, tokenizer, max_length=256):
    """Tokenizes and encodes sentences into numerical format for BERT processing."""
    return tokenizer(list(texts.values), padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

# Convert text data into BERT-compatible format
train_encodings = encode_texts(df_train["enhanced_sentence"], tokenizer)
test_encodings = encode_texts(df_test["enhanced_sentence"], tokenizer)


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Step 9: Create PyTorch Dataset class
class RelationDataset(Dataset):
    """Custom PyTorch Dataset class for relation extraction."""

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

# Convert training and test data into PyTorch datasets
train_dataset = RelationDataset(train_encodings, df_train["label_id"].tolist())
test_dataset = RelationDataset(test_encodings, df_test["label_id"].tolist())

# Step 10: Create DataLoader for batching
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(" Data processing completed, DataLoader is ready!")

 Data processing completed, DataLoader is ready!


In [4]:
# Step 11: Define contrastive learning loss function
class ContrastiveLoss(nn.Module):
    """Custom contrastive loss function to improve relation classification by separating different relation types."""

    def __init__(self, temperature=0.1):
        super(ContrastiveLoss, self).__init__()
        self.temperature = temperature

    def forward(self, z_i, z_j):
        """Computes contrastive loss (pulls positive samples closer, pushes negative samples apart)."""
        sim = torch.nn.functional.cosine_similarity(z_i, z_j, dim=-1) / self.temperature
        loss = -torch.log(torch.nn.functional.softmax(sim, dim=-1)).mean()
        return loss

contrastive_loss_fn = ContrastiveLoss()


In [7]:
# Step 12: Train the model (with learning rate scheduling & contrastive learning)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Set up learning rate scheduler
num_training_steps = len(train_loader) * 10  # 10 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

criterion = torch.nn.CrossEntropyLoss()

# Start training loop
for epoch in range(10):
    model.train()
    total_loss, contrastive_loss_total = 0, 0

    for batch in tqdm(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        # Forward pass through BERT
        outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
        logits = outputs.logits
        hidden_states = outputs.hidden_states[-1][:, 0, :]  # Extract CLS token representation

        # Compute standard classification loss
        ce_loss = criterion(logits, labels)

        # Compute contrastive loss
        positive_idx = torch.arange(hidden_states.size(0))
        negative_idx = torch.roll(positive_idx, shifts=1)
        contrastive_loss = contrastive_loss_fn(hidden_states[positive_idx], hidden_states[negative_idx])

        # Total loss = cross-entropy loss + weighted contrastive loss
        loss = ce_loss + 0.2 * contrastive_loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        contrastive_loss_total += contrastive_loss.item()

    print(f"Epoch {epoch+1}: Cross-Entropy Loss = {total_loss / len(train_loader):.4f}, Contrastive Loss = {contrastive_loss_total / len(train_loader):.4f}")

print("Training completed!")

100%|██████████| 250/250 [00:43<00:00,  5.81it/s]


Epoch 1: Cross-Entropy Loss = 0.9780, Contrastive Loss = 3.7436


100%|██████████| 250/250 [00:43<00:00,  5.80it/s]


Epoch 2: Cross-Entropy Loss = 0.8695, Contrastive Loss = 3.6688


100%|██████████| 250/250 [00:42<00:00,  5.81it/s]


Epoch 3: Cross-Entropy Loss = 0.8118, Contrastive Loss = 3.6310


100%|██████████| 250/250 [00:43<00:00,  5.81it/s]


Epoch 4: Cross-Entropy Loss = 0.7812, Contrastive Loss = 3.6037


100%|██████████| 250/250 [00:43<00:00,  5.80it/s]


Epoch 5: Cross-Entropy Loss = 0.7611, Contrastive Loss = 3.5880


100%|██████████| 250/250 [00:43<00:00,  5.80it/s]


Epoch 6: Cross-Entropy Loss = 0.7479, Contrastive Loss = 3.5705


100%|██████████| 250/250 [00:43<00:00,  5.79it/s]


Epoch 7: Cross-Entropy Loss = 0.7368, Contrastive Loss = 3.5636


100%|██████████| 250/250 [00:43<00:00,  5.80it/s]


Epoch 8: Cross-Entropy Loss = 0.7315, Contrastive Loss = 3.5534


100%|██████████| 250/250 [00:43<00:00,  5.81it/s]


Epoch 9: Cross-Entropy Loss = 0.7285, Contrastive Loss = 3.5489


100%|██████████| 250/250 [00:43<00:00,  5.79it/s]

Epoch 10: Cross-Entropy Loss = 0.7273, Contrastive Loss = 3.5476
Training completed!





In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report

# Step 13: Model evaluation
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='macro')  # Compute Precision
recall = recall_score(all_labels, all_preds, average='macro')        # Compute Recall
f1 = f1_score(all_labels, all_preds, average='macro')                # Compute F1-score

# Print evaluation results
print("Model Evaluation on Test Set:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nDetailed Classification Report:")
print(classification_report(all_labels, all_preds, target_names=[str(label) for label in label2id.keys()], zero_division=1))

print("Code execution completed, training and evaluation finished!")



Model Evaluation on Test Set:
Accuracy: 0.8524
Precision: 0.8023
Recall: 0.8205
F1 Score: 0.8101

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       134
           1       0.92      0.92      0.92       194
           2       0.87      0.87      0.87       162
           3       0.79      0.81      0.80       150
           4       0.90      0.93      0.92       153
           5       0.92      0.90      0.91        39
           6       0.92      0.95      0.94       291
           7       1.00      0.00      0.00         1
           8       0.85      0.92      0.89       211
           9       0.93      0.87      0.90        47
          10       0.63      0.77      0.69        22
          11       0.88      0.79      0.83       134
          12       0.69      0.75      0.72        32
          13       0.91      0.89      0.90       201
          14       0.86      0.95      0.90       210
     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
import os
import torch
import json
from transformers import BertTokenizer, BertForSequenceClassification

#  Define the directory to save the model
model_path = "improved_bert_relation_model"
os.makedirs(model_path, exist_ok=True)

#  Save the trained model (this saves 'pytorch_model.bin' and 'config.json')
print("Saving model weights and configuration...")
model.save_pretrained(model_path)

#  Save the tokenizer (this saves 'tokenizer.json', 'tokenizer_config.json', 'vocab.txt')
print("Saving tokenizer...")
tokenizer.save_pretrained(model_path)

# Save label mappings (ensuring integer keys and values are converted to standard Python integers)
label_mappings = {"label2id": {int(k): int(v) for k, v in label2id.items()},
                  "id2label": {int(k): str(v) for k, v in id2label.items()}} # Convert values to strings
with open(os.path.join(model_path, "label_mappings.json"), "w") as f:
    json.dump(label_mappings, f)

# Save optimizer and scheduler states (optional, useful for resuming training)
torch.save({
    'epoch': 10,  # Save last completed epoch
    'model_state_dict': model.state_dict(),  # Save model weights
    'optimizer_state_dict': optimizer.state_dict(),  # Save optimizer state
    'scheduler_state_dict': lr_scheduler.state_dict(),  # Save learning rate scheduler state
}, os.path.join(model_path, "checkpoint.pth"))

print(f" Model, tokenizer, and training state successfully saved in '{model_path}'")


Saving model weights and configuration...
Saving tokenizer...
 Model, tokenizer, and training state successfully saved in 'improved_bert_relation_model'


In [10]:
# Step 15: Load the saved model, tokenizer, and label mappings
from transformers import BertTokenizer, BertForSequenceClassification

# Define model path
model_path = "improved_bert_relation_model"

# Load trained model and tokenizer
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Load label mappings
with open(os.path.join(model_path, "label_mappings.json"), "r") as f:
    label_mappings = json.load(f)
    label2id = label_mappings["label2id"]
    id2label = label_mappings["id2label"]

# Move model to the available device (GPU or CPU)
model.to(device)

print(f"Model and tokenizer successfully loaded from: {model_path}")


Model and tokenizer successfully loaded from: improved_bert_relation_model


In [11]:
def predict_relation(sentence, model, tokenizer, id2label):
    """
    Predicts the relation type for a given input sentence using the trained BERT model.

    Args:
        sentence (str): A sentence with entity markers <e1>...</e1> and <e2>...</e2>.
        model (BertForSequenceClassification): The trained BERT model.
        tokenizer (BertTokenizer): Tokenizer corresponding to the trained BERT model.
        id2label (dict): Dictionary mapping label IDs to relation names.

    Returns:
        str: The predicted relation label.
    """

    # Ensure model is in evaluation mode
    model.eval()

    # Tokenize input sentence for BERT processing
    inputs = tokenizer(sentence, padding="max_length", truncation=True, max_length=128, return_tensors="pt").to(device)

    with torch.no_grad():  # Disable gradient calculations for inference
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()

    # Retrieve the predicted relation label
    relation = id2label[str(prediction)]
    return relation

# Example usage
test_sentence = "The <e1>company</e1> acquired the <e2>startup</e2>."
predicted_relation = predict_relation(test_sentence, model, tokenizer, id2label)

print(f"Predicted Relation: {predicted_relation}")


Predicted Relation: 18


In [12]:
# Example with and without external knowledge
test_sentence_basic = "The <e1>scientist</e1> won the <e2>Nobel Prize</e2>."
test_sentence_augmented = (
    "The <e1>scientist</e1> won the <e2>Nobel Prize</e2>. "
    "[SEP] scientist: A person who is studying or has expert knowledge in science. "
    "[KG: works at: Research Institute] [SEP] Nobel Prize: An international award given for achievements."
)

# Predict relations
relation_basic = predict_relation(test_sentence_basic, model, tokenizer, id2label)
relation_augmented = predict_relation(test_sentence_augmented, model, tokenizer, id2label)

print(f"Prediction without external knowledge: {relation_basic}")
print(f"Prediction with external knowledge: {relation_augmented}")


Prediction without external knowledge: 18
Prediction with external knowledge: 18


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import json
import os

# Step 1: Define the directory containing the saved model
model_path = "/content/improved_bert_relation_model"

# Step 2: Load the trained model and tokenizer
print(" Loading model and tokenizer...")
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Step 3: Load label mappings
with open(os.path.join(model_path, "label_mappings.json"), "r") as f:
    label_mappings = json.load(f)
    label2id = label_mappings["label2id"]
    id2label = {int(k): v for k, v in label_mappings["id2label"].items()}  # Ensure keys are integers

# Step 4: Set device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f" Model successfully loaded onto: {device}")

# Step 5: Define function for relation extraction
def predict_relation(sentence, model, tokenizer, id2label):
    """
    Predicts the relation type for a given input sentence using the trained BERT model.

    Args:
        sentence (str): A sentence with entity markers <e1>...</e1> and <e2>...</e2>.
        model (BertForSequenceClassification): The trained BERT model.
        tokenizer (BertTokenizer): Tokenizer corresponding to the trained BERT model.
        id2label (dict): Dictionary mapping label IDs to relation names.

    Returns:
        tuple: The predicted relation ID and relation label.
    """
    # Ensure model is in evaluation mode
    model.eval()

    # Check if the sentence contains entity markers
    if "<e1>" not in sentence or "<e2>" not in sentence:
        return "Error: The input sentence must contain <e1>...</e1> and <e2>...</e2> markers."

    # Tokenize input sentence for BERT processing
    inputs = tokenizer(sentence, padding="max_length", truncation=True, max_length=128, return_tensors="pt").to(device)

    with torch.no_grad():  # Disable gradient calculations for inference
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()

    # Retrieve the predicted relation label
    relation_id = prediction
    relation_name = id2label[prediction]
    return relation_id, relation_name

# Step 6: Real-time inference loop (User input mode)
print("\nRelation Extraction Model Ready! Type 'exit' to stop.")

# Define the relation_id_to_label mapping
relation_id_to_label = {
    0: "Cause-Effect(e1,e2)",
    1: "Cause-Effect(e2,e1)",
    2: "Component-Whole(e1,e2)",
    3: "Component-Whole(e2,e1)",
    4: "Content-Container(e1,e2)",
    5: "Content-Container(e2,e1)",
    6: "Entity-Destination(e1,e2)",
    7: "Entity-Destination(e2,e1)",
    8: "Entity-Origin(e1,e2)",
    9: "Entity-Origin(e2,e1)",
    10: "Instrument-Agency(e1,e2)",
    11: "Instrument-Agency(e2,e1)",
    12: "Member-Collection(e1,e2)",
    13: "Member-Collection(e2,e1)",
    14: "Message-Topic(e1,e2)",
    15: "Message-Topic(e2,e1)",
    16: "Product-Producer(e1,e2)",
    17: "Product-Producer(e2,e1)",
    18: "Other"
}

while True:
    user_sentence = input("\nEnter a sentence with <e1> and <e2> entity markers:\n")

    if user_sentence.lower() == "exit":
        print("Exiting program...")
        break

    relation_id, relation_name = predict_relation(user_sentence, model, tokenizer, relation_id_to_label)
    print(f"Predicted Relation ID: {relation_id}, Predicted Relation Name: {relation_name}")

🔄 Loading model and tokenizer...
 Model successfully loaded onto: cuda

Relation Extraction Model Ready! Type 'exit' to stop.

Enter a sentence with <e1> and <e2> entity markers:
The most common <e1>audits</e1> were about <e2>waste</e2> and recycling.
Predicted Relation ID: 14, Predicted Relation Name: Message-Topic(e1,e2)

Enter a sentence with <e1> and <e2> entity markers:
The <e1>company</e1> fabricates plastic <e2>chairs</e2>.
Predicted Relation ID: 17, Predicted Relation Name: Product-Producer(e2,e1)

Enter a sentence with <e1> and <e2> entity markers:
The school <e1>master</e1> teaches the lesson with a <e2>stick</e2>.
Predicted Relation ID: 11, Predicted Relation Name: Instrument-Agency(e2,e1)

Enter a sentence with <e1> and <e2> entity markers:
The suspect dumped the dead <e1>body</e1> into a local <e2>reservoir</e2>.
Predicted Relation ID: 6, Predicted Relation Name: Entity-Destination(e1,e2)

Enter a sentence with <e1> and <e2> entity markers:
Avian <e1>influenza</e1> is an i

KeyboardInterrupt: Interrupted by user