In [None]:
# Install necessary dependencies (only needed once)
!pip install transformers datasets torch scikit-learn matplotlib tqdm

#  **Step 1: Import required libraries**
import os
import re
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Check if GPU is available
print(torch.cuda.is_available())  # True means GPU is available


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [None]:
#  **Step 2: Ensure GPU is used if available**
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Current device: {device}")
!nvidia-smi  # Display GPU status


Current device: cuda
Wed Mar  5 02:01:31 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   49C    P8             11W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                           

In [None]:
#  **Step 3: Load the dataset**
os.makedirs("data", exist_ok=True)
dataset = load_dataset("sem_eval_2010_task_8", download_mode="force_redownload")
df_train = dataset["train"].to_pandas()
df_test = dataset["test"].to_pandas()

# Display the first few rows to understand the dataset structure
print(df_train.head())
print(df_train.columns)  # Check column names


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.23k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/673k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/231k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2717 [00:00<?, ? examples/s]

                                            sentence  relation
0  The system as described above has its greatest...         3
1  The <e1>child</e1> was carefully wrapped and b...        18
2  The <e1>author</e1> of a keygen uses a <e2>dis...        11
3  A misty <e1>ridge</e1> uprises from the <e2>su...        18
4  The <e1>student</e1> <e2>association</e2> is t...        12
Index(['sentence', 'relation'], dtype='object')


In [None]:
# **Step 4: Data Preprocessing**
# Function to replace <e1> and <e2> entity markers for better processing
def clean_text(text):
    text = re.sub(r"<e1>(.*?)</e1>", "@entity1@", text)
    text = re.sub(r"<e2>(.*?)</e2>", "@entity2@", text)
    return text

# Apply text cleaning to train and test data
df_train["clean_sentence"] = df_train["sentence"].apply(clean_text)
df_test["clean_sentence"] = df_test["sentence"].apply(clean_text)

# Display cleaned sentences
print(df_train[["sentence", "clean_sentence"]].head())


                                            sentence  \
0  The system as described above has its greatest...   
1  The <e1>child</e1> was carefully wrapped and b...   
2  The <e1>author</e1> of a keygen uses a <e2>dis...   
3  A misty <e1>ridge</e1> uprises from the <e2>su...   
4  The <e1>student</e1> <e2>association</e2> is t...   

                                      clean_sentence  
0  The system as described above has its greatest...  
1  The @entity1@ was carefully wrapped and bound ...  
2  The @entity1@ of a keygen uses a @entity2@ to ...  
3      A misty @entity1@ uprises from the @entity2@.  
4  The @entity1@ @entity2@ is the voice of the un...  


In [None]:
#  **Step 5: Encode relation labels**
# Generate label-to-ID mapping
unique_relations = sorted(df_train["relation"].unique())
label2id = {label: idx for idx, label in enumerate(unique_relations)}
id2label = {idx: label for label, idx in label2id.items()}

# Map labels to numerical IDs
df_train["label_id"] = df_train["relation"].map(label2id)
df_test["label_id"] = df_test["relation"].map(label2id)

# Display the mapping of relation labels
print(df_train[["relation", "label_id"]].head())

   relation  label_id
0         3         3
1        18        18
2        11        11
3        18        18
4        12        12


In [None]:
#  **Step 6: Load BERT model**
num_labels = len(label2id)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model.to(device)  # Move model to GPU if available
print(f" Model loaded onto: {next(model.parameters()).device}")

#  **Step 7: Tokenization**
# Function to encode texts for BERT input
def encode_texts(texts, tokenizer, max_length=128):
    return tokenizer(list(texts.values), padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

# Tokenize train and test datasets
train_encodings = encode_texts(df_train["clean_sentence"], tokenizer)
test_encodings = encode_texts(df_test["clean_sentence"], tokenizer)


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

 Model loaded onto: cuda:0


In [None]:
#  **Step 8: Create PyTorch Dataset class**
class RelationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

# Convert train and test data into PyTorch datasets
train_dataset = RelationDataset(train_encodings, df_train["label_id"].tolist())
test_dataset = RelationDataset(test_encodings, df_test["label_id"].tolist())

#  **Step 9: Create DataLoader for batching**
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2, pin_memory=True)


In [None]:
from transformers import get_scheduler

#  **Step 10: Set up optimizer and learning rate scheduler**
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

# Compute the total number of training steps
num_training_steps = len(train_loader) * 10  # Assuming 10 epochs
lr_scheduler = get_scheduler(
    name="cosine",  # Cosine Annealing
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

#  **Step 11: Train the model**
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(10):  # Train for 10 epochs
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        loss.backward()
        optimizer.step()  # Perform one optimizer update
        lr_scheduler.step()  # Gradually adjust the learning rate

        total_loss += loss.item()

    print(f"Epoch {epoch+1}: Average Loss = {total_loss / len(train_loader):.4f}")

print(" Training completed!")

Epoch 1: Average Loss = 2.1781
Epoch 2: Average Loss = 1.0871
Epoch 3: Average Loss = 0.7341
Epoch 4: Average Loss = 0.5100
Epoch 5: Average Loss = 0.3397
Epoch 6: Average Loss = 0.2311
Epoch 7: Average Loss = 0.1708
Epoch 8: Average Loss = 0.1345
Epoch 9: Average Loss = 0.1171
Epoch 10: Average Loss = 0.1108
 Training completed!


In [None]:
#  **Step 12: Evaluate the model**
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score # Import precision_score, recall_score, f1_score

model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Display some predictions and labels
print("Example of true labels (all_labels):", all_labels[:10])
print("Example of predicted labels (all_preds):", all_preds[:10])
print("Relation classes:", list(label2id.keys()))

#  Print performance metrics in the required format
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='macro')
recall = recall_score(all_labels, all_preds, average='macro')
f1 = f1_score(all_labels, all_preds, average='macro')

print("Model Evaluation on Test Set:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nDetailed Classification Report:")
print(classification_report(all_labels, all_preds, target_names=[str(label) for label in label2id.keys()], zero_division=1))

print(" Code execution completed, training and evaluation finished!")


Example of true labels (all_labels): [14, 17, 11, 6, 1, 2, 16, 13, 2, 14]
Example of predicted labels (all_preds): [14, 17, 11, 6, 1, 2, 18, 13, 18, 14]
Relation classes: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
Model Evaluation on Test Set:
Accuracy: 0.7611
Precision: 0.7043
Recall: 0.7235
F1 Score: 0.7124

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.92      0.90       134
           1       0.90      0.88      0.89       194
           2       0.76      0.81      0.78       162
           3       0.71      0.70      0.71       150
           4       0.83      0.90      0.86       153
           5       0.74      0.79      0.77        39
           6       0.89      0.94      0.92       291
           7       1.00      0.00      0.00         1
           8       0.83      0.85      0.84       211
           9       0.80      0.68      0.74        47
          10       0.46      0.50    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
#  **Step 13: Additional analysis**
print("Example of true labels (all_labels):", all_labels[:10])
print("Example of predicted labels (all_preds):", all_preds[:10])
print("Relation classes:", list(label2id.keys()))

# Randomly check some samples
print(df_train.sample(5))
print(df_train.isnull().sum())  # Check for missing values
print(df_train["sentence"].apply(len).describe())  # Analyze sentence length distribution

Example of true labels (all_labels): [14, 17, 11, 6, 1, 2, 16, 13, 2, 14]
Example of predicted labels (all_preds): [14, 17, 11, 6, 1, 2, 18, 13, 18, 14]
Relation classes: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
                                               sentence  relation  \
7538  They generally bought grapes from farmers or e...        16   
410   The curved <e1>spans</e1> of the <e2>bridge</e...         2   
589   The assigned cashier code serves to identify t...        11   
7187  I worked as a <e1>crane</e1> <e2>operator</e2>...        10   
3808  The <e1>castle</e1> has two big concentric <e2...         3   

                                         clean_sentence  label_id  
7538  They generally bought grapes from farmers or e...        16  
410   The curved @entity1@ of the @entity2@ are the ...         2  
589   The assigned cashier code serves to identify t...        11  
7187  I worked as a @entity1@ @entity2@ in construct...        10  
3808  T

In [None]:
import os
import torch
import json
from transformers import BertTokenizer, BertForSequenceClassification

# Define the directory to save the model
model_path = "original_bert_relation_model"
os.makedirs(model_path, exist_ok=True)

# Save the trained model (this saves 'pytorch_model.bin' and 'config.json')
print("Saving model weights and configuration...")
model.save_pretrained(model_path)

# Save the tokenizer (this saves 'tokenizer.json', 'tokenizer_config.json', 'vocab.txt')
print("Saving tokenizer...")
tokenizer.save_pretrained(model_path)

# Save label mappings (ensuring integer keys and values are converted to standard Python integers)
label_mappings = {"label2id": {int(k): int(v) for k, v in label2id.items()},
                  "id2label": {int(k): str(v) for k, v in id2label.items()}} # Convert values to strings
with open(os.path.join(model_path, "label_mappings.json"), "w") as f:
    json.dump(label_mappings, f)

#  Save optimizer and scheduler states (optional, useful for resuming training)
torch.save({
    'epoch': 10,  # Save last completed epoch
    'model_state_dict': model.state_dict(),  # Save model weights
    'optimizer_state_dict': optimizer.state_dict(),  # Save optimizer state
    'scheduler_state_dict': lr_scheduler.state_dict(),  # Save learning rate scheduler state
}, os.path.join(model_path, "checkpoint.pth"))

print(f"Model, tokenizer, and training state successfully saved in '{model_path}'")


Saving model weights and configuration...
Saving tokenizer...
✅ Model, tokenizer, and training state successfully saved in 'original_bert_relation_model'
