In [1]:
# Install dependencies (only needed once)
!pip install transformers datasets torch scikit-learn matplotlib tqdm nltk optuna

# Import required libraries
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from tqdm import tqdm
from sklearn.metrics import classification_report, accuracy_score

# Choose data format (CSV or Parquet)
DATA_FORMAT = "csv"  # Choose between "csv" or "parquet"

# Step 4: Load the preprocessed dataset
if DATA_FORMAT == "csv":
    df_train = pd.read_csv("train_enhanced.csv")
    df_test = pd.read_csv("test_enhanced.csv")
elif DATA_FORMAT == "parquet":
    df_train = pd.read_parquet("train_enhanced.parquet", engine="pyarrow")
    df_test = pd.read_parquet("test_enhanced.parquet", engine="pyarrow")
else:
    raise ValueError("Invalid DATA_FORMAT. Please set to either 'csv' or 'parquet'.")

print("Preprocessed data successfully loaded!")
print(f"Training set size: {len(df_train)}, Test set size: {len(df_test)}")


Preprocessed data successfully loaded!
Training set size: 8000, Test set size: 2717


In [2]:
# Load the saved model, tokenizer, and label mappings
from transformers import BertTokenizer, BertForSequenceClassification
import os
import torch
import json
from transformers import BertTokenizer, BertForSequenceClassification

# Define model path
model_path = "improved_bert_relation_model"


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



# Load trained model and tokenizer
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Load label mappings
with open(os.path.join(model_path, "label_mappings.json"), "r") as f:
    label_mappings = json.load(f)
    label2id = label_mappings["label2id"]
    id2label = label_mappings["id2label"]

# Move model to the available device (GPU or CPU)
model.to(device)

print(f"Model and tokenizer successfully loaded from: {model_path}")


Model and tokenizer successfully loaded from: improved_bert_relation_model


In [3]:
def encode_texts(texts, tokenizer, max_length=256):
    """Tokenizes and encodes sentences into numerical format for BERT processing."""
    return tokenizer(list(texts.values), padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
class RelationDataset(Dataset):
    """Custom PyTorch Dataset class for relation extraction."""

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }
train_encodings = encode_texts(df_train["enhanced_sentence"], tokenizer)
test_encodings = encode_texts(df_test["enhanced_sentence"], tokenizer)
# Convert training and test data into PyTorch datasets
train_dataset = RelationDataset(train_encodings, df_train["label_id"].tolist())
test_dataset = RelationDataset(test_encodings, df_test["label_id"].tolist())

# Create DataLoader for batching
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(" Data processing completed, DataLoader is ready!")

 Data processing completed, DataLoader is ready!


In [4]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report


# Convert text data into BERT-compatible format
train_encodings = encode_texts(df_train["enhanced_sentence"], tokenizer)
test_encodings = encode_texts(df_test["enhanced_sentence"], tokenizer)
# Model evaluation
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='macro')  # Compute Precision
recall = recall_score(all_labels, all_preds, average='macro')        # Compute Recall
f1 = f1_score(all_labels, all_preds, average='macro')                # Compute F1-score

# Print evaluation results
print("Model Evaluation on Test Set:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nDetailed Classification Report:")
print(classification_report(all_labels, all_preds, target_names=[str(label) for label in label2id.keys()], zero_division=1))

Model Evaluation on Test Set:
Accuracy: 0.8524
Precision: 0.8023
Recall: 0.8205
F1 Score: 0.8101

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       134
           1       0.92      0.92      0.92       194
           2       0.87      0.87      0.87       162
           3       0.79      0.81      0.80       150
           4       0.90      0.93      0.92       153
           5       0.92      0.90      0.91        39
           6       0.92      0.95      0.94       291
           7       1.00      0.00      0.00         1
           8       0.85      0.92      0.89       211
           9       0.93      0.87      0.90        47
          10       0.63      0.77      0.69        22
          11       0.88      0.79      0.83       134
          12       0.69      0.75      0.72        32
          13       0.91      0.89      0.90       201
          14       0.86      0.95      0.90       210
     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import json
import os

# Step 1: Define the directory containing the saved model
model_path = "/content/improved_bert_relation_model"

# Step 2: Load the trained model and tokenizer
print(" Loading model and tokenizer...")
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Step 3: Load label mappings
with open(os.path.join(model_path, "label_mappings.json"), "r") as f:
    label_mappings = json.load(f)
    label2id = label_mappings["label2id"]
    id2label = {int(k): v for k, v in label_mappings["id2label"].items()}  # Ensure keys are integers

# Step 4: Set device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f" Model successfully loaded onto: {device}")

# Step 5: Define function for relation extraction
def predict_relation(sentence, model, tokenizer, id2label):
    """
    Predicts the relation type for a given input sentence using the trained BERT model.

    Args:
        sentence (str): A sentence with entity markers <e1>...</e1> and <e2>...</e2>.
        model (BertForSequenceClassification): The trained BERT model.
        tokenizer (BertTokenizer): Tokenizer corresponding to the trained BERT model.
        id2label (dict): Dictionary mapping label IDs to relation names.

    Returns:
        tuple: The predicted relation ID and relation label.
    """
    # Ensure model is in evaluation mode
    model.eval()

    # Check if the sentence contains entity markers
    if "<e1>" not in sentence or "<e2>" not in sentence:
        return "Error: The input sentence must contain <e1>...</e1> and <e2>...</e2> markers."

    # Tokenize input sentence for BERT processing
    inputs = tokenizer(sentence, padding="max_length", truncation=True, max_length=128, return_tensors="pt").to(device)

    with torch.no_grad():  # Disable gradient calculations for inference
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()

    # Retrieve the predicted relation label
    relation_id = prediction
    relation_name = id2label[prediction]
    return relation_id, relation_name

# Step 6: Real-time inference loop (User input mode)
print("\nRelation Extraction Model Ready! Type 'exit' to stop.")

# Define the relation_id_to_label mapping
relation_id_to_label = {
    0: "Cause-Effect(e1,e2)",
    1: "Cause-Effect(e2,e1)",
    2: "Component-Whole(e1,e2)",
    3: "Component-Whole(e2,e1)",
    4: "Content-Container(e1,e2)",
    5: "Content-Container(e2,e1)",
    6: "Entity-Destination(e1,e2)",
    7: "Entity-Destination(e2,e1)",
    8: "Entity-Origin(e1,e2)",
    9: "Entity-Origin(e2,e1)",
    10: "Instrument-Agency(e1,e2)",
    11: "Instrument-Agency(e2,e1)",
    12: "Member-Collection(e1,e2)",
    13: "Member-Collection(e2,e1)",
    14: "Message-Topic(e1,e2)",
    15: "Message-Topic(e2,e1)",
    16: "Product-Producer(e1,e2)",
    17: "Product-Producer(e2,e1)",
    18: "Other"
}

while True:
    user_sentence = input("\nEnter a sentence with <e1> and <e2> entity markers:\n")

    if user_sentence.lower() == "exit":
        print("Exiting program...")
        break

    relation_id, relation_name = predict_relation(user_sentence, model, tokenizer, relation_id_to_label)
    print(f"Predicted Relation ID: {relation_id}, Predicted Relation Name: {relation_name}")



 Loading model and tokenizer...
 Model successfully loaded onto: cuda

Relation Extraction Model Ready! Type 'exit' to stop.

Enter a sentence with <e1> and <e2> entity markers:
The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.
Predicted Relation ID: 3, Predicted Relation Name: Component-Whole(e2,e1)

Enter a sentence with <e1> and <e2> entity markers:
The <e1>child</e1> was carefully wrapped and bound into the <e2>cradle</e2> by means of a cord.
Predicted Relation ID: 18, Predicted Relation Name: Other

Enter a sentence with <e1> and <e2> entity markers:
The <e1>author</e1> of a keygen uses a <e2>disassembler</e2> to look at the raw assembly code.
Predicted Relation ID: 11, Predicted Relation Name: Instrument-Agency(e2,e1)

Enter a sentence with <e1> and <e2> entity markers:
exit
Exiting program...
