# BERT baseline


# NOTE:  To load the best trained model(saved in the same folder) and test it, please run the LAST 2 cells !

In [None]:
import json
import pandas as pd

# Data Processing - Extracting relationships
def extract_relations(json_line):
    relations = []
    sent_text = json_line['sentText']
    for relation in json_line['relationMentions']:
        em1_text = relation['em1Text']
        em2_text = relation['em2Text']
        label = relation['label']
        relations.append((sent_text, em1_text, em2_text, label))
    return relations

# Read the dataset and process the data, convert data to DataFrame
train_input_file = '/content/train_converted_old_format.json'
train_data = []
with open(train_input_file, 'r') as file:
    for line in file:
        json_line = json.loads(line)
        relations = extract_relations(json_line)
        train_data.extend(relations)

df = pd.DataFrame(train_data, columns=['sentence', 'entity1', 'entity2', 'label'])

valid_input_file = '/content/eval_converted_old_format.json'
val_data = []
with open(valid_input_file, 'r') as file:
    for line in file:
        json_line = json.loads(line)
        relations = extract_relations(json_line)
        val_data.extend(relations)

val_df = pd.DataFrame(val_data, columns=['sentence', 'entity1', 'entity2', 'label'])

In [None]:
# Change all the forward slashes to underscores in the label column
df['label'] = df['label'].str.replace('/', '_')
val_df['label'] = val_df['label'].str.replace('/', '_')

In [None]:
#change all dtypes to string
df = df.astype(str)
val_df = val_df.astype(str)

In [None]:
import sys
print(sys.executable)
!pip show torch
from torch.optim.lr_scheduler import _LRScheduler as LRScheduler

/usr/bin/python3
Name: torch
Version: 2.5.1+cu124
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3-Clause
Location: /usr/local/lib/python3.11/dist-packages
Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu12, nvidia-cuda-cupti-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-runtime-cu12, nvidia-cudnn-cu12, nvidia-cufft-cu12, nvidia-curand-cu12, nvidia-cusolver-cu12, nvidia-cusparse-cu12, nvidia-nccl-cu12, nvidia-nvjitlink-cu12, nvidia-nvtx-cu12, sympy, triton, typing-extensions
Required-by: accelerate, fastai, peft, sentence-transformers, timm, torchaudio, torchvision


In [None]:
# Data preprocessing
import pandas as pd
import torch
!pip install transformers==4.28.0
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
label_encoder = {label: i for i, label in enumerate(df['label'].unique())}
def preprocess_data(row):
    sentence = row['sentence']
    entity1 = row['entity1']
    entity2 = row['entity2']
    max_length = 96

    # Tokenize sentence
    tokens = tokenizer.tokenize(sentence)

    # Find positions of entities in tokenized sentence
    entity1_pos = [i for i, token in enumerate(tokens) if token in tokenizer.tokenize(entity1)]
    entity2_pos = [i for i, token in enumerate(tokens) if token in tokenizer.tokenize(entity2)]

    # Encode entities positions
    encoded_sentence = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length= max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    # Avoid cross-broder
    entity1_pos = min(entity1_pos[0] if entity1_pos else 0, max_length - 1)
    entity2_pos = min(entity2_pos[0] if entity2_pos else 0, max_length - 1)

    return {
        'input_ids': encoded_sentence['input_ids'].squeeze(),
        'attention_mask': encoded_sentence['attention_mask'].squeeze(),
        'entity1_pos': torch.tensor(entity1_pos),
        'entity2_pos': torch.tensor(entity2_pos),
        'label': torch.tensor(label_encoder[row['label']])
    }




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
train_data = df.apply(preprocess_data, axis=1).tolist()
val_data = val_df.apply(preprocess_data, axis=1).tolist()

In [None]:
# Create custom dataset class
class RelationshipDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Create dataset and dataloader
train_dataset = RelationshipDataset(train_data)
val_dataset = RelationshipDataset(val_data)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:

import torch
import torch.nn as nn
from transformers import BertModel

In [None]:
# Keep original BERT structure, add a EPCA layer（Entity Pair Contrastive Attention）
'''focus on Q(entity1), K,V(entity 2). calculate entity_attention, difference, similarity. then do feature fusion of those 3 values.get entity_feature '''
class EntityPairContrastiveAttention(nn.Module):
    def __init__(self, hidden_dim):
        super(EntityPairContrastiveAttention, self).__init__()
        self.hidden_dim = hidden_dim

        # Linear transformation layer
        self.W_q = nn.Linear(hidden_dim, hidden_dim)
        self.W_k = nn.Linear(hidden_dim, hidden_dim)
        self.W_v = nn.Linear(hidden_dim, hidden_dim)

        # Relative information modeling
        self.W_diff = nn.Linear(hidden_dim, hidden_dim)
        self.W_sim = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, entity1_rep, entity2_rep):
        """
        entity1_rep, entity2_rep: (batch_size, hidden_dim)
        """
        # Calculate Query, Key, Value
        Q = self.W_q(entity1_rep)
        K = self.W_k(entity2_rep)
        V = self.W_v(entity2_rep)

        # Calculate the inter-entity attention score
        attention_scores = torch.matmul(Q, K.T) / (self.hidden_dim ** 0.5)
        attention_probs = torch.softmax(attention_scores, dim=-1)

        # Weighted sum
        entity_attention = torch.matmul(attention_probs, V)

        # Calculate the relative information of the entity
        h_diff = self.W_diff(entity1_rep - entity2_rep)
        h_sim = self.W_sim(entity1_rep * entity2_rep)

        # Combination
        entity_features = entity_attention + h_diff + h_sim
        return entity_features

In [None]:
# Relation extraction model:bert+EPCA+MLP
'''Combine CLS and entity attention representation 
Merge the two improvement methods：EntityPairContrastiveAttention, make entity 1 and entity 2 compute Query-Key interactly with each other. The entity relative information (vector difference, dot product similarity) is calculated to enhance the relationship characteristics. Modified MLP input, new method: CLS + entity attention interactive representation.
entity_features = self.entity_attention(entity1_rep, entity2_rep)'''

import torch
import torch.nn as nn
from transformers import BertModel

class RelationshipExtractionModel(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.hidden_size = self.bert.config.hidden_size #！！！
        self.entity_attention = EntityPairContrastiveAttention(self.hidden_size)

        # MLP layer
        self.mlp = nn.Sequential(
            nn.Linear(self.hidden_size * 2, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_labels)
        )


    def forward(self, input_ids, attention_mask, entity1_pos, entity2_pos):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state

        batch_size, seq_len, hidden_dim = hidden_states.shape

        # ** Ensure the index does not exceed seq_len **
        entity1_pos = torch.clamp(entity1_pos, min=0, max=seq_len - 1)
        entity2_pos = torch.clamp(entity2_pos, min=0, max=seq_len - 1)

        # ** Ensure the index is LongTensor **
        entity1_pos = entity1_pos.long()
        entity2_pos = entity2_pos.long()

        cls_rep = hidden_states[:, 0, :]
        entity1_rep = hidden_states[torch.arange(batch_size), entity1_pos]
        entity2_rep = hidden_states[torch.arange(batch_size), entity2_pos]


        # ！！！** Combine CLS + physical attention representation **
        # Merge the two improvement methods：EntityPairContrastiveAttention, make entity 1 and entity 2 compute Query-Key interactly with each other. The entity relative information (vector difference, dot product similarity) is calculated to enhance the relationship characteristics. Modified MLP input, new method: CLS + entity attention interactive representation.
        entity_features = self.entity_attention(entity1_rep, entity2_rep)
        combined_rep = torch.cat([cls_rep, entity_features], dim=1)


        logits = self.mlp(combined_rep)
        return logits

In [None]:
# Set up training loop
num_labels = len(df['label'].unique())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = RelationshipExtractionModel(num_labels).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 5

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Train and evaluate function definitions
from sklearn.metrics import accuracy_score

def evaluate(model, dataloader):
    model.eval()
    total_preds = []
    total_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            entity1_pos = batch['entity1_pos'].to(device)
            entity2_pos = batch['entity2_pos'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask, entity1_pos, entity2_pos)
            _, preds = torch.max(outputs, dim=1)

            total_preds.extend(preds.cpu().tolist())
            total_labels.extend(labels.cpu().tolist())

    return accuracy_score(total_labels, total_preds)

In [None]:
# Training and evaluation process
from tqdm.auto import tqdm

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        entity1_pos = batch['entity1_pos'].to(device)
        entity2_pos = batch['entity2_pos'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, entity1_pos, entity2_pos)
        loss = torch.nn.functional.cross_entropy(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': total_loss / (progress_bar.n + 1)})

    # Calculate validation accuracy
    train_accuracy = evaluate(model, train_dataloader)
    val_accuracy = evaluate(model, val_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} completed. train Accuracy: {train_accuracy:.4f}. validation Accuracy: {val_accuracy:.4f}")
    torch.save(model, f"bertproModel_{epoch+1}.pth")


Epoch 1/5:   0%|          | 0/20991 [00:00<?, ?it/s]

Epoch 1/5 completed. train Accuracy: 0.9164. validation Accuracy: 0.9008


Epoch 2/5:   0%|          | 0/20991 [00:00<?, ?it/s]

Epoch 2/5 completed. train Accuracy: 0.9326. validation Accuracy: 0.9101


Epoch 3/5:   0%|          | 0/20991 [00:00<?, ?it/s]

Epoch 3/5 completed. train Accuracy: 0.9411. validation Accuracy: 0.9134


Epoch 4/5:   0%|          | 0/20991 [00:00<?, ?it/s]

Epoch 4/5 completed. train Accuracy: 0.9458. validation Accuracy: 0.9169


Epoch 5/5:   0%|          | 0/20991 [00:00<?, ?it/s]

Epoch 5/5 completed. train Accuracy: 0.9476. validation Accuracy: 0.9168


In [None]:
# Load and process test data
test_input_file = '/content/test_converted_old_format.json'
test_data = []

with open(test_input_file, 'r') as file:
    for line in file:
        json_line = json.loads(line)
        relations = extract_relations(json_line)
        test_data.extend(relations)

# Convert the extracted test data to a pandas DataFrame
test_df = pd.DataFrame(test_data, columns=['sentence', 'entity1', 'entity2', 'label'])

# Change all the forward slashes to underscores in the label column (same as train/validation data)
test_df['label'] = test_df['label'].str.replace('/', '_')

# Change all dtypes to string
test_df = test_df.astype(str)

# Preprocess the test data
test_data = test_df.apply(preprocess_data, axis=1).tolist()
test_dataset = RelationshipDataset(test_data)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Evaluate the model on the test dataset
def test(model, dataloader):
    model.eval()
    total_preds = []
    total_labels = []
    total_success = 0
    total_samples = 0

    # Use tqdm to show progress bar
    progress_bar = tqdm(dataloader, desc="Testing Progress", dynamic_ncols=True)

    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            entity1_pos = batch['entity1_pos'].to(device)
            entity2_pos = batch['entity2_pos'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask, entity1_pos, entity2_pos)
            _, preds = torch.max(outputs, dim=1)

            # Track total samples
            batch_size = len(labels)
            total_samples += batch_size

            # Calculate batch success (correct predictions)
            correct_preds = (preds == labels).sum().item()
            total_success += correct_preds

            # Update progress bar description to show current success rate
            progress_bar.set_postfix(success=f'{total_success}/{total_samples} ({(total_success/total_samples)*100:.2f}%)')

            total_preds.extend(preds.cpu().tolist())
            total_labels.extend(labels.cpu().tolist())

    # Calculate overall accuracy
    test_accuracy = accuracy_score(total_labels, total_preds)
    return test_accuracy, total_success, total_samples


model = torch.load('bertproModel_5.pth')
# Run the test and print the accuracy
test_accuracy, total_success, total_samples = test(model, test_dataloader)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Total Success: {total_success}/{total_samples} ({(total_success/total_samples)*100:.2f}%)")


  model = torch.load('bertproModel_5.pth')


Testing Progress:   0%|          | 0/91 [00:00<?, ?it/s]

Test Accuracy: 0.4572
Total Success: 663/1450 (45.72%)


In [None]:
# Assessment module function
import numpy as np
import torch

relation_cls_label_map = {
    0: 'None',
    1: '_location_location_contains',
    2: '_location_administrative_division_country',
    3: '_location_country_administrative_divisions',
    4: '_location_country_capital',
    5: '_people_person_children',
    6: '_people_person_place_lived',
    7: '_people_person_nationality',
    8: '_business_company_place_founded',
    9: '_location_neighborhood_neighborhood_of',
    10: '_people_person_place_of_birth',
    11: '_sports_sports_team_location',
    12: '_sports_sports_team_location_teams',
    13: '_people_deceased_person_place_of_death',
    14: '_business_company_founders',
    15: '_business_person_company',
    16: '_business_company_major_shareholders',
    17: '_business_company_shareholder_major_shareholder_of',
    18: '_people_ethnicity_people',
    19: '_people_person_ethnicity',
    20: '_business_company_advisors',
    21: '_people_person_religion',
    22: '_people_ethnicity_geographic_distribution',
    23: '_people_person_profession',
    24: '_business_company_industry'
}

# Categories to ignore (usually the "unrelated" category)
ignore_rel_list = ['None']
def get_threshold(data, preds):
    max_f1 = -1.0
    best_th = -1.0
    cur_th = 0.0

    while cur_th < 1.0:
        pred_pos, gt_pos, correct_pos = get_F1(data, preds, threshold=cur_th)
        p = float(correct_pos) / (pred_pos + 1e-8)
        r = float(correct_pos) / (gt_pos + 1e-8)
        cur_f1 = (2 * p * r) / (p + r + 1e-8)

        if cur_f1 > max_f1:
            max_f1 = cur_f1
            best_th = cur_th
        cur_th += 0.01  #  The best threshold was searched with a step size of 0.01

    return best_th

def get_F1(data, preds_probs, threshold=0.0):
    gt_pos = 0
    pred_pos = 0
    correct_pos = 0

    for i in range(len(preds_probs)):
        true_label_idx = data[i]['label'].item()
        org_rel_name = relation_cls_label_map[true_label_idx]
        pred_val = np.argmax(preds_probs[i])
        pred_rel_name = relation_cls_label_map[pred_val]

        # Adjusted prediction: If the prediction is non-None and the probability > threshold is retained, otherwise it is treated as None
        if org_rel_name not in ignore_rel_list:
            gt_pos += 1
        if (pred_rel_name not in ignore_rel_list) and (np.max(preds_probs[i]) > threshold):
            pred_pos += 1

        if (org_rel_name == pred_rel_name) and (org_rel_name not in ignore_rel_list) and (np.max(preds_probs[i]) > threshold):
            correct_pos += 1

    return pred_pos, gt_pos, correct_pos

def evaluate_metrics(data, preds_probs, threshold=0.0):
    pred_pos, gt_pos, correct_pos = get_F1(data, preds_probs, threshold)

    precision = correct_pos / (pred_pos + 1e-8)
    recall = correct_pos / (gt_pos + 1e-8)
    f1 = (2 * precision * recall) / (precision + recall + 1e-8)

    # Outputting each metric
    print(f"Threshold={threshold:.2f} | P: {precision:.4f} R: {recall:.4f} F1: {f1:.4f}")
    return precision, recall, f1

def evaluate_metric_model(model, dataloader, dataset, threshold=0.0):
    model.eval()
    all_pred_probs = []
    all_valid_probs = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            entity1_pos = batch['entity1_pos'].to(device)
            entity2_pos = batch['entity2_pos'].to(device)

            outputs = model(input_ids, attention_mask, entity1_pos, entity2_pos)
            logits = outputs[0] if isinstance(outputs, tuple) else outputs
            probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy()
            all_pred_probs.extend(probs)


    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            entity1_pos = batch['entity1_pos'].to(device)
            entity2_pos = batch['entity2_pos'].to(device)

            outputs = model(input_ids, attention_mask, entity1_pos, entity2_pos)
            logits = outputs[0] if isinstance(outputs, tuple) else outputs
            probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy()
            all_valid_probs.extend(probs)

    if len(all_pred_probs) != len(dataset):
        raise ValueError(f"The number of predictions ({len(all_pred_probs)}) does not match the number of dataset samples ({len(dataset)})")
    threshold = get_threshold(val_dataset, all_valid_probs)
    print(threshold)

    # Call the evaluation function and return the accuracy
    precision, recall, f1 = evaluate_metrics(dataset, all_pred_probs, threshold)
    return precision, recall, f1

# Load the best trained model and output the score of test set
precision, recall, f1 = evaluate_metric_model(model, test_dataloader, test_dataset, threshold=0.01)

0.47000000000000025
Threshold=0.47 | P: 0.3953 R: 0.7192 F1: 0.5102


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def predict_relationship(sentence, entity1, entity2):
    # Tokenize sentence
    tokens = tokenizer.tokenize(sentence)

    # Find positions of entities in tokenized sentence
    entity1_pos = [i for i, token in enumerate(tokens) if token in tokenizer.tokenize(entity1)]
    entity2_pos = [i for i, token in enumerate(tokens) if token in tokenizer.tokenize(entity2)]

    # Encode entities positions
    encoded_sentence = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=96,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoded_sentence['input_ids'].to(device)
    attention_mask = encoded_sentence['attention_mask'].to(device)
    entity1_pos = torch.tensor(entity1_pos[0] if entity1_pos else 0).unsqueeze(0).to(device)
    entity2_pos = torch.tensor(entity2_pos[0] if entity2_pos else 0).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask, entity1_pos, entity2_pos)
        _, preds = torch.max(outputs, dim=1)

    # Convert prediction back to label
    reverse_label_encoder = {v: k for k, v in label_encoder.items()}
    predicted_label = reverse_label_encoder[preds.item()]

    return predicted_label

# Example sentence
sentence = "Alan Turing, the father of computer science, was born in London in 1912."
entity1 = "Alan Turing"
entity2 = "London"

predicted_relationship = predict_relationship(sentence, entity1, entity2)
print(f"Predicted relationship between '{entity1}' and '{entity2}': {predicted_relationship}")

Predicted relationship between 'Alan Turing' and 'London': _people_person_place_of_birth




In [None]:
sentence = "Pendik province in Istanbul is one of the most beautiful places you can see on Earth."
entity1 = "Istanbul"
entity2 = "Pendik"

predicted_relationship = predict_relationship(sentence, entity1, entity2)
print(f"Predicted relationship between '{entity1}' and '{entity2}': {predicted_relationship}")

Predicted relationship between 'Istanbul' and 'Pendik': None


# --------------------------------------END----------------------------------------
# To load the best trained model(saved in the same folder) and test it, please run the following 2 cells ! You can input your test cases in the second cell below.

In [6]:
#-----------------------------------------------------
#  For teachers to test our model
#-----------------------------------------------------

import sys
import subprocess



def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# List of packages to install
required_packages = ["pandas", "torch==2.0.1", "transformers==4.32.0", "tqdm", "scikit-learn", "numpy", "gdown"]

# Install the missing libraries
for package in required_packages:
    try:
        __import__(package)
        print(f"{package} Installed")
    except ImportError:
        print(f"Installing {package} ...")
        install(package)

#-----------------------------------------------------
# Load the model and related variables
#-----------------------------------------------------
import torch
from transformers import BertTokenizer
import torch.nn as nn
from transformers import BertModel,BertConfig
import os
import gdown

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Google Drive File link
file_id = "13I6peo1hvsi8TaRw_Ysk_4UxeY3muWhf"
best_model_path = "./best_BERTmodel.pth"  # Local storage path


# If the model file does not exist, it is downloaded
if not os.path.exists(best_model_path):
    gdown.download(f"https://drive.google.com/uc?id={file_id}", best_model_path, quiet=False)
    print("Model downloaded successfully!")

    
#num_labels = len(df['label'].unique())
num_labels=25
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
label_encoder = {'None': 0, 'location_location_contains': 1, 'location_administrative_division_country': 2, 'location_country_administrative_divisions': 3, 'location_country_capital': 4, 'people_person_children': 5, 'people_person_place_lived': 6, 'people_person_nationality': 7, 'business_company_place_founded': 8, 'location_neighborhood_neighborhood_of': 9, 'people_person_place_of_birth': 10, 'sports_sports_team_location': 11, 'sports_sports_team_location_teams': 12, 'people_deceased_person_place_of_death': 13, 'business_company_founders': 14, 'business_person_company': 15, 'business_company_major_shareholders': 16, 'business_company_shareholder_major_shareholder_of': 17, 'people_ethnicity_people': 18, 'people_person_ethnicity': 19, 'business_company_advisors': 20, 'people_person_religion': 21, 'people_ethnicity_geographic_distribution': 22, 'people_person_profession': 23, 'business_company_industry': 24}


class EntityPairContrastiveAttention(nn.Module):
    def __init__(self, hidden_dim):
        super(EntityPairContrastiveAttention, self).__init__()
        self.hidden_dim = hidden_dim

        # Linear transformation layer
        self.W_q = nn.Linear(hidden_dim, hidden_dim)
        self.W_k = nn.Linear(hidden_dim, hidden_dim)
        self.W_v = nn.Linear(hidden_dim, hidden_dim)

        # Relative information modeling
        self.W_diff = nn.Linear(hidden_dim, hidden_dim)
        self.W_sim = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, entity1_rep, entity2_rep):
        """
        entity1_rep, entity2_rep: (batch_size, hidden_dim)
        """
        # Calculate Query, Key, Value
        Q = self.W_q(entity1_rep)
        K = self.W_k(entity2_rep)
        V = self.W_v(entity2_rep)

        # Calculate the inter-entity attention score
        attention_scores = torch.matmul(Q, K.T) / (self.hidden_dim ** 0.5)
        attention_probs = torch.softmax(attention_scores, dim=-1)

        # Weighted sum
        entity_attention = torch.matmul(attention_probs, V)

        # Calculate the relative information of the entity
        h_diff = self.W_diff(entity1_rep - entity2_rep)
        h_sim = self.W_sim(entity1_rep * entity2_rep)

        # Combination
        entity_features = entity_attention + h_diff + h_sim
        return entity_features

class RelationshipExtractionModel(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        #self.bert = BertModel.from_pretrained('bert-base-uncased')
        config = BertConfig.from_pretrained('bert-base-uncased')
        self.bert = BertModel.from_pretrained('bert-base-uncased', config=config)
                                              
        self.hidden_size = self.bert.config.hidden_size #！！！
        self.entity_attention = EntityPairContrastiveAttention(self.hidden_size)

        # MLP layer
        self.mlp = nn.Sequential(
            nn.Linear(self.hidden_size * 2, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_labels)
        )


    def forward(self, input_ids, attention_mask, entity1_pos, entity2_pos):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state

        batch_size, seq_len, hidden_dim = hidden_states.shape

        # ** Ensure the index does not exceed seq_len **
        entity1_pos = torch.clamp(entity1_pos, min=0, max=seq_len - 1)
        entity2_pos = torch.clamp(entity2_pos, min=0, max=seq_len - 1)

        # ** Ensure the index is LongTensor **
        entity1_pos = entity1_pos.long()
        entity2_pos = entity2_pos.long()

        cls_rep = hidden_states[:, 0, :]
        entity1_rep = hidden_states[torch.arange(batch_size), entity1_pos]
        entity2_rep = hidden_states[torch.arange(batch_size), entity2_pos]


        # ！！！** Combine CLS + physical attention representation **
        # Merge the two improvement methods：EntityPairContrastiveAttention, make entity 1 and entity 2 compute Query-Key interactly with each other. The entity relative information (vector difference, dot product similarity) is calculated to enhance the relationship characteristics. Modified MLP input, new method: CLS + entity attention interactive representation.
        entity_features = self.entity_attention(entity1_rep, entity2_rep)
        combined_rep = torch.cat([cls_rep, entity_features], dim=1)


        logits = self.mlp(combined_rep)
        return logits

# Load saved optimal model
def load_best_model(model, model_path):
    #model.load_state_dict(torch.load(model_path))
    model = torch.load(model_path, map_location=torch.device('cpu')) 
    #print(model)
    model.eval()
    print(f"BERT Model loaded successfully!")
    return model

#Function definition for relation prediction on an input sentence and two entities using the loaded model
def predict_relationship_with_saved_model(sentence, entity1, entity2, model, tokenizer, label_encoder, max_length=256):
    tokens = tokenizer.tokenize(sentence)
    entity1_pos = [i for i, token in enumerate(tokens) if token in tokenizer.tokenize(entity1)]
    entity2_pos = [i for i, token in enumerate(tokens) if token in tokenizer.tokenize(entity2)]

    # tokenize the input sentence
    encoded_sentence = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt')

    input_ids = encoded_sentence['input_ids'].to(device)
    attention_mask = encoded_sentence['attention_mask'].to(device)
    entity1_pos = torch.tensor(entity1_pos[0] if entity1_pos else 0).unsqueeze(0).to(device)
    entity2_pos = torch.tensor(entity2_pos[0] if entity2_pos else 0).unsqueeze(0).to(device)

    # Make predictions using the loaded model
    with torch.no_grad():
        logits = model(input_ids, attention_mask, entity1_pos, entity2_pos)
        _, preds = torch.max(logits, dim=1)

    # The labels are obtained by reverse mapping
    reverse_label_encoder = {v: k for k, v in label_encoder.items()}
    predicted_label = reverse_label_encoder[preds.item()]

    return predicted_label


bert_model = RelationshipExtractionModel(num_labels).to(device)


# load best model
bert_model = load_best_model(bert_model, best_model_path)


pandas Installed
Installing torch==2.0.1 ...
Installing transformers==4.32.0 ...
tqdm Installed
Installing scikit-learn ...
numpy Installed
gdown Installed


Downloading...
From (original): https://drive.google.com/uc?id=13I6peo1hvsi8TaRw_Ysk_4UxeY3muWhf
From (redirected): https://drive.google.com/uc?id=13I6peo1hvsi8TaRw_Ysk_4UxeY3muWhf&confirm=t&uuid=ae078f3e-0b09-44e6-a62b-200e2d4a99e3
To: /Users/zhouqiaoqiao/Desktop/【61332】 Text Mining/CW最终提交材料/best_BERTmodel.pth
100%|██████████| 454M/454M [00:19<00:00, 23.2MB/s] 


Model downloaded successfully!
BERT Model loaded successfully!


BERT Model loaded successfully!


In [7]:
#-----------------------------------------------------
# Input Module： Please input your test case here!
#-----------------------------------------------------

# The sentence and two entities (entity1 and entity2) are provided as input for relationship prediction.
# Users can modify these variables to test the model with different sentences and entities.

# In this example, 'sentence' is a string that represents the sentence where the entities appear.
# 'entity1' and 'entity2' are the two entities whose relationship you want to predict. You can replace
# these with any sentence and entities of your choice.

# To test the module:
# 1. Modify the 'sentence' variable with your desired sentence that contains two entities.
# 2. Modify 'entity1' with the first entity (a person, organization, or any other entity) in the sentence.
# 3. Modify 'entity2' with the second entity (the relationship you want to identify between entity1 and entity2).

# After modifying these values, the model will predict the relationship between the two entities in the sentence.
# The predicted relationship will be printed out.


sentence = "otecna employed Kojo Annan , Kofi Annan 's son , as a contractor at the time it received the aid inspection contract"
entity1 = "Kojo Annan"
entity2 = "Kofi Annan"

# Relationship prediction using the loaded model
predicted_relationship = predict_relationship_with_saved_model(sentence, entity1, entity2, bert_model, tokenizer, label_encoder)
print(f"Predicted relationships：{predicted_relationship}")


Predicted relationships：people_person_children
