# Attention-Based BiLSTM-CNN (Att-BiLSTM-CNN) for Relation Extraction on SemEval-2010 Task 8

In [1]:
import re
import os
import torch
import json
import numpy as np
from config import Config
from nltk.tokenize import word_tokenize
from utils import WordEmbeddingLoader, RelationLoader
from model.att_blstm import Att_BLSTM
from model.blstm import BLSTM
from model.blstm_cnn import BLSTM_CNN
from model.multi_att_blstm import Multi_Att_BLSTM
from model.att_blstm_cnn import Att_BLSTM_CNN
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 1. Data Preprocess

## 1.1 Raw data
Retrieve the **sem_eval_2010_task_8** dataset from the website:  [https://huggingface.co/datasets/SemEvalWorkshop/sem_eval_2010_task_8]


 **Loading the Dataset**  
   ```python
   ds = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")
   ```
   - The function `load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")` fetches the **SemEval-2010 Task 8 dataset** from Hugging Face's dataset repository.
   - This dataset is used for **relation extraction**, where the goal is to classify relationships between two entities in a sentence.
   - The dataset typically contains:
     - `train`: Training dataset.
     - `test`: Test dataset.
   - `ds` is a dictionary-like object with keys corresponding to different dataset splits (e.g., `"train"`, `"test"`).

In [4]:
from datasets import load_dataset

# Load the dataset
ds = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")

# Display the first few rows of the training set
print(ds['train'][:5])  # This will display the first 5 examples

README.md:   0%|          | 0.00/8.23k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


{'sentence': ['The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.', 'The <e1>child</e1> was carefully wrapped and bound into the <e2>cradle</e2> by means of a cord.', 'The <e1>author</e1> of a keygen uses a <e2>disassembler</e2> to look at the raw assembly code.', 'A misty <e1>ridge</e1> uprises from the <e2>surge</e2>.', 'The <e1>student</e1> <e2>association</e2> is the voice of the undergraduate student population of the State University of New York at Buffalo.'], 'relation': [3, 18, 11, 18, 12]}


## 1.2 Convert format
The complete format conversion and data preprocessing are implemented, which converts the `.txt` file to `.json` format and splits . This is done in the `./data/Data_Preprocess.py` file.

In [5]:
import json

def load_json_lines(file_path):
    """Load JSON lines data locally"""
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line.strip()))  
    return data

# Example of loading data from a JSON file with multiple objects per line
file_path = './data/train.json'  # Change this to your file path
data = load_json_lines(file_path)

# Print the first few records to inspect
print(data[:5])  # Display the first two elements

[{'id': 1, 'relation': 'Component-Whole(e2,e1)', 'sentence': ['The', 'system', 'as', 'described', 'above', 'has', 'its', 'greatest', 'application', 'in', 'an', 'arrayed', '<e1>', 'configuration', '</e1>', 'of', 'antenna', '<e2>', 'elements', '</e2>', '.']}, {'id': 2, 'relation': 'Other', 'sentence': ['The', '<e1>', 'child', '</e1>', 'was', 'carefully', 'wrapped', 'and', 'bound', 'into', 'the', '<e2>', 'cradle', '</e2>', 'by', 'means', 'of', 'a', 'cord', '.']}, {'id': 3, 'relation': 'Instrument-Agency(e2,e1)', 'sentence': ['The', '<e1>', 'author', '</e1>', 'of', 'a', 'keygen', 'uses', 'a', '<e2>', 'disassembler', '</e2>', 'to', 'look', 'at', 'the', 'raw', 'assembly', 'code', '.']}, {'id': 4, 'relation': 'Other', 'sentence': ['A', 'misty', '<e1>', 'ridge', '</e1>', 'uprises', 'from', 'the', '<e2>', 'surge', '</e2>', '.']}, {'id': 5, 'relation': 'Member-Collection(e1,e2)', 'sentence': ['The', '<e1>', 'student', '</e1>', '<e2>', 'association', '</e2>', 'is', 'the', 'voice', 'of', 'the', 'u

## 1.3 split train set and validation set
The detailed process for splitting the dataset into training and validation sets is implemented in the script located at `./data/Data_Preprocess`. This script handles the division of the original dataset into separate training and validation subsets with a 60:40 ratio to facilitate model training and evaluation.

In [4]:
import json  
import os  
import html  
import re  
from collections import Counter  
from sklearn.model_selection import train_test_split  
from datasets import load_dataset  

# Define the 19 relations for SemEval-2010 task 8
relations = [
    "Cause-Effect(e1,e2)", "Cause-Effect(e2,e1)",
    "Component-Whole(e1,e2)", "Component-Whole(e2,e1)",
    "Content-Container(e1,e2)", "Content-Container(e2,e1)",
    "Entity-Destination(e1,e2)", "Entity-Destination(e2,e1)",
    "Entity-Origin(e1,e2)", "Entity-Origin(e2,e1)",
    "Instrument-Agency(e1,e2)", "Instrument-Agency(e2,e1)",
    "Member-Collection(e1,e2)", "Member-Collection(e2,e1)",
    "Message-Topic(e1,e2)", "Message-Topic(e2,e1)",
    "Product-Producer(e1,e2)", "Product-Producer(e2,e1)",
    "Other"
]

# Map relations to numerical IDs and vice versa
label2id = {label: i for i, label in enumerate(relations)}
id2label = {i: label for label, i in label2id.items()}

# Define local file paths
train_file = "train.json"  # Training set file path
valid_file = "validation.json"  # Validation set file path
test_file = "test.json"  # Test set file path

# Load local data (single JSON object)
def load_local_data(file_path):
    """Load local JSON data (single JSON object)"""
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

# Load JSON Lines format data
def load_json_lines(file_path):
    """Load local JSON Lines format data"""
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line.strip()))  
    return data

# Save data as a local JSON file (single JSON object)
def save_local_data(data, file_path):
    """Save data as a JSON file (single JSON object, i.e., a list)"""
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

# Clean text data
def clean_text(text):
    """Clean text data"""
    if not isinstance(text, str) or text.strip() == "":
        return None  # Return None if the text is empty
    text = html.unescape(text)  # Handle HTML escape characters
    return text.strip()  # Remove leading and trailing spaces

# Process e1 and e2 tags and perform tokenization
def search_entity(sentence):
    """Process e1 and e2 tags and perform tokenization"""
    e1 = re.findall(r'<e1>(.*)</e1>', sentence)[0]  # Extract the text inside e1 tags
    e2 = re.findall(r'<e2>(.*)</e2>', sentence)[0]  # Extract the text inside e2 tags
    # Add spaces around e1 and e2 tags and their contents
    sentence = sentence.replace('<e1>' + e1 + '</e1>', ' <e1> ' + e1 + ' </e1> ', 1)
    sentence = sentence.replace('<e2>' + e2 + '</e2>', ' <e2> ' + e2 + ' </e2> ', 1)
    sentence = sentence.split()  # Tokenize the sentence by spaces
    sentence = ' '.join(sentence)  # Join the tokenized sentence back into a string
    # Fix any formatting issues with e1 and e2 tags
    sentence = sentence.replace('< e1 >', '<e1>')
    sentence = sentence.replace('< e2 >', '<e2>')
    sentence = sentence.replace('< /e1 >', '</e1>')
    sentence = sentence.replace('< /e2 >', '</e2>')
    sentence = sentence.split()

    # Ensure that the sentence contains e1 and e2 tags
    assert '<e1>' in sentence
    assert '<e2>' in sentence
    assert '</e1>' in sentence
    assert '</e2>' in sentence

    return sentence

# If local files exist, load the data
if os.path.exists(train_file) and os.path.exists(valid_file) and os.path.exists(test_file):
    train_data = load_local_data(train_file)
    valid_data = load_local_data(valid_file)
    test_data = load_local_data(test_file)
    print("Data has been loaded locally!")  # Output success message for loading data
    print("Training set size:", len(train_data))  # Output the size of the training set
    print("Validation Set Size:", len(valid_data))  # Output the size of the validation set
    print("Test Set Size:", len(test_data))  # Output the size of the test set
else:
    # 1. Load the dataset from Hugging Face
    ds = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")
    
    # 2. Read the training and test data
    train_texts_all = ds["train"]["sentence"]  # Training set sentences
    train_labels_all = ds["train"]["relation"]  # Training set labels
    test_texts = ds["test"]["sentence"]  # Test set sentences
    test_labels = ds["test"]["relation"]  # Test set labels

    # Add ID encoding to the training set, from 1 to 8000
    train_data_with_ids = [{"id": i + 1, "sentence": text, "relation": label} 
                           for i, (text, label) in enumerate(zip(train_texts_all, train_labels_all))]

    # Add ID encoding to the test set, from 8001 to 8000 + length of test set
    test_data_with_ids = [{"id": i + 8001, "sentence": text, "relation": label} 
                          for i, (text, label) in enumerate(zip(test_texts, test_labels))]

    # 3. Clean the data (text cleaning and splitting)
    clean_train_data = [(clean_text(t), l) for t, l in zip(train_texts_all, train_labels_all)]
    clean_train_data = [x for x in clean_train_data if x[0] is not None]  # Filter out empty texts

    clean_test_data = [(clean_text(t), l) for t, l in zip(test_texts, test_labels)]
    clean_test_data = [x for x in clean_test_data if x[0] is not None]  # Filter out empty texts

    # 4. Split the test data into validation and test sets
    all_test_labels = [item[1] for item in clean_test_data]
    relation_counts = Counter(all_test_labels)  # Count the frequency of each label
    single_instance_relations = [rel for rel, count in relation_counts.items() if count == 1]  # Single instance relations
    single_instance_data = [item for item in clean_test_data if item[1] in single_instance_relations]  # Single instance data
    remaining_data = [item for item in clean_test_data if item[1] not in single_instance_relations]  # Remaining data
    remaining_labels = [x[1] for x in remaining_data]
    # Split the remaining data into validation and test sets, preserving label proportions
    val_data_split, test_data_split = train_test_split(
        remaining_data,
        test_size=0.4,  # Test set will be 40%
        stratify=remaining_labels,  # Ensure label proportions are preserved
        random_state=42  # Set random seed for reproducibility
    )

    final_test_data = test_data_split + single_instance_data  # Final test set includes single instance data

    # 5. Create dictionary lists (matching the previous format)
    train_data = [{
        "id": train_data_with_ids[i]["id"],
        "relation": id2label[label],  # Use relation descriptions instead of label indices
        "sentence": search_entity(text),
        "comment": "N/A"  # Add a comment field
    } for i, (text, label) in enumerate(clean_train_data)]

    valid_data = [{
        "id": test_data_with_ids[i]["id"],
        "relation": id2label[label],  # Use relation descriptions instead of label indices
        "sentence": search_entity(text),
        "comment": "N/A"
    } for i, (text, label) in enumerate(val_data_split)]

    test_data = [{
        "id": test_data_with_ids[i]["id"],
        "relation": id2label[label],  # Use relation descriptions instead of label indices
        "sentence": search_entity(text),
        "comment": "N/A"
    } for i, (text, label) in enumerate(final_test_data)]

# Output the sizes of the datasets
print("Training set size:", len(train_data))  # Output the size of the training set
print("Validation Set Size:", len(valid_data))  # Output the size of the validation set
print("Test Set Size:", len(test_data))  # Output the size of the test set


Training set size: 8000
Validation Set Size: 1629
Test Set Size: 1088


In [5]:
from collections import Counter
from datasets import load_dataset

# Define the relation labels dictionary
relations = [
    "Cause-Effect(e1,e2)",
    "Cause-Effect(e2,e1)",
    "Instrument-Agency(e1,e2)",
    "Instrument-Agency(e2,e1)",
    "Product-Producer(e1,e2)",
    "Product-Producer(e2,e1)",
    "Content-Container(e1,e2)",
    "Content-Container(e2,e1)",
    "Entity-Origin(e1,e2)",
    "Entity-Origin(e2,e1)",
    "Entity-Destination(e1,e2)",
    "Entity-Destination(e2,e1)",
    "Component-Whole(e1,e2)",
    "Component-Whole(e2,e1)",
    "Member-Collection(e1,e2)",
    "Member-Collection(e2,e1)",
    "Message-Topic(e1,e2)",
    "Message-Topic(e2,e1)",
    "Other"
]

label2id = {label: i for i, label in enumerate(relations)}
id2label = {i: label for label, i in label2id.items()}

# 1. Load dataset from Hugging Face
ds = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")

# 2. Read training and test data
train_texts_all = ds["train"]["sentence"]  # Training texts
train_labels_all = ds["train"]["relation"]  # Training labels
test_texts = ds["test"]["sentence"]         # Test texts
test_labels = ds["test"]["relation"]        # Test labels

# Get the distribution of relation labels in the training and test sets and convert to label names using id2label
train_relation_counts = Counter([id2label[label] for label in train_labels_all])
test_relation_counts = Counter([id2label[label] for label in test_labels])
print("----------Original Datasets distribution-----------")
# Print relation label distribution
print("Training set relation distribution:")
for label, count in train_relation_counts.items():
    print(f"{label}: {count}")

print("\nTest set relation distribution:")
for label, count in test_relation_counts.items():
    print(f"{label}: {count}")

print("----------Splitted Datasets distribution-----------")
# Get the relation distribution in the training, validation, and test sets
train_relations = [item["relation"] for item in train_data]
valid_relations = [item["relation"] for item in valid_data]
test_relations = [item["relation"] for item in test_data]

# Count the occurrences of each label
train_relation_counts = Counter(train_relations)
valid_relation_counts = Counter(valid_relations)
test_relation_counts = Counter(test_relations)

# Print the distribution results
print("Training set relation distribution:")
print(train_relation_counts)

print("\nValidation set relation distribution:")
print(valid_relation_counts)

print("\nTest set relation distribution:")
print(test_relation_counts)

----------Original Datasets distribution-----------
Training set relation distribution:
Instrument-Agency(e2,e1): 471
Other: 1410
Entity-Destination(e2,e1): 407
Component-Whole(e1,e2): 78
Cause-Effect(e2,e1): 659
Content-Container(e1,e2): 844
Product-Producer(e1,e2): 374
Member-Collection(e1,e2): 490
Message-Topic(e2,e1): 394
Component-Whole(e2,e1): 612
Entity-Origin(e1,e2): 568
Cause-Effect(e1,e2): 344
Instrument-Agency(e1,e2): 470
Member-Collection(e2,e1): 144
Message-Topic(e1,e2): 323
Entity-Origin(e2,e1): 148
Product-Producer(e2,e1): 166
Entity-Destination(e1,e2): 97
Content-Container(e2,e1): 1

Test set relation distribution:
Member-Collection(e1,e2): 210
Message-Topic(e2,e1): 123
Entity-Destination(e2,e1): 134
Content-Container(e1,e2): 291
Cause-Effect(e2,e1): 194
Instrument-Agency(e1,e2): 162
Message-Topic(e1,e2): 108
Component-Whole(e2,e1): 201
Other: 454
Entity-Origin(e1,e2): 211
Product-Producer(e1,e2): 153
Entity-Origin(e2,e1): 47
Cause-Effect(e1,e2): 134
Instrument-Agency(e

In [6]:
import json

def load_local_data(file_path):
    """Load JSON data locally (single JSON object)"""
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

# List of dataset files
file_paths = ["./data/train.json", "./data/validation.json", "./data/test.json"]

# Count the number of samples in each dataset
for file_path in file_paths:
    data = load_json_lines(file_path)
    print(f"{file_path}: {len(data)} samples")


./data/train.json: 8000 samples
./data/validation.json: 1629 samples
./data/test.json: 1088 samples


---
# 2. Model Training

* **<span style="color:red">Att-BLSTM-CNN (Our improved model)</span>** is located in the directory `./model/att_blstm_cnn.py`.
* **<span style="color:blue">BLSTM (Baseline model)</span>** is located in the directory `./model/blstm.py`.
* **Att_BLSTM (Comparison model)** is located in the directory `./model/att_blstm.py`.
* **BLSTM_CNN (Comparison model)** is located in the directory `./model/blstm_cnn.py`.
* **Multi_Att_BLSTM (Comparison model)** is located in the directory `./model/multi_att_blstm.py`.

* **`run.py`**: This module is responsible for training and testing the models. It handles the main execution of the training pipeline, including model initialization, training, validation, and testing based on the provided configurations.

* **`config.py`**: This module contains the default settings for various parameters used in the models. It defines configuration options such as model architecture, learning rate, batch size, and other hyperparameters that are used throughout the training and evaluation process.

* **`evaluate.py`**: This module is used for evaluating the model's performance. It computes various evaluation metrics such as accuracy, precision, recall, F1-score, etc., to assess how well the trained model performs on the test dataset.

* **`utils.py`**: This module is responsible for data-related operations, including reading and loading datasets, preprocessing data, and handling file operations. It provides utility functions that support data management and loading for the training and evaluation processes.
* **Training Data CSV** will be stored in the directory `./output/<model_name>/training_data.csv`.
* **Test Data CSV** will be stored in the directory `./output/<model_name>/test_data.csv`.
* **Predicted Results** (in `predicted_result.txt`) will be stored in the directory `./output/<model_name>/predicted_result.txt`.
* **Trained Model** (in `model.pkl`) will be stored in the directory `./output/<model_name>/model.pkl`.


**<span style="color:red">Att-BLSTM-CNN (Our improved model) Structure : </span>**

In [None]:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# @Version : Python 3.8

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# Define the model class with BiLSTM, Attention, and CNN layers
class Att_BLSTM_CNN(nn.Module):
    def __init__(self, word_vec, class_num, config):
        super().__init__()
        self.word_vec = word_vec  # Pre-trained word embeddings
        self.class_num = class_num  # Number of output classes

        # Hyperparameters from the config
        self.max_len = config.max_len  # Maximum sequence length
        self.word_dim = config.word_dim  # Dimension of word embeddings
        self.hidden_size = config.hidden_size  # Hidden size of LSTM
        self.layers_num = config.layers_num  # Number of LSTM layers
        self.emb_dropout_value = config.emb_dropout  # Dropout for embedding layer
        self.lstm_dropout_value = config.lstm_dropout  # Dropout for LSTM layer
        self.linear_dropout_value = config.linear_dropout  # Dropout for fully connected layer
        self.cnn_filters = config.cnn_filters  # Number of filters in the CNN layers

        # Embedding Layer: Using pre-trained word embeddings for initialization
        self.word_embedding = nn.Embedding.from_pretrained(
            embeddings=self.word_vec,
            freeze=False,  # Whether to fine-tune the word embeddings
        )

        # BiLSTM Layer: Bidirectional LSTM for sequence encoding
        self.lstm = nn.LSTM(
            input_size=self.word_dim,  # Input dimension (word embedding dimension)
            hidden_size=self.hidden_size,  # Hidden state size
            num_layers=self.layers_num,  # Number of LSTM layers
            bias=True,  # Whether to include bias terms
            batch_first=True,  # Batch comes first in the input tensor
            dropout=0,  # No dropout between LSTM layers
            bidirectional=True,  # Bidirectional LSTM
        )

        # CNN Layer: Convolutional layers with different kernel sizes
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=self.hidden_size, 
                      out_channels=self.cnn_filters, 
                      kernel_size=k, 
                      padding=k // 2) for k in [2, 3, 4]  # Kernel sizes 2, 3, and 4
        ])

        # Attention Layer: To compute a weighted sum of LSTM outputs
        self.tanh = nn.Tanh()  # Non-linear activation for attention scores
        self.att_weight = nn.Parameter(torch.randn(1, self.hidden_size, 1))  # Attention weights

        # Dropout Layers: Dropout for regularization
        self.emb_dropout = nn.Dropout(self.emb_dropout_value)
        self.lstm_dropout = nn.Dropout(self.lstm_dropout_value)
        self.linear_dropout = nn.Dropout(self.linear_dropout_value)

        # Fully Connected Layer: Output layer for classification
        total_features = self.hidden_size + self.cnn_filters * len(self.convs)
        self.dense = nn.Linear(total_features, self.class_num, bias=True)  # Final dense layer for classification

        # Weight Initialization: Xavier initialization for weights
        init.xavier_normal_(self.dense.weight)
        init.constant_(self.dense.bias, 0.)

    # LSTM Layer: Process the sequence using BiLSTM
    def lstm_layer(self, x, mask):
        lengths = torch.sum(mask.gt(0), dim=-1)  # Calculate sequence lengths
        lengths = lengths.cpu()  # Ensure lengths are on CPU to avoid errors
        x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)  # Pack padded sequences
        h, (_, _) = self.lstm(x)  # Run through BiLSTM
        h, _ = pad_packed_sequence(h, batch_first=True, padding_value=0.0, total_length=self.max_len)  # Pad sequences back
        h = h.view(-1, self.max_len, 2, self.hidden_size)  # Reshape to include bidirectional outputs
        h = torch.sum(h, dim=2)  # Sum bidirectional outputs (forward + backward)
        return h

    # Attention Layer: Apply attention mechanism on the BiLSTM output
    def attention_layer(self, h, mask):
        att_weight = self.att_weight.expand(mask.shape[0], -1, -1)  # Expand attention weight to match batch size
        att_score = torch.bmm(self.tanh(h), att_weight)  # Compute attention scores for each timestep

        # Masking to ignore padding positions
        mask = mask.unsqueeze(dim=-1)  # Add an extra dimension to mask
        att_score = att_score.masked_fill(mask.eq(0), float('-inf'))  # Mask padding positions with negative infinity
        att_weight = F.softmax(att_score, dim=1)  # Apply softmax to get attention weights

        # Compute attention output (context vector)
        reps = torch.bmm(h.transpose(1, 2), att_weight).squeeze(dim=-1)  # Weighted sum of LSTM outputs
        reps = self.tanh(reps)  # Apply Tanh activation
        return reps

    # CNN Layer: Apply convolutions with multiple kernel sizes
    def cnn_layer(self, h):
        h = h.permute(0, 2, 1)  # Change shape to fit Conv1D input (B, H, L)
        cnn_outs = [F.relu(conv(h)) for conv in self.convs]  # Apply each convolution
        # Perform global max pooling on each convolution's output
        pooled_outs = [F.max_pool1d(cnn_out, kernel_size=cnn_out.size(2)).squeeze(2) for cnn_out in cnn_outs]
        cnn_features = torch.cat(pooled_outs, dim=1)  # Concatenate CNN outputs from different kernels
        return cnn_features

    # Forward Pass: The complete forward pass for the model
    def forward(self, data):
        token = data[:, 0, :].view(-1, self.max_len)  # Extract word indices
        mask = data[:, 1, :].view(-1, self.max_len)   # Extract mask (padding positions)

        # Embedding Layer
        emb = self.word_embedding(token)  # Convert word indices to embeddings
        emb = self.emb_dropout(emb)  # Apply dropout to embeddings

        # BiLSTM Layer
        h = self.lstm_layer(emb, mask)  # Process through BiLSTM
        h = self.lstm_dropout(h)  # Apply dropout to LSTM output

        # Attention Representation
        att_output = self.attention_layer(h, mask)  # Compute attention representation

        # CNN Representation
        cnn_output = self.cnn_layer(h)  # Get CNN features

        # Feature Fusion: Combine Attention and CNN features
        final_rep = torch.cat([att_output, cnn_output], dim=1)  # Concatenate features from attention and CNN layers

        # Fully Connected Layer
        final_rep = self.linear_dropout(final_rep)  # Apply dropout
        logits = self.dense(final_rep)  # Classify using fully connected layer

        return logits  # Return the classification output


## 2.1 Parameter Settings

In [7]:
!python run.py --help

usage: run.py [-h] [--data_dir DATA_DIR] [--output_dir OUTPUT_DIR]
              [--embedding_path EMBEDDING_PATH] [--word_dim WORD_DIM]
              [--model_name MODEL_NAME] [--mode {0,1}] [--seed SEED]
              [--cuda CUDA] [--epoch EPOCH] [--batch_size BATCH_SIZE]
              [--lr LR] [--max_len MAX_LEN] [--emb_dropout EMB_DROPOUT]
              [--lstm_dropout LSTM_DROPOUT] [--linear_dropout LINEAR_DROPOUT]
              [--hidden_size HIDDEN_SIZE] [--layers_num LAYERS_NUM]
              [--L2_decay L2_DECAY] [--cnn_filters CNN_FILTERS]

config for models

optional arguments:
  -h, --help            show this help message and exit
  --data_dir DATA_DIR   dir to load data
  --output_dir OUTPUT_DIR
                        dir to save output
  --embedding_path EMBEDDING_PATH
                        pre_trained word embedding
  --word_dim WORD_DIM   dimension of word embedding
  --model_name MODEL_NAME
                        model name
  --mode {0,1}          running mode: 

Here’s a list of the **configurable parameters** you can adjust when running the model:

---

### **1. Data Directories:**
- **`data_dir`**: The directory path to load the dataset.
  - Default: `./data`
  - Example: `--data_dir ./path/to/data`

- **`output_dir`**: The directory path where output (like trained models) will be saved.
  - Default: `./output`
  - Example: `--output_dir ./path/to/output`

---

### **2. Word Embedding Settings:**
- **`embedding_path`**: The path to the pre-trained word embedding file.
  - Default: `./embedding/glove.6B.100d.txt`
  - Example: `--embedding_path ./embedding/custom_embeddings.txt`

- **`word_dim`**: The dimensionality of the word embeddings.
  - Default: `100`
  - Example: `--word_dim 200`

---

### **3. CNN (Convolutional Neural Networks) Settings:**
- **`cnn_filters`**: The number of output channels in the CNN layer (i.e., the number of filters).
  - Default: `128`
  - Example: `--cnn_filters 256`

---

### **4. Model Configuration:**
- **`model_name`**: The name of the model to use.
  - Default: `None` (which defaults to `'Att_BLSTM_CNN'` if not specified)
  - Example: `--model_name 'BLSTM'`

- **`mode`**: Running mode. 
  - `1` for training (default)
  - `0` for testing
  - Example: `--mode 1`

- **`seed`**: Random seed for reproducibility.
  - Default: `5782`
  - Example: `--seed 42`

- **`cuda`**: The GPU device number to use for training (set to `-1` to use CPU).
  - Default: `0` (first GPU)
  - Example: `--cuda 1` (use second GPU)

- **`epoch`**: The maximum number of epochs to train.
  - Default: `20`
  - Example: `--epoch 50`

---

### **5. Hyperparameters:**
- **`batch_size`**: The batch size used during training.
  - Default: `10`
  - Example: `--batch_size 32`

- **`lr` (Learning Rate)**: The learning rate for the optimizer.
  - Default: `1.0`
  - Example: `--lr 0.001`

- **`max_len`**: The maximum length of sentences or input data.
  - Default: `100`
  - Example: `--max_len 150`

---

### **6. Dropout Settings:**
- **`emb_dropout`**: Dropout rate in the embedding layer.
  - Default: `0.3`
  - Example: `--emb_dropout 0.5`

- **`lstm_dropout`**: Dropout rate in the LSTM layers.
  - Default: `0.3`
  - Example: `--lstm_dropout 0.4`

- **`linear_dropout`**: Dropout rate in the linear layer.
  - Default: `0.5`
  - Example: `--linear_dropout 0.6`

---

### **7. LSTM Settings:**
- **`hidden_size`**: The number of hidden units in the LSTM layers.
  - Default: `100`
  - Example: `--hidden_size 200`

- **`layers_num`**: The number of stacked LSTM layers.
  - Default: `1`
  - Example: `--layers_num 2`

---

### **8. Regularization:**
- **`L2_decay`**: L2 regularization weight decay.
  - Default: `1e-5`
  - Example: `--L2_decay 1e-4`

---

### **Example Usage:**

To set up and run training with customized settings:

```bash
!python run.py --model_name='Att_BLSTM_CNN' --mode=1  --output_dir='./output' --epoch=20 --batch_size=10 --lr=1 --max_len=100 --hidden_size=100 --lstm_dropout=0.3 --emb_dropout=0.5
```
---

## 2.2 Model Training Display

To **train** the model, you need to set `--mode=1`. By default, in `config.py`, the mode is set to `1`, which is for **model training**.


**Make sure you have the appropriate dataset and configurations set up before starting the training process. The default parameters are defined in `config.py`.**

To run the training, use the following command:

```bash
!python run.py --model_name='Model Name' --mode=1
```

Here are the five algorithms available for training, which can be specified using the `--model_name` argument:

1. **<span style="color:red">Att_BLSTM_CNN (Our Improved model)</span>**: Attention-based Bidirectional LSTM with Convolutional Neural Networks
   ```bash
   --model_name=Att_BLSTM_CNN
   ```

2. **BLSTM**: Bidirectional LSTM model
   ```bash
   --model_name=BLSTM
   ```

3. **Att_BLSTM**: Attention-based Bidirectional LSTM model
   ```bash
   --model_name=Att_BLSTM
   ```

4. **BLSTM_CNN**: Bidirectional LSTM with Convolutional Neural Networks
   ```bash
   --model_name=BLSTM_CNN
   ```

5. **Multi_Att_BLSTM**: Multi-Attention Bidirectional LSTM model
   ```bash
   --model_name=Multi_Att_BLSTM
   ```
Now, for model **training**, replace the `--model_name='Model Name'` with the desired model from the above list. Ensure you set `--mode=1` for training.

### Model Training and Testing Code

In [None]:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# @Version : Python 3.8

import os
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd

from config import Config
from utils import WordEmbeddingLoader, RelationLoader, SemEvalDataLoader
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from model.att_blstm import Att_BLSTM
from model.blstm import BLSTM
from model.blstm_cnn import BLSTM_CNN
from model.multi_att_blstm import Multi_Att_BLSTM
from model.att_blstm_cnn import Att_BLSTM_CNN
from evaluate import Eval


def print_result(predict_label, id2rel, start_idx=8001):
    # Save the predicted results to a text file
    output_path = os.path.join(config.model_dir, 'predicted_result.txt')
    with open(output_path, 'w', encoding='utf-8') as fw:
        for i in range(0, predict_label.shape[0]):
            # Write each prediction with the corresponding label ID to the file
            fw.write('{}\t{}\n'.format(
                start_idx+i, id2rel[int(predict_label[i])]))


def train(model, criterion, loader, config):
    # Unpack training, validation (dev), and testing data loaders
    train_loader, dev_loader, _ = loader
    optimizer = optim.Adadelta(
        model.parameters(), lr=config.lr, weight_decay=config.L2_decay)

    # Display the model structure
    print(model)
    print('Training model parameters:')
    for name, param in model.named_parameters():
        if param.requires_grad:
            print('%s :  %s' % (name, str(param.data.shape)))
    print('--------------------------------------')
    print('Starting to train the model ...')

    eval_tool = Eval(config)
    max_f1 = -float('inf')

    # Create directory for saving training metrics if it doesn't exist
    csv_path = os.path.join(config.model_dir, f'{config.model_name}_train_metrics.csv')
    if not os.path.exists(config.model_dir):
        os.makedirs(config.model_dir)

    # List to store training records
    records = []

    # Train for each epoch
    for epoch in range(1, config.epoch + 1):
        for step, (data, label) in enumerate(train_loader):
            model.train()  # Set the model to training mode
            data = data.to(config.device)  # Move data to the appropriate device (GPU/CPU)
            label = label.to(config.device)  # Move labels to the appropriate device

            optimizer.zero_grad()   # Clear gradients from previous step
            logits = model(data)  # Forward pass

            loss = criterion(logits, label)  # Calculate loss
            loss.backward()  # Backpropagation
            nn.utils.clip_grad_value_(model.parameters(), clip_value=5)  # Prevent exploding gradients
            optimizer.step()  # Update model parameters

        # Calculate metrics for training and validation data
        _, train_loss, _, _, _, _, _ = eval_tool.evaluate(model, criterion, train_loader)
        f1, eval_loss, _, micro_f1, precision, recall, accuracy = eval_tool.evaluate(model, criterion, dev_loader)

        # Display the metrics for this epoch
        print(f'[{epoch:03d}] train_loss: {train_loss:.3f} | '
              f'dev_loss: {eval_loss:.3f} | '
              f'micro f1 on dev: {micro_f1:.4f} | '
              f'Precision on dev: {precision:.4f} | '
              f'Recall on dev: {recall:.4f} | '
              f'Accuracy on dev: {accuracy:.4f} | '
              f'Macro F1 on dev: {f1:.4f}', end=' ')

        # Store metrics in the records list for later saving to CSV
        records.append([
            epoch, train_loss, eval_loss, micro_f1, precision, recall, accuracy, f1
        ])

        # Save the model if it achieves a better F1 score
        if f1 > max_f1:
            max_f1 = f1
            torch.save(model.state_dict(), os.path.join(config.model_dir, 'model.pkl'))
            print('>>> Model saved!')

        else:
            print()

    # Save the training metrics to a CSV file
    df = pd.DataFrame(records, columns=['Epoch', 'Train Loss', 'Dev Loss', 'Micro F1', 'Precision', 'Recall', 'Accuracy', 'Macro F1'])
    df.to_csv(csv_path, index=False)
    print(f'Training metrics saved to {csv_path}')


def test(model, criterion, loader, config):
    # Start testing after training is complete
    print('--------------------------------------')
    print('Start testing ...')

    _, _, test_loader = loader
    model.load_state_dict(torch.load(os.path.join(config.model_dir, 'model.pkl')))  # Load the saved model
    eval_tool = Eval(config)

    # Evaluate the model on the test set
    f1, test_loss, predict_label, micro_f1, precision, recall, accuracy = eval_tool.evaluate(
        model, criterion, test_loader)

    # Display the test results
    print(f'test_loss: {test_loss:.3f} | '
          f'Micro F1 on test: {micro_f1:.4f} | '
          f'Precision on test: {precision:.4f} | '
          f'Recall on test: {recall:.4f} | '
          f'Accuracy on test: {accuracy:.4f} | '
          f'Macro F1 on test: {f1:.4f}')

    # Append test results to CSV file
    csv_path = os.path.join(config.model_dir, f'{config.model_name}_test_metrics.csv')
    test_record = pd.DataFrame([['Test', None, test_loss, micro_f1, precision, recall, accuracy, f1]], columns=['Epoch', 'Train Loss', 'Dev Loss', 'Micro F1', 'Precision', 'Recall', 'Accuracy', 'Macro F1'])
    test_record.to_csv(csv_path, mode='a', header=False, index=False)
    print(f'Test metrics appended to {csv_path}')

    return predict_label


if __name__ == '__main__':
    config = Config()
    print('--------------------------------------')
    print('some config:')
    config.print_config()

    print('--------------------------------------')
    print('start to load data ...')
    # Load word embeddings and relations data
    word2id, word_vec = WordEmbeddingLoader(config).load_embedding()
    rel2id, id2rel, class_num = RelationLoader(config).get_relation()
    loader = SemEvalDataLoader(rel2id, word2id, config)

    # Initialize loaders for train, dev, and test datasets
    train_loader, dev_loader = None, None
    if config.mode == 1:  # If in training mode
        train_loader = loader.get_train()
        dev_loader = loader.get_dev()
    test_loader = loader.get_test()  # Get test data loader
    loader = [train_loader, dev_loader, test_loader]
    print('finish!')

    print('--------------------------------------')
    # Initialize the model based on the configuration
    if config.model_name == 'BLSTM':
        model = BLSTM(word_vec=word_vec, class_num=class_num, config=config)
    elif config.model_name == 'Att_BLSTM':
        model = Att_BLSTM(word_vec=word_vec, class_num=class_num, config=config)
    elif config.model_name == 'BLSTM_CNN':
        model = BLSTM_CNN(word_vec=word_vec, class_num=class_num, config=config)
    elif config.model_name == 'Att_BLSTM_CNN':
        model = Att_BLSTM_CNN(word_vec=word_vec, class_num=class_num, config=config)
    elif config.model_name == 'Multi_Att_BLSTM':
        model = Multi_Att_BLSTM(word_vec=word_vec, class_num=class_num, config=config)

    # Move model to the appropriate device (GPU/CPU)
    model = model.to(config.device)
    criterion = nn.CrossEntropyLoss()

    # Training mode
    if config.mode == 1:
        train(model, criterion, loader, config)

    # Testing mode
    predict_label = test(model, criterion, loader, config)
    print_result(predict_label, id2rel)  # Save predicted labels

### 2.2.1 **<span style="color:red">Attention-Based BiLSTM with CNN (Att-BiLSTM-CNN)(Our Improved model)</span>**

In [1]:
!python run.py --model_name=Att_BLSTM_CNN --mode=1 

--------------------------------------
some config:
data_dir = ./data
output_dir = ./output
embedding_path = ./embedding/glove.6B.100d.txt
word_dim = 100
cnn_filters = 128
model_name = Att_BLSTM_CNN
mode = 1
seed = 5782
cuda = 0
epoch = 20
batch_size = 10
lr = 1.0
max_len = 100
emb_dropout = 0.3
lstm_dropout = 0.3
linear_dropout = 0.5
hidden_size = 100
layers_num = 1
L2_decay = 1e-05
device = cuda:0
model_dir = ./output/Att_BLSTM_CNN
--------------------------------------
start to load data ...
finish!
--------------------------------------
Att_BLSTM_CNN(
  (word_embedding): Embedding(400006, 100)
  (lstm): LSTM(100, 100, batch_first=True, bidirectional=True)
  (convs): ModuleList(
    (0): Conv1d(100, 128, kernel_size=(2,), stride=(1,), padding=(1,))
    (1): Conv1d(100, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (2): Conv1d(100, 128, kernel_size=(4,), stride=(1,), padding=(2,))
  )
  (tanh): Tanh()
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (lstm_dropout): Dropout(p=

### 2.2.2 <span style="color:blue">Bidirectional LSTM (BLSTM) (Baseline model)</span>

In [2]:
!python run.py --model_name=BLSTM --mode=1

--------------------------------------
some config:
data_dir = ./data
output_dir = ./output
embedding_path = ./embedding/glove.6B.100d.txt
word_dim = 100
cnn_filters = 128
model_name = BLSTM
mode = 1
seed = 5782
cuda = 0
epoch = 20
batch_size = 10
lr = 1.0
max_len = 100
emb_dropout = 0.3
lstm_dropout = 0.3
linear_dropout = 0.5
hidden_size = 100
layers_num = 1
L2_decay = 1e-05
device = cuda:0
model_dir = ./output/BLSTM
--------------------------------------
start to load data ...
finish!
--------------------------------------
BLSTM(
  (word_embedding): Embedding(400006, 100)
  (lstm): LSTM(100, 100, batch_first=True, bidirectional=True)
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (lstm_dropout): Dropout(p=0.3, inplace=False)
  (linear_dropout): Dropout(p=0.5, inplace=False)
  (dense): Linear(in_features=200, out_features=19, bias=True)
)
Training model parameters:
word_embedding.weight :  torch.Size([400006, 100])
lstm.weight_ih_l0 :  torch.Size([400, 100])
lstm.weight_hh_l0 :  tor

### 2.2.3 Attention-based Bidirectional LSTM (Att_BLSTM) (Comparison model)

In [3]:
!python run.py --model_name=Att_BLSTM --mode=1

--------------------------------------
some config:
data_dir = ./data
output_dir = ./output
embedding_path = ./embedding/glove.6B.100d.txt
word_dim = 100
cnn_filters = 128
model_name = Att_BLSTM
mode = 1
seed = 5782
cuda = 0
epoch = 20
batch_size = 10
lr = 1.0
max_len = 100
emb_dropout = 0.3
lstm_dropout = 0.3
linear_dropout = 0.5
hidden_size = 100
layers_num = 1
L2_decay = 1e-05
device = cuda:0
model_dir = ./output/Att_BLSTM
--------------------------------------
start to load data ...
finish!
--------------------------------------
Att_BLSTM(
  (word_embedding): Embedding(400006, 100)
  (lstm): LSTM(100, 100, batch_first=True, bidirectional=True)
  (tanh): Tanh()
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (lstm_dropout): Dropout(p=0.3, inplace=False)
  (linear_dropout): Dropout(p=0.5, inplace=False)
  (dense): Linear(in_features=100, out_features=19, bias=True)
)
Training model parameters:
att_weight :  torch.Size([1, 100, 1])
word_embedding.weight :  torch.Size([400006, 100])
l

### 2.2.4 Bidirectional LSTM with Convolutional Neural Networks (BLSTM_CNN) (Comparison model)

In [4]:
!python run.py --model_name=BLSTM_CNN --mode=1

--------------------------------------
some config:
data_dir = ./data
output_dir = ./output
embedding_path = ./embedding/glove.6B.100d.txt
word_dim = 100
cnn_filters = 128
model_name = BLSTM_CNN
mode = 1
seed = 5782
cuda = 0
epoch = 20
batch_size = 10
lr = 1.0
max_len = 100
emb_dropout = 0.3
lstm_dropout = 0.3
linear_dropout = 0.5
hidden_size = 100
layers_num = 1
L2_decay = 1e-05
device = cuda:0
model_dir = ./output/BLSTM_CNN
--------------------------------------
start to load data ...
finish!
--------------------------------------
BLSTM_CNN(
  (word_embedding): Embedding(400006, 100)
  (lstm): LSTM(100, 100, batch_first=True, bidirectional=True)
  (conv1): Conv1d(200, 100, kernel_size=(3,), stride=(1,))
  (conv2): Conv1d(200, 100, kernel_size=(4,), stride=(1,))
  (conv3): Conv1d(200, 100, kernel_size=(5,), stride=(1,))
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (lstm_dropout): Dropout(p=0.3, inplace=False)
  (linear_dropout): Dropout(p=0.5, inplace=False)
  (dense): Linear(in_f

### 2.2.5 Multi-Attention Bidirectional LSTM (Multi_Att_BLSTM) (Comparison model)

In [5]:
!python run.py --model_name=Multi_Att_BLSTM --mode=1

--------------------------------------
some config:
data_dir = ./data
output_dir = ./output
embedding_path = ./embedding/glove.6B.100d.txt
word_dim = 100
cnn_filters = 128
model_name = Multi_Att_BLSTM
mode = 1
seed = 5782
cuda = 0
epoch = 20
batch_size = 10
lr = 1.0
max_len = 100
emb_dropout = 0.3
lstm_dropout = 0.3
linear_dropout = 0.5
hidden_size = 100
layers_num = 1
L2_decay = 1e-05
device = cuda:0
model_dir = ./output/Multi_Att_BLSTM
--------------------------------------
start to load data ...
finish!
--------------------------------------
Multi_Att_BLSTM(
  (tanh): Tanh()
  (word_embedding): Embedding(400006, 100)
  (lstm): LSTM(100, 100, batch_first=True, bidirectional=True)
  (self_attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
  )
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (lstm_dropout): Dropout(p=0.3, inplace=False)
  (linear_dropout): Dropout(p=0.5, inplace=False)
  (dense): Linear(in_featu

# 3. Model Testing

To test the model's performance, you need to set `--mode=0`. By default, in `config.py`, the mode is set to `1`, which is for model training.

**Before testing the model, make sure that the trained model is available in the appropriate directory.**

To run the test, use the following command:

```bash
!python run.py --model_name='Model Name' --mode=0
``` 


Here are the five algorithms available for testing, which can be specified using the `--model_name` argument:

1. **<span style="color:red">Att_BLSTM_CNN（Our Improved model)</span>**: Attention-based Bidirectional LSTM with Convolutional Neural Networks
   ```bash
   --model_name=Att_BLSTM_CNN
   ```

2. **BLSTM**: Bidirectional LSTM model
   ```bash
   --model_name=BLSTM
   ```

3. **Att_BLSTM**: Attention-based Bidirectional LSTM model
   ```bash
   --model_name=Att_BLSTM
   ```

4. **BLSTM_CNN**: Bidirectional LSTM with Convolutional Neural Networks
   ```bash
   --model_name=BLSTM_CNN
   ```

5. **Multi_Att_BLSTM**: Multi-Attention Bidirectional LSTM model
   ```bash
   --model_name=Multi_Att_BLSTM
   ```

## 3.1 **<span style="color:red">Attention-Based BiLSTM with CNN (Att-BiLSTM-CNN) (Our Improved model)</span>**

In [3]:
!python run.py --model_name=Att_BLSTM_CNN --mode=0

--------------------------------------
some config:
data_dir = ./data
output_dir = ./output
embedding_path = ./embedding/glove.6B.100d.txt
word_dim = 100
cnn_filters = 128
model_name = Att_BLSTM_CNN
mode = 0
seed = 5782
cuda = 0
epoch = 20
batch_size = 10
lr = 1.0
max_len = 100
emb_dropout = 0.3
lstm_dropout = 0.3
linear_dropout = 0.5
hidden_size = 100
layers_num = 1
L2_decay = 1e-05
device = cuda:0
model_dir = ./output/Att_BLSTM_CNN
--------------------------------------
start to load data ...
finish!
--------------------------------------
--------------------------------------
Start testing ...
test_loss: 0.910 | Micro F1 on test: 0.7914 | Precision on test: 0.7242 | Recall on test: 0.7547 | Accuracy on test: 0.7914 | Macro F1 on test: 0.8302
Test metrics appended to ./output/Att_BLSTM_CNN/Att_BLSTM_CNN_test_metrics.csv


## 3.2 <span style="color:blue">Bidirectional LSTM (BLSTM)(Baseline model)</span>

In [4]:
!python run.py --model_name=BLSTM --mode=0

--------------------------------------
some config:
data_dir = ./data
output_dir = ./output
embedding_path = ./embedding/glove.6B.100d.txt
word_dim = 100
cnn_filters = 128
model_name = BLSTM
mode = 0
seed = 5782
cuda = 0
epoch = 20
batch_size = 10
lr = 1.0
max_len = 100
emb_dropout = 0.3
lstm_dropout = 0.3
linear_dropout = 0.5
hidden_size = 100
layers_num = 1
L2_decay = 1e-05
device = cuda:0
model_dir = ./output/BLSTM
--------------------------------------
start to load data ...
finish!
--------------------------------------
--------------------------------------
Start testing ...
test_loss: 1.036 | Micro F1 on test: 0.7757 | Precision on test: 0.7335 | Recall on test: 0.7080 | Accuracy on test: 0.7757 | Macro F1 on test: 0.8188
Test metrics appended to ./output/BLSTM/BLSTM_test_metrics.csv


## 3.3 Attention-based Bidirectional LSTM (Att_BLSTM) (Comparison model)

In [5]:
!python run.py --model_name=Att_BLSTM --mode=0

--------------------------------------
some config:
data_dir = ./data
output_dir = ./output
embedding_path = ./embedding/glove.6B.100d.txt
word_dim = 100
cnn_filters = 128
model_name = Att_BLSTM
mode = 0
seed = 5782
cuda = 0
epoch = 20
batch_size = 10
lr = 1.0
max_len = 100
emb_dropout = 0.3
lstm_dropout = 0.3
linear_dropout = 0.5
hidden_size = 100
layers_num = 1
L2_decay = 1e-05
device = cuda:0
model_dir = ./output/Att_BLSTM
--------------------------------------
start to load data ...
finish!
--------------------------------------
--------------------------------------
Start testing ...
test_loss: 0.870 | Micro F1 on test: 0.7803 | Precision on test: 0.7342 | Recall on test: 0.7333 | Accuracy on test: 0.7803 | Macro F1 on test: 0.8222
Test metrics appended to ./output/Att_BLSTM/Att_BLSTM_test_metrics.csv


## 3.4 Bidirectional LSTM with Convolutional Neural Networks (BLSTM_CNN) (Comparison model)

In [6]:
!python run.py --model_name=BLSTM_CNN --mode=0

--------------------------------------
some config:
data_dir = ./data
output_dir = ./output
embedding_path = ./embedding/glove.6B.100d.txt
word_dim = 100
cnn_filters = 128
model_name = BLSTM_CNN
mode = 0
seed = 5782
cuda = 0
epoch = 20
batch_size = 10
lr = 1.0
max_len = 100
emb_dropout = 0.3
lstm_dropout = 0.3
linear_dropout = 0.5
hidden_size = 100
layers_num = 1
L2_decay = 1e-05
device = cuda:0
model_dir = ./output/BLSTM_CNN
--------------------------------------
start to load data ...
finish!
--------------------------------------
--------------------------------------
Start testing ...
test_loss: 0.977 | Micro F1 on test: 0.7895 | Precision on test: 0.7222 | Recall on test: 0.7566 | Accuracy on test: 0.7895 | Macro F1 on test: 0.8287
Test metrics appended to ./output/BLSTM_CNN/BLSTM_CNN_test_metrics.csv


## 3.5 Multi-Attention Bidirectional LSTM (Multi_Att_BLSTM) (Comparison model)

In [7]:
!python run.py --model_name=Multi_Att_BLSTM --mode=0

--------------------------------------
some config:
data_dir = ./data
output_dir = ./output
embedding_path = ./embedding/glove.6B.100d.txt
word_dim = 100
cnn_filters = 128
model_name = Multi_Att_BLSTM
mode = 0
seed = 5782
cuda = 0
epoch = 20
batch_size = 10
lr = 1.0
max_len = 100
emb_dropout = 0.3
lstm_dropout = 0.3
linear_dropout = 0.5
hidden_size = 100
layers_num = 1
L2_decay = 1e-05
device = cuda:0
model_dir = ./output/Multi_Att_BLSTM
--------------------------------------
start to load data ...
finish!
--------------------------------------
--------------------------------------
Start testing ...
test_loss: 0.953 | Micro F1 on test: 0.7757 | Precision on test: 0.7188 | Recall on test: 0.7354 | Accuracy on test: 0.7757 | Macro F1 on test: 0.8189
Test metrics appended to ./output/Multi_Att_BLSTM/Multi_Att_BLSTM_test_metrics.csv


# 4. Prediction Demo

To evaluate the actual prediction ability of the trained model, we use `demo.py` to predict the relation between two entities in a single input sentence and output the result.

**Before predicting the entity relation in a new sentence, make sure that the trained model is available in the appropriate directory.**

To predict the relation, use the following command:

```bash
!python demo.py --model_name='Model Name'
```

If the content cannot be output in real-time due to buffering problems, you can use the following command:

```bash
%run demo.py --model_name='Model Name'
```


Here are the five algorithms available for predicting, which can be specified using the `--model_name` argument:

1. **<span style="color:red">Att_BLSTM_CNN（Our Improved model)</span>**: Attention-based Bidirectional LSTM with Convolutional Neural Networks
   ```bash
   --model_name=Att_BLSTM_CNN
   ```

2. **BLSTM**: Bidirectional LSTM model
   ```bash
   --model_name=BLSTM
   ```

3. **Att_BLSTM**: Attention-based Bidirectional LSTM model
   ```bash
   --model_name=Att_BLSTM
   ```

4. **BLSTM_CNN**: Bidirectional LSTM with Convolutional Neural Networks
   ```bash
   --model_name=BLSTM_CNN
   ```

5. **Multi_Att_BLSTM**: Multi-Attention Bidirectional LSTM model
   ```bash
   --model_name=Multi_Att_BLSTM
   ```
**Some test cases are provided below, and you can choose from these examples.**
* **Valid example:**
1. A child is told a `<e1>lie</e1>` for several years by their `<e2>parents</e2>` before he/she realizes that a Santa Claus does not exist.
   - **Expected Output**: `Agent-Patient(e2,e1)`
   
2. The disgusting scene was retaliation against her brother Philip who rents the `<e1>room</e1>` inside this apartment `<e2>house</e2>` on Lombard street.
   - **Expected Output**: `Agent-Patient(e2,e1)`
   
3. This `<e1>thesis</e1>` defines the `<e2>clinical characteristics</e2>` of amyloid disease.
   - **Expected Output**: `Agent-Theme(e1,e2)`
   
4. The `<e1>company</e1>` fabricates plastic `<e2>chairs</e2>`.
   - **Expected Output**: `Agent-Patient(e1,e2)`

* **InValid example:**
The school `<e1>master</e1>` teaches the lesson with a `<e2>stick</e2>`.
   - **Expected Output**: `Instrument-Agency(e2,e1)`
   - **Error**: *"The sentence structure does not align with the typical agent-patient or theme roles; the expected output should be Instrument-Agency(e2,e1)."*


## 4.1 **<span style="color:red">Attention-Based BiLSTM with CNN (Att-BiLSTM-CNN) (Our Improved model)</span>**

### Valid example

In [11]:
%run demo.py --model_name Att_BLSTM_CNN

--------------------------------------
start to load data ...
finish!
--------------------------------------
**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: The <e1>company</e1> fabricates plastic <e2>chairs</e2>
Extracted entities: e1 = company, e2 = chairs
Predicted relation: Product-Producer(e2,e1)


**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: break
Exiting the program. Goodbye!


**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: The <e1>company</e1> fabricates plastic <e2>chairs</e2>


Extracted entities: e1 = company, e2 = chairs
Predicted relation: Product-Producer(e2,e1)




**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: break


Exiting the program. Goodbye!


### invalid example

In [8]:
%run demo.py --model_name Att_BLSTM_CNN

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


--------------------------------------
Starting data loading process...
Data loading complete!
--------------------------------------


********** Please enter a sentence containing <e1> and <e2> tags (type 'break' to exit) **********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence:  The school <e1>master</e1> teaches the lesson with a <e2>stick<e2>.


Input format error. Please ensure the sentence contains both <e1> and <e2> tags.


********** Please enter a sentence containing <e1> and <e2> tags (type 'break' to exit) **********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence:  break


Exiting the program. Goodbye!


Extracted entities: e1 = master, e2 = stick
Predicted relation: Instrument-Agency(e2,e1)




**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: break


Exiting the program. Goodbye!


## 4.2 Bidirectional LSTM (BLSTM)(Baseline model)

In [15]:
%run demo.py --model_name BLSTM

--------------------------------------
start to load data ...
finish!
--------------------------------------
**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: The <e1>company</e1> fabricates plastic <e2>chairs</e2>
Extracted entities: e1 = company, e2 = chairs
Predicted relation: Product-Producer(e2,e1)


**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: break
Exiting the program. Goodbye!


**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: The <e1>company</e1> fabricates plastic <e2>chairs</e2>


Extracted entities: e1 = company, e2 = chairs
Predicted relation: Product-Producer(e2,e1)




**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: break


Exiting the program. Goodbye!


## 4.3 Attention-based Bidirectional LSTM (Att_BLSTM)

In [1]:
%run demo.py --model_name Att_BLSTM

--------------------------------------
start to load data ...
finish!
--------------------------------------
**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: The <e1>company</e1> fabricates plastic <e2>chairs</e2>
Extracted entities: e1 = company, e2 = chairs
Predicted relation: Product-Producer(e2,e1)


**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: break
Exiting the program. Goodbye!


**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: The <e1>company</e1> fabricates plastic <e2>chairs</e2>


Extracted entities: e1 = company, e2 = chairs
Predicted relation: Product-Producer(e2,e1)




**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: break


Exiting the program. Goodbye!


## 4.4 Bidirectional LSTM with Convolutional Neural Networks (BLSTM_CNN)

In [2]:
%run demo.py --model_name BLSTM_CNN

--------------------------------------
start to load data ...
finish!
--------------------------------------
**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: The disgusting scene was retaliation against her brother Philip who rents the <e1>room</e1> inside this apartment <e2>house</e2> on Lombard street.
Extracted entities: e1 = room, e2 = house
Predicted relation: Component-Whole(e1,e2)


**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: This <e1>thesis</e1> defines the <e2>clinical characteristics</e2> of amyloid disease.
Extracted entities: e1 = thesis, e2 = clinical characteristics
Predicted relation: Me

**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: The disgusting scene was retaliation against her brother Philip who rents the <e1>room</e1> inside this apartment <e2>house</e2> on Lombard street.


Extracted entities: e1 = room, e2 = house
Predicted relation: Component-Whole(e1,e2)




**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: This <e1>thesis</e1> defines the <e2>clinical characteristics</e2> of amyloid disease.


Extracted entities: e1 = thesis, e2 = clinical characteristics
Predicted relation: Message-Topic(e1,e2)




**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: A child is told a <e1>lie</e1> for several years by their <e2>parents</e2> before he/she realizes that a Santa Claus does not exist.


Extracted entities: e1 = lie, e2 = parents
Predicted relation: Product-Producer(e1,e2)




**********Please enter a sentence with <e1> and <e2> tags (type 'break' to exit)**********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence: break


Exiting the program. Goodbye!


## 4.5 Multi-Attention Bidirectional LSTM (Multi_Att_BLSTM)

In [7]:
%run demo.py --model_name Multi_Att_BLSTM

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


--------------------------------------
Starting data loading process...
Data loading complete!
--------------------------------------


********** Please enter a sentence containing <e1> and <e2> tags (type 'break' to exit) **********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence:  A child is told a <e1>lie</e1> for several years by their <e2>parents</e2> before he/she realizes that a Santa Claus does not exist.


Extracted entities: e1 = lie, e2 = parents
Predicted relation: Product-Producer(e1,e2)




********** Please enter a sentence containing <e1> and <e2> tags (type 'break' to exit) **********
Example: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.

Test sentence:  break


Exiting the program. Goodbye!
