In [1]:
!pip install nltk
!pip install scikit-learn
!pip install datasets 
!pip install pandas



In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 1. Original Dataset
Loading Original Dataset from Hugging Face

In [3]:
from datasets import load_dataset

# Load the dataset
ds = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")

# Display the first few rows of the training set
print(ds['train'][:5])  # This will display the first 5 examples

{'sentence': ['The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.', 'The <e1>child</e1> was carefully wrapped and bound into the <e2>cradle</e2> by means of a cord.', 'The <e1>author</e1> of a keygen uses a <e2>disassembler</e2> to look at the raw assembly code.', 'A misty <e1>ridge</e1> uprises from the <e2>surge</e2>.', 'The <e1>student</e1> <e2>association</e2> is the voice of the undergraduate student population of the State University of New York at Buffalo.'], 'relation': [3, 18, 11, 18, 12]}


# 2. Data Preprocess

The purpose of this code is to load and process the SemEval-2010 Task 8 dataset, preparing it for training, validation, and testing of a relationship classification model. 

1. **Loading Data:**
   - If local training, validation, and test data files (`train.json`, `validation.json`, `test.json`) already exist, the code will directly load these files.
   - If the local data files do not exist, the code will download the training and test data from the publicly available dataset on Hugging Face, `SemEvalWorkshop/sem_eval_2010_task_8`.

2. **Data Cleaning:**
   - **Cleaning Text**: The text in the data is cleaned (e.g., removing HTML escape characters, extra spaces, etc.) to ensure uniform text formatting.
   - **Extracting Entities**: The entities between the `<e1>` and `<e2>` tags in the sentences are extracted using regular expressions, and the sentences are tokenized.

3. **Data Preprocessing:**
   - **Label Conversion**: Relation labels are converted from numeric IDs to human-readable relation types (e.g., “Cause-Effect(e1,e2)”).
   - **Handling Rare Relations**: Relations that appear only once in the test or validation sets are handled separately to prevent data imbalance.
   - **Data Splitting**: **The original test data is further split into a validation set and a test set with a radio: 60% - 40%**, ensuring balanced class distribution (using `train_test_split`).

4. **Saving Data:**
   - The cleaned and processed data is saved in JSON format for easy use in training models later.


In [4]:
import json
import os
import html
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from nltk.tokenize import word_tokenize

# Define 19 relationship labels
relations = [
    "Cause-Effect(e1,e2)", "Cause-Effect(e2,e1)",
    "Component-Whole(e1,e2)", "Component-Whole(e2,e1)",
    "Content-Container(e1,e2)", "Content-Container(e2,e1)",
    "Entity-Destination(e1,e2)", "Entity-Destination(e2,e1)",
    "Entity-Origin(e1,e2)", "Entity-Origin(e2,e1)",
    "Instrument-Agency(e1,e2)", "Instrument-Agency(e2,e1)",
    "Member-Collection(e1,e2)", "Member-Collection(e2,e1)",
    "Message-Topic(e1,e2)", "Message-Topic(e2,e1)",
    "Product-Producer(e1,e2)", "Product-Producer(e2,e1)",
    "Other"
]

# Relationship label mapping
label2id = {label: i for i, label in enumerate(relations)}
id2label = {i: label for label, i in label2id.items()}

# Define file paths
original_train_file = "original_train.json"
original_test_file = "original_test.json"
train_file = "train.json"
valid_file = "validation.json"
test_file = "test.json"

def load_json_lines(file_path):
    """Load local JSON Lines data"""
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line.strip()))  
    return data
    
def load_local_data(file_path):
    """Load JSON data locally (single JSON object)"""
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data
    
def save_local_data(data, file_path):
    """Save data in JSON Lines format"""
    with open(file_path, "w", encoding="utf-8") as f:
        for entry in data:
            json.dump(entry, f, ensure_ascii=False)
            f.write("\n")   

def clean_text(text):
    """Clean text data"""
    if not isinstance(text, str) or text.strip() == "":
        return None  # Filter out empty text
    text = html.unescape(text)  # Handle HTML escape characters
    return text.strip()

def search_entity(sentence):
    """Extract and format entity markers in the sentence"""
    e1 = re.findall(r'<e1>(.*)</e1>', sentence)[0]
    e2 = re.findall(r'<e2>(.*)</e2>', sentence)[0]
    sentence = sentence.replace('<e1>' + e1 + '</e1>', ' <e1> ' + e1 + ' </e1> ', 1)
    sentence = sentence.replace('<e2>' + e2 + '</e2>', ' <e2> ' + e2 + ' </e2> ', 1)
    sentence = word_tokenize(sentence)
    sentence = ' '.join(sentence)
    sentence = sentence.replace('< e1 >', '<e1>')
    sentence = sentence.replace('< e2 >', '<e2>')
    sentence = sentence.replace('< /e1 >', '</e1>')
    sentence = sentence.replace('< /e2 >', '</e2>')
    sentence = sentence.split()

    assert '<e1>' in sentence
    assert '<e2>' in sentence
    assert '</e1>' in sentence
    assert '</e2>' in sentence

    return sentence
    
if os.path.exists(original_train_file) and os.path.exists(original_test_file):
    print("Original train and test data found locally. Loading...")
    train_data_with_ids = load_json_lines(original_train_file)
    test_data_with_ids = load_json_lines(original_test_file)
else:
    print("Loading dataset from Hugging Face...")
    ds = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")

    train_texts_all = ds["train"]["sentence"]
    train_labels_all = ds["train"]["relation"]
    test_texts = ds["test"]["sentence"]
    test_labels = ds["test"]["relation"]

    # **Create original data with IDs**
    train_data_with_ids = [
        {"id": i + 1, "sentence": text, "relation": label} 
        for i, (text, label) in enumerate(zip(train_texts_all, train_labels_all))
    ]
    
    test_data_with_ids = [
        {"id": i + 8001, "sentence": text, "relation": label} 
        for i, (text, label) in enumerate(zip(test_texts, test_labels))
    ]

    # **Save original data**
    save_local_data(train_data_with_ids, original_train_file)
    save_local_data(test_data_with_ids, original_test_file)
    print("Original train and test data saved locally!")

# **Step 2: Clean the data**
clean_train_data = [(clean_text(d["sentence"]), d["relation"], d["id"]) for d in train_data_with_ids]
clean_train_data = [x for x in clean_train_data if x[0] is not None]
    
clean_test_data = [(clean_text(d["sentence"]), d["relation"], d["id"]) for d in test_data_with_ids]
clean_test_data = [x for x in clean_test_data if x[0] is not None]
        
# **Step 3: New data split**
# 1. Read original test set data
test_data = [{"id": item[2], "relation": item[1], "sentence": search_entity(item[0]), "comment": "N/A"} for item in clean_test_data]
    
# 2. Get category labels (relation)
labels = [item["relation"] for item in test_data]
    
# 3. Count occurrences of each category
relation_counts = Counter(labels)
    
# 4. Select categories that appear only once
single_instance_relations = {relation for relation, count in relation_counts.items() if count == 1}
    
# 5. Separate data
single_instance_data = [item for item in test_data if item["relation"] in single_instance_relations]  # Directly into new_test
remaining_data = [item for item in test_data if item["relation"] not in single_instance_relations]  # Used in train_test_split
    
# 6. Perform 60%-40% stratified sampling
if remaining_data:
    remaining_labels = [item["relation"] for item in remaining_data]
    new_val_data, new_test_data_split = train_test_split(
            remaining_data, test_size=0.4, stratify=remaining_labels, random_state=42
            )
else:
    new_val_data, new_test_data_split = [], []

# 7. Merge final test set data
new_test_data = new_test_data_split + single_instance_data
    
# **Step 4: Convert to final format**
train_data = [{
            "id": item[2],
            "relation": id2label[item[1]],
            "sentence": search_entity(item[0]),
            # "comment": "N/A"
        } for item in clean_train_data]
    
valid_data = [{
            "id": item["id"],  
            "relation": id2label.get(item["relation"]),
            "sentence": item["sentence"],  
            # "comment": "N/A"
        } for item in new_val_data]
    
test_data = [{
            "id": item["id"],  
            "relation": id2label.get(item["relation"]),
            "sentence": item["sentence"],  
            # "comment": "N/A"
        } for item in new_test_data]
    
# **Step 5: Save the cleaned data**
save_local_data(train_data, train_file)
save_local_data(valid_data, valid_file)
save_local_data(test_data, test_file)
    
print("Processed data saved!")
print("Training set size:", len(train_data))
print("Validation set size:", len(new_val_data))
print("Test set size:", len(new_test_data))


Original train and test data found locally. Loading...
Processed data saved!
Training set size: 8000
Validation set size: 1629
Test set size: 1088


# 3. Display Processed Datasets

## 3.1 Display JSON file 

In [5]:
import json

def load_json_lines(file_path):
    """Load JSON data locally (multiple JSON objects)"""
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line.strip()))  # Read and parse each line as a JSON object
    return data

# Example of loading data from a JSON file with multiple objects per line
file_path = 'train.json'  # Change this to your file path
data = load_json_lines(file_path)

# Print the first few records to inspect
print(data[:10])  # Display the first ten elements

[{'id': 1, 'relation': 'Component-Whole(e2,e1)', 'sentence': ['The', 'system', 'as', 'described', 'above', 'has', 'its', 'greatest', 'application', 'in', 'an', 'arrayed', '<e1>', 'configuration', '</e1>', 'of', 'antenna', '<e2>', 'elements', '</e2>', '.']}, {'id': 2, 'relation': 'Other', 'sentence': ['The', '<e1>', 'child', '</e1>', 'was', 'carefully', 'wrapped', 'and', 'bound', 'into', 'the', '<e2>', 'cradle', '</e2>', 'by', 'means', 'of', 'a', 'cord', '.']}, {'id': 3, 'relation': 'Instrument-Agency(e2,e1)', 'sentence': ['The', '<e1>', 'author', '</e1>', 'of', 'a', 'keygen', 'uses', 'a', '<e2>', 'disassembler', '</e2>', 'to', 'look', 'at', 'the', 'raw', 'assembly', 'code', '.']}, {'id': 4, 'relation': 'Other', 'sentence': ['A', 'misty', '<e1>', 'ridge', '</e1>', 'uprises', 'from', 'the', '<e2>', 'surge', '</e2>', '.']}, {'id': 5, 'relation': 'Member-Collection(e1,e2)', 'sentence': ['The', '<e1>', 'student', '</e1>', '<e2>', 'association', '</e2>', 'is', 'the', 'voice', 'of', 'the', 'u

## 3.2 Distriution of relations

In [6]:
from collections import Counter
from datasets import load_dataset

# Define the relation labels dictionary
relations = [
    "Cause-Effect(e1,e2)", "Cause-Effect(e2,e1)",
    "Component-Whole(e1,e2)", "Component-Whole(e2,e1)",
    "Content-Container(e1,e2)", "Content-Container(e2,e1)",
    "Entity-Destination(e1,e2)", "Entity-Destination(e2,e1)",
    "Entity-Origin(e1,e2)", "Entity-Origin(e2,e1)",
    "Instrument-Agency(e1,e2)", "Instrument-Agency(e2,e1)",
    "Member-Collection(e1,e2)", "Member-Collection(e2,e1)",
    "Message-Topic(e1,e2)", "Message-Topic(e2,e1)",
    "Product-Producer(e1,e2)", "Product-Producer(e2,e1)",
    "Other"
]

label2id = {label: i for i, label in enumerate(relations)}
id2label = {i: label for label, i in label2id.items()}

# 1. Load dataset from Hugging Face
ds = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")

# 2. Read training and test data
train_texts_all = ds["train"]["sentence"]  # Training texts
train_labels_all = ds["train"]["relation"]  # Training labels
test_texts = ds["test"]["sentence"]         # Test texts
test_labels = ds["test"]["relation"]        # Test labels

# Get the distribution of relation labels in the training and test sets and convert to label names using id2label
train_relation_counts = Counter([id2label[label] for label in train_labels_all])
test_relation_counts = Counter([id2label[label] for label in test_labels])
print("----------Original Datasets distribution-----------")
# Print relation label distribution
print("Training set relation distribution:")
for label, count in train_relation_counts.items():
    print(f"{label}: {count}")

print("\nTest set relation distribution:")
for label, count in test_relation_counts.items():
    print(f"{label}: {count}")

print("----------Splitted Datasets distribution-----------")
# Get the relation distribution in the training, validation, and test sets
train_relations = [item["relation"] for item in train_data]
valid_relations = [item["relation"] for item in valid_data]
test_relations = [item["relation"] for item in test_data]

# Count the occurrences of each label
train_relation_counts = Counter(train_relations)
valid_relation_counts = Counter(valid_relations)
test_relation_counts = Counter(test_relations)

# Print the distribution results
print("Training set relation distribution:")
print(train_relation_counts)

print("\nValidation set relation distribution:")
print(valid_relation_counts)

print("\nTest set relation distribution:")
print(test_relation_counts)


----------Original Datasets distribution-----------
Training set relation distribution:
Component-Whole(e2,e1): 471
Other: 1410
Instrument-Agency(e2,e1): 407
Member-Collection(e1,e2): 78
Cause-Effect(e2,e1): 659
Entity-Destination(e1,e2): 844
Content-Container(e1,e2): 374
Message-Topic(e1,e2): 490
Product-Producer(e2,e1): 394
Member-Collection(e2,e1): 612
Entity-Origin(e1,e2): 568
Cause-Effect(e1,e2): 344
Component-Whole(e1,e2): 470
Message-Topic(e2,e1): 144
Product-Producer(e1,e2): 323
Entity-Origin(e2,e1): 148
Content-Container(e2,e1): 166
Instrument-Agency(e1,e2): 97
Entity-Destination(e2,e1): 1

Test set relation distribution:
Message-Topic(e1,e2): 210
Product-Producer(e2,e1): 123
Instrument-Agency(e2,e1): 134
Entity-Destination(e1,e2): 291
Cause-Effect(e2,e1): 194
Component-Whole(e1,e2): 162
Product-Producer(e1,e2): 108
Member-Collection(e2,e1): 201
Other: 454
Entity-Origin(e1,e2): 211
Content-Container(e1,e2): 153
Entity-Origin(e2,e1): 47
Cause-Effect(e1,e2): 134
Component-Whole(

In [7]:
import os

# Configuration class: used to store the data directory path
class Config:
    def __init__(self, data_dir):
        self.data_dir = data_dir  # Set the data directory path

# Relation loader class: used to read relation mappings
class RelationLoader(object):
    def __init__(self, config):
        self.data_dir = config.data_dir  # Set the data directory path

    def __load_relation(self):
        """Private method: Load the relation-to-ID mapping"""
        relation_file = os.path.join(self.data_dir, 'relation2id.txt')  # Path to the relation file
        rel2id = {}  # Dictionary mapping relations to IDs
        id2rel = {}  # Dictionary mapping IDs to relations
        with open(relation_file, 'r', encoding='utf-8') as fr:
            for line in fr:
                relation, id_s = line.strip().split()  # Read relation name and corresponding ID
                id_d = int(id_s)  # Convert ID to an integer
                rel2id[relation] = id_d  # Store in the rel2id dictionary
                id2rel[id_d] = relation  # Store in the id2rel dictionary
        return rel2id, id2rel, len(rel2id)  # Return mappings and total number of relations

    def get_relation(self):
        """Retrieve relation mappings"""
        return self.__load_relation()

# Use RelationLoader to read relation2id.txt
config = Config(data_dir="./")  # Assume relation2id.txt is in the current directory
relation_loader = RelationLoader(config)
rel2id, id2rel, total_relations = relation_loader.get_relation()

# Print results
print("Relation to ID mapping:", rel2id)
print("ID to Relation mapping:", id2rel)
print("Total number of relations:", total_relations)


Relation to ID mapping: {'Other': 0, 'Cause-Effect(e1,e2)': 1, 'Cause-Effect(e2,e1)': 2, 'Component-Whole(e1,e2)': 3, 'Component-Whole(e2,e1)': 4, 'Content-Container(e1,e2)': 5, 'Content-Container(e2,e1)': 6, 'Entity-Destination(e1,e2)': 7, 'Entity-Destination(e2,e1)': 8, 'Entity-Origin(e1,e2)': 9, 'Entity-Origin(e2,e1)': 10, 'Instrument-Agency(e1,e2)': 11, 'Instrument-Agency(e2,e1)': 12, 'Member-Collection(e1,e2)': 13, 'Member-Collection(e2,e1)': 14, 'Message-Topic(e1,e2)': 15, 'Message-Topic(e2,e1)': 16, 'Product-Producer(e1,e2)': 17, 'Product-Producer(e2,e1)': 18}
ID to Relation mapping: {0: 'Other', 1: 'Cause-Effect(e1,e2)', 2: 'Cause-Effect(e2,e1)', 3: 'Component-Whole(e1,e2)', 4: 'Component-Whole(e2,e1)', 5: 'Content-Container(e1,e2)', 6: 'Content-Container(e2,e1)', 7: 'Entity-Destination(e1,e2)', 8: 'Entity-Destination(e2,e1)', 9: 'Entity-Origin(e1,e2)', 10: 'Entity-Origin(e2,e1)', 11: 'Instrument-Agency(e1,e2)', 12: 'Instrument-Agency(e2,e1)', 13: 'Member-Collection(e1,e2)', 14