In [1]:
# Step 1: Install dependencies (only required once)
!pip install transformers datasets torch scikit-learn matplotlib tqdm nltk
!pip install SPARQLWrapper

# Step 2: Import necessary libraries
import os
import re
import torch
import pandas as pd
import nltk
from nltk.corpus import wordnet
from datasets import load_dataset
from tqdm import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON

# Download WordNet resources
nltk.download('wordnet')

# Step 3: Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Current device: {device}")

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
# Step 4: Load the SemEval-2010 Task 8 dataset
dataset = load_dataset("sem_eval_2010_task_8")
df_train = dataset["train"].to_pandas()
df_test = dataset["test"].to_pandas()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.23k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/673k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/231k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2717 [00:00<?, ? examples/s]

In [3]:
# Step 5: Initialize a cache for Wikidata queries
wikidata_cache = {}

# Step 6: Extract entities from sentences
def extract_entities(sentence):
    """ Extracts entities enclosed in <e1> and <e2> tags from a given sentence. """
    entity1 = re.search(r"<e1>(.*?)</e1>", sentence)
    entity2 = re.search(r"<e2>(.*?)</e2>", sentence)
    return (entity1.group(1) if entity1 else ""), (entity2.group(1) if entity2 else "")

# Step 7: Retrieve entity definitions from WordNet
def get_wordnet_definition(entity):
    """ Retrieves the WordNet definition for a given entity. """
    synsets = wordnet.synsets(entity)
    return synsets[0].definition() if synsets else "N/A"

# Step 8: Query Wikidata for entity relationships
def query_wikidata(entity):
    """ Queries Wikidata for relationships of a given entity, utilizing caching to avoid redundant requests. """
    if entity in wikidata_cache:
        return wikidata_cache[entity]  # Return cached results

    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

    query = f"""
    SELECT ?relationLabel ?entityLabel WHERE {{
      ?entity rdfs:label "{entity}"@en.
      ?entity ?relation ?relatedEntity.
      ?relatedEntity rdfs:label ?entityLabel.
      FILTER (LANG(?entityLabel) = "en")
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }} LIMIT 5
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)

    try:
        results = sparql.query().convert()
        relations = [
            f"{result['relationLabel']['value']}: {result['entityLabel']['value']}"
            for result in results["results"]["bindings"]
        ]

        result_str = "; ".join(relations) if relations else "No data"
        wikidata_cache[entity] = result_str  # Cache result
        return result_str
    except Exception:
        return "No data"

# Step 9: Augment sentence with WordNet and Wikidata information
def enhance_sentence_with_knowledge(sentence):
    """ Enhances a sentence with WordNet definitions and Wikidata relationships. """
    entity1, entity2 = extract_entities(sentence)

    # Retrieve WordNet definitions
    entity1_info = get_wordnet_definition(entity1)
    entity2_info = get_wordnet_definition(entity2)

    # Query Wikidata relationships (using cache)
    entity1_kg = query_wikidata(entity1)
    entity2_kg = query_wikidata(entity2)

    # Construct enhanced sentence
    enhanced_sentence = (
        f"{sentence} [SEP] {entity1}: {entity1_info} [KG: {entity1_kg}] "
        f"[SEP] {entity2}: {entity2_info} [KG: {entity2_kg}]"
    )
    return enhanced_sentence

# Step 10: Apply enhancements to dataset (with progress bars)
tqdm.pandas()

print("Processing training dataset...")
df_train["enhanced_sentence"] = df_train["sentence"].progress_apply(enhance_sentence_with_knowledge)

print("Processing test dataset...")
df_test["enhanced_sentence"] = df_test["sentence"].progress_apply(enhance_sentence_with_knowledge)

# Step 11: Generate label-to-ID mapping
unique_relations = sorted(df_train["relation"].unique())
label2id = {label: idx for idx, label in enumerate(unique_relations)}
id2label = {idx: label for label, idx in label2id.items()}

df_train["label_id"] = df_train["relation"].map(label2id)
df_test["label_id"] = df_test["relation"].map(label2id)

print("Data preprocessing completed!")

Processing training dataset...


100%|██████████| 8000/8000 [12:31<00:00, 10.65it/s]


Processing test dataset...


100%|██████████| 2717/2717 [02:11<00:00, 20.63it/s]

Data preprocessing completed!





In [7]:
# Step 12: Save processed dataset as CSV
df_train.to_csv("train_enhanced.csv", index=False)
df_test.to_csv("test_enhanced.csv", index=False)


# Step 13: Preview enhanced dataset
print("Sample of augmented dataset:")
print((df_train[["sentence", "enhanced_sentence"]].head()).to_string(index=False))

print("rocess completed, enhanced dataset is saved and ready for use!")

Sample of augmented dataset:
                                                                                                                                      sentence                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   enhanced_sentence
                 The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.                                                                                                                          