In [1]:
# ✅ **第一部分: 安装依赖**
!pip install transformers datasets torch scikit-learn matplotlib tqdm nltk
!pip install SPARQLWrapper

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [2]:
import os
import re
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON

# 下载 WordNet 资源
nltk.download('wordnet')

# ✅ 检测是否有 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"当前设备: {device}")


当前设备: cuda


[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
# ✅ 加载 SemEval-2010 Task 8 数据集
dataset = load_dataset("sem_eval_2010_task_8")
df_train = dataset["train"].to_pandas()
df_test = dataset["test"].to_pandas()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.23k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/673k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/231k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2717 [00:00<?, ? examples/s]

In [4]:
import re
import pandas as pd
import nltk
from nltk.corpus import wordnet
from SPARQLWrapper import SPARQLWrapper, JSON
import time

# 下载 WordNet 资源
nltk.download('wordnet')

# ✅ 1️⃣ 创建 Wikidata 查询缓存
wikidata_cache = {}

# ✅ 2️⃣ 提取 `e1` 和 `e2`
def extract_entities(sentence):
    """ 从句子中提取 <e1> 和 <e2> 之间的实体 """
    entity1 = re.search(r"<e1>(.*?)</e1>", sentence)
    entity2 = re.search(r"<e2>(.*?)</e2>", sentence)
    return (entity1.group(1) if entity1 else ""), (entity2.group(1) if entity2 else "")

# ✅ 3️⃣ 获取 WordNet 定义
def get_wordnet_definition(entity):
    """ 从 WordNet 获取实体的定义 """
    synsets = wordnet.synsets(entity)
    return synsets[0].definition() if synsets else "N/A"

# ✅ 4️⃣ 查询 Wikidata 关系（增加缓存）
def query_wikidata(entity):
    """ 查询 Wikidata 获取实体的关系信息（带缓存） """
    if entity in wikidata_cache:
        return wikidata_cache[entity]  # 直接从缓存返回，避免重复查询

    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

    query = f"""
    SELECT ?relationLabel ?entityLabel WHERE {{
      ?entity rdfs:label "{entity}"@en.
      ?entity ?relation ?relatedEntity.
      ?relatedEntity rdfs:label ?entityLabel.
      FILTER (LANG(?entityLabel) = "en")
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }} LIMIT 5
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)

    try:
        results = sparql.query().convert()
        relations = []
        for result in results["results"]["bindings"]:
            relation = result["relationLabel"]["value"]
            related_entity = result["entityLabel"]["value"]
            relations.append(f"{relation}: {related_entity}")

        result_str = "; ".join(relations) if relations else "No data"
        wikidata_cache[entity] = result_str  # 存入缓存
        return result_str
    except Exception as e:
        return "No data"

# ✅ 5️⃣ 结合 WordNet & Wikidata 进行增强（增加进度条）
def enhance_sentence_with_knowledge(sentence):
    """ 通过 WordNet + Wikidata 增强句子 """
    entity1, entity2 = extract_entities(sentence)

    # 获取 WordNet 定义
    entity1_info = get_wordnet_definition(entity1)
    entity2_info = get_wordnet_definition(entity2)

    # 查询 Wikidata 关系（使用缓存）
    entity1_kg = query_wikidata(entity1)
    entity2_kg = query_wikidata(entity2)

    # 组装增强后的句子
    enhanced_sentence = (
        f"{sentence} [SEP] {entity1}: {entity1_info} [KG: {entity1_kg}] "
        f"[SEP] {entity2}: {entity2_info} [KG: {entity2_kg}]"
    )
    return enhanced_sentence

# ✅ 6️⃣ 处理数据（增加进度条）
from tqdm import tqdm

# 加快处理速度
tqdm.pandas()

# 处理数据集，显示进度条
print("🚀 处理训练集...")
df_train["enhanced_sentence"] = df_train["sentence"].progress_apply(enhance_sentence_with_knowledge)

print("🚀 处理测试集...")
df_test["enhanced_sentence"] = df_test["sentence"].progress_apply(enhance_sentence_with_knowledge)

# ✅ 7️⃣ 生成关系标签到 ID 的映射
unique_relations = sorted(df_train["relation"].unique())
label2id = {label: idx for idx, label in enumerate(unique_relations)}
id2label = {idx: label for label, idx in label2id.items()}

df_train["label_id"] = df_train["relation"].map(label2id)
df_test["label_id"] = df_test["relation"].map(label2id)

print("✅ 数据预处理完成！")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


🚀 处理训练集...


100%|██████████| 8000/8000 [35:52<00:00,  3.72it/s]


🚀 处理测试集...


100%|██████████| 2717/2717 [05:45<00:00,  7.87it/s]

✅ 数据预处理完成！





In [5]:
# ✅ 保存 CSV 格式
df_train.to_csv("train_enhanced.csv", index=False)
df_test.to_csv("test_enhanced.csv", index=False)


In [6]:
    num_labels = len(label2id)
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model.to(device)


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [7]:
def encode_texts(texts, tokenizer, max_length=256):
    return tokenizer(list(texts.values), padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

train_encodings = encode_texts(df_train["enhanced_sentence"], tokenizer)
test_encodings = encode_texts(df_test["enhanced_sentence"], tokenizer)


In [8]:
# ✅ **第八部分: 创建 PyTorch Dataset**
class RelationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

train_dataset = RelationDataset(train_encodings, df_train["label_id"].tolist())
test_dataset = RelationDataset(test_encodings, df_test["label_id"].tolist())

In [13]:
# ✅ **第九部分: 创建 DataLoader**
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2, pin_memory=True)

In [15]:
# ✅ 第十部分: 定义对比学习损失
class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.1):
        super(ContrastiveLoss, self).__init__()
        self.temperature = temperature

    def forward(self, z_i, z_j):
        """ 计算对比损失 (正样本相似, 负样本远离) """
        sim = torch.nn.functional.cosine_similarity(z_i, z_j, dim=-1) / self.temperature
        loss = -torch.log(torch.nn.functional.softmax(sim, dim=-1)).mean()
        return loss

contrastive_loss_fn = ContrastiveLoss()

In [23]:

# ✅ 第十一部分: 训练模型 (加入学习率调度 + 对比学习)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

# 设定学习率调度器
num_training_steps = len(train_loader) * 10  # 5个epoch
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

criterion = torch.nn.CrossEntropyLoss()

for epoch in range(10):
    model.train()
    total_loss, contrastive_loss_total = 0, 0

    for batch in tqdm(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        # 获取 BERT 的输出
        outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
        logits = outputs.logits
        hidden_states = outputs.hidden_states[-1][:, 0, :]  # 取 CLS 位置的向量

        # 计算交叉熵损失
        ce_loss = criterion(logits, labels)

        # 计算对比学习损失
        positive_idx = torch.arange(hidden_states.size(0))  # 选择当前batch的样本作为正样本
        negative_idx = torch.roll(positive_idx, shifts=1)  # 随机打乱作为负样本
        contrastive_loss = contrastive_loss_fn(hidden_states[positive_idx], hidden_states[negative_idx])

        # 总损失 = 交叉熵损失 + 0.2 * 对比学习损失
        loss = ce_loss + 0.2 * contrastive_loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        contrastive_loss_total += contrastive_loss.item()

    print(f"Epoch {epoch+1}: 交叉熵损失 = {total_loss / len(train_loader):.4f}, 对比损失 = {contrastive_loss_total / len(train_loader):.4f}")

print("✅ 训练完成！")

100%|██████████| 250/250 [01:12<00:00,  3.45it/s]


Epoch 1: 交叉熵损失 = 1.7767, 对比损失 = 3.4922


100%|██████████| 250/250 [01:12<00:00,  3.46it/s]


Epoch 2: 交叉熵损失 = 1.7667, 对比损失 = 3.4874


100%|██████████| 250/250 [01:12<00:00,  3.46it/s]


Epoch 3: 交叉熵损失 = 1.7547, 对比损失 = 3.4840


100%|██████████| 250/250 [01:12<00:00,  3.46it/s]


Epoch 4: 交叉熵损失 = 1.7483, 对比损失 = 3.4833


100%|██████████| 250/250 [01:12<00:00,  3.46it/s]

Epoch 5: 交叉熵损失 = 1.7469, 对比损失 = 3.4830
✅ 训练完成！





In [24]:
from sklearn.metrics import accuracy_score, classification_report

# ✅ **第十一部分: 评估模型**
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds) * 100

print("真实标签 (all_labels) 示例:", all_labels[:10])
print("预测标签 (all_preds) 示例:", all_preds[:10])
print("关系类别 (label2id.keys()):", list(label2id.keys()))

print(f"测试集 Accuracy: {accuracy:.3f}%")
print("测试集性能:")
print(classification_report(all_labels, all_preds, target_names=[str(label) for label in label2id.keys()], zero_division=1))

print("✅ 代码执行完毕，训练和测试都已完成！")


真实标签 (all_labels) 示例: [14, 17, 11, 6, 1, 2, 16, 13, 2, 14]
预测标签 (all_preds) 示例: [14, 17, 11, 6, 1, 2, 18, 13, 2, 14]
关系类别 (label2id.keys()): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
测试集 Accuracy: 84.726%
测试集性能:
              precision    recall  f1-score   support

           0       0.93      0.96      0.94       134
           1       0.94      0.92      0.93       194
           2       0.84      0.86      0.85       162
           3       0.77      0.85      0.81       150
           4       0.87      0.92      0.90       153
           5       0.85      0.90      0.88        39
           6       0.92      0.95      0.93       291
           7       1.00      0.00      0.00         1
           8       0.88      0.91      0.90       211
           9       0.85      0.83      0.84        47
          10       0.56      0.82      0.67        22
          11       0.81      0.84      0.82       134
          12       0.76      0.78      0.77        32
      