1️⃣ 安装 & 导入库

2️⃣ 数据加载 & 预处理

3️⃣ 关系抽取模型（BERT）

4️⃣ 训练模型

5️⃣ 评估模型

6️⃣ 关系预测（推理）

7️⃣ 结果分析（可视化 Loss & 混淆矩阵）


Step 1: 安装 & 导入库

In [1]:
# 安装 transformers, torch 等依赖（只需执行一次）
!pip install transformers datasets torch scikit-learn matplotlib tqdm

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [2]:
# 导入相关库
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm


Step 2: data loading & reprocessing

In [6]:
import os

# 创建 data 目录（如果不存在）
os.makedirs("data", exist_ok=True)

from datasets import load_dataset

# 下载 SemEval 2010 Task 8 数据集
dataset = load_dataset("sem_eval_2010_task_8")

# 转换为 Pandas DataFrame
df_train = dataset["train"].to_pandas()
df_test = dataset["test"].to_pandas()

# 保存为 CSV
df_train.to_csv("data/semeval2010_task8_train.csv", index=False)
df_test.to_csv("data/semeval2010_task8_test.csv", index=False)

print("✅ 数据集已下载并保存！")



✅ 数据集已下载并保存！


In [8]:
# 加载数据
df = pd.read_csv("data/semeval2010_task8_train.csv")
df.head()  # 显示前几行

# 加载 BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 处理数据（Tokenization）
def preprocess_data(df):
    encodings = tokenizer(df["sentence"].tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt")
    labels = df["relation"].tolist()
    return encodings, labels

train_encodings, train_labels = preprocess_data(df)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Step 3: 构建 BERT 关系抽取模型

In [9]:
# 自定义数据集
class REDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, torch.tensor(self.labels[idx])

# 创建数据集
train_dataset = REDataset(train_encodings, train_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# 构建 BERT 模型
num_classes = len(set(train_labels))
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

训练模型

In [10]:
# 设置优化器和损失函数
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 训练 BERT
EPOCHS = 3
train_losses = []

for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    for batch, labels in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids, attention_mask = batch["input_ids"].to(device), batch["attention_mask"].to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask).logits
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    train_losses.append(epoch_loss / len(train_loader))
    print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")

# 保存模型
torch.save(model.state_dict(), "bert_relation_model.pth")


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, torch.tensor(self.labels[idx])
  3%|▎         | 16/500 [07:45<3:54:52, 29.12s/it]


KeyboardInterrupt: 

Step 5 Evaluation Models

In [None]:
# 加载测试集
df_test = pd.read_csv("data/semeval2010_task8_test.csv")
test_encodings, test_labels = preprocess_data(df_test)
test_dataset = REDataset(test_encodings, test_labels)
test_loader = DataLoader(test_dataset, batch_size=16)

# 评估模型
model.eval()
preds, truths = [], []

with torch.no_grad():
    for batch, labels in test_loader:
        input_ids, attention_mask = batch["input_ids"].to(device), batch["attention_mask"].to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask).logits
        preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
        truths.extend(labels.cpu().numpy())

# 计算评估指标
print(classification_report(truths, preds))


Step 6: Relation Predictiomn

In [None]:
def predict_relation(sentence):
    model.eval()
    encoding = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    input_ids, attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask).logits
        prediction = torch.argmax(output, dim=1).item()

    return prediction

# 测试新句子
test_sentence = "The Eiffel Tower is located in Paris."
print("Predicted Relation:", predict_relation(test_sentence))


Step 7: Analysis

In [None]:
import os

# 创建 data 目录（如果不存在）
os.makedirs("data", exist_ok=True)

# 现在可以保存 CSV
df_train.to_csv("data/semeval2010_task8_train.csv", index=False)
df_test.to_csv("data/semeval2010_task8_test.csv", index=False)

print("✅ 数据集已成功保存！")


In [None]:
# 绘制混淆矩阵
conf_matrix = confusion_matrix(truths, preds)
plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
