In [2]:
# 安装 transformers, torch 等依赖（只需执行一次）
!pip install transformers datasets torch scikit-learn matplotlib tqdm

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [3]:
# ✅ **第二部分: 导入库**
import os
import re
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [4]:
# ✅ **第三部分: 确保 GPU 可用**
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"当前设备: {device}")
!nvidia-smi

当前设备: cuda
Mon Feb 24 12:04:57 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                     

In [5]:
# ✅ **第四部分: 加载数据集**
os.makedirs("data", exist_ok=True)
dataset = load_dataset("sem_eval_2010_task_8", download_mode="force_redownload")
df_train = dataset["train"].to_pandas()
df_test = dataset["test"].to_pandas()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.23k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/673k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/231k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2717 [00:00<?, ? examples/s]

In [6]:
print(df_train.head())  # 查看前5行
print(df_train.columns)  # 查看列名

                                            sentence  relation
0  The system as described above has its greatest...         3
1  The <e1>child</e1> was carefully wrapped and b...        18
2  The <e1>author</e1> of a keygen uses a <e2>dis...        11
3  A misty <e1>ridge</e1> uprises from the <e2>su...        18
4  The <e1>student</e1> <e2>association</e2> is t...        12
Index(['sentence', 'relation'], dtype='object')


In [7]:
# ✅ **第五部分: 数据预处理**
def clean_text(text):
    text = re.sub(r"<e1>(.*?)</e1>", "@entity1@", text)
    text = re.sub(r"<e2>(.*?)</e2>", "@entity2@", text)
    return text

df_train["clean_sentence"] = df_train["sentence"].apply(clean_text)
df_test["clean_sentence"] = df_test["sentence"].apply(clean_text)

# 查看清理后的数据
print(df_train[["sentence", "clean_sentence"]].head())

                                            sentence  \
0  The system as described above has its greatest...   
1  The <e1>child</e1> was carefully wrapped and b...   
2  The <e1>author</e1> of a keygen uses a <e2>dis...   
3  A misty <e1>ridge</e1> uprises from the <e2>su...   
4  The <e1>student</e1> <e2>association</e2> is t...   

                                      clean_sentence  
0  The system as described above has its greatest...  
1  The @entity1@ was carefully wrapped and bound ...  
2  The @entity1@ of a keygen uses a @entity2@ to ...  
3      A misty @entity1@ uprises from the @entity2@.  
4  The @entity1@ @entity2@ is the voice of the un...  


In [8]:
import re

def clean_text(text):
    """ 将 <e1> 和 <e2> 标记替换为 @entity1@ 和 @entity2@ """
    text = re.sub(r"<e1>(.*?)</e1>", "@entity1@", text)
    text = re.sub(r"<e2>(.*?)</e2>", "@entity2@", text)
    return text

# 应用到数据集
df_train["clean_sentence"] = df_train["sentence"].apply(clean_text)
df_test["clean_sentence"] = df_test["sentence"].apply(clean_text)

# 查看清理后的数据
print(df_train[["sentence", "clean_sentence"]].head())


                                            sentence  \
0  The system as described above has its greatest...   
1  The <e1>child</e1> was carefully wrapped and b...   
2  The <e1>author</e1> of a keygen uses a <e2>dis...   
3  A misty <e1>ridge</e1> uprises from the <e2>su...   
4  The <e1>student</e1> <e2>association</e2> is t...   

                                      clean_sentence  
0  The system as described above has its greatest...  
1  The @entity1@ was carefully wrapped and bound ...  
2  The @entity1@ of a keygen uses a @entity2@ to ...  
3      A misty @entity1@ uprises from the @entity2@.  
4  The @entity1@ @entity2@ is the voice of the un...  


In [9]:
# 生成 关系标签到 ID 的映射
unique_relations = sorted(df_train["relation"].unique())
label2id = {label: idx for idx, label in enumerate(unique_relations)}
id2label = {idx: label for label, idx in label2id.items()}

# 映射标签
df_train["label_id"] = df_train["relation"].map(label2id)
df_test["label_id"] = df_test["relation"].map(label2id)

# 显示映射后的标签
print(df_train[["relation", "label_id"]].head())


   relation  label_id
0         3         3
1        18        18
2        11        11
3        18        18
4        12        12


In [10]:
# ✅ **第六部分: 加载 BERT 模型**
num_labels = len(label2id)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model.to(device)
print(f"✅ 模型已加载到: {next(model.parameters()).device}")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

✅ 模型已加载到: cuda:0


In [11]:
# from transformers import BertForSequenceClassification, BertTokenizer

# # 设定关系分类的类别数（SemEval 2010 Task 8 有 19 类）
# num_labels = 19

# # 加载 DistilBERT 模型和分词器
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label2id))
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# # 移动模型到 GPU
# model.to(device)

# # 确保模型已成功加载到 GPU
# print(f"模型所在设备: {next(model.parameters()).device}")


In [12]:
# ✅ **第七部分: Tokenization**
def encode_texts(texts, tokenizer, max_length=128):
    return tokenizer(list(texts.values), padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

train_encodings = encode_texts(df_train["clean_sentence"], tokenizer)
test_encodings = encode_texts(df_test["clean_sentence"], tokenizer)


In [13]:
# ✅ **第八部分: 创建 PyTorch Dataset**
class RelationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

train_dataset = RelationDataset(train_encodings, df_train["label_id"].tolist())
test_dataset = RelationDataset(test_encodings, df_test["label_id"].tolist())


In [14]:
# ✅ **第九部分: 创建 DataLoader**
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2, pin_memory=True)

In [15]:
from transformers import get_scheduler

# ✅ 设置 AdamW 优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

# ✅ 计算总的训练步数
num_training_steps = len(train_loader) * 10  # 假设训练 5 轮
lr_scheduler = get_scheduler(
    name="cosine",  # 余弦退火
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# ✅ **第十部分: 训练模型**
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(10):  # 训练 5 轮
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        loss.backward()
        optimizer.step()  # ✅ 只执行一次优化器更新
        lr_scheduler.step()  # ✅ 让学习率逐步下降

        total_loss += loss.item()

    print(f"Epoch {epoch+1}: 平均损失 = {total_loss / len(train_loader):.4f}")

print("✅ 训练完成！")


Epoch 1: 平均损失 = 2.1472
Epoch 2: 平均损失 = 1.0757
Epoch 3: 平均损失 = 0.7100
Epoch 4: 平均损失 = 0.4741
Epoch 5: 平均损失 = 0.3199
Epoch 6: 平均损失 = 0.2116
Epoch 7: 平均损失 = 0.1559
Epoch 8: 平均损失 = 0.1227
Epoch 9: 平均损失 = 0.1074
Epoch 10: 平均损失 = 0.1031
✅ 训练完成！


In [16]:
# from transformers import get_scheduler

# # ✅ 设置 AdamW 优化器
# optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

# # ✅ 计算总的训练步数
# num_training_steps = len(train_loader) * 5  # 假设训练 10 轮
# lr_scheduler = get_scheduler(
#     name="cosine",  # 余弦退火
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps
# )


# # @title Default title text
# # ✅ **第十部分: 训练模型**
# optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
# criterion = torch.nn.CrossEntropyLoss()

# for epoch in range(5):
#     model.train()
#     total_loss = 0

#     for batch in train_loader:
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["labels"].to(device)

#         optimizer.zero_grad()
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()

#         optimizer.step()
#         lr_scheduler.step()  # 调整学习率


#         total_loss += loss.item()

#     print(f"Epoch {epoch+1}: 平均损失 = {total_loss / len(train_loader):.4f}")

# print("✅ 训练完成！")

In [17]:
# ✅ **第十一部分: 评估模型**
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("真实标签 (all_labels) 示例:", all_labels[:10])
print("预测标签 (all_preds) 示例:", all_preds[:10])
print("关系类别 (label2id.keys()):", list(label2id.keys()))

print("测试集性能:")
print(classification_report(all_labels, all_preds, target_names=[str(label) for label in label2id.keys()], zero_division=1))

print("✅ 代码执行完毕，训练和测试都已完成！")


真实标签 (all_labels) 示例: [14, 17, 11, 6, 1, 2, 16, 13, 2, 14]
预测标签 (all_preds) 示例: [14, 17, 11, 6, 1, 2, 18, 13, 2, 14]
关系类别 (label2id.keys()): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
测试集性能:
              precision    recall  f1-score   support

           0       0.89      0.92      0.90       134
           1       0.89      0.88      0.89       194
           2       0.79      0.81      0.80       162
           3       0.76      0.75      0.76       150
           4       0.83      0.90      0.86       153
           5       0.74      0.82      0.78        39
           6       0.90      0.91      0.90       291
           7       1.00      0.00      0.00         1
           8       0.84      0.82      0.83       211
           9       0.78      0.68      0.73        47
          10       0.35      0.50      0.42        22
          11       0.74      0.78      0.76       134
          12       0.49      0.56      0.52        32
          13       0.79     

In [18]:
# from sklearn.metrics import classification_report

# # 评估模型
# model.eval()
# all_preds, all_labels = [], []

# with torch.no_grad():
#     for batch in test_loader:
#         input_ids = batch["input_ids"].to("cuda")
#         attention_mask = batch["attention_mask"].to("cuda")
#         labels = batch["labels"].to("cuda")

#         outputs = model(input_ids, attention_mask=attention_mask)
#         predictions = torch.argmax(outputs.logits, dim=1)

#         all_preds.extend(predictions.cpu().numpy())
#         all_labels.extend(labels.cpu().numpy())

# # **检查数据格式**
# print("真实标签 (all_labels) 示例:", all_labels[:10])
# print("预测标签 (all_preds) 示例:", all_preds[:10])
# print("关系类别 (label2id.keys()):", list(label2id.keys()))

# # **修正 target_names 并防止 Precision 计算错误**
# print("测试集性能:")
# print(classification_report(all_labels, all_preds, target_names=[str(label) for label in label2id.keys()], zero_division=1))

# print("✅ 代码执行完毕，训练和测试都已完成！")


In [19]:
print("真实标签 (all_labels) 示例:", all_labels[:10])
print("预测标签 (all_preds) 示例:", all_preds[:10])
print("关系类别 (label2id.keys()):", list(label2id.keys()))


真实标签 (all_labels) 示例: [14, 17, 11, 6, 1, 2, 16, 13, 2, 14]
预测标签 (all_preds) 示例: [14, 17, 11, 6, 1, 2, 18, 13, 2, 14]
关系类别 (label2id.keys()): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]


In [20]:
print(df_train.sample(5))  # 随机查看 5 行
print(df_train.isnull().sum())  # 查看是否有 NaN 数据
print(df_train["sentence"].apply(len).describe())  # 检查句子长度分布


                                               sentence  relation  \
4190  In recent years, the <e1>issue</e1> of experim...        18   
1853  Thankfully the <e1>plump</e1> of <e2>wildfowls...        13   
7856  In addition to his reputation as a <e1>film</e...        16   
5064  The weight of the <e1>groceries</e1> in a shop...         4   
5672  Stack gummy savers with a little frosting in b...        18   

                                         clean_sentence  label_id  
4190  In recent years, the @entity1@ of experimentat...        18  
1853  Thankfully the @entity1@ of @entity2@ brought ...        13  
7856  In addition to his reputation as a @entity1@ @...        16  
5064  The weight of the @entity1@ in a shopping @ent...         4  
5672  Stack gummy savers with a little frosting in b...        18  
sentence          0
relation          0
clean_sentence    0
label_id          0
dtype: int64
count    8000.000000
mean      119.793500
std        44.963614
min        35.000000
