In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import DebertaTokenizer, DebertaForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig
from sklearn.metrics import f1_score
import numpy as np
import os
os.environ['CUDA_VISIBLE_DEVICES']='0,1'
# 自定义 Dataset 类
class EntityMatchingDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.sentence_1 = dataframe['sentence_1'].tolist()
        self.sentence_2 = dataframe['sentence_2'].tolist()
        self.labels = dataframe['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text1 = self.sentence_1[idx]
        text2 = self.sentence_2[idx]
        label = self.labels[idx]
        
        # 将两个句子连接在一起并进行 tokenization
        encoding = self.tokenizer.encode_plus(
            text1,
            text2,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 读取 CSV 数据



In [2]:
import torch

class GradientHook:
    def __init__(self, model):
        self.gradients = {}
        self.hooks = []
        
        # 为每一层注册钩子
        for name, module in model.named_modules():
            if isinstance(module, torch.nn.Linear):  # 可以修改为其他层类型
                hook = module.register_backward_hook(self._save_gradient(name))
                self.hooks.append(hook)

    def _save_gradient(self, layer_name):
        def hook(module, grad_input, grad_output):
            self.gradients[layer_name] = grad_output[0].detach().cpu()  # 获取每层的梯度
        return hook

    def remove_hooks(self):
        for hook in self.hooks:
            hook.remove()

    def get_gradients(self):
        return self.gradients


In [3]:
from transformers import Trainer

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.gradient_hook = None  # 用于存储梯度钩子

    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        self.gradient_hook = GradientHook(self.model)  # 在评估时启用梯度钩子
        result = super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)  # 执行评估
        
        gradients = self.gradient_hook.get_gradients()  # 获取梯度信息
        self.gradient_hook.remove_hooks()  # 评估完毕后移除钩子

        # 在返回结果中加入梯度信息
        result["gradients"] = gradients
        return result

In [4]:
train_df = pd.read_json('/home/yanmy/Transfer-ER/Unicorn-main/data/Abt-Buy-Fewshot/train.json')
valid_df = pd.read_json('/home/yanmy/Transfer-ER/Unicorn-main/data/Abt-Buy-Fewshot/valid.json')
test_df = pd.read_json('/home/yanmy/Transfer-ER/Unicorn-main/data/Abt-Buy-Fewshot/test.json')
train_df.columns = ['sentence_1', 'sentence_2', 'label']
valid_df.columns = ['sentence_1', 'sentence_2', 'label']
test_df.columns = ['sentence_1', 'sentence_2', 'label']
# 加载 DeBERTa-v3-base 的 tokenizer
# tokenizer = DebertaTokenizer.from_pretrained('/home/yanmy/model/deberta-v3-base')
tokenizer = AutoTokenizer.from_pretrained('/home/yanmy/model/deberta-v3-base', padding_side="right")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

# 定义数据集
train_dataset = EntityMatchingDataset(train_df, tokenizer)
valid_dataset = EntityMatchingDataset(valid_df, tokenizer)
test_dataset = EntityMatchingDataset(test_df, tokenizer)

# 加载 DeBERTa-v3-base 模型
model = AutoModelForSequenceClassification.from_pretrained('/home/yanmy/model/deberta-v3-base', num_labels=2)

# 配置 LoRA
lora_config = LoraConfig(
    r=8,  # LoRA rank
    lora_alpha=16,  # 通常 LoRA alpha 设置为 2 倍的 rank
    target_modules=["value_proj"],  # 微调分类层
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, lora_config)

# 定义 f1_score 评估函数
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    f1 = f1_score(labels, predictions)
    return {"f1": f1}

# 定义训练参数
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=128,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="none"
)

# 使用 Trainer 进行训练
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=valid_dataset,
#     # compute_metrics=compute_metrics  # 设置 f1_score 作为评估指标
# )

# # 开始训练
# trainer.train()

# # 在测试集上进行评估
# metrics = trainer.evaluate(test_dataset)
# print(metrics)  # 输出包含 f1_score 的评估结果



trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)
trainer.train()
# 进行评估
metrics = trainer.evaluate(test_dataset)
print(metrics)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /home/yanmy/model/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2024-11-13 19:47:20,947] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/yanmy/anaconda3/compiler_compat/ld: cannot find -laio
collect2: error: ld returned 1 exit status




  def forward(ctx, input, weight, bias=None):
  def backward(ctx, grad_output):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'eval_runtime': 2.9041, 'eval_samples_per_second': 659.751, 'eval_steps_per_second': 2.755, 'epoch': 3.0, 'gradients': {}}


In [7]:
def predict(text1, text2):
    # 获取模型所在设备
    device = model.device
    
    # 将文本转换为张量
    inputs = tokenizer(text1, text2, return_tensors="pt", truncation=True, padding=True)
    
    # 将输入数据移到相同的设备上
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # 进行推理
    outputs = model(**inputs)
    logits = outputs.logits
    
    # 获取预测类别
    predicted_class = logits.argmax().item()
    
    return predicted_class  # 1: 同一实体, 0: 不同实体

predict_all = []
from tqdm.notebook import tqdm
for sentence_1, sentence_2, label in tqdm(test_df.values):
    prediction = predict(sentence_1, sentence_2)
    predict_all.append(prediction)
label_all = test_df['label'].tolist()
from sklearn.metrics import f1_score
f1_score(y_true=label_all, y_pred=predict_all)

  0%|          | 0/1916 [00:00<?, ?it/s]

0.0