## 数据准备

In [36]:
from datasets import load_dataset

data = load_dataset("cardiffnlp/sentiment")
data["val"] = data["validation"]
del data["validation"]

In [37]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12284
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [7]:
import pandas as pd
data["test"].to_pandas().label.value_counts()

label
1    5937
0    3972
2    2375
Name: count, dtype: int64

In [38]:
# 字符数
max_char = data['train'].to_pandas()['text'].str.len().max()
print(f"The maximum number of characters is {max_char}")
# 词数
max_words = data['test'].to_pandas()['text'].str.split().str.len().max()
print(f"The maximum number of words is {max_words}")

The maximum number of characters is 200
The maximum number of words is 32


## llama tokenizer

In [39]:
from transformers import AutoTokenizer, DataCollatorWithPadding
llama_path = "./Meta-Llama-3.1-8B"
llama_tokenizer = AutoTokenizer.from_pretrained(llama_path, add_prefix_space=True)
llama_tokenizer.pad_token_id = llama_tokenizer.eos_token_id
llama_tokenizer.pad_token = llama_tokenizer.eos_token
def preprocessing_function(examples):
    examples['label'] = [int(i) for i in examples['label']]
    return llama_tokenizer(examples['text'], truncation=True, padding='max_length', max_length=64)

tokenized_data = data.map(preprocessing_function, batched=True, remove_columns= ["text"])
tokenized_data.set_format("torch")
# 创建 DataCollatorWithPadding 实例
data_collator = DataCollatorWithPadding(tokenizer=llama_tokenizer)


In [40]:
tokenized_data["train"]


Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 45615
})

## llama model

In [13]:
from transformers import AutoModelForSequenceClassification
import torch

pretrain_model = AutoModelForSequenceClassification.from_pretrained(llama_path, 
                                                                 num_labels=3,
                                                                device_map="auto",
                                                                offload_folder="offload",
                                                                trust_remote_code=True)
pretrain_model.config.pad_token_id = llama_tokenizer.pad_token_id
# llama_model.config.use_cache = False
# llama_model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ./Meta-Llama-3.1-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
model_dtype = next(pretrain_model.parameters()).dtype
print(f"Model dtype: {model_dtype}")

Model dtype: torch.float32


## LoRA 配置

In [24]:
from peft import get_peft_model, LoraConfig, TaskType
lora_alpha = 8
lora_rank = 4


lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, 
    r=lora_rank, 
    lora_alpha=lora_alpha, 
    lora_dropout=0.05, 
    bias="none",
    target_modules=[
       "q_proj",  "v_proj"
    ],
)

llama_model = get_peft_model(pretrain_model, lora_config)
llama_model.print_trainable_parameters()

trainable params: 6,959,104 || all params: 7,511,896,064 || trainable%: 0.0926


## 评估指标

In [35]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # eval_pred 是模型返回的预测值和实际值元组
    predictions = np.argmax(logits, axis=-1)
    
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')
    f1 = f1_score(labels, predictions, average='macro')
    accuracy = accuracy_score(labels, predictions)
    
    # 返回包含所有指标的字典
    return {"precision": precision, "recall": recall, "f1-score": f1, "accuracy": accuracy}


In [26]:
from transformers import TrainingArguments, Trainer

lr = 1e-4
batch_size = 8
num_epochs = 5

training_args = TrainingArguments(
    output_dir=f"./results/lr={lr}_lora_alpha={lora_alpha}_lora_r={lora_rank}",
    learning_rate=lr,
    lr_scheduler_type= "constant",
    warmup_ratio= 0.1,
    max_grad_norm= 0.3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.001,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2, 
    load_best_model_at_end=True,
    report_to="wandb",
    fp16=True,
    gradient_checkpointing=True,
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [27]:
trainer = Trainer(
    model=pretrain_model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['val'],
    tokenizer=llama_tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
# 开始训练

trainer.train()

In [21]:
import random
import numpy as np
import torch
from transformers import set_seed

# 设置随机种子
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_seed(seed)


In [25]:
from transformers import TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType

batch_size = 32
num_epochs = 5
lr_pars = [2e-04,1e-05]
lora_ranks = [16,8]
lora_alphas = [32]

for lr in lr_pars:
    for lora_rank in lora_ranks:
        for lora_alpha in lora_alphas:
            
            lora_config = LoraConfig(
                task_type=TaskType.SEQ_CLS, 
                r=lora_rank, 
                lora_alpha=lora_alpha, 
                lora_dropout=0.05, 
                bias="none",
                target_modules=[
                "q_proj",  "v_proj"
                ],
            )

            llama_model = get_peft_model(pretrain_model, lora_config)
            llama_model.print_trainable_parameters()

            training_args = TrainingArguments(
                output_dir=f"./results/lr={lr}_lora_alpha={lora_alpha}_lora_r={lora_rank}",
                learning_rate=lr,
                lr_scheduler_type= "constant",
                warmup_ratio= 0.1,
                max_grad_norm= 0.3,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                num_train_epochs=num_epochs,
                weight_decay=0.001,
                eval_strategy="epoch",
                save_strategy="epoch",
                metric_for_best_model="eval_f1-score",  # 使用准确率选择最佳模型
                greater_is_better=True,       # 指标越大越好
                save_total_limit=3, 
                load_best_model_at_end=True,
                # report_to="wandb",
                fp16=True,
                gradient_checkpointing=True,
            )

            trainer = Trainer(
                model=llama_model,
                args=training_args,
                train_dataset=tokenized_data['train'],
                eval_dataset=tokenized_data['val'],
                tokenizer=llama_tokenizer,
                compute_metrics=compute_metrics,
                data_collator=data_collator
            )

            # 开始训练
            print(f"------lr={lr}_lora_rank={lora_rank}_lora_alpha={lora_alpha}")
            trainer.train()     
            trainer.save_model(f"./results/lr={lr}_lora_alpha={lora_alpha}_lora_r={lora_rank}/best_model")
            



trainable params: 6,828,032 || all params: 7,511,764,992 || trainable%: 0.0909


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


------lr=0.0002_lora_rank=16_lora_alpha=32


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
1,0.6207,0.566585,0.728079,0.7414,0.730694,0.7395
2,0.5266,0.633718,0.793459,0.65073,0.670701,0.738
3,0.454,0.629658,0.738881,0.751413,0.736184,0.743
4,0.3571,0.63124,0.748523,0.75446,0.751101,0.763
5,0.2419,0.831401,0.747358,0.707947,0.722998,0.7435


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


trainable params: 3,420,160 || all params: 7,508,357,120 || trainable%: 0.0456


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


------lr=0.0002_lora_rank=8_lora_alpha=32


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
1,0.6125,0.559743,0.745924,0.745554,0.742305,0.7495
2,0.5221,0.595347,0.797024,0.676292,0.702453,0.7505
3,0.4615,0.618875,0.75869,0.736227,0.739981,0.749
4,0.3644,0.622508,0.733482,0.747886,0.739592,0.7525
5,0.2679,0.840272,0.753086,0.701062,0.717854,0.745


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


trainable params: 6,828,032 || all params: 7,511,764,992 || trainable%: 0.0909


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


------lr=1e-05_lora_rank=16_lora_alpha=32


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
1,0.7206,0.599613,0.758631,0.70766,0.722486,0.737
2,0.5601,0.576174,0.740071,0.742756,0.737467,0.7465
3,0.5332,0.546066,0.764414,0.755699,0.758173,0.7655
4,0.5113,0.547754,0.756978,0.743688,0.748245,0.758
5,0.4839,0.553476,0.763044,0.749796,0.755497,0.766


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


trainable params: 3,420,160 || all params: 7,508,357,120 || trainable%: 0.0456


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


------lr=1e-05_lora_rank=8_lora_alpha=32


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
1,0.7172,0.600779,0.761534,0.704874,0.721192,0.737
2,0.5619,0.580852,0.738473,0.740779,0.735094,0.743
3,0.5349,0.544426,0.758663,0.752074,0.753757,0.7625
4,0.5117,0.544107,0.758849,0.747574,0.751711,0.7635
5,0.4855,0.556181,0.764472,0.749738,0.755728,0.767


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


## 加载llama

In [17]:
from peft import PeftModel

# 加载微调后的权重
lora_weights_path = "/root/emotion_classification/results/lr=0.0002_lora_alpha=32_lora_r=8/best_model"  # 这里填写你LoRA微调后的权重路径
model = PeftModel.from_pretrained(pretrain_model, lora_weights_path)

In [16]:
tokenized_data["test"]

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 12284
})

In [41]:
from torch.utils.data import DataLoader
val_dataloader = DataLoader(
    tokenized_data["val"],
    batch_size=16,  # 根据硬件资源调整 batch_size
    shuffle=False,
    collate_fn=data_collator
)

In [45]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import torch
import numpy as np
from tqdm import tqdm
def evaluate(model, dataloader):
    model.eval()
    all_logits = []
    all_labels = []

    for batch in tqdm(dataloader):
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)

        with torch.no_grad():
            with torch.autocast('cuda'):
                outputs = model(input_ids, attention_mask=attention_mask)
        
        logits = outputs.logits
        all_logits.append(logits.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

    all_logits = np.concatenate(all_logits, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    
    return all_logits, all_labels

# 执行评估并计算指标
# logits, labels = evaluate(model, val_dataloader)
# metrics = compute_metrics((logits, labels))

# # 输出结果
# print("Evaluation Results:")
# for key, value in metrics.items():
#     print(f"{key}: {value:.4f}")

In [29]:
import os
path = []
def list_subdirectories(folder_path):
    # 使用 os.listdir() 列出文件夹中的所有文件和文件夹
    for item in os.listdir(folder_path):
        # 使用 os.path.isdir() 检查是否为文件夹
        if os.path.isdir(os.path.join(folder_path, item)):
            path.append(item)

# 示例：打印当前目录下的所有文件夹名
list_subdirectories('/root/emotion_classification/results')
path

['lr=1e-05_lora_alpha=32_lora_r=16',
 'lr=0.0002_lora_alpha=32_lora_r=16',
 'lr=1e-05_lora_alpha=32_lora_r=8',
 'lr=0.0002_lora_alpha=32_lora_r=8']

In [46]:
test_dataloader = DataLoader(
    tokenized_data["test"],
    batch_size=16,  # 根据硬件资源调整 batch_size
    shuffle=False,
    collate_fn=data_collator
)
from peft import PeftModel
path=["lr=0.0002_lora_alpha=32_lora_r=8"]
# 加载微调后的权重
for i in path:
    print(i)
    lora_weights_path = "/root/emotion_classification/results/"+i+"/best_model"  # 这里填写你LoRA微调后的权重路径
    model = PeftModel.from_pretrained(pretrain_model, lora_weights_path)

    logits, labels = evaluate(model, val_dataloader)
    metrics = compute_metrics((logits, labels))

    # 输出结果
    for key, value in metrics.items():
        print(f"{key}: {value:.4f}", end=", ")
    logits, labels = evaluate(model, test_dataloader)
    metrics = compute_metrics((logits, labels))

    # 输出结果
    for key, value in metrics.items():
        print(f"{key}: {value:.4f}", end=", ")

lr=0.0002_lora_alpha=32_lora_r=8


  0%|          | 0/125 [00:00<?, ?it/s]

100%|██████████| 125/125 [00:35<00:00,  3.56it/s]


precision: 0.7509, recall: 0.7480, f1-score: 0.7463, accuracy: 0.7560, 

100%|██████████| 768/768 [03:37<00:00,  3.53it/s]

precision: 0.7464, recall: 0.7357, f1-score: 0.7404, accuracy: 0.7441, 




In [32]:
test_dataloader = DataLoader(
    tokenized_data["test"],
    batch_size=16,  # 根据硬件资源调整 batch_size
    shuffle=False,
    collate_fn=data_collator
)
from peft import PeftModel

# 加载微调后的权重
for i in path:
    print(i)
    lora_weights_path = "/root/emotion_classification/results2/"+i+"/best_model"  # 这里填写你LoRA微调后的权重路径
    model = PeftModel.from_pretrained(pretrain_model, lora_weights_path)

    logits, labels = evaluate(model, val_dataloader)
    metrics = compute_metrics((logits, labels))

    # 输出结果
    for key, value in metrics.items():
        print(f"{key}: {value:.4f}", end=", ")
        
    logits, labels = evaluate(model, test_dataloader)
    metrics = compute_metrics((logits, labels))

    # 输出结果
    for key, value in metrics.items():
        print(f"{key}: {value:.4f}", end=", ")

lr=1e-05_lora_alpha=32_lora_r=16


100%|██████████| 125/125 [02:24<00:00,  1.16s/it]


precision: 0.7635, recall: 0.7553, f1-score: 0.7576, accuracy: 0.7650, 

100%|██████████| 768/768 [14:50<00:00,  1.16s/it]


precision: 0.7285, recall: 0.7277, f1-score: 0.7279, accuracy: 0.7293, lr=0.0002_lora_alpha=32_lora_r=16


100%|██████████| 125/125 [02:24<00:00,  1.16s/it]


precision: 0.7490, recall: 0.7548, f1-score: 0.7515, accuracy: 0.7635, 

100%|██████████| 768/768 [14:47<00:00,  1.16s/it]


precision: 0.7207, recall: 0.7205, f1-score: 0.7204, accuracy: 0.7222, lr=1e-05_lora_alpha=32_lora_r=8


100%|██████████| 125/125 [02:23<00:00,  1.15s/it]


precision: 0.7635, recall: 0.7494, f1-score: 0.7551, accuracy: 0.7665, 

100%|██████████| 768/768 [14:49<00:00,  1.16s/it]


precision: 0.7254, recall: 0.7317, f1-score: 0.7281, accuracy: 0.7290, lr=0.0002_lora_alpha=32_lora_r=8


100%|██████████| 125/125 [02:24<00:00,  1.16s/it]


precision: 0.7464, recall: 0.7456, f1-score: 0.7425, accuracy: 0.7495, 

100%|██████████| 768/768 [14:48<00:00,  1.16s/it]

precision: 0.7500, recall: 0.7404, f1-score: 0.7423, accuracy: 0.7443, 




## FGM 对抗训练

In [31]:
import torch

class FGM:
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=1.0, emb_name='word_embeddings'):
        # 对抗训练，只在embedding层添加扰动
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
        # 恢复embedding层的参数
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}


In [32]:
from transformers import Trainer

class CustomTrainer(Trainer):
    def __init__(self, *args, fgm=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.fgm = fgm

    def training_step(self, model, inputs):
        # 正常的前向传递和损失计算
        loss = super().training_step(model, inputs)
        
        if self.fgm is not None:
            # 使用FGM生成对抗样本
            self.fgm.attack()  # 在原始输入上加扰动
            loss_adv = super().training_step(model, inputs)  # 再次计算损失
            loss = (loss + loss_adv) / 2  # 将原始损失与对抗损失结合
            self.fgm.restore()  # 恢复模型参数
        
        return loss


In [33]:
from transformers import TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType

batch_size = 32
num_epochs = 5
lr_pars = [2e-04, 1e-05]
lora_ranks = [16, 8]
lora_alphas = [32]

for lr in lr_pars:
    for lora_rank in lora_ranks:
        for lora_alpha in lora_alphas:

            lora_config = LoraConfig(
                task_type=TaskType.SEQ_CLS, 
                r=lora_rank, 
                lora_alpha=lora_alpha, 
                lora_dropout=0.05, 
                bias="none",
                target_modules=[
                    "q_proj", "v_proj"
                ],
            )

            llama_model = get_peft_model(pretrain_model, lora_config)
            llama_model.print_trainable_parameters()

            # 初始化FGM
            fgm = FGM(llama_model)

            training_args = TrainingArguments(
                output_dir=f"./results/lr={lr}_lora_alpha={lora_alpha}_lora_r={lora_rank}",
                learning_rate=lr,
                lr_scheduler_type="constant",
                warmup_ratio=0.1,
                max_grad_norm=0.3,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                num_train_epochs=num_epochs,
                weight_decay=0.001,
                eval_strategy="epoch",
                save_strategy="epoch",
                metric_for_best_model="eval_f1-score",
                greater_is_better=True,
                save_total_limit=3, 
                load_best_model_at_end=True,
                fp16=True,
                gradient_checkpointing=True,
            )

            # 使用自定义的Trainer进行训练
            trainer = CustomTrainer(
                model=llama_model,
                args=training_args,
                train_dataset=tokenized_data['train'],
                eval_dataset=tokenized_data['val'],
                tokenizer=llama_tokenizer,
                compute_metrics=compute_metrics,
                data_collator=data_collator,
                fgm=fgm  # 传入FGM实例
            )

            # 开始训练
            print(f"------lr={lr}_lora_rank={lora_rank}_lora_alpha={lora_alpha}")
            trainer.train()
            trainer.save_model(f"./results/lr={lr}_lora_alpha={lora_alpha}_lora_r={lora_rank}/best_model")

            logits, labels = evaluate(llama_model, test_dataloader)
            metrics = compute_metrics((logits, labels))

            # 输出结果
            print("Evaluation Results:")
            for key, value in metrics.items():
                print(f"{key}: {value:.4f}")


trainable params: 6,828,032 || all params: 7,511,764,992 || trainable%: 0.0909


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


------lr=0.0002_lora_rank=16_lora_alpha=32


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
1,0.6143,0.555525,0.747145,0.756406,0.748023,0.757
2,0.5265,0.65387,0.788831,0.645406,0.665006,0.7365
3,0.4571,0.615685,0.762686,0.728835,0.7381,0.749
4,0.3645,0.6381,0.743815,0.73689,0.739767,0.7575
5,0.2481,0.821069,0.765913,0.684183,0.70793,0.7405


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
100%|██████████| 768/768 [03:41<00:00,  3.47it/s]


Evaluation Results:
precision: 0.7459
recall: 0.7418
f1-score: 0.7418
accuracy: 0.7436
trainable params: 3,420,160 || all params: 7,508,357,120 || trainable%: 0.0456


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


------lr=0.0002_lora_rank=8_lora_alpha=32


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [37]:
! pip list | grep transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


transformers              4.43.3


In [35]:
import sys
print(sys.executable)

/home/vipuser/anaconda3/envs/ec/bin/python


## 给poster和reply预测情感

In [2]:
import pandas as pd
from datasets import Dataset
import numpy as np

reply_to_keep = ['id','conversation_id','referenced_tweets.replied_to.id','author_id','formatted_text',"theta", "accounts_followed"]
poster_to_keep = ['id','author_id','formatted_text','topic','context_annotations','event',"theta", "accounts_followed"]
dtypes = {'id':str,'conversation_id':str,'referenced_tweets.replied_to.id' : str,"theta": np.float64}

# 读取 CSV 文件
replies = pd.read_csv('all_replies.csv', usecols=lambda column: column in reply_to_keep, dtype=dtypes)
posters = pd.read_csv("all_posters.csv", usecols=lambda column: column in poster_to_keep)

In [3]:
posters_text = posters["formatted_text"]
replies_text = replies["formatted_text"]

In [48]:
from datasets import DatasetDict
posters_dict = {"text": posters_text.tolist()}
replies_dict = {"text": replies_text.tolist()}

data = DatasetDict({
    "posters" : Dataset.from_dict(posters_dict),
    "replies": Dataset.from_dict(replies_dict)
})

In [7]:
# 字符数
max_char = data['posters'].to_pandas()['text'].str.len().max()
print(f"The maximum number of characters is {max_char}")
# 词数
max_words = data['posters'].to_pandas()['text'].str.split().str.len().max()
print(f"The maximum number of words is {max_words}")

The maximum number of characters is 698
The maximum number of words is 119


In [8]:
# 字符数
max_char = data['replies'].to_pandas()['text'].str.len().max()
print(f"The maximum number of characters is {max_char}")
# 词数
max_words = data['replies'].to_pandas()['text'].str.split().str.len().max()
print(f"The maximum number of words is {max_words}")

The maximum number of characters is 1156
The maximum number of words is 111


In [11]:
from transformers import AutoTokenizer, DataCollatorWithPadding
llama_path = "./Meta-Llama-3.1-8B"
llama_tokenizer = AutoTokenizer.from_pretrained(llama_path, add_prefix_space=True)
llama_tokenizer.pad_token_id = llama_tokenizer.eos_token_id
llama_tokenizer.pad_token = llama_tokenizer.eos_token
def preprocessing_function(examples):
    return llama_tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_data = data.map(preprocessing_function, batched=True, remove_columns= ["text"])
tokenized_data.set_format("torch")
# 创建 DataCollatorWithPadding 实例
data_collator = DataCollatorWithPadding(tokenizer=llama_tokenizer)


Map:   0%|          | 0/2261178 [00:00<?, ? examples/s]

Map:   0%|          | 0/3937856 [00:00<?, ? examples/s]

In [47]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 12284
    })
    val: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [16]:
from peft import PeftModel

from transformers import AutoModelForSequenceClassification
import torch

pretrain_model = AutoModelForSequenceClassification.from_pretrained(llama_path, 
                                                                 num_labels=3,
                                                                device_map="auto",
                                                                offload_folder="offload",
                                                                trust_remote_code=True)
pretrain_model.config.pad_token_id = llama_tokenizer.pad_token_id
# llama_model.config.use_cache = False
# llama_model.config.pretraining_tp = 1

# 加载微调后的权重
lora_weights_path = "/root/emotion_classification/results/lr=0.0002_lora_alpha=32_lora_r=8/best_model"  # 这里填写你LoRA微调后的权重路径
model = PeftModel.from_pretrained(pretrain_model, lora_weights_path)

In [24]:
from torch.utils.data import DataLoader
import torch
import numpy as np
from tqdm import tqdm
posters_dataloader = DataLoader(
    tokenized_data["posters"],
    batch_size=16,  # 根据硬件资源调整 batch_size
    shuffle=False,
    collate_fn=data_collator
)
replies_dataloader = DataLoader(
    tokenized_data["replies"],
    batch_size=16,  # 根据硬件资源调整 batch_size
    shuffle=False,
    collate_fn=data_collator
)


In [52]:
from torch.amp import autocast
def evaluate(model, dataloader):
    model.eval()
    predictions = []
    

    for batch in tqdm(dataloader):
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        

        with torch.no_grad():
            with autocast('cuda'):
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).cpu().numpy()
        predictions.extend(prediction)
        print(predictions)
    return predictions


In [53]:
posters_output = evaluate(model, posters_dataloader)
posters_output

  0%|          | 0/141324 [00:00<?, ?it/s]

  0%|          | 1/141324 [00:00<22:38:46,  1.73it/s]

[0, 0, 1, 0, 2, 1, 0, 0, 2, 2, 1, 1, 1, 1, 2, 2]


  0%|          | 2/141324 [00:01<20:49:00,  1.89it/s]

[0, 0, 1, 0, 2, 1, 0, 0, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 0, 1, 2, 2, 1, 2, 2, 1]


  0%|          | 3/141324 [00:01<20:22:31,  1.93it/s]

[0, 0, 1, 0, 2, 1, 0, 0, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 0, 1, 2, 2, 1, 2, 2, 1, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0]


  0%|          | 4/141324 [00:02<20:14:10,  1.94it/s]

[0, 0, 1, 0, 2, 1, 0, 0, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 0, 1, 2, 2, 1, 2, 2, 1, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 2, 0, 1, 0, 2, 0, 0, 0, 1, 2, 0, 0, 1, 0, 1, 0]


  0%|          | 5/141324 [00:02<20:12:20,  1.94it/s]

[0, 0, 1, 0, 2, 1, 0, 0, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 0, 1, 2, 2, 1, 2, 2, 1, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 2, 0, 1, 0, 2, 0, 0, 0, 1, 2, 0, 0, 1, 0, 1, 0, 0, 2, 0, 1, 2, 1, 2, 0, 0, 1, 2, 0, 1, 1, 1, 0]


  0%|          | 6/141324 [00:03<20:50:21,  1.88it/s]

[0, 0, 1, 0, 2, 1, 0, 0, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 0, 1, 2, 2, 1, 2, 2, 1, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 2, 0, 1, 0, 2, 0, 0, 0, 1, 2, 0, 0, 1, 0, 1, 0, 0, 2, 0, 1, 2, 1, 2, 0, 0, 1, 2, 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 1, 2, 1, 0, 1, 2, 1, 0, 0, 1]





KeyboardInterrupt: 