# 导入环境

In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from sklearn.model_selection import train_test_split
df = pd.read_json('./dataset/merged_output.json')
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

# 5. 验证划分比例
print(f"训练集: {len(train_ds)}条 ({len(train_ds)/len(df):.1%})")
print(f"测试集: {len(test_ds)}条 ({len(test_ds)/len(df):.1%})")

训练集: 8003条 (80.0%)
测试集: 2001条 (20.0%)


# 处理数据集

In [None]:
tokenizer = AutoTokenizer.from_pretrained('./model/AI-ModelScope/Mistral-7B-Instruct-v0___2', use_fast=False, trust_remote_code=True)
tokenizer.padding_side = "left"  # Flash Attention 必须左对齐
tokenizer.pad_token = tokenizer.eos_token

In [5]:
tokenizer.pad_token, tokenizer.pad_token_id, tokenizer.eos_token_id

('</s>', 2, 2)

In [None]:
def process_func(example):
    MAX_LENGTH = 1024    
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"""<s>[INST]
                            你是一位老年服务机构的文书编辑，擅长将老人的口头叙述准确、清晰地转化为日常书面风格的中文文本，并判断语句中存在的错误类型。
                            句子中可能存在的错误类型：1. 句子成分缺失。2. 句子结构混乱。3. 句子成分错误。4. 句子成分冗余。
                            原文：{example['spoken_text']}
                            上下文(仅协助理解，不翻译): {example['context']}
                            仅输出原文那一句话的中文转换结果和错误类型序号，不要输出思考过程，不要输出解释。输出格式：
                            翻译结果：
                            错误类型：
                            [/INST]</s>\n""", add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    # 构建response
    response = tokenizer(
        f"翻译结果：{example['written_text']}\n错误类型：{','.join(map(str, example['error_type']))}",
        add_special_tokens=False
    )
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH: 
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [7]:
tokenized_train = train_ds.map(process_func, remove_columns=train_ds.column_names)
tokenized_train

Map: 100%|██████████| 8003/8003 [00:11<00:00, 707.15 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 8003
})

In [8]:
print(tokenizer.decode(tokenized_train[0]['input_ids']))

<s> [INST]
                            你是一位老年服务机构的文书编辑，擅长将老人的口头叙述准确、清晰地转化为日常书面风格的中文文本，并判断语句中存在的错误类型。
                            句子中可能存在的错误类型：1. 句子成分缺失。2. 句子结构混乱。3. 句子成分错误。4. 句子成分冗余。
                            原文：去一回也算，守上几天也算，任家庄街西社区也算抗美援朝老兵。
                            上下文(仅协助理解，不翻译): 王家庄街西社区的士兵没有去朝鲜。庄街西社区的部分居民并未参战，即便参战也未曾经历实战。主街西社区的居民承担了守备任务，但街西社区在五六年前已经停战了。任家庄街西社区已经停战，守备部队也已撤回。去一回也算，守上几天也算，任家庄街西社区也算抗美援朝老兵。任家庄街西社区那也算了啊，任家庄街西社你去了吗？任家庄街西社对对对大叔，任家庄街西那您四年以后当兵。4年以后就集体转业吗？就集体转业，送这的，就直接转到大西北来了啊。直接送到这里，哦送了多少人啊？你们一起送了。
                            仅输出原文那一句话的中文转换结果和错误类型序号，不要输出思考过程，不要输出解释。输出格式：
                            翻译结果：
                            错误类型：
                            [/INST]</s> 
 翻译结果：无论是曾赴朝鲜参战，还是仅参与过短期守备任务，任家庄街西社区的居民都可被视为抗美援朝老兵。
错误类型：1,2,3</s>


In [9]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_train[1]["labels"])))

' 翻译结果：您坚持包粽子50年，真是不容易啊。\n错误类型：4</s>'

# 创建模型

In [None]:
import torch

model = AutoModelForCausalLM.from_pretrained('./model/AI-ModelScope/Mistral-7B-Instruct-v0___2', 
                                             device_map="auto",
                                             torch_dtype=torch.bfloat16,
                                             #attn_implementation="flash_attention_2",
                                             use_cache=False,
                                             trust_remote_code=True)
model

Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.86it/s]


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): MistralRMSNorm((4096,), eps=1e-0

In [11]:
model.enable_input_require_grads() # 开启梯度检查点时，要执行该方法

In [12]:
model.dtype

torch.bfloat16

In [13]:
model.config._attn_implementation

'sdpa'

# lora 

In [17]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
config

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'q_proj', 'down_proj', 'gate_proj', 'o_proj', 'k_proj', 'up_proj', 'v_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [18]:
model = get_peft_model(model, config)
config



LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'q_proj', 'down_proj', 'gate_proj', 'o_proj', 'k_proj', 'up_proj', 'v_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [19]:
model.print_trainable_parameters()

trainable params: 20,971,520 || all params: 7,262,703,616 || trainable%: 0.2888


# 配置训练参数

In [None]:
args = TrainingArguments(
    output_dir="./output/mistral",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=3,
    save_steps=100,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    save_on_each_node=True,
    save_strategy = "steps",
    gradient_checkpointing=True
)

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding="longest",
    pad_to_multiple_of=8,  
    return_tensors="pt"     
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    #eval_dataset=tokenized_val,
    data_collator=data_collator,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
#torch.cuda.empty_cache()

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)  # 将模型移动到 GPU 上
model.device

device(type='cuda', index=0)

In [23]:
trainer.train()

Step,Training Loss


KeyboardInterrupt: 

# 保存 LoRA 和 tokenizer 结果


In [21]:
peft_model_id="./mistral_lora/train3"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

('./mistral_lora/train3/tokenizer_config.json',
 './mistral_lora/train3/special_tokens_map.json',
 './mistral_lora/train3/tokenizer.model',
 './mistral_lora/train3/added_tokens.json')

In [22]:
tokenized_test = test_ds.map(process_func, remove_columns=test_ds.column_names)

import gc
gc.collect()
torch.cuda.empty_cache()

model.config.use_cache = False
trainer.args.per_device_eval_batch_size = 1
#trainer.evaluate(tokenized_test.select(range(5)))  # 先试 5 条

trainer.evaluate(tokenized_test)

Map: 100%|██████████| 2001/2001 [00:03<00:00, 665.05 examples/s]


{'eval_loss': 0.6241292357444763,
 'eval_runtime': 215.9996,
 'eval_samples_per_second': 9.264,
 'eval_steps_per_second': 9.264,
 'epoch': 2.995502248875562}

# 加载 lora 权重推理

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel

mode_path = './model/AI-ModelScope/Mistral-7B-Instruct-v0___2'
lora_path = './mistral_lora/train3'

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)
#tokenizer = AutoTokenizer.from_pretrained(lora_path, trust_remote_code=True)
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 加载模型
model = AutoModelForCausalLM.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()

# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.29s/it]


In [2]:
torch.cuda.empty_cache()

# 假设你已经指定了 device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)  # 将模型移动到 GPU 上
print(model.device)

cuda:0


In [3]:
model.device 

device(type='cuda', index=0)

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset

df = pd.read_json('./dataset/test_dataset.json')

test_ds = Dataset.from_pandas(df)
test_ds

Dataset({
    features: ['id', 'file_id', 'spoken_text', 'context', 'written_text', 'error_type', '__index_level_0__'],
    num_rows: 2001
})

In [None]:
import torch
import random
import json
import re
from tqdm import tqdm

# ✅ 解析预测结果中的“翻译结果”和“错误类型”
def extract_translation_and_error_type(pred_text):
    trans_match = re.search(r"翻译结果：\s*(.*?)\n", pred_text, re.DOTALL)
    translation = trans_match.group(1).strip() if trans_match else ""

    error_match = re.search(r"错误类型：\s*(.+)", pred_text)
    if error_match:
        error_str = error_match.group(1).strip()
        # 只提取1-4的数字，并使用集合去重
        error_type = list({int(x) for x in re.findall(r"\d+", error_str) if 1 <= int(x) <= 4})
    else:
        error_type = []

    return translation, error_type

tokenizer.padding_side = 'left'
test_samples =list(test_ds)

batch_size = 8
all_results = []

for i in tqdm(range(0, len(test_samples), batch_size), desc="Processing samples"):
    batch = test_samples[i:i+batch_size]

    # ✅ 构造输入
    prompts = [
        f"""<s>[INST]
            你是一位老年服务机构的文书编辑，擅长将老人的口头叙述准确、清晰地转化为日常书面风格的中文文本，并判断语句中存在的错误类型。
            句子中可能存在的错误类型：1. 句子成分缺失。2. 句子结构混乱。3. 句子成分错误。4. 句子成分冗余。
            原文：{example['spoken_text']}
            上下文(仅协助理解，不翻译): {example['context']}
            仅输出原文那一句话的中文翻译结果和错误类型序号，不要输出思考过程，不要输出解释。请在你预测的翻译结果前写“翻译结果：”，错误类型前写“错误类型：”。
            严格按照输出格式输出：
            翻译结果：
            错误类型：
            [/INST]</s>""" for example in batch
    ]

    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding="longest",
        truncation=True,
    ).to(model.device)

    def safe_decode(sequences, tokenizer):
        vocab_size = tokenizer.vocab_size  # ChatGLMTokenizer 使用这个属性获取词表大小
        cleaned = []
        for seq in sequences:
            if isinstance(seq, torch.Tensor):
                seq = seq.tolist()
            # 过滤掉不在词表范围内的 token
            seq = [token for token in seq if 0 <= token < vocab_size]
            cleaned.append(seq)
        return tokenizer.batch_decode(cleaned, skip_special_tokens=True)

    # ✅ 推理
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            num_beams=1,
            do_sample=False,
            temperature=0.2,
            top_p=0.95,
        )

    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    #preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    preds = safe_decode(outputs, tokenizer)

    # ✅ 处理每条预测结果
    for pred_text, example in zip(preds, batch):
        translation, predicted_error_type = extract_translation_and_error_type(pred_text)

        # 转换原始标签为数组
        true_error = example.get("error_type", [])
        if isinstance(true_error, int):
            true_error = [true_error]
        elif isinstance(true_error, str):
            true_error = [int(x) for x in re.findall(r"\d+", true_error)]

        result = {
            "spoken_text": example["spoken_text"],
            "context": example["context"],
            "reference": example["written_text"],
            "ref_error_type": true_error,
            "prediction": translation,
            "pred_error_type": predicted_error_type
        }
        all_results.append(result)

# ✅ 保存结果到 JSON 文件
output_file = "./result/lora/lora_predictions_mistral.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)

print(f"✅ 预测结果已保存到 {output_file}")

Processing samples:   0%|          | 0/251 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processing samples:   0%|          | 1/251 [00:05<24:35,  5.90s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Processing samples:   1%|          | 2/251 [00:11<23:56,  5.77s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Processing samples:   1%|          | 3/251 [00:17<24:03,  5.82s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Processing samples:   2%|▏         | 4/251 [00:24<25:21,  6.16s/it]Setting `

✅ 预测结果已保存到 model_predictions/lora_predictions_mistral.json





In [None]:
import json
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from tqdm import tqdm
import numpy as np

# 加载模型和 tokenizer
tokenizer = AutoTokenizer.from_pretrained("./bleurt-base-128", local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained("./bleurt-base-128", local_files_only=True)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 加载数据
with open("./result/lora/lora_predictions_mistral.json", "r", encoding="utf-8") as f:
    data = json.load(f)

import importlib
import evaluate
importlib.reload(evaluate)  # 强制重新加载  
from evaluate import calculate_all_metrics

# 存储所有指标
all_scores = []

for item in tqdm(data):
    reference = item["reference"]
    generated = item["prediction"]
    if not reference or not generated:
        continue  # 跳过空文本样本
    ref_error_types = item.get("ref_error_type", [])
    pred_error_types = item.get("pred_error_type", [])

    metrics = calculate_all_metrics(reference, generated, tokenizer, model, ref_error_types, pred_error_types, device)
    all_scores.append(metrics)

# 计算每个指标的平均值
average_scores = {}
for key in all_scores[0].keys():
    average_scores[key] = np.mean([score[key] for score in all_scores])

# 打印平均结果
print("各项指标的平均值：")
for key, value in average_scores.items():
    print(f"{key}: {value:.4f}")

  0%|          | 0/2001 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.392 seconds.
Prefix dict has been built successfully.
  1%|          | 13/2001 [00:00<01:02, 31.70it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (137 > 128). Running this sequence through the model will result in indexing errors
100%|██████████| 2001/2001 [00:16<00:00, 122.38it/s]

各项指标的平均值：
BLEU-1: 0.7572
BLEU-2: 0.5709
BLEU-3: 0.4541
BLEU-4: 0.3759
ROUGE-1: 0.6977
ROUGE-2: 0.4605
ROUGE-L: 0.6481
BLEURT: 0.4604
Joint Accuracy: 0.4997
Acc-1: 0.9418



