# 导入环境

In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from sklearn.model_selection import train_test_split

df = pd.read_json('./dataset/merged_output.json')
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)


print(f"训练集: {len(train_ds)}条 ({len(train_ds)/len(df):.1%})")
print(f"测试集: {len(test_ds)}条 ({len(test_ds)/len(df):.1%})")

训练集: 8003条 (80.0%)
测试集: 2001条 (20.0%)


# 处理数据集

In [None]:
tokenizer = AutoTokenizer.from_pretrained('./model/ZhipuAI/chatglm3-6b', use_fast=False, trust_remote_code=True)
tokenizer.padding_side = "left"  # Flash Attention 必须左对齐
tokenizer.pad_token = tokenizer.eos_token

In [5]:
tokenizer.pad_token, tokenizer.pad_token_id, tokenizer.eos_token_id

('<unk>', 0, 2)

In [None]:
def process_func(example):
    MAX_LENGTH = 1024    
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"""<|system|>
                            你是一位老年服务机构的文书编辑，擅长将老人的口头叙述准确、清晰地转化为日常书面风格的中文文本，并判断语句中存在的错误类型。
                            句子中可能存在的错误类型：1. 句子成分缺失。2. 句子结构混乱。3. 句子成分错误。4. 句子成分冗余。
                            <|user|>
                            原文：{example['spoken_text']}
                            上下文(仅协助理解，不翻译): {example['context']}
                            仅输出原文那一句话的中文转换结果和错误类型序号，不要输出思考过程，不要输出解释。输出格式：
                            翻译结果：
                            错误类型：
                            <|assistant|>\n""", add_special_tokens=False)  
    # 构建response
    response = tokenizer(
        f"翻译结果：{example['written_text']}\n错误类型：{','.join(map(str, example['error_type']))}",
        add_special_tokens=False
    )
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [7]:
tokenized_train = train_ds.map(process_func, remove_columns=train_ds.column_names)
tokenized_train

Map:   0%|          | 0/8003 [00:00<?, ? examples/s]

Map: 100%|██████████| 8003/8003 [00:09<00:00, 834.70 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 8003
})

In [8]:
print(tokenizer.decode(tokenized_train[0]['input_ids']))

<|system|>
                            你是一位老年服务机构的文书编辑，擅长将老人的口头叙述准确、清晰地转化为日常书面风格的中文文本，并判断语句中存在的错误类型。
                            句子中可能存在的错误类型：1. 句子成分缺失。2. 句子结构混乱。3. 句子成分错误。4. 句子成分冗余。
                            <|user|>
                            原文：去一回也算，守上几天也算，任家庄街西社区也算抗美援朝老兵。
                            上下文(仅协助理解，不翻译): 王家庄街西社区的士兵没有去朝鲜。庄街西社区的部分居民并未参战，即便参战也未曾经历实战。主街西社区的居民承担了守备任务，但街西社区在五六年前已经停战了。任家庄街西社区已经停战，守备部队也已撤回。去一回也算，守上几天也算，任家庄街西社区也算抗美援朝老兵。任家庄街西社区那也算了啊，任家庄街西社你去了吗？任家庄街西社对对对大叔，任家庄街西那您四年以后当兵。4年以后就集体转业吗？就集体转业，送这的，就直接转到大西北来了啊。直接送到这里，哦送了多少人啊？你们一起送了。
                            仅输出原文那一句话的中文转换结果和错误类型序号，不要输出思考过程，不要输出解释。输出格式：
                            翻译结果：
                            错误类型：
                            <|assistant|>
 翻译结果：无论是曾赴朝鲜参战，还是仅参与过短期守备任务，任家庄街西社区的居民都可被视为抗美援朝老兵。
错误类型：1,2,3


In [9]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_train[1]["labels"])))

'翻译结果：您坚持包粽子50年，真是不容易啊。\n错误类型：4'

# 创建模型

In [None]:
import torch

model = AutoModelForCausalLM.from_pretrained('./model/ZhipuAI/chatglm3-6b', 
                                             device_map="auto",
                                             torch_dtype=torch.bfloat16,
                                             #attn_implementation="flash_attention_2",
                                             use_cache=False,
                                             trust_remote_code=True)
model

Loading checkpoint shards: 100%|██████████| 7/7 [00:18<00:00,  2.66s/it]


ChatGLMForConditionalGeneration(
  (transformer): ChatGLMModel(
    (embedding): Embedding(
      (word_embeddings): Embedding(65024, 4096)
    )
    (rotary_pos_emb): RotaryEmbedding()
    (encoder): GLMTransformer(
      (layers): ModuleList(
        (0-27): 28 x GLMBlock(
          (input_layernorm): RMSNorm()
          (self_attention): SelfAttention(
            (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
            (core_attention): CoreAttention(
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (dense): Linear(in_features=4096, out_features=4096, bias=False)
          )
          (post_attention_layernorm): RMSNorm()
          (mlp): MLP(
            (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
            (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
          )
        )
      )
      (final_layernorm): RMSNorm()
    )
    (output_layer): Linear(in_

In [13]:
model.enable_input_require_grads() # 开启梯度检查点时，要执行该方法

In [14]:
model.dtype

torch.bfloat16

In [15]:
model.config._attn_implementation

'eager'

# lora 

In [18]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules={"query_key_value"},#["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
config

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'query_key_value'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [19]:
model = get_peft_model(model, config)
config

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='/home/fz/finetune/model/ZhipuAI/chatglm3-6b', revision=None, inference_mode=False, r=8, target_modules={'query_key_value'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [20]:
model.print_trainable_parameters()

trainable params: 1,949,696 || all params: 6,245,533,696 || trainable%: 0.0312


# 配置训练参数

In [22]:
args = TrainingArguments(
    output_dir="./output/glm",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=3,
    save_steps=100,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    save_on_each_node=True,
    save_strategy = "steps",
    gradient_checkpointing=True
)

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding="longest",
    pad_to_multiple_of=8,  
    return_tensors="pt"     
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    data_collator=data_collator,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
#torch.cuda.empty_cache()

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)  # 将模型移动到 GPU 上
model.device

device(type='cuda', index=0)

In [25]:
trainer.train()

You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
  return fn(*args, **kwargs)


Step,Training Loss
10,2.9281
20,2.3391
30,2.0023
40,1.9654
50,1.8549
60,1.692
70,1.6152
80,1.5504
90,1.4762
100,1.4238


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=1500, training_loss=1.2119361979166667, metrics={'train_runtime': 6680.7706, 'train_samples_per_second': 3.594, 'train_steps_per_second': 0.225, 'total_flos': 4.1944821011890176e+17, 'train_loss': 1.2119361979166667, 'epoch': 2.995502248875562})

# 保存 LoRA 和 tokenizer 结果


In [4]:
peft_model_id="./glm_lora/train1"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

NameError: name 'trainer' is not defined

In [27]:
tokenized_test = test_ds.map(process_func, remove_columns=test_ds.column_names)

import gc
gc.collect()
torch.cuda.empty_cache()

model.config.use_cache = False
trainer.args.per_device_eval_batch_size = 1
#trainer.evaluate(tokenized_test.select(range(5)))  # 先试 5 条

trainer.evaluate(tokenized_test)

Map: 100%|██████████| 2001/2001 [00:02<00:00, 842.00 examples/s]


{'eval_loss': 1.132540225982666,
 'eval_runtime': 96.1808,
 'eval_samples_per_second': 20.805,
 'eval_steps_per_second': 20.805,
 'epoch': 2.995502248875562}

# 加载 lora 权重推理

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel

mode_path = './model/ZhipuAI/chatglm3-6b'
lora_path = './glm_lora/train1'

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)
#tokenizer = AutoTokenizer.from_pretrained(lora_path, trust_remote_code=True)
tokenizer.padding_side = "left"
#if tokenizer.pad_token is None:
#    tokenizer.pad_token = tokenizer.eos_token

# 加载模型
model = AutoModelForCausalLM.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()

# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 7/7 [00:16<00:00,  2.32s/it]


In [None]:
torch.cuda.empty_cache()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)  # 将模型移动到 GPU 上
print(model.device)

cuda:0


In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset

# 将JSON文件转换为CSV文件
df = pd.read_json('./dataset/test_dataset.json')

test_ds = Dataset.from_pandas(df)
test_ds

Dataset({
    features: ['id', 'file_id', 'spoken_text', 'context', 'written_text', 'error_type', '__index_level_0__'],
    num_rows: 2001
})

In [None]:
model.device 

device(type='cuda', index=0)

In [None]:
import torch
import random
import json
import re
from tqdm import tqdm

# ✅ 解析预测结果中的“翻译结果”和“错误类型”
def extract_translation_and_error_type(pred_text):
    trans_match = re.search(r"翻译结果：\s*(.*?)\n", pred_text, re.DOTALL)
    translation = trans_match.group(1).strip() if trans_match else ""

    error_match = re.search(r"错误类型：\s*(.+)", pred_text)
    if error_match:
        error_str = error_match.group(1).strip()
        error_type = [int(x) for x in re.findall(r"\d+", error_str)]
    else:
        error_type = []

    return translation, error_type

# ✅ 设置 tokenizer 左侧 padding（适用于 decoder-only 架构）
tokenizer.padding_side = 'left'

# ✅ 采样测试样本
test_samples =list(test_ds)

batch_size = 8
all_results = []

for i in tqdm(range(0, len(test_samples), batch_size), desc="Processing samples"):
    batch = test_samples[i:i+batch_size]

    # ✅ 构造输入
    prompts = [
        f"""<|system|>
            你是一位老年服务机构的文书编辑，擅长将老人的口头叙述准确、清晰地转化为日常书面风格的文本，并判断语句中存在的错误类型。
            句子中可能存在的错误类型：1. 句子成分缺失。2. 句子结构混乱。3. 句子成分错误。4. 句子成分冗余。
            <|user|>
            原文：{example['spoken_text']}
            上下文(仅协助理解，不翻译): {example['context']}
            仅输出原文那一句话的翻译结果和错误类型序号，不要输出思考过程，不要输出解释。输出格式：
                翻译结果：
                错误类型：
            <|assistant|>""" for example in batch
    ]

    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding="longest",
        truncation=True,
    ).to(model.device)

    def safe_decode(sequences, tokenizer):
        vocab_size = tokenizer.vocab_size  # ChatGLMTokenizer 使用这个属性获取词表大小
        cleaned = []
        for seq in sequences:
            if isinstance(seq, torch.Tensor):
                seq = seq.tolist()
            # 过滤掉不在词表范围内的 token
            seq = [token for token in seq if 0 <= token < vocab_size]
            cleaned.append(seq)
        return tokenizer.batch_decode(cleaned, skip_special_tokens=True)

    # ✅ 推理
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            num_beams=1,
            do_sample=False,
            temperature=0.2,
            top_p=0.95,
        )

    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    #preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    preds = safe_decode(outputs, tokenizer)

    # ✅ 处理每条预测结果
    for pred_text, example in zip(preds, batch):
        translation, predicted_error_type = extract_translation_and_error_type(pred_text)

        # 转换原始标签为数组
        true_error = example.get("error_type", [])
        if isinstance(true_error, int):
            true_error = [true_error]
        elif isinstance(true_error, str):
            true_error = [int(x) for x in re.findall(r"\d+", true_error)]

        result = {
            "spoken_text": example["spoken_text"],
            "context": example["context"],
            "reference": example["written_text"],
            "ref_error_type": true_error,
            "prediction": translation,
            "pred_error_type": predicted_error_type
        }
        all_results.append(result)

# ✅ 保存结果到 JSON 文件
output_file = "./result/lora/lora_glm.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)

print(f"✅ 预测结果已保存到 {output_file}")

Processing samples:   0%|          | 0/251 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing samples: 100%|██████████| 251/251 [31:35<00:00,  7.55s/it]

✅ 预测结果已保存到 model_predictions/lora_glm.json





In [None]:
import json
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from tqdm import tqdm
import numpy as np

# 加载模型和 tokenizer
tokenizer = AutoTokenizer.from_pretrained("./bleurt-base-128", local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained("./bleurt-base-128", local_files_only=True)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 加载数据
with open("./result/lora/lora_glm.json", "r", encoding="utf-8") as f:
    data = json.load(f)

import importlib
import evaluate
importlib.reload(evaluate)  # 强制重新加载  
from evaluate import calculate_all_metrics

# 存储所有指标
all_scores = []

for item in tqdm(data):
    reference = item["reference"]
    generated = item["prediction"]
    if not reference or not generated:
        continue  # 跳过空文本样本
    ref_error_types = item.get("ref_error_type", [])
    pred_error_types = item.get("pred_error_type", [])

    metrics = calculate_all_metrics(reference, generated, tokenizer, model, ref_error_types, pred_error_types, device)
    all_scores.append(metrics)

# 计算每个指标的平均值
average_scores = {}
for key in all_scores[0].keys():
    average_scores[key] = np.mean([score[key] for score in all_scores])

# 打印平均结果
print("各项指标的平均值：")
for key, value in average_scores.items():
    print(f"{key}: {value:.4f}")

  0%|          | 0/2001 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.399 seconds.
DEBUG:jieba:Loading model cost 0.399 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.
  1%|          | 13/2001 [00:00<01:02, 31.66it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (142 > 128). Running this sequence through the model will result in indexing errors
100%|██████████| 2001/2001 [00:18<00:00, 110.79it/s]

各项指标的平均值：
BLEU-1: 0.6884
BLEU-2: 0.4519
BLEU-3: 0.3103
BLEU-4: 0.2237
ROUGE-1: 0.6030
ROUGE-2: 0.3082
ROUGE-L: 0.5414
BLEURT: 0.3515
Joint Accuracy: 0.2194
Acc-1: 0.9750



