In [1]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq,AutoTokenizer
import torch
import re
from tqdm import tqdm
import json


model_dir = "Qwen2.5-32B-Instruct-GPTQ-Int4"

### 加载模型和分词器
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)

### 加载数据集

In [2]:
train_dir="round1_train_data.jsonl"
new_train_json_dir="round1_train_data_new.jsonl"

In [3]:
def process_func(example):
    """
    将数据集进行预处理
    """
    # global i
    MAX_LENGTH = 512 
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(
        f"<|system|>\n你是一个逻辑推理专家，擅长解决逻辑推理问题。以下是一个逻辑推理的题目，形式为单项选择题。所有的问题都是（close-world assumption）闭世界假设，即未观测事实都为假。请逐步分析问题并在最后一行输出答案，最后一行的格式为:答案是：A<|endoftext|>\n<|user|>\n{example['input']}<|endoftext|>\n<|assistant|>\n",
        add_special_tokens=False,
    )
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = (
        instruction["attention_mask"] + response["attention_mask"] + [1]
    )
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}   

In [4]:
model.enable_input_require_grads()

In [8]:
import pandas as pd
from datasets import Dataset
train_df = pd.read_json(new_train_json_dir, lines=True)
data=pd.read_json("new_500_data.jsonl", lines=True)     #模型预测的测试集（0.8622）
# data=pd.read_json("/home/tom/fssd/code/app/new_testdata.jsonl", lines=True)  #人工标注测试集
new_data=pd.concat([train_df,data])
data=pd.read_json("new_gpt4_data.jsonl",lines=True) #gpt4 公开数据集100
# data=pd.read_json("gpt_train_25000.jsonl",lines=True) #gpt4 公开数据集25000条，取前5000条
# data=data.head(5000) #和上面一行代码一起开，取前5000条数据
new_data=pd.concat([new_data,data])
new_data.reset_index(inplace=True)
new_data.drop("index",axis=1)
# new_data=new_data.sample(int(len(new_data)*0.8))  #随机抽80%的数据
train_ds = Dataset.from_pandas(new_data)

In [9]:
# new_data=new_data.sample(int(len(new_data)*0.8))

In [10]:
# new_data

Unnamed: 0,index,instruction,input,output
52,52,你是一个逻辑推理专家，擅长解决逻辑推理问题。以下是一个逻辑推理的题目，形式为单项选择题。所有...,题目:在一个团队中，成员有不同的角色和技能，并且有明确的上下级关系。以下是团队成员名单和他们...,A
1970,549,你是一个逻辑推理专家，擅长解决逻辑推理问题。以下是一个逻辑推理的题目，形式为单项选择题。所有...,"题目:有一组数字，分别为：2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...",B
3128,379,你是一个逻辑推理专家，擅长解决逻辑推理问题。以下是一个逻辑推理的题目，形式为单项选择题。所有...,题目:在一个数字游戏中，玩家需要根据给定的数字规则来计算分数。规则如下：\n\n1. 如果一...,C
1504,83,你是一个逻辑推理专家，擅长解决逻辑推理问题。以下是一个逻辑推理的题目，形式为单项选择题。所有...,题目:有6个国家分别为：日本、埃及、尼泊尔、俄罗斯、中国和不丹，其对应的首都分别为：东京、开...,A
2488,1067,你是一个逻辑推理专家，擅长解决逻辑推理问题。以下是一个逻辑推理的题目，形式为单项选择题。所有...,题目:在以下问题中，请根据给定的条件，选择正确答案。\n\n有一组特定的规则和操作如下：\n...,A
...,...,...,...,...
1247,1247,你是一个逻辑推理专家，擅长解决逻辑推理问题。以下是一个逻辑推理的题目，形式为单项选择题。所有...,题目:假设有如下连个点之间的无向图信息：\n\n- 点A与点B之间有一条路径长度为1；\n-...,B
1644,223,你是一个逻辑推理专家，擅长解决逻辑推理问题。以下是一个逻辑推理的题目，形式为单项选择题。所有...,题目:在这个情景中，有三个人：Vincent、Marcellus和Mia。下列是关于他们的信...,C
90,90,你是一个逻辑推理专家，擅长解决逻辑推理问题。以下是一个逻辑推理的题目，形式为单项选择题。所有...,"题目:假设您有两组整数的列表：\n\n1. 第一组为 [2, 4, 7, 13, 63, 1...",D
2292,871,你是一个逻辑推理专家，擅长解决逻辑推理问题。以下是一个逻辑推理的题目，形式为单项选择题。所有...,题目:在一个印度板球联盟中，有多个球队参加比赛和互相竞争。以下是一些球队的表现情况的类别说明...,C


In [6]:
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)

Map:   0%|          | 0/3164 [00:00<?, ? examples/s]

In [7]:
from peft import LoraConfig, TaskType, get_peft_model
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

### 加载微调模型

In [8]:
model = get_peft_model(model, config)

### 配置超参数

In [9]:
args = TrainingArguments(
    output_dir="output_v1/qwen2_5_32B", #记得每一次修改文件
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    logging_steps=500,
    num_train_epochs=2,
    save_steps=500,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="none",
    # fp16=True,
    save_total_limit=3,
    # seed=2024
)

### 开始训练

In [10]:
from transformers import DataCollatorWithPadding
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

trainer.train()

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
500,0.6722
1000,0.4984
1500,0.4223
2000,0.4578
2500,0.3841
3000,0.4116
3500,0.2547
4000,0.1878
4500,0.1724
5000,0.1614


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enab

TrainOutput(global_step=6328, training_loss=0.31921516989938226, metrics={'train_runtime': 13038.1141, 'train_samples_per_second': 0.485, 'train_steps_per_second': 0.485, 'total_flos': 8428866772205568.0, 'train_loss': 0.31921516989938226, 'epoch': 2.0})

In [11]:
def predict(messages, model, tokenizer):
    device = "cuda"
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=384,
        pad_token_id=tokenizer.eos_token_id
        # do_sample=False,
        # temperature=0.7
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    # print(response)
     
    return response

### 验证训练集ACC

In [12]:
test_df = pd.read_json(new_train_json_dir, lines=True)

test_pred_list = []
test_label_list=[]
for index, row in tqdm(test_df.iterrows()):
    instruction = row['instruction']
    input_value = row['input']
    test_label_list.append(row["output"])
    messages = [
        {"role": "system", "content": f"{instruction}"},
        {"role": "user", "content": f"{input_value}"}
    ]

    response = predict(messages, model, tokenizer)
    test_pred_list.append(response)

0it [00:00, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  return fn(*args, **kwargs)
34it [00:59,  1.72s/it]

1421it [42:15,  1.78s/it]


In [13]:
test_pred_list

['D',
 'A',
 'A',
 'C',
 'B',
 'B',
 'B',
 'B',
 'A',
 'B',
 'A',
 'D',
 'B',
 'B',
 'B',
 'D',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'C',
 'A',
 'A',
 'C',
 'B',
 'A',
 'B',
 'B',
 'A',
 'A',
 'A',
 'A',
 'B',
 'A',
 'A',
 'A',
 'D',
 'C',
 'B',
 'C',
 'A',
 'A',
 'C',
 'B',
 'B',
 'A',
 'B',
 'A',
 'A',
 'B',
 'A',
 'A',
 'A',
 'A',
 'B',
 'B',
 'A',
 'B',
 'B',
 'B',
 'B',
 'A',
 'B',
 'C',
 'A',
 'B',
 'B',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'B',
 'C',
 'B',
 'B',
 'D',
 'D',
 'A',
 'A',
 'B',
 'B',
 'B',
 'B',
 'A',
 'A',
 'C',
 'D',
 'D',
 'A',
 'D',
 'A',
 'A',
 'A',
 'A',
 'A',
 'C',
 'B',
 'B',
 'C',
 'D',
 'B',
 'A',
 'B',
 'C',
 'A',
 'B',
 'C',
 'D',
 'A',
 'C',
 'C',
 'C',
 'B',
 'A',
 'C',
 'B',
 'B',
 'B',
 'B',
 'A',
 'B',
 'C',
 'B',
 'B',
 'B',
 'B',
 'A',
 'A',
 'B',
 'B',
 'D',
 'A',
 'B',
 'B',
 'B',
 'B',
 'B',
 'D',
 'B',
 'A',
 'D',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'B',
 'A',
 'D',
 'C',
 'B',
 'A',
 'A',
 'A',
 'A',
 'A',
 'B',
 'B',
 'B',
 'A',
 'B',
 'D'

In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(test_label_list,test_pred_list)

0.9753694581280788