# 导入所需库

In [None]:
!pip install datasets
!pip install pandas
!pip install transformers
!pip install modelscope

# 下载模型

In [None]:

from modelscope import snapshot_download, AutoModel, AutoTokenizer
# 从modelscope下载deepseek r1 1.5B
def download_model():
    model_dir = snapshot_download('deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', cache_dir='./Model/ds1b/autodl-tmp',
                                  revision='master')
download_model()

# 测试未微调模型补全代码功能

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel
#下载的模型目录
model_path = '/root/autodl-tmp/deepseek-ai/DeepSeek-R1-Distill-Qwen-1___5B'

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# 加载模型
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()

#问题信息
messages=[
    {'role': 'sysrem', 'content': "Please complete the following code"},
    { 'role': 'user', 'content': '''def get_logger(name: str) -> logging.Logger:\n
                                        logger = logging.getLogger(name)\n
                                        handler = logging.StreamHandler(sys.stdout)\n'''}
]
#转换成模型可接受的输入模版
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
#模型生成输出
outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
#打印输出内容
print(tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True))

#导入环境和数据集

In [None]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
# 将JSON文件转换为CSV文件
df = pd.read_json("/content/all_dataset.json")
ds = Dataset.from_pandas(df)

#处理数据集

In [None]:
import torch
model_path = '/content/Model/root/autodl-tmp/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct'
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
tokenizer.padding_side = 'right'
# 搭建数据集处理函数
def process_func(example):
    MAX_LENGTH = 512
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer((f"<｜begin▁of▁sentence｜>{example['coder_file_name']}\n"
                             f"User: {example['prerequisite'] + example['pre_code']}\nAssistant: "
                             ).strip(),
                            add_special_tokens=False)
    response = tokenizer(f"{example['output']}<｜end▁of▁sentence｜>", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


# 利用处理函数处理数据集
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)

#加载模型

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
# 以下采用对模型进行量化后加载模型
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.half,
                      low_cpu_mem_usage=True,  # 是否使用低CPU内存
                      load_in_4bit=True,  # 是否在4位精度下加载模型。如果设置为True，则在4位精度下加载模型。
                      bnb_4bit_compute_dtype=torch.half, # 4位精度计算的数据类型。这里设置为torch.half，表示使用半精度浮点数。
                      bnb_4bit_quant_type="nf4",  # 4位精度量化的类型。这里设置为"nf4"，表示使用nf4量化类型。
                      bnb_4bit_use_double_quant=True,  # 是否使用双精度量化。如果设置为True，则使用双精度量化。
                      device_map="cuda:0").eval()
model.enable_input_require_grads() # 开启梯度检查点时，要执行该方法
#enable_input_require_grads 方法的作用就是将模型输入张量的 requires_grad 属性设置为 True，使得在反向传播过程中能够计算输入张量的梯度

#配置lora参数和转换模型

In [None]:
from peft import LoraConfig, TaskType, get_peft_model
# lora配置
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    # 需要训练的层数
    target_modules=["q_proj", "kv_a_proj_with_mqa", "kv_b_proj", "o_proj", 'gate_proj', 'up_proj', 'down_proj'],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
# 将一个预训练模型转换为支持参数高效微调的模型
model = get_peft_model(model, config)

# 配置训练参数并开始训练

In [None]:
# 训练配置
args = TrainingArguments(
    output_dir="./output/deepseek_r1", #lora输出目录
    per_device_train_batch_size=1, #训练批次
    gradient_accumulation_steps=8, #多少次进行一次梯度累积
    logging_steps=10,
    num_train_epochs=2, #数据集被完整遍历的次数
    save_steps=1000, #每1000次训练集保存一次
    learning_rate=1e-5,
    save_on_each_node=True,
    gradient_checkpointing=True,
)

In [None]:
#配置训练
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)
#开始训练
trainer.train()

#加载 lora 权重并进行测试推理

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel
#下载的模型目录
model_path = '/root/autodl-tmp/deepseek-ai/DeepSeek-R1-Distill-Qwen-1___5B'
#训练完的lora目录
lora_path = './output/deepseek_r1/checkpoint-10000'

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# 加载模型
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()

# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path)
#问题信息
messages=[
    {'role': 'sysrem', 'content': "Please complete the following code"},
    { 'role': 'user', 'content': '''def get_logger(name: str) -> logging.Logger:\n
                                        logger = logging.getLogger(name)\n
                                        handler = logging.StreamHandler(sys.stdout)\n'''}
]
#转换成模型可接受的输入模版
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
#模型生成输出
outputs = model.generate(inputs, max_new_tokens=512, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
#打印输出内容
print(tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True))