# DeepSeek-R1-Distill-Qwen-1.5B 模型微调实验

本笔记本将指导您完成模型微调的整个过程。

## 1. 环境准备
首先安装必要的依赖包

In [1]:
!pip install -q transformers==4.35.2 datasets accelerate bitsandbytes==0.41.1 peft==0.7.1 torch==2.1.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## 2. 检查 GPU 环境

In [2]:
import torch
print("GPU available:", torch.cuda.is_available())
print("GPU device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

GPU available: True
GPU device name: Tesla T4


## 3. 创建训练代码

In [3]:
%%writefile train_colab.py

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    TaskType
)
import os
from google.colab import drive

def mount_drive():
    drive.mount('/content/drive')
    os.makedirs('/content/drive/MyDrive/model_training', exist_ok=True)
    return '/content/drive/MyDrive/model_training'

def create_sample_dataset():
    data = {
        "instruction": [
            "解释什么是机器学习",
            "写一个简单的Python函数",
            "总结以下文本的主要内容"
        ],
        "input": [
            "",
            "计算两个数的和",
            "人工智能是计算机科学的一个重要分支..."
        ],
        "output": [
            "机器学习是人工智能的一个子领域，它使计算机系统能够通过经验自动改进...",
            "def add_numbers(a, b):\\n    return a + b",
            "这段文本主要讨论了人工智能的概念和应用..."
        ]
    }

    import json
    with open('sample_data.json', 'w', encoding='utf-8') as f:
        json.dump({"train": data}, f, ensure_ascii=False, indent=2)

def load_model_and_tokenizer():
    model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        load_in_8bit=True,
        device_map="auto"
    )

    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True
    )
    return model, tokenizer

def prepare_model_for_training(model):
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=4,
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        bias="none",
        inference_mode=False,
    )

    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    return model

def prepare_dataset(tokenizer, data_path):
    dataset = load_dataset("json", data_files=data_path)

    def preprocess_function(examples):
        prompts = []
        for instruction, input_text in zip(examples["instruction"], examples["input"]):
            if input_text:
                prompt = f"Instruction: {instruction}\\nInput: {input_text}\\nOutput: "
            else:
                prompt = f"Instruction: {instruction}\\nOutput: "
            prompts.append(prompt)

        texts = [p + o for p, o in zip(prompts, examples["output"])]

        encodings = tokenizer(
            texts,
            truncation=True,
            max_length=256,
            padding="max_length",
            return_tensors="pt"
        )
        return encodings

    processed_dataset = dataset["train"].map(
        preprocess_function,
        remove_columns=dataset["train"].column_names,
        batch_size=4,
    )
    return processed_dataset

def main():
    output_dir = mount_drive()
    create_sample_dataset()
    model, tokenizer = load_model_and_tokenizer()
    model = prepare_model_for_training(model)
    train_dataset = prepare_dataset(tokenizer, "sample_data.json")

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=1e-4,
        fp16=True,
        logging_steps=10,
        save_steps=50,
        warmup_steps=10,
        save_total_limit=2,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    trainer.train()
    trainer.save_model(os.path.join(output_dir, "final_model"))

if __name__ == "__main__":
    main()

Writing train_colab.py


## 4. 运行训练

In [None]:
!python train_colab.py

## 5. 测试微调后的模型

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

def load_and_test_model():
    base_model = AutoModelForCausalLM.from_pretrained(
        "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        trust_remote_code=True,
        load_in_8bit=True,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(
        "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        trust_remote_code=True
    )

    model_path = "/content/drive/MyDrive/model_training/final_model"
    model = PeftModel.from_pretrained(base_model, model_path)

    def generate_response(prompt):
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_length=256, temperature=0.7)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)

    test_prompt = "解释什么是机器学习"
    response = generate_response(test_prompt)
    print(f"问题：{test_prompt}")
    print(f"回答：{response}")

load_and_test_model()

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


ImportError: cannot import name 'EncoderDecoderCache' from 'transformers' (/usr/local/lib/python3.11/dist-packages/transformers/__init__.py)