# DeepSeek-R1-Distill-Qwen-1.5B 模型微调实验

本笔记本将指导您完成模型微调的整个过程。

## 1. 环境准备
首先安装必要的依赖包

In [9]:
# 安装依赖
!pip install -U torch==2.1.2
!pip install -U transformers>=4.37.2
!pip install -U accelerate>=0.27.0
!pip install -U peft>=0.7.0
!pip install -U bitsandbytes>=0.41.0
!pip install -U datasets>=2.16.0
!pip install -U deepspeed>=0.12.0
!pip install -U sentencepiece>=0.1.99
!pip install -U wandb>=0.15.0
!pip install -U trl>=0.7.10
!pip install bitsandbytes

# 验证安装
import torch
import transformers
import accelerate
import peft
import bitsandbytes
import datasets

print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")
print(f"PEFT version: {peft.__version__}")
print(f"Bitsandbytes version: {bitsandbytes.__version__}")
print(f"Datasets version: {datasets.__version__}")

# 检查 GPU 可用性
print(f"\nGPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU model: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

Collecting torch==2.1.2
  Downloading torch-2.1.2-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting triton==2.1.0 (from torch==2.1.2)
  Using cached triton-2.1.0-0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading torch-2.1.2-cp311-cp311-manylinux1_x86_64.whl (670.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached triton-2.1.0-0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (89.2 MB)
Installing collected packages: triton, torch
  Attempting uninstall: triton
    Found existing installation: triton 2.0.0
    Uninstalling triton-2.0.0:
      Successfully uninstalled triton-2.0.0
  Attempting uninstall: torch
    Found existing installation: torch 2.0.1+cu118
    Uninstalling torch-2.0.1+cu118:
      Successfully uninstalled torch-2.0.1+cu118
[31mERROR: pip's dependency resolver does not currently take into account all the packages 

RuntimeError: Failed to import transformers.models.auto.modeling_auto because of the following error (look up to see its traceback):
Failed to import transformers.generation.utils because of the following error (look up to see its traceback):
numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

## 2. 检查 GPU 环境

In [None]:
import torch
print("GPU available:", torch.cuda.is_available())
print("GPU device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

## 3. 创建训练代码

In [9]:

%%writefile train_colab.py
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    AutoConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType
)
import os
import json

def get_output_dir():
    # 检查是否在 Colab 环境中
    try:
        import google.colab
        try:
            from google.colab import drive
            drive.mount('/content/drive')
            output_dir = '/content/drive/MyDrive/model_training'
        except:
            output_dir = os.path.join(os.getcwd(), 'model_training')
    except ImportError:
        output_dir = os.path.join(os.getcwd(), 'model_training')

    os.makedirs(output_dir, exist_ok=True)
    return output_dir

def create_sample_dataset():
    """创建示例数据集"""
    data = [
        {
            "instruction": "解释什么是机器学习",
            "input": "",
            "output": "机器学习是人工智能的一个子领域，它使计算机系统能够通过经验自动改进..."
        },
        {
            "instruction": "写一个简单的Python函数",
            "input": "计算两个数的和",
            "output": "def add_numbers(a, b):\n    return a + b"
        },
        {
            "instruction": "总结以下文本的主要内容",
            "input": "人工智能是计算机科学的一个重要分支...",
            "output": "这段文本主要讨论了人工智能的概念和应用..."
        }
    ]

    with open('sample_data.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def load_model_and_tokenizer():
    """加载模型和分词器"""
    model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

    # 配置 4-bit 量化
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # 首先下载并加载配置
    config = AutoConfig.from_pretrained(
        model_name,
        trust_remote_code=True
    )

    # 设置模型类型和其他必要配置
    config.model_type = "qwen"
    config.torch_dtype = torch.float16
    config.use_cache = True

    # 加载模型
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        config=config,
        trust_remote_code=True,
        device_map="auto",
        quantization_config=quantization_config
    )

    # 加载分词器
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True,
        padding_side="right",
        model_max_length=2048
    )

    # 确保 tokenizer 配置正确
    if not tokenizer.pad_token_id:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    return model, tokenizer

def prepare_model_for_training(model):
    """准备模型进行训练"""
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=4,
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        bias="none",
        inference_mode=False,
    )

    model = get_peft_model(model, lora_config)
    return model

def prepare_dataset(tokenizer, data_path):
    """准备数据集"""
    # 加载数据集
    dataset = load_dataset("json", data_files={"train": data_path})

    def preprocess_function(examples):
        """处理数据集样本"""
        prompts = []
        outputs = []

        # 构建提示文本
        for instruction, input_text, output_text in zip(examples["instruction"], examples["input"], examples["output"]):
            if input_text:
                prompt = f"Instruction: {instruction}\nInput: {input_text}\nOutput: "
            else:
                prompt = f"Instruction: {instruction}\nOutput: "

            prompts.append(prompt)
            outputs.append(output_text)

        # 组合提示和输出
        texts = [p + o for p, o in zip(prompts, outputs)]

        # 对文本进行编码
        encodings = tokenizer(
            texts,
            truncation=True,
            max_length=256,
            padding="max_length",
            return_tensors="pt"
        )

        # 设置标签
        encodings["labels"] = encodings["input_ids"].clone()

        return encodings

    # 处理数据集
    processed_dataset = dataset["train"].map(
        preprocess_function,
        batched=True,
        remove_columns=dataset["train"].column_names,
        batch_size=4,
    )

    return processed_dataset

def main():
    """主函数"""
    # 获取输出目录
    output_dir = get_output_dir()

    # 创建示例数据集
    create_sample_dataset()

    # 加载模型和分词器
    model, tokenizer = load_model_and_tokenizer()

    # 准备模型进行训练
    model = prepare_model_for_training(model)

    # 准备训练数据集
    train_dataset = prepare_dataset(tokenizer, "sample_data.json")

    # 设置训练参数
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=1e-4,
        fp16=True,
        logging_steps=10,
        save_steps=50,
        warmup_steps=10,
        save_total_limit=2,
    )

    # 创建训练器
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    # 开始训练
    trainer.train()

    # 保存模型
    trainer.save_model(os.path.join(output_dir, "final_model"))

if __name__ == "__main__":
    main()

Overwriting train_colab.py


## 4. 运行训练

In [10]:
!python train_colab.py

2025-04-06 15:35:28.743204: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743953728.765103    3904 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743953728.771843    3904 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-06 15:35:28.793715: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Generating

## 5. 测试微调后的模型

In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

def load_and_test_model():
    base_model = AutoModelForCausalLM.from_pretrained(
        "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        trust_remote_code=True,
        load_in_8bit=True,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(
        "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        trust_remote_code=True
    )

    model_path = "/content/drive/MyDrive/model_training/final_model"
    model = PeftModel.from_pretrained(base_model, model_path)

    def generate_response(prompt):
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_length=256, temperature=0.7)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)

    test_prompt = "解释什么是机器学习"
    response = generate_response(test_prompt)
    print(f"问题：{test_prompt}")
    print(f"回答：{response}")

load_and_test_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


ValueError: Can't find 'adapter_config.json' at '/content/drive/MyDrive/model_training/final_model'