# DeepSeek-R1-Distill-Qwen-1.5B 模型微调实验

本笔记本将指导您完成模型微调的整个过程。

## 1. 环境准备
首先安装必要的依赖包

In [1]:
# 检查 CUDA 环境
!nvidia-smi
print("\nCUDA 库信息:")
!find /usr/local/cuda* -name "libcudart.so*"
!find /usr/local/cuda* -name "libcusparse.so*"
!find /usr/lib/x86_64-linux-gnu -name "libcudart.so*"
!find /usr/lib/x86_64-linux-gnu -name "libcusparse.so*"
!ldconfig -p | grep -E 'cuda|cusparse'

# 安装 CUDA 工具包
!apt-get update && apt-get install -y cuda-cudart-11-8 cuda-libraries-11-8 libcusparse-11-8

# 创建符号链接
!ln -sf /usr/local/cuda-11.8/targets/x86_64-linux/lib/libcusparse.so.11 /usr/local/cuda/lib64/libcusparse.so.11

# 设置 CUDA 环境变量
import os
cuda_paths = [
    "/usr/local/cuda-11.8/lib64",
    "/usr/local/cuda-11.8/extras/CUPTI/lib64",
    "/usr/local/cuda/lib64",
    "/usr/local/cuda/extras/CUPTI/lib64",
    "/usr/lib/x86_64-linux-gnu",
    "/usr/lib/cuda/lib64",
    "/usr/lib/cuda/include",
    "/usr/local/cuda/targets/x86_64-linux/lib"
]

ld_library_path = os.environ.get("LD_LIBRARY_PATH", "")
for path in cuda_paths:
    if os.path.exists(path) and path not in ld_library_path:
        ld_library_path = f"{path}:{ld_library_path}"

os.environ["LD_LIBRARY_PATH"] = ld_library_path.rstrip(":")
os.environ["CUDA_HOME"] = "/usr/local/cuda"

print("\n环境变量设置:")
print("CUDA_HOME:", os.environ.get("CUDA_HOME"))
print("LD_LIBRARY_PATH:", os.environ.get("LD_LIBRARY_PATH"))

# 验证 CUDA 库
!ldconfig
!python -c "import torch; print('CUDA 可用:', torch.cuda.is_available()); print('CUDA 版本:', torch.version.cuda)"

# 卸载现有的包以避免冲突
!pip uninstall -y numpy bitsandbytes transformers torch torchvision torchaudio accelerate peft setuptools jedi sentence-transformers diffusers huggingface-hub datasets fastai timm tensorflow tensorboard jax jaxlib

# 按顺序安装依赖包
!pip install -q setuptools==68.2.2 wheel==0.41.2 pip==23.3.1
!pip install -q numpy==1.24.3
!pip install -q torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
!pip install -q huggingface-hub==0.20.3
!pip install -q transformers==4.31.0
!pip install -q datasets==2.14.7
!pip install -q accelerate==0.20.3 peft==0.3.0

# 从源代码编译并安装 bitsandbytes
!git clone https://github.com/TimDettmers/bitsandbytes.git
!cd bitsandbytes && CUDA_VERSION=118 make cuda11x && python setup.py install

# 安装完整的 bitsandbytes 包
!pip install -q bitsandbytes==0.41.1
!pip install -q triton==2.0.0

# 重新导入以确保环境正确
import os
import sys
import importlib
if 'transformers' in sys.modules:
    importlib.reload(sys.modules['transformers'])
if 'numpy' in sys.modules:
    importlib.reload(sys.modules['numpy'])

# 验证 bitsandbytes 安装
print("\n验证 bitsandbytes 安装:")
import bitsandbytes as bnb
print("已加载的 bitsandbytes 模块:", dir(bnb))
print("Bitsandbytes 版本:", bnb.__version__)

# 验证 8-bit 量化功能
print("\n验证 8-bit 量化功能:")
import torch
x = torch.randn(2, 3).cuda()
linear_8bit = bnb.nn.Linear8bitLt(3, 4, has_fp16_weights=False).cuda()
out = linear_8bit(x)
print("8-bit 线性层测试成功!")

# 验证 transformers 功能
print("\n验证 transformers 功能:")
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
print("Transformers tokenizer 测试成功!")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.5/123.5 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## 2. 检查 GPU 环境

In [None]:
import torch
print("GPU available:", torch.cuda.is_available())
print("GPU device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

## 3. 创建训练代码

In [2]:
%%writefile train_colab.py

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    AutoConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType
)
import os

def get_output_dir():
    # 检查是否在 Colab 环境中
    try:
        import google.colab
        try:
            from google.colab import drive
            drive.mount('/content/drive')
            output_dir = '/content/drive/MyDrive/model_training'
        except:
            output_dir = os.path.join(os.getcwd(), 'model_training')
    except ImportError:
        output_dir = os.path.join(os.getcwd(), 'model_training')

    os.makedirs(output_dir, exist_ok=True)
    return output_dir

def create_sample_dataset():
    data = {
        "instruction": [
            "解释什么是机器学习",
            "写一个简单的Python函数",
            "总结以下文本的主要内容"
        ],
        "input": [
            "",
            "计算两个数的和",
            "人工智能是计算机科学的一个重要分支..."
        ],
        "output": [
            "机器学习是人工智能的一个子领域，它使计算机系统能够通过经验自动改进...",
            "def add_numbers(a, b):\\n    return a + b",
            "这段文本主要讨论了人工智能的概念和应用..."
        ]
    }

    import json
    with open('sample_data.json', 'w', encoding='utf-8') as f:
        json.dump({"train": data}, f, ensure_ascii=False, indent=2)

def load_model_and_tokenizer():
    model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

    # 配置 4-bit 量化
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # 首先下载并加载配置
    config = AutoConfig.from_pretrained(
        model_name,
        trust_remote_code=True
    )

    # 设置模型类型和其他必要配置
    config.model_type = "qwen"
    config.torch_dtype = torch.float16
    config.use_cache = True

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        config=config,
        trust_remote_code=True,
        device_map="auto",
        quantization_config=quantization_config
    )

    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True,
        padding_side="right",
        model_max_length=2048
    )

    # 确保 tokenizer 配置正确
    if not tokenizer.pad_token_id:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    return model, tokenizer

def prepare_model_for_training(model):
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=4,
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        bias="none",
        inference_mode=False,
    )

    model = get_peft_model(model, lora_config)
    return model

def prepare_dataset(tokenizer, data_path):
    dataset = load_dataset("json", data_files=data_path)

    def preprocess_function(examples):
        prompts = []
        for instruction, input_text in zip(examples["instruction"], examples["input"]):
            if input_text:
                prompt = f"Instruction: {instruction}\\nInput: {input_text}\\nOutput: "
            else:
                prompt = f"Instruction: {instruction}\\nOutput: "
            prompts.append(prompt)

        texts = [p + o for p, o in zip(prompts, examples["output"])]

        encodings = tokenizer(
            texts,
            truncation=True,
            max_length=256,
            padding="max_length",
            return_tensors="pt"
        )
        return encodings

    processed_dataset = dataset["train"].map(
        preprocess_function,
        remove_columns=dataset["train"].column_names,
        batch_size=4,
    )
    return processed_dataset

def main():
    output_dir = get_output_dir()
    create_sample_dataset()
    model, tokenizer = load_model_and_tokenizer()
    model = prepare_model_for_training(model)
    train_dataset = prepare_dataset(tokenizer, "sample_data.json")

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=1e-4,
        fp16=True,
        logging_steps=10,
        save_steps=50,
        warmup_steps=10,
        save_total_limit=2,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    trainer.train()
    trainer.save_model(os.path.join(output_dir, "final_model"))

if __name__ == "__main__":
    main()

Writing train_colab.py


## 4. 运行训练

In [3]:
!python train_colab.py


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/content/train_colab.py", line 2, in <module>
    import torch
  File "/usr/local/lib/python3.11/dist-packages/torch/__init__.py", line 1382, in <module>
    from .functional import *  # noqa: F403
  File "/usr/local/lib/python3.11/dist-packages/torch/functional.py", line 7, in <module>
    import torch.nn.functional as F
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/__init__.py", line 1, in <module>
    from .modules import *  # noqa: F403
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules

## 5. 测试微调后的模型

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

def load_and_test_model():
    base_model = AutoModelForCausalLM.from_pretrained(
        "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        trust_remote_code=True,
        load_in_8bit=True,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(
        "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        trust_remote_code=True
    )

    model_path = "/content/drive/MyDrive/model_training/final_model"
    model = PeftModel.from_pretrained(base_model, model_path)

    def generate_response(prompt):
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_length=256, temperature=0.7)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)

    test_prompt = "解释什么是机器学习"
    response = generate_response(test_prompt)
    print(f"问题：{test_prompt}")
    print(f"回答：{response}")

load_and_test_model()