<a href="https://colab.research.google.com/github/adenslee/mnist_colab/blob/main/colab_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DeepSeek-R1-Distill-Qwen-1.5B 模型微调实验

本笔记本将指导您完成模型微调的整个过程。

## 1. 环境准备
首先安装必要的依赖包

In [9]:
# 安装依赖
!pip install -U torch==2.1.2
!pip install -U transformers>=4.37.2
!pip install -U accelerate>=0.27.0
!pip install -U peft>=0.7.0
!pip install -U bitsandbytes>=0.41.0
!pip install -U datasets>=2.16.0
!pip install -U deepspeed>=0.12.0
!pip install -U sentencepiece>=0.1.99
!pip install -U wandb>=0.15.0
!pip install -U trl>=0.7.10
!pip install bitsandbytes

# 验证安装
import torch
import transformers
import accelerate
import peft
import bitsandbytes
import datasets

print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")
print(f"PEFT version: {peft.__version__}")
print(f"Bitsandbytes version: {bitsandbytes.__version__}")
print(f"Datasets version: {datasets.__version__}")

# 检查 GPU 可用性
print(f"\nGPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU model: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

Collecting torch==2.1.2
  Downloading torch-2.1.2-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting triton==2.1.0 (from torch==2.1.2)
  Using cached triton-2.1.0-0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading torch-2.1.2-cp311-cp311-manylinux1_x86_64.whl (670.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached triton-2.1.0-0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (89.2 MB)
Installing collected packages: triton, torch
  Attempting uninstall: triton
    Found existing installation: triton 2.0.0
    Uninstalling triton-2.0.0:
      Successfully uninstalled triton-2.0.0
  Attempting uninstall: torch
    Found existing installation: torch 2.0.1+cu118
    Uninstalling torch-2.0.1+cu118:
      Successfully uninstalled torch-2.0.1+cu118
[31mERROR: pip's dependency resolver does not currently take into account all the packages 

RuntimeError: Failed to import transformers.models.auto.modeling_auto because of the following error (look up to see its traceback):
Failed to import transformers.generation.utils because of the following error (look up to see its traceback):
numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

## 2. 检查 GPU 环境

In [None]:
import torch
print("GPU available:", torch.cuda.is_available())
print("GPU device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

## 3. 创建训练代码

In [17]:

%%writefile train_colab.py
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    AutoConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType
)
import os
import json

def get_output_dir():
    # 检查是否在 Colab 环境中
    try:
        import google.colab
        try:
            from google.colab import drive
            drive.mount('/content/drive')
            output_dir = '/content/drive/MyDrive/model_training'
        except:
            output_dir = os.path.join(os.getcwd(), 'model_training')
    except ImportError:
        output_dir = os.path.join(os.getcwd(), 'model_training')

    os.makedirs(output_dir, exist_ok=True)
    return output_dir

def create_sample_dataset():
    """创建示例数据集"""
    data = [
        {
            "instruction": "解释什么是机器学习",
            "input": "",
            "output": "机器学习是人工智能的一个子领域，它使计算机系统能够通过经验自动改进..."
        },
        {
            "instruction": "写一个简单的Python函数",
            "input": "计算两个数的和",
            "output": "def add_numbers(a, b):\n    return a + b"
        },
        {
            "instruction": "总结以下文本的主要内容",
            "input": "人工智能是计算机科学的一个重要分支...",
            "output": "这段文本主要讨论了人工智能的概念和应用..."
        }
    ]

    with open('sample_data.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def load_model_and_tokenizer():
    """加载模型和分词器"""
    model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

    # 配置 4-bit 量化
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # 首先下载并加载配置
    config = AutoConfig.from_pretrained(
        model_name,
        trust_remote_code=True
    )

    # 设置模型类型和其他必要配置
    config.model_type = "qwen"
    config.torch_dtype = torch.float16
    config.use_cache = True

    # 加载模型
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        config=config,
        trust_remote_code=True,
        device_map="auto",
        quantization_config=quantization_config
    )

    # 加载分词器
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True,
        padding_side="right",
        model_max_length=2048
    )

    # 确保 tokenizer 配置正确
    if not tokenizer.pad_token_id:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    return model, tokenizer

def prepare_model_for_training(model):
    """准备模型进行训练"""
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=4,
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        bias="none",
        inference_mode=False,
    )

    model = get_peft_model(model, lora_config)
    return model

def prepare_dataset(tokenizer, data_path):
    """准备数据集"""
    # 加载数据集
    dataset = load_dataset("json", data_files={"train": data_path})

    def preprocess_function(examples):
        """处理数据集样本"""
        prompts = []
        outputs = []

        # 构建提示文本
        for instruction, input_text, output_text in zip(examples["instruction"], examples["input"], examples["output"]):
            if input_text:
                prompt = f"Instruction: {instruction}\nInput: {input_text}\nOutput: "
            else:
                prompt = f"Instruction: {instruction}\nOutput: "

            prompts.append(prompt)
            outputs.append(output_text)

        # 组合提示和输出
        texts = [p + o for p, o in zip(prompts, outputs)]

        # 对文本进行编码
        encodings = tokenizer(
            texts,
            truncation=True,
            max_length=256,
            padding="max_length",
            return_tensors="pt"
        )

        # 设置标签
        encodings["labels"] = encodings["input_ids"].clone()

        return encodings

    # 处理数据集
    processed_dataset = dataset["train"].map(
        preprocess_function,
        batched=True,
        remove_columns=dataset["train"].column_names,
        batch_size=4,
    )

    return processed_dataset

def main():
    """主函数"""
    # 获取输出目录
    output_dir = get_output_dir()

    # 创建示例数据集
    create_sample_dataset()

    # 加载模型和分词器
    model, tokenizer = load_model_and_tokenizer()

    # 准备模型进行训练
    model = prepare_model_for_training(model)

    # 准备训练数据集
    train_dataset = prepare_dataset(tokenizer, "sample_data.json")

    # 设置训练参数
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=1e-4,
        fp16=True,
        logging_steps=10,
        save_steps=50,
        warmup_steps=10,
        save_total_limit=2,
        save_safetensors=True,
        # 添加新的参数
        push_to_hub=False,  # 不推送到hub
        overwrite_output_dir=True,  # 如果输出目录存在则覆盖
    )

    # 创建训练器
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    # 开始训练
    trainer.train()

    # 保存模型和配置
    peft_model_path = os.path.join(output_dir, "final_model")
    os.makedirs(peft_model_path, exist_ok=True)

    print(f"\n保存模型到: {peft_model_path}")

    # 保存 LoRA 模型和配置
    model.save_pretrained(peft_model_path)

    # 保存分词器
    tokenizer.save_pretrained(peft_model_path)

    # 保存训练参数
    training_args.save_to_json(os.path.join(peft_model_path, "training_args.json"))

    print("模型、分词器和配置已保存完成")

    return model, tokenizer

if __name__ == "__main__":
    try:
        model, tokenizer = main()
    except Exception as e:
        print(f"训练过程中出现错误: {str(e)}")
        raise

Overwriting train_colab.py


## 4. 运行训练

In [18]:
!python train_colab.py

2025-04-06 15:50:31.744339: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743954631.768497    7738 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743954631.776718    7738 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Generating train split: 3 examples [00:00, 16.58 examples/s]
Map: 100% 3/3 [00:00<00:00, 53.21 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty labe

## 5. 测试微调后的模型

In [21]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import os

def load_and_test_model():
    print("开始加载模型...")

    # 设置模型路径
    model_path = "/content/model_training/final_model"
    if not os.path.exists(model_path):
        raise ValueError(f"模型路径不存在: {model_path}")

    # 检查必要文件
    required_files = ["adapter_config.json", "tokenizer.json"]
    for file in required_files:
        file_path = os.path.join(model_path, file)
        if not os.path.exists(file_path):
            raise ValueError(f"缺少必要文件: {file_path}")

    # 配置量化参数
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # 加载 PEFT 配置
    print("加载PEFT配置...")
    peft_config = PeftConfig.from_pretrained(model_path)

    # 加载基础模型
    print("加载基础模型...")
    base_model = AutoModelForCausalLM.from_pretrained(
        peft_config.base_model_name_or_path,
        trust_remote_code=True,
        device_map="auto",
        quantization_config=quantization_config
    )

    # 加载分词器
    print("加载分词器...")
    tokenizer = AutoTokenizer.from_pretrained(
        model_path,  # 使用保存的分词器
        trust_remote_code=True,
        padding_side="right",
        model_max_length=2048
    )

    if not tokenizer.pad_token_id:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    # 加载训练好的模型
    print("加载LoRA权重...")
    model = PeftModel.from_pretrained(
        base_model,
        model_path,
        device_map="auto",
        torch_dtype=torch.float16
    )
    model.eval()

    def generate_response(prompt):
        # 构建输入格式
        formatted_prompt = f"Instruction: {prompt}\nOutput: "
        print(f"\n生成的提示：{formatted_prompt}")

        # 编码输入
        inputs = tokenizer(formatted_prompt, return_tensors="pt")
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        # 生成回答
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=512,
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.1,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        # 解码输出
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # 移除提示部分，只保留回答
        response = response.replace(formatted_prompt, "").strip()
        return response

    # 测试多个问题
    test_prompts = [
        "解释什么是机器学习",
        "写一个简单的Python函数来计算两个数的和",
        "总结一下：人工智能是计算机科学的一个重要分支..."
    ]

    print("\n开始测试生成...")
    for prompt in test_prompts:
        print("\n" + "="*50)
        print(f"问题：{prompt}")
        try:
            response = generate_response(prompt)
            print(f"回答：{response}")
        except Exception as e:
            print(f"生成回答时出错：{str(e)}")

if __name__ == "__main__":
    try:
        load_and_test_model()
    except Exception as e:
        print(f"运行过程中出现错误：{str(e)}")
        raise

开始加载模型...
加载PEFT配置...
加载基础模型...
加载分词器...
加载LoRA权重...

开始测试生成...

问题：解释什么是机器学习

生成的提示：Instruction: 解释什么是机器学习
Output: 
回答：请详细解释一下什么是机器学习。

I need to explain what machine learning is.
Alright, let's start by defining it. Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without explicit programming. It involves algorithms that can perform tasks, such as classification, regression, clustering, and dimensionality reduction, based on data.

Another important aspect is the training process. In machine learning, models are trained on datasets, which may contain labeled or unlabeled data. Labeled data helps the model understand patterns and make predictions, while unlabeled data allows the model to find hidden structures within the dataset.

Model evaluation is another crucial element. After training, models must be assessed to measure their performance using metrics like accuracy, precision, recall, F1-score, and area under the RO