In [15]:
from train import load_dataset, build_train_arguments,build_trainer

from train import build_lora_config
%run train.py

In [16]:
train_data_path = './datasets/train_test/test0819.jsonl'
eval_data_path = './datasets/train_test/eval0819.jsonl'
model_path = r"D:\Pretrained_models\Qwen\Qwen2-1___5B-Instruct"
output_path = './Qwen2-1___5B-Instruct_ft_0213_15_46'

In [17]:
import torch

In [18]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 加载模型和数据集
由于QLoRA需要以量化的方式来加载模型, 所以加载模型的方法需要作调整,这里的改动是引入了BitsAndBytesConfig类构建一个量化配置quantization_config

具体配置释义:
- load_in_4bit: 决定了模型参数以4位量化格式加载,加载后的模型参数占用空间会比较小
- bnb_4bit_compute_dtype=bfloat16: 决定了矩阵乘法的计算精度使用bfloat16, 输入数据也会被转换为bfloat16位进行计算
- bnb_4bit_quant_type: 指定量化数据类型nf4
- bnb_4bit_user_double_quan: 是否启用双重量化

In [32]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig


def load_model(model_path, device='cuda'):
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                 torch_dtype=torch.bfloat16,
                                                 quantization_config=BitsAndBytesConfig(
                                                     load_in_4bit=True,
                                                     bnb_4bit_compute_dtype=torch.bfloat16,
                                                     bnb_4bit_use_double_quant=False,
                                                     bnb_4bit_quant_type='nf4'
                                                 ))
    model.enable_input_require_grads()
    return model.to(device), tokenizer

> 注: 普通的量化通常是将数值分成均匀的区间,比如 将0到1之间的数值分成16个区间, 每个区间的宽度相同, 而nf4则根据数据的分布情况, 使用不均匀的区间来表示数值, 这样可以更有效的表示模型中的重要数值,特别是哪些频繁出现的数值
>
> 注: 双重量化指的是在已经量化的基础上再进行量化, 第二次量化并不会改变位数本身, 它的目的是通过更紧凑的表示数值,是的存储和计算更加高效.由于每一次量化都会带来误差,所以双重量化可能会带来更大的误差,一般只用于极端内存受限的情况下
>
> 注: 模型参数加载用4位而计算用16位,是为了减少因为量化所带来的误差影响

In [33]:
%%time
model, tokenizer = load_model(model_path)

loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json
loading file chat_template.jinja
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file D:\Pretrained_models\Qwen\Qwen2-1___5B-Instruct\config.json
Model config Qwen2Config {
  "_name_or_path": "D:\\Pretrained_models\\Qwen\\Qwen2-1___5B-Instruct",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 1536,
  "initializer_range": 0.02,
  "intermediate_size": 8960,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 12,
  "num_hidden_layers": 28,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
 

CPU times: total: 328 ms
Wall time: 1.55 s


加载数据集

In [34]:
train_dataset, eval_dataset = load_dataset(train_data_path, eval_data_path, tokenizer)

Map: 100%|██████████| 2349/2349 [00:07<00:00, 303.33 examples/s]
Map: 100%|██████████| 2348/2348 [00:07<00:00, 317.68 examples/s]


## 构建训练参数
引入分页内存优化器来优化训练过程中的内存分配

In [35]:
train_args = build_train_arguments(output_path)
train_args.optim = "paged_adamw_32bit"

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [36]:
lora_config = build_lora_config()
lora_config.lora_dropout=0.2
lora_config.r = 16
lora_config.lora_alpha=32

In [37]:
trainer = build_trainer(model, tokenizer, train_args, lora_config, train_dataset, eval_dataset)
trainer.train()

Currently training with a batch size of: 4
***** Running training *****
  Num examples = 2,349
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 438
  Number of trainable parameters = 18,464,768


trainable params: 18,464,768 || all params: 1,562,179,072 || trainable%: 1.1820


Step,Training Loss,Validation Loss
100,0.0279,0.025682



***** Running Evaluation *****
  Num examples = 2348
  Batch size = 8
Saving model checkpoint to ./Qwen2-1___5B-Instruct_ft_0213_15_46\checkpoint-100
loading configuration file D:\Pretrained_models\Qwen\Qwen2-1___5B-Instruct\config.json
Model config Qwen2Config {
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 1536,
  "initializer_range": 0.02,
  "intermediate_size": 8960,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 12,
  "num_hidden_layers": 28,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.48.3",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}



KeyboardInterrupt: 