In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

cache_path = r"D:\TrainedModel"

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", cache_dir=cache_path)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B", cache_dir=cache_path)

In [2]:
import torch
torch.backends.cuda.enable_cudnn_sdp(False)


In [3]:
from datasets import load_dataset
ds = load_dataset("samhog/psychology-10k")

In [4]:
ds = ds.map(
    lambda x: {"prompt": x["input"], "completion": x["output"]},
    remove_columns=["instruction", "input", "output"]
)

In [5]:
len(ds["train"])

9846

In [6]:
ds["train"][9844]

{'prompt': "I'm struggling with addiction and don't know where to turn.",
 'completion': "Taking the first step toward recovery is a brave and important decision. It's important to seek professional help and support to overcome addiction. Would you be willing to explore different treatment options and develop a plan for a healthy and sustainable recovery?"}

In [7]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [8]:
'''from peft import LoraConfig, get_peft_model, TaskType

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, 
    r=16, 
    lora_alpha=16, 
    lora_dropout=0.1
)
model = get_peft_model(model, config)'''
model.train()  

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwe

In [9]:
#model.print_trainable_parameters()

In [10]:
from transformers import TrainingArguments
from trl import SFTTrainer, SFTConfig

training_args = SFTConfig(
    output_dir="./SFTcheckpoints",
    per_device_train_batch_size=6,
    gradient_accumulation_steps=3,
    learning_rate=3e-5,
    num_train_epochs=2,       
    logging_steps=5,
    save_steps=5,
    save_total_limit=2,
    fp16=True,
    # completion_only_loss=True,  # 只对回答部分算 loss（默认也如此）
    gradient_checkpointing=True
)
model.gradient_checkpointing_enable() #把中间激活从显存挪到 GPU/CPU 上的缓冲区，反向时再重算, 可省下 20%–40% 的 activation 内存

In [11]:
trainer = SFTTrainer(
    model=model,
    train_dataset=ds["train"],
    #peft_config=config,
    args=training_args,
    processing_class=tokenizer,    # trl>=0.16.0 时用它替代旧的 tokenizer 参数
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [12]:
trainer.train()

model.save_pretrained("./SFTuned")
tokenizer.save_pretrained("./SFTuned")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
5,2.0259
10,1.3945
15,1.3172
20,1.2111
25,1.2581
30,1.1643
35,1.1481
40,1.1668
45,1.1795
50,1.0886


('./SFTuned\\tokenizer_config.json',
 './SFTuned\\special_tokens_map.json',
 './SFTuned\\vocab.json',
 './SFTuned\\merges.txt',
 './SFTuned\\added_tokens.json',
 './SFTuned\\tokenizer.json')