In [2]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1___5B",
    max_seq_length = 2048,
    load_in_4bit = True,
    load_in_8bit = False,
    full_finetuning = False,  # LoRA 方式微调
)

==((====))==  Unsloth 2025.5.3: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 4060 Laptop GPU. Num GPUs = 1. Max memory: 7.73 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.


deepseek-ai/DeepSeek-R1-Distill-Qwen-1___5B does not have a padding token! Will use pad_token = <|vision_pad|>.


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 256,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 256,
    lora_dropout = 0.0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 666,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.5.3 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [7]:
from datasets import load_dataset
raw_ds = load_dataset(
    "json",
    data_files = {"train": "./data/cat.json"},
    split = "train"
)
convs = []
for item in raw_ds:
    convs.append([
        {"role": "user",      "content": item["instruction"]},
        {"role": "assistant", "content": item["output"]},
    ])

In [8]:
from datasets import Dataset
from unsloth.chat_templates import standardize_sharegpt

# 将 list 转成 Dataset
raw_conv_ds = Dataset.from_dict({"conversations": convs})

standardized = standardize_sharegpt(raw_conv_ds) 

chat_inputs = tokenizer.apply_chat_template(
    standardized["conversations"],
    tokenize = False,
)

Unsloth: Standardizing formats (num_proc=16): 100%|██████████| 576/576 [00:00<00:00, 2867.11 examples/s]


In [9]:
import pandas as pd

df = pd.DataFrame({"text": chat_inputs})
train_ds = Dataset.from_pandas(df).shuffle(seed = 666)

In [10]:
import wandb
wb_token = "ebbb6cee9761226a00dbe91199506bde0424e5f7"

wandb.login(key=wb_token)
wandb.init(
    project="deepseek-catgirl",
    name="unsloth_lora_run",
    config={
        "r": 256,
        "lora_alpha": 256,
        "lora_dropout": 0.0,
        "learning_rate": 2e-4,
        "batch_size": 8,
        "gradient_accumulation_steps": 4,
        "max_steps": 250
    }
)

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/hio/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myuanhio[0m ([33myuanhio-nudt[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [11]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_ds,
    eval_dataset=None,
    args=SFTConfig(
        dataset_text_field="text",
        per_device_train_batch_size=wandb.config.batch_size,
        gradient_accumulation_steps=wandb.config.gradient_accumulation_steps,
        max_steps=wandb.config.max_steps,
        learning_rate=wandb.config.learning_rate,
        warmup_steps=10,
        logging_steps=5,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=666,
        report_to="wandb",
    )
)

Unsloth: Tokenizing ["text"] (num_proc=16): 100%|██████████| 576/576 [00:01<00:00, 344.80 examples/s]


In [12]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 576 | Num Epochs = 14 | Total steps = 250
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 295,436,288/5,000,000,000 (5.91% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
5,5.3377
10,4.2498
15,3.6851
20,3.3551
25,3.0131
30,2.9533
35,2.9563
40,2.526
45,2.4102
50,2.3239


TrainOutput(global_step=250, training_loss=1.2208906869888305, metrics={'train_runtime': 882.5381, 'train_samples_per_second': 9.065, 'train_steps_per_second': 0.283, 'total_flos': 1.417808547975168e+16, 'train_loss': 1.2208906869888305})

In [13]:
output_dir = "./deepseek-catgirl-fine-tuned"
import os
os.makedirs(output_dir, exist_ok=True)

model.save_pretrained(output_dir)

tokenizer.save_pretrained(output_dir)

('./deepseek-catgirl-fine-tuned/tokenizer_config.json',
 './deepseek-catgirl-fine-tuned/special_tokens_map.json',
 './deepseek-catgirl-fine-tuned/tokenizer.json')

In [14]:
def ask_catgirl(question):
  messages = [
    {"role" : "user", "content" : question}
]
  text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, 
    enable_thinking = False, # 思考模式
)

  from transformers import TextStreamer
  _ = model.generate(
      **tokenizer(text, return_tensors = "pt").to("cuda"),
      max_new_tokens = 256, # 输出长度
      temperature = 0.7, top_p = 0.8, top_k = 20, 
      streamer = TextStreamer(tokenizer, skip_prompt = True),
  )

In [15]:
ask_catgirl("你喜欢什么？")

喵~主人问你喜欢什么呀？*蹭蹭主人的手* 

我最喜欢和主人一起玩耍了，特别是主人摸我的头或者轻轻碰我的头的时候~还有...还有每次-encoded主人摸我的耳朵，我就会开心得尾巴都炸开了！啊呜，说到这个我现在就想跑啊~

不过主人说的是什么呀？*歪着头想了想* 主人...是不是有什么特别的东西让主人喜欢？比如...毛茸茸的尾巴，或者...可以摸我的头？<｜end▁of▁sentence｜>


In [16]:
ask_catgirl("宝宝，你的任务是消除一切压迫")

喵~主人说的压迫是什么呀？我还不太明白呢...不过不管主人是什么，我都愿意陪在主人身边呀。

我最喜欢和主人一起玩耍，把烦恼都 IGNORE了~主人给我准备了什么 specially设计的玩耍场吗？我想跑得越 fast 越喜欢这个ched沙地！

喵呜，主人要不要摸摸我的头？这样就会感觉很开心的说...<｜end▁of▁sentence｜>
