## 初始化

In [None]:
%%capture
!pip install unsloth
# 同时获取最新的版本 Unsloth！
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
!pip install --upgrade transformers torch peft

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m94.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.47.1
    Uninstalling transformers-4.47.1:
      Successfully uninstalled transformers-4.47.1
Successfully installed transformers-4.49.0


In [None]:
!pip install unsloth



In [None]:
# 导入 Unsloth 库中的 FastLanguageModel 类
from unsloth import FastLanguageModel
import torch

# 设置模型输入序列的最大长度，单位为 token。这个值限制了每次模型处理的文本长度
max_seq_length = 256

# 设置模型的数据类型，如果为 None，通常会默认使用 float32
dtype = None

# 设置是否以 4-bit 精度加载模型。设置为 True 可以减少内存占用和计算量，但可能会降低精度
load_in_4bit = True

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
## 使用本地环境
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(hf_token)

In [None]:
## 使用colab环境变量
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get('deepseekr1')
login(hf_token)

In [None]:
## 使用wandb
import wandb

wandb.login(key="b42ca0000cf06f97b05eba34f58823ad5f3122a4")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33myao110002[0m ([33myao110002-sdfsdfsdfsdf-com[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
import wandb

# 直接将你的 wandb token 写在这里，不需要调用 user_secrets
wb_token = "b42ca0000cf06f97b05eba34f58823ad5f3122a4"

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune-DeepSeek-R1-Distill-Llama-8B on Medical COT Dataset',
    job_type="training",
    anonymous="allow"
)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## 加载模型和分词器

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token,
)

==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

## Model inference before fine-tuning

In [None]:
prompt_style = """以下是任务指令。
请根据请求给出一个合理的回答。

### 指令:
你是一个具备诊断和治疗规划能力的医学专家。
请简要的回答以下医学问题。

### 回答:
"""

In [None]:
# 中文医学问题
question = "一位61岁的女性，长期存在在咳嗽或打喷嚏等活动中出现不自主尿液丢失，但夜间无漏尿症状，经妇科检查和Q-tip测试。基于这些发现，膀胱测压最有可能显示其残余容量和逼尿肌收缩的情况如何？"

# 启动快速推理（假设 FastLanguageModel 支持中文模型同样适用）
FastLanguageModel.for_inference(model)  # Unsloth 已实现2倍加速推理！

# 根据提示模板和问题构造输入
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

# 模型生成答案
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)

# 解码输出，并根据“### 回答:”标签提取回答内容
response = tokenizer.batch_decode(outputs)
print(response[0].split("### 回答:")[1])

In [None]:
model = FastLanguageModel.get_peft_model(
    model=model,  # 待微调的模型
    r=8,  # 将 LoRA 分解的秩从 16 降低到 8，减少额外参数数量
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        # 如果模型不需要对 gate_proj、up_proj 和 down_proj 应用 LoRA，可不传这些模块
    ],
    lora_alpha=32,  # 增大缩放因子以补偿降低的 r，提升低秩适配层的表达能力
    lora_dropout=0.1,  # 添加一定的 dropout 防止过拟合（值可根据实际情况调整）
    bias="none",  # 不修改偏置项
    use_gradient_checkpointing=True,  # 启用梯度检查点，节省显存；如果需要超长上下文，可改为 "unsloth"
    random_state=3407,  # 固定随机种子，确保训练可复现
    use_rslora=True,  # 启用 RSLoRA 变体（根据实际需求设置）
    loftq_config={"scaling": 1.0, "alpha": 1.0},  # 示例配置，可根据需求进一步调整
)

## Loading and processing the dataset

In [None]:
train_prompt_style = """以下是任务指令。
请根据请求给出一个合理的回答。

### 指令:
你是一个具备诊断和治疗规划能力的医学专家。
请简要的回答以下医学问题。

### 回答:
"""

In [None]:
# 获取结束符，必须添加 EOS_TOKEN
EOS_TOKEN = tokenizer.eos_token

# 针对 shibing624/medical 数据集（finetune 配置）的字段进行格式化：
def formatting_prompts_func(examples):
    # 从数据集中提取三个字段：instruction, input, output
    instructions = examples["instruction"]
    inputs_ = examples["input"]
    outputs = examples["output"]
    texts = []
    for inst, inp, out in zip(instructions, inputs_, outputs):
        # 如果 input 不为空，则将 instruction 和 input 组合成问题内容
        question = inst + ("\n" + inp if inp.strip() != "" else "")
        # 目前没有提供复杂的逐步思考链，可留空
        cot = ""
        # 将问题、思考链和最终回答填充到中文模板中，并在末尾添加 EOS_TOKEN
        text = train_prompt_style.format(question, cot, out) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

In [None]:
from datasets import load_dataset

# 加载数据集，读取所有样本
dataset = load_dataset("shibing624/medical", "finetune", split="train", trust_remote_code=True)

# 打印数据集的列名，确认有哪些字段
print("数据集的字段：", dataset.column_names)

# 定义格式化函数，生成符合模板要求的 "text" 字段
def formatting_prompts_func(examples):
    new_texts = []
    # 根据数据集实际字段名称调整这里的字段
    # 这里假设数据集中包含 "instruction" 和 "output" 两个字段
    for instruction, output in zip(examples["instruction"], examples["output"]):
        formatted_text = f"问题: {instruction}\n回答: {output}"
        new_texts.append(formatted_text)
    return {"text": new_texts}

# 对数据集应用格式化函数，生成符合模板要求的文本（即 "text" 字段）
dataset = dataset.map(formatting_prompts_func, batched=True)

# 查看第一个生成的文本
print(dataset["text"][0])

README.md:   0%|          | 0.00/9.14k [00:00<?, ?B/s]

medical.py:   0%|          | 0.00/7.11k [00:00<?, ?B/s]

train_zh_0.json:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

train_en_1.json:   0%|          | 0.00/139M [00:00<?, ?B/s]

valid_zh_0.json:   0%|          | 0.00/307k [00:00<?, ?B/s]

valid_en_1.json:   0%|          | 0.00/609k [00:00<?, ?B/s]

test_zh_0.json:   0%|          | 0.00/298k [00:00<?, ?B/s]

test_en_1.json:   0%|          | 0.00/602k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

数据集的字段： ['instruction', 'input', 'output']


Map:   0%|          | 0/2066589 [00:00<?, ? examples/s]

问题: 血热的临床表现是什么?
回答: 初发或复发病不久。皮疹发展迅速，呈点滴状、钱币状或混合状。常见丘疹、斑丘疹、大小不等的斑片，潮红、鲜红或深红色。散布于体表各处或几处，以躯干、四肢多见，亦可先从头面开始，逐渐发展至全身。新皮疹不断出现，表面覆有银白色鳞屑，干燥易脱落，剥刮后有点状出血。可有同形反应;伴瘙痒、心烦口渴。大便秘结、小便短黄，舌质红赤，苔薄黄或根部黄厚，脉弦滑或滑数。血热炽盛病机，主要表现在如下四个面：一、热象：血热多属阳盛则热之实性、热性病机和病证、并表现出热象。二、血行加速：血得热则行，可使血流加速，且使脉道扩张，络脉充血，故可见面红目赤，舌色深红（即舌绛）等症。三、动血：在血行加速与脉道扩张的基础上，血分有热，可灼伤脉络，引起出血，称为“热迫血妄行”，或称动血。四、扰乱心神：血热炽盛则扰动心神，心主血脉而藏神，血脉与心相通，故血热则使心神不安，而见心烦，或躁扰发狂等症。


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token,
)

==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# 如果 is_bfloat16_supported 无法正确识别硬件，您可以手动设置：
# is_bfloat16_supported = False   # 如果您的硬件不支持 bfloat16
# 或者您也可以检查这个函数的返回值

# 确保模型在训练前已经正确准备
FastLanguageModel.for_training(model)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=8,    # 增加每设备批大小，提高梯度估计稳定性（如果显存允许）
        gradient_accumulation_steps=2,      # 保持较低的累积步数，有效批大小 = 8 × 2 = 16
        warmup_steps=100,                  # 预热步数增至 1000，保证学习率平滑上升，适应较长的训练过程
        max_steps=1000,                    # 总训练步数设为 10000
        learning_rate=2e-5,                 # 进一步降低学习率，确保参数更新更细致
        fp16=not is_bfloat16_supported(),   # 如果硬件支持 BF16，使用它，否则使用 FP16
        bf16=is_bfloat16_supported(),      # 设置 BF16 支持标志
        logging_steps=100,                  # 日志打印间隔设置为 100 步，减少过于频繁的输出
        optim="adamw_8bit",                 # 采用 adamw_8bit 优化器
        weight_decay=0.01,                  # 保持 0.01 的权重衰减
        lr_scheduler_type="linear",         # 使用线性学习率衰减策略
        seed=3407,
        output_dir="outputs",
    ),
)

Applying chat template to train dataset (num_proc=2):   0%|          | 0/2066589 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/2066589 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/2066589 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## Setting up the model

## Model training

In [None]:
print(dataset.column_names)

['instruction', 'input', 'output', 'text']


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,066,589 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 2
\        /    Total batch size = 16 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 6,815,744


Step,Training Loss
100,3.121
200,2.6319
300,2.5328
400,2.486
500,2.4635
600,2.4364
700,2.4131
800,2.4243
900,2.4253
1000,2.4403


In [None]:
import wandb

# 其他 wandb 相关代码
wandb.finish()

0,1
train/epoch,▁▂▃▃▄▅▆▆▇██
train/global_step,▁▂▃▃▄▅▆▆▇██
train/grad_norm,▁▃▃█▄▇▇▇▆█
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▃▂▂▁▁▁▁▁▁

0,1
total_flos,1.805190207932989e+17
train/epoch,0.00774
train/global_step,1000.0
train/grad_norm,5.13804
train/learning_rate,0.0
train/loss,2.4403
train_loss,2.53745
train_runtime,4398.8352
train_samples_per_second,3.637
train_steps_per_second,0.227


## Model inference after fine-tuning

In [None]:
# 定义中文医学问题
question = "怀孕后嘴巴很淡怎么办？"

# 构造新的 prompt，明确要求模型只给出最终答案，不展示中间思考过程，
# 并确保回答简洁、精炼且不重复
prompt = (
    f"问题：{question}\n"
    "请直接给出最终答案，不需要显示任何中间思考过程，并确保回答简洁、精炼且不重复。\n"
    "### 回答:"
)

# 利用新的 prompt 构造输入文本
inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

# 使用模型生成回答，同时设置 no_repeat_ngram_size 和 repetition_penalty 参数防止重复
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=256,           # 可根据需要调整生成 token 数量
    no_repeat_ngram_size=3,       # 防止连续3个 token 重复
    repetition_penalty=1.2,       # 对重复生成的内容进行惩罚
    use_cache=True,
)

# 解码输出，并通过“### 回答:”分隔符提取最终答案部分
response = tokenizer.batch_decode(outputs)
final_answer = response[0].split("### 回答:")[-1].strip()

print(final_answer)

1.口腔溃疡或白斑
2.缺乏维生素B2，服用一些补充维生碘的食物，如牛奶、鸡蛋等
3.常吃酸辣刺激性食物，可导致胃病和牙齿龋洞。建议少吃辛辣油炸烹饪食品，以免引起口腻或失去牙齋
4.注意个人卫生清洗口腺多次漱口，保持口腮干燥
5.留意自身体质是否有异常症状如手足麻木、头晕眩悸等，如果有，则可及时就医检查
6.饮食方面应均衡营养，适当减少高糖、高脂肪含量
7.避免长时间咀嚼软骨或硬核糖果，可以防止颌关节炎
8.定期进行全面的口腭健康检查，必要时做相关治疗
9.如果上述方法都无效，最好还是到正规医院看一下吧，这样才能确定是不是因为其他原因造成


In [None]:
# 定义中文医学问题
question = "感冒了怎么办"

# 构造新的 prompt，明确要求模型只给出最终答案，不展示中间思考过程，
# 并确保回答简洁、精炼且不重复
prompt = (
    f"问题：{question}\n"
    "请直接给出最终答案，不需要显示任何中间思考过程，并确保回答简洁、精炼且不重复。\n"
    "### 回答:"
)

# 利用新的 prompt 构造输入文本
inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

# 使用模型生成回答，同时设置 no_repeat_ngram_size 和 repetition_penalty 参数防止重复
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=256,           # 可根据需要调整生成 token 数量
    no_repeat_ngram_size=3,       # 防止连续3个 token 重复
    repetition_penalty=1.2,       # 对重复生成的内容进行惩罚
    use_cache=True,
)

# 解码输出，并通过“### 回答:”分隔符提取最终答案部分
response = tokenizer.batch_decode(outputs)
final_answer = response[0].split("### 回答:")[-1].strip()

print(final_answer)

1.多喝水，补充水分。2.服用抗生素等药物缓解症状，防止细菌感染的扩散和发展3.有必要及时就医治疗，以免病情延误4.注意休息，如果体力气质较差，可在家养护一段时间后到医院救治5.保持心情愉快，可以吃点小食来舒缓一下6.饮食以清淡为主，少吃辛辣刺激性食物7.可以进行一些针灸或者其他疗法8.尽量避免外流汗9.不要熬夜10.增加营养摄入11.定期观察身体情况12.选择合适的医疗机构13.服上热茶或凉汤14.调节睡眠15.平衡工作与生活16.留意个人卫生17.要做好预防措施18.加强锻练19.减轻负担20.口腔消毒21.饮品宜选温开水22.增进肠胃功能23.提高精神状态24.早晚空腹25.常规检查


## Saving the model locally

In [None]:
new_model_online = "kingabzpro/DeepSeek-R1-Medical-COT"
new_model_local = "DeepSeek-R1-Medical-COT"
model.save_pretrained(new_model_local) # Local saving
tokenizer.save_pretrained(new_model_local)


('DeepSeek-R1-Medical-COT/tokenizer_config.json',
 'DeepSeek-R1-Medical-COT/special_tokens_map.json',
 'DeepSeek-R1-Medical-COT/tokenizer.json')

In [None]:
new_model_online = "beita6969/deepseek-r1-medical-response"
new_model_local = "deepseek-r1-medical-response"


model.save_pretrained(new_model_local) # Local saving
tokenizer.save_pretrained(new_model_local)

('deepseek-r1-medical-response/tokenizer_config.json',
 'deepseek-r1-medical-response/special_tokens_map.json',
 'deepseek-r1-medical-response/tokenizer.json')

## Pushing the model to Hugging Face hub

In [None]:
model.push_to_hub(new_model_online) # Online saving
tokenizer.push_to_hub(new_model_online) # Online saving

README.md:   0%|          | 0.00/737 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

Saved model to https://huggingface.co/beita6969/deepseek-r1-medical-response


In [None]:
model.save_pretrained_merged(new_model_local, tokenizer, save_method = "merged_16bit",)
model.push_to_hub_merged(new_model_online, tokenizer, save_method = "merged_16bit")

Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 6.0G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 30.97 out of 52.96 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 69%|██████▉   | 22/32 [00:00<00:00, 43.72it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:06<00:00,  5.22it/s]


Unsloth: Saving tokenizer... Done.
Done.


Unsloth: You are pushing to hub, but you passed your HF username = beita6969.
We shall truncate beita6969/deepseek-r1-medical-response to deepseek-r1-medical-response


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 31.11 out of 52.96 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:05<00:00,  5.53it/s]


Unsloth: Saving tokenizer...

No files have been modified since last commit. Skipping to prevent empty commit.


 Done.


  0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/beita6969/deepseek-r1-medical-response
