## 初始化

In [None]:
%%capture
!pip install unsloth
# 同时获取最新的版本 Unsloth！
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
!pip install --upgrade transformers torch peft

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m120.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.48.3
    Uninstalling transformers-4.48.3:
      Successfully uninstalled transformers-4.48.3
Successfully installed transformers-4.49.0


In [None]:
!pip install unsloth



In [None]:
# 导入 Unsloth 库中的 FastLanguageModel 类
from unsloth import FastLanguageModel
import torch

# 设置模型输入序列的最大长度，单位为 token。这个值限制了每次模型处理的文本长度
max_seq_length = 256

# 设置模型的数据类型，如果为 None，通常会默认使用 float32
dtype = None

# 设置是否以 4-bit 精度加载模型。设置为 True 可以减少内存占用和计算量，但可能会降低精度
load_in_4bit = True

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
## 使用本地环境
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(hf_token)

In [None]:
## 使用colab环境变量
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get('deepseekr1')
login(hf_token)

In [None]:
## 使用wandb
import wandb

wandb.login(key="wandb token")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myao110002[0m ([33myao110002-sdfsdfsdfsdf-com[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
import wandb

# 直接将你的 wandb token 写在这里，不需要调用 user_secrets
wb_token = "wandb token"

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune-DeepSeek-R1-Distill-Llama-8B on Medical COT Dataset',
    job_type="training",
    anonymous="allow"
)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


## 加载模型和分词器

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Qwen-32B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token,
)

==((====))==  Unsloth 2025.2.15: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/280k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/4.32G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/6.78k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

## Model inference before fine-tuning

In [None]:
prompt_style = """系统: 你是一个专业的医生，专注于提供准确且简洁的回答。
问题: {user_input}
回答: """

In [None]:
# 中文问答问题
question = """我感冒了怎么办"""

# 构造用户输入
user_input = question.strip()

# 根据提示模板和问题构造输入
inputs = tokenizer([prompt_style.format(user_input=user_input)], return_tensors="pt").to("cuda")

# 启动快速推理
FastLanguageModel.for_inference(model)  # Unsloth 已实现2倍加速推理！

# 模型生成答案
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=256,
    use_cache=True,
)

# 解码输出并提取回答内容
response = tokenizer.batch_decode(outputs)
print(response[0].split("回答:")[1].strip())

你需要多休息，多喝水，保持营养均衡。如果症状严重，如高烧、咳嗽不止，建议及时就医。
</think>

你需要多休息，多喝水，保持营养均衡。如果症状严重，如高烧、咳嗽不止，建议及时就医。<｜end▁of▁sentence｜>


In [None]:
model = FastLanguageModel.get_peft_model(
    model=model,  # 待微调的模型
    r=8,  # LoRA 分解的秩，保持为 8，适合大型模型和大数据集
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        # 仅对注意力头的投影层应用 LoRA，符合 Qwen 模型架构
    ],
    lora_alpha=8,  # 调整为 8，与 r 匹配，结合 RSLoRA 稳定训练
    lora_dropout=0.1,  # 保持 0.1，防止过拟合，适合大数据集
    bias="none",  # 不修改偏置项，保持默认设置
    use_gradient_checkpointing=True,  # 启用梯度检查点，节省显存，适合 32B 模型
    random_state=3407,  # 固定随机种子，确保训练可复现
    use_rslora=True,  # 启用 RSLoRA，提升训练稳定性
    loftq_config={"scaling": 1.0, "alpha": 1.0},  # 保持示例配置，可根据需求调整
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.2.15 patched 64 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


## Loading and processing the dataset

In [None]:
prompt_style = """系统: 你是一个专业的医生，专注于提供准确且简洁的回答。
用户: {user_input}
助手: """

In [None]:
# 获取结束符，必须添加 EOS_TOKEN
EOS_TOKEN = tokenizer.eos_token

# 针对 shibing624/medical 数据集（finetune 配置）的字段进行格式化：
def formatting_prompts_func(examples):
    # 从数据集中提取三个字段：instruction, input, output
    instructions = examples["instruction"]
    inputs_ = examples["input"]
    outputs = examples["output"]
    texts = []
    for inst, inp, out in zip(instructions, inputs_, outputs):
        # 如果 input 不为空，则将 instruction 和 input 组合成问题内容
        question = inst + ("\n" + inp if inp.strip() != "" else "")
        # 目前没有提供复杂的逐步思考链，可留空
        cot = ""
        # 将问题、思考链和最终回答填充到中文模板中，并在末尾添加 EOS_TOKEN
        text = train_prompt_style.format(question, cot, out) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

In [None]:
from datasets import load_dataset

# 加载数据集，读取所有样本
dataset = load_dataset("shibing624/medical", "finetune", split="train", trust_remote_code=True)

# 打印数据集的列名，确认有哪些字段
print("数据集的字段：", dataset.column_names)

# 定义格式化函数，生成符合模板要求的 "text" 字段
def formatting_prompts_func(examples):
    new_texts = []
    # 根据数据集实际字段名称调整这里的字段
    # 这里假设数据集中包含 "instruction" 和 "output" 两个字段
    for instruction, output in zip(examples["instruction"], examples["output"]):
        formatted_text = f"问题: {instruction}\n回答: {output}"
        new_texts.append(formatted_text)
    return {"text": new_texts}

# 对数据集应用格式化函数，生成符合模板要求的文本（即 "text" 字段）
dataset = dataset.map(formatting_prompts_func, batched=True)

# 查看第一个生成的文本
print(dataset["text"][0])

README.md:   0%|          | 0.00/9.14k [00:00<?, ?B/s]

medical.py:   0%|          | 0.00/7.11k [00:00<?, ?B/s]

train_zh_0.json:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

train_en_1.json:   0%|          | 0.00/139M [00:00<?, ?B/s]

valid_zh_0.json:   0%|          | 0.00/307k [00:00<?, ?B/s]

valid_en_1.json:   0%|          | 0.00/609k [00:00<?, ?B/s]

test_zh_0.json:   0%|          | 0.00/298k [00:00<?, ?B/s]

test_en_1.json:   0%|          | 0.00/602k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

数据集的字段： ['instruction', 'input', 'output']


Map:   0%|          | 0/2066589 [00:00<?, ? examples/s]

问题: 血热的临床表现是什么?
回答: 初发或复发病不久。皮疹发展迅速，呈点滴状、钱币状或混合状。常见丘疹、斑丘疹、大小不等的斑片，潮红、鲜红或深红色。散布于体表各处或几处，以躯干、四肢多见，亦可先从头面开始，逐渐发展至全身。新皮疹不断出现，表面覆有银白色鳞屑，干燥易脱落，剥刮后有点状出血。可有同形反应;伴瘙痒、心烦口渴。大便秘结、小便短黄，舌质红赤，苔薄黄或根部黄厚，脉弦滑或滑数。血热炽盛病机，主要表现在如下四个面：一、热象：血热多属阳盛则热之实性、热性病机和病证、并表现出热象。二、血行加速：血得热则行，可使血流加速，且使脉道扩张，络脉充血，故可见面红目赤，舌色深红（即舌绛）等症。三、动血：在血行加速与脉道扩张的基础上，血分有热，可灼伤脉络，引起出血，称为“热迫血妄行”，或称动血。四、扰乱心神：血热炽盛则扰动心神，心主血脉而藏神，血脉与心相通，故血热则使心神不安，而见心烦，或躁扰发狂等症。


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Qwen-32B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token,
)

==((====))==  Unsloth 2025.2.15: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model=model,  # 待微调的模型
    r=8,  # LoRA 分解的秩，保持为 8，适合大型模型和大数据集
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        # 仅对注意力头的投影层应用 LoRA，符合 Qwen 模型架构
    ],
    lora_alpha=8,  # 调整为 8，与 r 匹配，结合 RSLoRA 稳定训练
    lora_dropout=0.1,  # 保持 0.1，防止过拟合，适合大数据集
    bias="none",  # 不修改偏置项，保持默认设置
    use_gradient_checkpointing=True,  # 启用梯度检查点，节省显存，适合 32B 模型
    random_state=3407,  # 固定随机种子，确保训练可复现
    use_rslora=True,  # 启用 RSLoRA，提升训练稳定性
    loftq_config={"scaling": 1.0, "alpha": 1.0},  # 保持示例配置，可根据需求调整
)



In [None]:
prompt_style = """系统: 你是一个专业的医生，专注于提供准确且简洁的回答。
问题: {user_input}
回答: """

In [None]:
# 获取结束符，必须添加 EOS_TOKEN
EOS_TOKEN = tokenizer.eos_token

# 针对 shibing624/medical 数据集（finetune 配置）的字段进行格式化：
def formatting_prompts_func(examples):
    # 从数据集中提取三个字段：instruction, input, output
    instructions = examples["instruction"]
    inputs_ = examples["input"]
    outputs = examples["output"]
    texts = []
    for inst, inp, out in zip(instructions, inputs_, outputs):
        # 如果 input 不为空，则将 instruction 和 input 组合成问题内容
        question = inst + ("\n" + inp if inp.strip() != "" else "")
        # 目前没有提供复杂的逐步思考链，可留空
        cot = ""
        # 将问题、思考链和最终回答填充到中文模板中，并在末尾添加 EOS_TOKEN
        text = train_prompt_style.format(question, cot, out) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported, FastLanguageModel



# 确保模型准备好进行训练
FastLanguageModel.for_training(model)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=16,  # 增加批次大小
        gradient_accumulation_steps=1,   # 减少梯度累积步数
        warmup_steps=200,                # 减少预热步数
        max_steps=1500,                  # 减少训练步数
        learning_rate=2e-4,              # 增加学习率
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=100,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

Applying chat template to train dataset (num_proc=2):   0%|          | 0/2066589 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/2066589 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/2066589 [00:00<?, ? examples/s]

## Setting up the model

## Model training

In [None]:
print(dataset.column_names)

['instruction', 'input', 'output', 'text']


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,066,589 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 16 | Gradient Accumulation steps = 1
\        /    Total batch size = 16 | Total steps = 1,500
 "-____-"     Number of trainable parameters = 16,777,216


Step,Training Loss
100,2.5321
200,2.0838
300,2.0596
400,2.0522
500,1.9976
600,2.0269
700,2.0075
800,2.0104
900,2.0354
1000,2.0286


In [None]:
import wandb

# 其他 wandb 相关代码
wandb.finish()

0,1
train/epoch,▁▁▂▃▃▃▄▅▅▆▆▇▇▇██
train/global_step,▁▁▂▃▃▃▄▅▅▅▆▇▇▇██
train/grad_norm,▁▂▂▄▁▄▅▃▂▃█▆▃▂▁
train/learning_rate,▅█▇▇▆▆▅▅▄▄▃▃▂▂▁
train/loss,█▂▂▂▁▁▁▁▂▂▁▂▁▁▁

0,1
total_flos,1.1681642046523638e+18
train/epoch,0.01161
train/global_step,1500.0
train/grad_norm,0.38023
train/learning_rate,0.0
train/loss,1.9897
train_loss,2.05907
train_runtime,5836.8621
train_samples_per_second,4.112
train_steps_per_second,0.257


## Model inference after fine-tuning

In [None]:
prompt_style = """系统: 你是一个专业的医生，专注于提供准确且简洁的回答，回答一定要简明扼要，并且提供具体的药物建议以及应该如何食用该药物。
问题: {user_input}
回答: """

In [None]:
# 中文问答问题
question = """拉肚子怎么办"""

# 构造用户输入
user_input = question.strip()

# 根据提示模板和问题构造输入
inputs = tokenizer([prompt_style.format(user_input=user_input)], return_tensors="pt").to("cuda")

# 启动快速推理
FastLanguageModel.for_inference(model)  # Unsloth 已实现2倍加速推理！

# 模型生成答案
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=256,           # 可根据需要调整生成 token 数量
    no_repeat_ngram_size=3,       # 防止连续3个 token 重复
    repetition_penalty=1.2,       # 对重复生成的内容进行惩罚
    use_cache=True,
)

# 解码输出并提取回答内容
response = tokenizer.batch_decode(outputs)[0]
answer = response.split("回答:")[1].strip()

# 后处理函数：实现按点换行
def format_answer(text):
    # 在中文句号、问号和感叹号后插入换行符
    text = text.replace("。", "。\n")
    text = text.replace("？", "？\n")
    text = text.replace("！", "！\n")
    return text

# 格式化回答
formatted_answer = format_answer(answer)

# 打印格式化后的回答
print(formatted_answer)

1.一般治疗：注意饮食卫生和饮水卫生。
急性腹泻病人应暂时禁食,严重呕吐者需静脉补液,病情好转后可逐步恢复进食;慢性腹泻病人宜采用易消化、少纤维的流质或半流质食物如稀饭、面条等,避免刺激性食物,必要时短期低乳糖或无乳糖饮食。


2.止泻药：适用于非感染性和轻症感染性腹泻,尤其是以肠蠕动亢进为主要症状者。
对伴有发热及明显腹痛的炎症性 diarrhea不宜使用。
(1)次碳酸铋：成人每次0.3g,每日4~6次口服;儿童剂量减半,每日总量不超过8mg/kg(体重),连服5日为一疗程。
(2)蒙脱石散剂：具有吸附病原体及其毒素的作用。
成人口服一次3g(每袋),一日三次;小于3岁小儿每日用量1袋,分两次服用;大于3岁至12岁每日用量l-2袋,分为两次或三次服用。
(3)洛哌丁胺：用于控制腹泻的症状,但不能缩短腹泻时间。
成人与12岁以上青少年可用。
首次剂量为4mg,


## Saving the model locally

In [None]:
new_model_online = "your name"
new_model_local = "your name"


model.save_pretrained(new_model_local) # Local saving
tokenizer.save_pretrained(new_model_local)

('DeepSeek-R1-Distill-Qwen-32B-Medical/tokenizer_config.json',
 'DeepSeek-R1-Distill-Qwen-32B-Medical/special_tokens_map.json',
 'DeepSeek-R1-Distill-Qwen-32B-Medical/tokenizer.json')

## Pushing the model to Hugging Face hub

In [None]:
model.push_to_hub(new_model_online) # Online saving
tokenizer.push_to_hub(new_model_online) # Online saving

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/67.2M [00:00<?, ?B/s]

Saved model to https://huggingface.co/beita6969/DeepSeek-R1-Distill-Qwen-32B-Medical


README.md:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [None]:
model.save_pretrained_merged(new_model_local, tokenizer, save_method = "merged_16bit",)
model.push_to_hub_merged(new_model_online, tokenizer, save_method = "merged_16bit")

Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 19.2G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 53.71 out of 83.48 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 14%|█▍        | 9/64 [00:00<00:01, 40.83it/s]
We will save to Disk and not RAM now.
100%|██████████| 64/64 [02:15<00:00,  2.12s/it]


Unsloth: Saving tokenizer... Done.
Done.


Unsloth: You are pushing to hub, but you passed your HF username = beita6969.
We shall truncate beita6969/DeepSeek-R1-Distill-Qwen-32B-Medical to DeepSeek-R1-Distill-Qwen-32B-Medical


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 53.18 out of 83.48 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 64/64 [03:43<00:00,  3.49s/it]


Unsloth: Saving tokenizer...

No files have been modified since last commit. Skipping to prevent empty commit.


 Done.


README.md:   0%|          | 0.00/116 [00:00<?, ?B/s]

  0%|          | 0/14 [00:00<?, ?it/s]

model-00002-of-00014.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00010-of-00014.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00009-of-00014.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00007-of-00014.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00013-of-00014.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00006-of-00014.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00003-of-00014.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00012-of-00014.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00001-of-00014.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

model-00004-of-00014.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00014-of-00014.safetensors:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

model-00008-of-00014.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00005-of-00014.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00011-of-00014.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/beita6969/DeepSeek-R1-Distill-Qwen-32B-Medical
