<a href="https://colab.research.google.com/github/boheling/healthAI/blob/main/SFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Install necessary packages
!pip install transformers datasets trl --quiet
#!huggingface-cli login

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from datasets import load_dataset
from trl import SFTTrainer

# Use the distill model
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Save the pre-trained model for backup
model.save_pretrained("./pretrain_model")
tokenizer.save_pretrained("./pretrain_model")

# Q&A healthcare doctor dataset
dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k", split="train[:10%]")

def tokenize_function(examples):
    texts = []
    for i in range(len(examples["instruction"])):
        # Build the text with structure
        text = f"Instruction: {examples['instruction'][i]}\n"
        # Process 'input'
        if examples["input"][i]:
            if isinstance(examples["input"][i], list):
                input_text = " ".join(examples["input"][i])
            else:
                input_text = examples["input"][i]
            text += f"Input: {input_text}\n"
        # Process 'output'
        if examples["output"][i]:
            if isinstance(examples["output"][i], list):
                output_text = " ".join(examples["output"][i])
            else:
                output_text = examples["output"][i]
            text += f"Output: {output_text}\n"
        texts.append(text)
    return tokenizer(texts, truncation=True, max_length=128)

# Tokenize the dataset and remove original text columns
unfiltered_tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["instruction", "input", "output"]
)

# Filter out examples with empty tokenization results
tokenized_dataset = unfiltered_tokenized_dataset.filter(lambda x: len(x["input_ids"]) > 0)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/485.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/318.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/542 [00:00<?, ?B/s]

(…)-00000-of-00001-5e7cb295b9cff0bf.parquet:   0%|          | 0.00/70.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/112165 [00:00<?, ? examples/s]

Map:   0%|          | 0/11216 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11216 [00:00<?, ? examples/s]

In [3]:
print(unfiltered_tokenized_dataset[0])

{'input_ids': [151646, 16664, 25, 1416, 498, 525, 264, 10668, 11, 4486, 4226, 279, 6457, 4755, 3118, 389, 279, 8720, 594, 4008, 624, 2505, 25, 358, 38726, 705, 419, 6556, 8266, 279, 4361, 3054, 374, 37860, 979, 600, 572, 11699, 1495, 13, 358, 3937, 311, 279, 14852, 11435, 650, 42180, 1541, 11, 438, 600, 6679, 311, 5244, 600, 2666, 48294, 782, 13, 358, 1430, 311, 21982, 275, 714, 432, 39364, 2525, 700, 496, 4636, 4633, 7215, 86769, 323, 6084, 369, 2421, 4115, 11, 600, 2058, 2666, 279, 1852, 496, 3216, 279, 1616, 11, 421, 600, 10962, 1495, 476, 2444, 1495, 11, 847, 1968, 653, 537, 12616, 11, 1172, 979, 600, 1366, 311, 3271, 2163, 1221, 600, 2666, 279, 4361, 1879, 374, 37860, 496, 1597, 432, 374, 4622, 22350, 43676, 518], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/sft_output",  # save checkpoints here
    logging_dir="/content/drive/MyDrive/sft_logs",
    per_device_train_batch_size=2,   # Adjust as necessary
    num_train_epochs=2,              # Increase for more training
    logging_steps=10,
    save_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    fp16=True,                       # Mixed precision for T4 GPU
    dataloader_num_workers=2,        # Adjust based on your CPU
)

# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Evaluate the pre-trained model (baseline)
print("Evaluating pre-trained model...")
pretrain_metrics = trainer.evaluate()
print("Pre-training evaluation metrics:", pretrain_metrics)

# Sample prompt before training
prompt = "Q: What could be wrong with lower back pain in a cancer patient?\nA:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_length=100, do_sample=True, temperature=0.7, top_p=0.9)
print("Pre-training output:", tokenizer.decode(outputs[0], skip_special_tokens=True))

# Start fine-tuning
print("Starting fine-tuning...")
trainer.train()

# Save the fine-tuned (post-train) model
model.save_pretrained("./posttrain_model")
tokenizer.save_pretrained("./posttrain_model")

# Optionally, save to Google Drive
from google.colab import drive
drive.mount('/content/drive')
model.save_pretrained("/content/drive/MyDrive/deepseek_models/posttrain_model")
tokenizer.save_pretrained("/content/drive/MyDrive/deepseek_models/posttrain_model")

# Evaluate the fine-tuned model
print("Evaluating fine-tuned model...")
posttrain_metrics = trainer.evaluate()
print("Post-training evaluation metrics:", posttrain_metrics)

# Sample prompt after training
prompt = "Q: What could be wrong with lower back pain in a cancer patient?\nA:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_length=100, do_sample=True, temperature=0.7, top_p=0.9)
print("Post-training output:", tokenizer.decode(outputs[0], skip_special_tokens=True))



Converting train dataset to ChatML:   0%|          | 0/11216 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/11216 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/11216 [00:00<?, ? examples/s]

Evaluating pre-trained model...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mboheling[0m ([33mboheling-stanford-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Pre-training evaluation metrics: {'eval_loss': 4.312994003295898, 'eval_model_preparation_time': 0.005, 'eval_runtime': 79.443, 'eval_samples_per_second': 141.183, 'eval_steps_per_second': 17.648}
Pre-training output: Q: What could be wrong with lower back pain in a cancer patient?
A: The lower back pain in a cancer patient is likely to be due to a number of factors, including:

1. **Cancer itself** - Cancer can have a significant impact on the lower back, often leading to issues such as pain, stiffness, or difficulty moving the legs.

2. **Infiltration of cancer cells into the lower back muscles** - This can cause localized pain, stiffness, or difficulty
Starting fine-tuning...


Step,Training Loss,Validation Loss
