Fine tune Deepseek R1 Distilled Qwen 1.5B model w/ LoRA

# Necessary Imports

In [None]:
%pip install datasets trl peft accelerate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.

# Load Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset(path="HuggingFaceTB/smoltalk", name="everyday-conversations") # Small number of rows
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['full_topic', 'messages'],
        num_rows: 2260
    })
    test: Dataset({
        features: ['full_topic', 'messages'],
        num_rows: 119
    })
})

# Get Base Model and Tokenizer

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTConfig, SFTTrainer, setup_chat_format # Supervised fine-tune
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load model
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

model = AutoModelForCausalLM.from_pretrained(
    model_name
).to(device)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
)

# Finetune model name
finetuned_model_name = "DeepSeek-R1-Distill-Qwen-1.5B-finetuned-smoltalk-everyday-conversations"

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


# Set Hyperparameters

## LoRA Config

In [None]:
from peft import LoraConfig

# LoRA config
rank_dimension = 6 # Decrease to speed up training
lora_alpha = 2 * rank_dimension
lora_dropout = 0.05 # Helps w/ over-fitting

peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=rank_dimension, # Usually 4-32
    lora_alpha=lora_alpha, # Usually 2x rank
    lora_dropout=lora_dropout,
    bias="none", # Biases to be updated during training
    target_modules="all-linear" # Apply LoRA to all linear embeddings
)

## Training arguments

In [None]:
training_args = SFTConfig(
    output_dir=finetuned_model_name,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True, # Saves memory but is more computationally expensive
    optim="adamw_torch_fused", # More efficiency
    logging_steps=10,
    max_grad_norm=0.3, # Gradient clipping threshold
    warmup_ratio=0.03, # Amount of dataset to increase lr to specified amount
    lr_scheduler_type="constant", # No change in lr after warmup
    learning_rate=2e-4,
    save_strategy="epoch", # Save checkpoint every epoch
    bf16=True, # Mixed precision to speed up training
    report_to="none", # Disable WandB logging
)

# Train Model

In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config, # LoRA config
)

Converting train dataset to ChatML:   0%|          | 0/2260 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/2260 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2260 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2260 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/119 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/119 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/119 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/119 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,3.0733
20,2.2733
30,1.9868
40,1.9593
50,1.7305
60,1.8119
70,1.7796
80,1.7834
90,1.7366
100,1.6677


TrainOutput(global_step=565, training_loss=1.6710539699655718, metrics={'train_runtime': 2172.9719, 'train_samples_per_second': 1.04, 'train_steps_per_second': 0.26, 'total_flos': 3848548529147904.0, 'train_loss': 1.6710539699655718})

# Push Model to Hugging Face Hub

In [None]:
from peft import AutoPeftModelForCausalLM

# Load PEFT model on CPU

model = AutoPeftModelForCausalLM.from_pretrained(
    "/content/DeepSeek-R1-Distill-Qwen-1.5B-finetuned-smoltalk-everyday-conversations/checkpoint-565", # Load just-trained model
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16, # Make it faster
)

merged_model = model.merge_and_unload()

In [None]:
from huggingface_hub import notebook_login

notebook_login() # Login to account

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
merged_model.push_to_hub(finetuned_model_name) # Push to hub
tokenizer.push_to_hub(finetuned_model_name)

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/avanishd/DeepSeek-R1-Distill-Qwen-1.5B-finetuned-smoltalk-everyday-conversations/commit/b363ebd0e8bfb7fb43e6140b698088b706628b49', commit_message='Upload Qwen2ForCausalLM', commit_description='', oid='b363ebd0e8bfb7fb43e6140b698088b706628b49', pr_url=None, repo_url=RepoUrl('https://huggingface.co/avanishd/DeepSeek-R1-Distill-Qwen-1.5B-finetuned-smoltalk-everyday-conversations', endpoint='https://huggingface.co', repo_type='model', repo_id='avanishd/DeepSeek-R1-Distill-Qwen-1.5B-finetuned-smoltalk-everyday-conversations'), pr_revision=None, pr_num=None)