In [1]:
!pip install datasets bitsandbytes trl==0.12.1 transformers peft huggingface-hub accelerate safetensors pandas matplotlib numpy==1.26.4

Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting trl==0.12.1
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading trl-0.12.1-py3-none-any.whl (310 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.9/310.9 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m85.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m38.2

# Install necessary libraries

In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    #AutoPeftModelForCausalLM, # Removed from transformers
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from trl import SFTTrainer, SFTConfig
# from trl.trainer.utils import DataCollatorForCompletionOnlyLM
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM, PeftConfig # Added to peft
from huggingface_hub import notebook_login
from trl import SFTTrainer, SFTConfig, setup_chat_format, DataCollatorForCompletionOnlyLM


# Check for bf16 support and set compute dtype


In [2]:
support = torch.cuda.is_bf16_supported(including_emulation=False)
calculate_dtype = torch.bfloat16 if support else torch.float32

In [3]:
print(calculate_dtype)

torch.bfloat16


#bnb config for loading 4 bit model with nf4 quant type
* loading model with quantization config
* device map to cuda
* 4bit true

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= calculate_dtype, #calculate_dtype can be bf16 or float32- use bf16 if supported
    bnb_4bit_use_double_quant= True
    )
repo = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model = AutoModelForCausalLM.from_pretrained(repo, quantization_config= bnb_config, device_map= "cuda:0")

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

#Check model memory footprint

In [5]:
print(model.get_memory_footprint()/1024/1024)

1515.276611328125


#model config

In [6]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear4bit(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear4bit(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear4bit(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear4bit(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear4bit(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps

#Prepare model for kbit training
##Use Lora Config


1.   rank [4,8,16,32] - choose one
2.   lora_alpha is a scalling factor which should be 2x the rank of matrix.
3.   dropout range from 0.03 to 0.10 which helps prevent overfit
4.   module - choose module as per requirement


In [7]:
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r = 8, #. rank of LoRA - [4-16]
    bias = "none", # ["all", "lora_only"] - for train bias term
    lora_alpha = 16, # scalling factor
    lora_dropout = 0.10, # prevent overfit- used for regularisation
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    task_type = "CAUSAL_LM"

)

model = get_peft_model(model, config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 1536)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=1536, out_features=1536, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1536, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=1536, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Li

#once again check memory footprint

In [None]:
print(model.get_memory_footprint()/1024/1024)

2401.226806640625


#Print base model to compare

In [8]:
print(model.get_base_model)

<bound method PeftModel.get_base_model of PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 1536)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=1536, out_features=1536, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1536, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=1536, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
     

In [9]:
print(model.get_memory_footprint()/1e6)

2559.5968


#Check for trainable Parameters and its percentage for a mathematical view.

In [10]:
trainable_params, total_params = model.get_nb_trainable_parameters()
percentage = (trainable_params / total_params) * 100

print(f"Trainable Parameters: {trainable_params:,}")
print(f"Total Parameters: {total_params:,}")
print(f"Percentage Trainable: {percentage:.2f}%")

Trainable Parameters: 9,232,384
Total Parameters: 1,786,320,384
Percentage Trainable: 0.52%


#ETL Process for Dataset Prep stage, Tokenizer load and define chat template if needed.

In [13]:
from datasets import load_dataset
from transformers import AutoTokenizer
import torch

# Load the tokenizer for DeepSeek-R1-Distill-Qwen-1.5B
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load a small portion of a popular instruction dataset
# Using Alpaca dataset - a good choice for instruction following
raw_dataset = load_dataset("tatsu-lab/alpaca", split="train[:1000]")  # Only first 1000 samples

print(f"Dataset loaded: {len(raw_dataset)} samples")
print(f"Sample entry: {raw_dataset[0]}")

def format_prompt(example):
    """Format the data into a conversation format suitable for the model"""
    instruction = example["instruction"]
    input_text = example["input"] if example["input"] else ""
    output = example["output"]

    # Create a conversation format
    if input_text:
        prompt = f"Human: {instruction}\n{input_text}\n\nAssistant: {output}"
    else:
        prompt = f"Human: {instruction}\n\nAssistant: {output}"

    return {"text": prompt}

def tokenize_function(examples):
    """Tokenize the formatted text with proper padding"""
    # Tokenize the text
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",  # Pad to max_length for consistent tensor sizes
        max_length=512,
        return_tensors=None
    )

    # For causal LM, labels are the same as input_ids
    # But we need to set labels to -100 for padded tokens so they're ignored in loss
    labels = []
    for input_ids, attention_mask in zip(tokenized["input_ids"], tokenized["attention_mask"]):
        # Convert to list if it's not already
        label = input_ids.copy() if isinstance(input_ids, list) else input_ids[:]
        # Set padded positions to -100 (ignored in loss calculation)
        for i, mask in enumerate(attention_mask):
            if mask == 0:  # This is a padded token
                label[i] = -100
        labels.append(label)

    tokenized["labels"] = labels

    return tokenized

# Format the dataset
formatted_dataset = raw_dataset.map(format_prompt)

# Tokenize the dataset
tokenized_dataset = formatted_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=formatted_dataset.column_names,  # Remove original columns
    desc="Tokenizing dataset"
)

# No split needed - using full dataset for training
final_dataset = tokenized_dataset

print(f"Final dataset samples: {len(final_dataset)}")
print(f"Sample tokenized length: {len(final_dataset[0]['input_ids'])}")

print("\nDataset preparation complete!")
print("Use 'final_dataset' variable in your trainer.")

Dataset loaded: 1000 samples
Sample entry: {'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Final dataset samples: 1000
Sample tokenized length: 512

Dataset preparation complete!
Use 'final_dataset' variable in your trainer.


In [14]:
# My SFT Trainer Configuration for DeepSeek-R1-Distill-Qwen-1.5B Fine-tuning
# No evaluation split needed - using full dataset for training only

# My optimized parameters for LoRA training
min_effective_batch_size = 8  # I increased this for better gradient updates with LoRA
lr = 1e-4  # I reduced learning rate for LoRA stability
max_seq_length = 512  # I updated this to match my tokenization settings
collator_fn = None  # I'm not using a custom collator since I pre-pad in tokenization
packing = False  # I disabled packing since I'm using fixed-length sequences
steps = 15  # My logging and saving frequency
num_train_epochs = 3  # I reduced epochs since LoRA trains faster and overfits easily
warmup_ratio = 0.1  # I slightly increased warmup for LoRA stability

# My SFT configuration with updated paths and names
sft_config = SFTConfig(
    # I'm saving my model to a new directory for this DeepSeek experiment
    output_dir = '/content/drive/MyDrive/deepseek-r1/DeepSeek-R1-Distill-Qwen-1.5B-finetuned',

    # My data processing settings
    packing = packing,
    max_seq_length = max_seq_length,

    # I disabled gradient checkpointing to fix potential errors
    gradient_checkpointing = False,

    # My training batch and precision settings
    per_device_train_batch_size = min_effective_batch_size,
    auto_find_batch_size = True,  # I let the trainer find optimal batch size
    bf16 = True,  # I use bf16 instead of fp16 for better stability

    # My training schedule
    num_train_epochs = num_train_epochs,
    learning_rate = lr,
    lr_scheduler_type = "cosine",  # I use cosine scheduler for smooth learning rate decay
    warmup_ratio = warmup_ratio,
    weight_decay = 0.01,  # I add weight decay for regularization
    max_grad_norm = 1.0,  # I clip gradients to prevent exploding gradients

    # My logging and monitoring setup
    report_to = 'wandb',  # I'm tracking my experiments with Weights & Biases
    run_name = "DeepSeek-R1-Distill-Qwen-1.5B-LoRA-Finetune",  # My updated run name

    # My logging directory (updated path)
    logging_dir = '/content/drive/MyDrive/deepseek-r1/DeepSeek-R1-Distill-Qwen-1.5B-finetuned/logs',

    # My checkpoint and logging strategy
    logging_strategy = 'steps',
    save_strategy = 'steps',
    logging_steps = steps,  # I log every 15 steps
    save_steps = steps,     # I save checkpoint every 15 steps
    save_total_limit = 2,   # I keep only the last 2 checkpoints to save space
)

# I create my trainer with the prepared dataset and configuration
trainer = SFTTrainer(
    model = model,                    # My loaded model (should be already loaded)
    train_dataset = final_dataset,    # My prepared dataset from the previous script
    processing_class = tokenizer,     # My tokenizer for text processing
    data_collator = collator_fn,      # My data collator (None for default)
    args = sft_config,               # My training configuration
)

# I start the training process
print("Starting my DeepSeek-R1 fine-tuning...")
trainer.train()
print("Training completed! My model is saved to:", sft_config.output_dir)



Starting my DeepSeek-R1 fine-tuning...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
15,2.7487
30,2.4326
45,2.1837
60,2.0107
75,1.8778
90,1.8509
105,1.8883
120,1.8471
135,1.8212
150,1.7785


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Training completed! My model is saved to: /content/drive/MyDrive/deepseek-r1/DeepSeek-R1-Distill-Qwen-1.5B-finetuned


In [15]:
# Step 1: I'm saving my trained model locally first
print("Saving my trained DeepSeek model...")
trainer.save_model('/content/deepseek-r1-saved')

# Step 2: I load and merge the LoRA adapter with the base model
from peft import AutoPeftModelForCausalLM

print("Loading my PEFT model and merging adapter...")
# I load the saved PEFT model (use the same path as Step 1)
peft_model = AutoPeftModelForCausalLM.from_pretrained('/content/deepseek-r1-saved')

# I merge and unload the adapter to get a single model
merged_model = peft_model.merge_and_unload()

# Step 3: I save the merged model with tokenizer
print("Saving my merged model...")
merged_model.save_pretrained('/content/deepseek-r1-merged')
tokenizer.save_pretrained('/content/deepseek-r1-merged')

# Step 4: I upload my model to Hugging Face Hub
from huggingface_hub import HfApi

print("Uploading my model to Hugging Face Hub...")
api = HfApi()
api.upload_folder(
    folder_path='/content/deepseek-r1-merged',
    repo_id="sweatSmile/DeepSeek-R1-Distill-Qwen-1.5B-Alpaca-Instruct",  # My new repo name
    repo_type="model",
    commit_message="Upload DeepSeek-R1-Distill-Qwen-1.5B fine-tuned on Alpaca instruction dataset with LoRA"
)

print("Model upload completed! 🎉")
print("Model is now available at: https://huggingface.co/sweatSmile/DeepSeek-R1-Distill-Qwen-1.5B-Alpaca-Instruct")

Saving my trained DeepSeek model...
Loading my PEFT model and merging adapter...
Saving my merged model...
Uploading my model to Hugging Face Hub...


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...t/deepseek-r1-merged/tokenizer.json: 100%|##########| 11.4MB / 11.4MB            

  ...ed/model-00002-of-00002.safetensors:   2%|1         | 33.5MB / 2.11GB            

  ...ed/model-00001-of-00002.safetensors:   1%|          | 33.5MB / 5.00GB            

Model upload completed! 🎉
Model is now available at: https://huggingface.co/sweatSmile/DeepSeek-R1-Distill-Qwen-1.5B-Alpaca-Instruct
