In [1]:
!pip install datasets bitsandbytes trl==0.12.1 transformers peft huggingface-hub accelerate safetensors pandas matplotlib numpy==1.26.4

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl==0.12.1
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Downloading trl-0.12.1-py3-none-any.whl (310 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.9/310.9 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m89.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m37.9

# Install necessary libraries

In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    #AutoPeftModelForCausalLM, # Removed from transformers
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from trl import SFTTrainer, SFTConfig
# from trl.trainer.utils import DataCollatorForCompletionOnlyLM
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM, PeftConfig # Added to peft
from huggingface_hub import notebook_login
from trl import SFTTrainer, SFTConfig, setup_chat_format, DataCollatorForCompletionOnlyLM


# Check for bf16 support and set compute dtype


In [2]:
support = torch.cuda.is_bf16_supported(including_emulation=False)
calculate_dtype = torch.bfloat16 if support else torch.float32

In [3]:
print(calculate_dtype)

torch.bfloat16


#bnb config for loading 4 bit model with nf4 quant type
* loading model with quantization config
* device map to cuda
* 4bit true

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= calculate_dtype, #calculate_dtype can be bf16 or float32- use bf16 if supported
    bnb_4bit_use_double_quant= True
    )
repo = "HuggingFaceTB/SmolLM-360M-Instruct"
model = AutoModelForCausalLM.from_pretrained(repo, quantization_config= bnb_config, device_map= "cuda:0")

config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/724M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

#Check model memory footprint

In [6]:
print(model.get_memory_footprint()/1024/1024)

240.119140625


#model config

In [7]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 960, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=960, out_features=960, bias=False)
          (k_proj): Linear4bit(in_features=960, out_features=320, bias=False)
          (v_proj): Linear4bit(in_features=960, out_features=320, bias=False)
          (o_proj): Linear4bit(in_features=960, out_features=960, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=960, out_features=2560, bias=False)
          (up_proj): Linear4bit(in_features=960, out_features=2560, bias=False)
          (down_proj): Linear4bit(in_features=2560, out_features=960, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((960,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNo

#Prepare model for kbit training
##Use Lora Config


1.   rank [4,8,16,32] - choose one
2.   lora_alpha is a scalling factor which should be 2x the rank of matrix.
3.   dropout range from 0.03 to 0.10 which helps prevent overfit
4.   module - choose module as per requirement


In [8]:
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r = 8, #. rank of LoRA - [4-16]
    bias = "none", # ["all", "lora_only"] - for train bias term
    lora_alpha = 16, # scalling factor
    lora_dropout = 0.10, # prevent overfit- used for regularisation
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    task_type = "CAUSAL_LM"

)

model = get_peft_model(model, config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(49152, 960, padding_idx=2)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=960, out_features=960, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=960, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=960, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj

#once again check memory footprint

In [9]:
print(model.get_memory_footprint()/1024/1024)

346.8006591796875


#Print base model to compare

In [10]:
print(model.get_base_model)

<bound method PeftModel.get_base_model of PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(49152, 960, padding_idx=2)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=960, out_features=960, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=960, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=960, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDi

In [11]:
print(model.get_memory_footprint()/1e6)

363.646848


#Check for trainable Parameters and its percentage for a mathematical view.

In [12]:
trainable_params, total_params = model.get_nb_trainable_parameters()
percentage = (trainable_params / total_params) * 100

print(f"Trainable Parameters: {trainable_params:,}")
print(f"Total Parameters: {total_params:,}")
print(f"Percentage Trainable: {percentage:.2f}%")

Trainable Parameters: 4,341,760
Total Parameters: 366,162,880
Percentage Trainable: 1.19%


#ETL Process for Dataset Prep stage, Tokenizer load and define chat template if needed.

In [13]:

model_name = "HuggingFaceTB/SmolLM-360M-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add padding token if needed
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


raw_dataset = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset", split="train[:2000]")

def format_prompt(example):
    instruction = example["instruction"]
    response = example["response"]
    prompt = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
    return {"text": prompt}


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Bitext_Sample_Customer_Support_Training_(…):   0%|          | 0.00/19.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26872 [00:00<?, ? examples/s]

In [14]:
def tokenize_function(examples):
    """Tokenize with proper padding and labels for causal LM"""
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors=None
    )

    # Create labels (same as input_ids but with -100 for padding)
    labels = []
    for input_ids, attention_mask in zip(tokenized["input_ids"], tokenized["attention_mask"]):
        label = input_ids.copy() if isinstance(input_ids, list) else list(input_ids)
        # Mask padding tokens in labels
        for i, mask in enumerate(attention_mask):
            if mask == 0:
                label[i] = -100
        labels.append(label)

    tokenized["labels"] = labels
    return tokenized

In [15]:

print(f"📦 Dataset loaded: {len(raw_dataset)} samples")
print(f"📝 Sample: {raw_dataset[0]}\n")

# Format the prompts
formatted_dataset = raw_dataset.map(
    format_prompt,
    desc="Formatting prompts"
)

# Tokenize
tokenized_dataset = formatted_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=formatted_dataset.column_names,
    desc="Tokenizing dataset"
)

# Final dataset ready for trainer
final_dataset = tokenized_dataset

print(f"✅ Dataset ready for training!")
print(f"   Total samples: {len(final_dataset)}")
print(f"   Max length: {len(final_dataset[0]['input_ids'])}")
print(f"   Keys: {final_dataset.column_names}")

📦 Dataset loaded: 2000 samples
📝 Sample: {'flags': 'B', 'instruction': 'question about cancelling order {{Order Number}}', 'category': 'ORDER', 'intent': 'cancel_order', 'response': "I've understood you have a question regarding canceling order {{Order Number}}, and I'm here to provide you with the information you need. Please go ahead and ask your question, and I'll do my best to assist you."}



Formatting prompts:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

✅ Dataset ready for training!
   Total samples: 2000
   Max length: 512
   Keys: ['input_ids', 'attention_mask', 'labels']


In [16]:
# My SFT Trainer Configuration for SmolLM-360M-Instruct Fine-tuning
# No evaluation split needed - using full dataset for training only

from trl import SFTConfig, SFTTrainer

# My optimized parameters for LoRA training
min_effective_batch_size = 8  # I increased this for better gradient updates with LoRA
lr = 1e-4  # I reduced learning rate for LoRA stability
max_seq_length = 512  # I updated this to match my tokenization settings
collator_fn = None  # I'm not using a custom collator since I pre-pad in tokenization
packing = False  # I disabled packing since I'm using fixed-length sequences
steps = 15  # My logging and saving frequency
num_train_epochs = 3  # I reduced epochs since LoRA trains faster and overfits easily
warmup_ratio = 0.1  # I slightly increased warmup for LoRA stability

# My SFT configuration with updated paths and names
sft_config = SFTConfig(
    # I'm saving my model to a new directory for this SmolLM experiment
    output_dir = '/content/drive/MyDrive/smollm-360m/SmolLM-360M-Instruct-finetuned',

    # My data processing settings
    packing = packing,
    max_seq_length = max_seq_length,

    # I disabled gradient checkpointing to fix potential errors
    gradient_checkpointing = False,

    # My training batch and precision settings
    per_device_train_batch_size = min_effective_batch_size,
    auto_find_batch_size = True,  # I let the trainer find optimal batch size
    bf16 = True,  # I use bf16 instead of fp16 for better stability

    # My training schedule
    num_train_epochs = num_train_epochs,
    learning_rate = lr,
    lr_scheduler_type = "cosine",  # I use cosine scheduler for smooth learning rate decay
    warmup_ratio = warmup_ratio,
    weight_decay = 0.01,  # I add weight decay for regularization
    max_grad_norm = 1.0,  # I clip gradients to prevent exploding gradients

    # My logging and monitoring setup
    report_to = 'wandb',  # I'm tracking my experiments with Weights & Biases
    run_name = "SmolLM-360M-Instruct-LoRA-Finetune",  # My updated run name

    # My logging directory (updated path)
    logging_dir = '/content/drive/MyDrive/smollm-360m/SmolLM-360M-Instruct-finetuned/logs',

    # My checkpoint and logging strategy
    logging_strategy = 'steps',
    save_strategy = 'steps',
    logging_steps = steps,  # I log every 15 steps
    save_steps = steps,     # I save checkpoint every 15 steps
    save_total_limit = 2,   # I keep only the last 2 checkpoints to save space
)

# I create my trainer with the prepared dataset and configuration
trainer = SFTTrainer(
    model = model,                    # My loaded model (should be already loaded)
    train_dataset = final_dataset,    # My prepared dataset from the previous script
    processing_class = tokenizer,     # My tokenizer for text processing
    data_collator = collator_fn,      # My data collator (None for default)
    args = sft_config,               # My training configuration
)

# I start the training process
print("Starting my SmolLM-360M fine-tuning...")
trainer.train()
print("Training completed! My model is saved to:", sft_config.output_dir)

Starting my SmolLM-360M fine-tuning...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mchaubey-amit017[0m ([33mhectorlabs[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
15,2.1635
30,2.1654
45,2.0791
60,1.972
75,1.753
90,1.5067
105,1.2879
120,1.0275
135,0.9446
150,0.8901


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*a

Training completed! My model is saved to: /content/drive/MyDrive/smollm-360m/SmolLM-360M-Instruct-finetuned


In [17]:
# Step 1: I'm saving my trained model locally first
print("Saving my trained SmolLM model...")
trainer.save_model('/content/smollm-360m-saved')

# Step 2: I load and merge the LoRA adapter with the base model
from peft import AutoPeftModelForCausalLM

print("Loading my PEFT model and merging adapter...")
# I load the saved PEFT model (use the same path as Step 1)
peft_model = AutoPeftModelForCausalLM.from_pretrained('/content/smollm-360m-saved')

# I merge and unload the adapter to get a single model
merged_model = peft_model.merge_and_unload()

# Step 3: I save the merged model with tokenizer
print("Saving my merged model...")
merged_model.save_pretrained('/content/smollm-360m-merged')
tokenizer.save_pretrained('/content/smollm-360m-merged')

# Step 4: I upload my model to Hugging Face Hub
from huggingface_hub import HfApi

print("Uploading my model to Hugging Face Hub...")
api = HfApi()
api.upload_folder(
    folder_path='/content/smollm-360m-merged',
    repo_id="sweatSmile/SmolLM-360M-CustomerSupport-Instruct",  # My new repo name
    repo_type="model",
    commit_message="Upload SmolLM-360M-Instruct fine-tuned on customer support dataset with LoRA"
)

print("Model upload completed! 🎉")
print("Model is now available at: https://huggingface.co/sweatSmile/SmolLM-360M-CustomerSupport-Instruct")

Saving my trained SmolLM model...
Loading my PEFT model and merging adapter...
Saving my merged model...
Uploading my model to Hugging Face Hub...


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...-merged/model.safetensors:   1%|1         | 16.0MB / 1.45GB            

Model upload completed! 🎉
Model is now available at: https://huggingface.co/sweatSmile/SmolLM-360M-CustomerSupport-Instruct
