In [1]:
%%capture
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U wandb

In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

2025-09-13 21:40:06.258566: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757799606.455470      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757799606.515614      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("hf_write")
login(token = hf_token)

In [4]:
wb_token = user_secrets.get_secret("wndb_api")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3.2 on Customer Support Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjayarammunna43[0m ([33mjayarammunna43-orm[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.21.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250913_214024-9ap8ny6q[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mvague-night-5[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/jayarammunna43-orm/Fine-tune%20Llama%203.2%20on%20Customer%20Support%20Dataset?apiKey=191a8aef97986a5dc03b966fe41eb4fac6bf0809[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/jayarammunna43-orm/Fine-tune%20Llama%203.2%20on%20Customer%20Support%20Dataset/runs/9ap8ny6q?apiKey=191a8aef97986a5dc03b966f

In [5]:
base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"
new_model = "llama-3.2-3b-it-Ecommerce-ChatBot"
dataset_name = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"

In [6]:
# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [7]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:

# load full dataset (no split param → gives DatasetDict)
dataset = load_dataset(dataset_name)

# OR if you only want 1000 samples and train/test split:
dataset = load_dataset(dataset_name, split="train[:1000]")

# create train/test split (e.g., 90/10)
dataset = dataset.train_test_split(test_size=0.1, seed=65)
instruction = """You are a top-rated customer service agent named John."""
def format_chat_template(row):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": row["instruction"]},
        {"role": "assistant", "content": row["response"]},
    ]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(format_chat_template, num_proc=4)

print(dataset)


README.md: 0.00B [00:00, ?B/s]

Bitext_Sample_Customer_Support_Training_(…):   0%|          | 0.00/19.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26872 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['flags', 'instruction', 'category', 'intent', 'response', 'text'],
        num_rows: 900
    })
    test: Dataset({
        features: ['flags', 'instruction', 'category', 'intent', 'response', 'text'],
        num_rows: 100
    })
})


In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['flags', 'instruction', 'category', 'intent', 'response', 'text'],
        num_rows: 900
    })
    test: Dataset({
        features: ['flags', 'instruction', 'category', 'intent', 'response', 'text'],
        num_rows: 100
    })
})

In [10]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [11]:
tokenizer.chat_template = None
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [12]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [13]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    args=training_arguments,
)



Adding EOS to train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

In [14]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'bos_token_id': 128000, 'pad_token_id': None}.


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
90,0.3979,0.478041,0.458018,49398.0,0.845654
180,0.484,0.445633,0.418662,98474.0,0.849658
270,0.3327,0.423852,0.452497,146755.0,0.858787
360,0.3558,0.407103,0.428564,193825.0,0.862382
450,0.3524,0.397315,0.408412,241629.0,0.866178




TrainOutput(global_step=450, training_loss=0.48929393582873876, metrics={'train_runtime': 943.0537, 'train_samples_per_second': 0.954, 'train_steps_per_second': 0.477, 'total_flos': 4121796277757952.0, 'train_loss': 0.48929393582873876, 'epoch': 1.0})

In [15]:
wandb.finish()

[34m[1mwandb[0m: updating run metadata
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:             eval/entropy █▂▇▄▁
[34m[1mwandb[0m:                eval/loss █▅▃▂▁
[34m[1mwandb[0m: eval/mean_token_accuracy ▁▂▅▇█
[34m[1mwandb[0m:          eval/num_tokens ▁▃▅▆█
[34m[1mwandb[0m:             eval/runtime ▁█▇▆▇
[34m[1mwandb[0m:  eval/samples_per_second █▁▂▃▂
[34m[1mwandb[0m:    eval/steps_per_second █▁▂▃▂
[34m[1mwandb[0m:            train/entropy █▂▂▂▂▂▂▂▃▂▂▂▂▂▁▃▄▂▂▁▁▂▁▁▁▂▂▁▂▂▂▁▁▂▄▁▁▁▁▁
[34m[1mwandb[0m:              train/epoch ▁▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇██
[34m[1mwandb[0m:        train/global_step ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
[34m[1mwandb[0m:                       +5 ...
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:             eval/entropy 0.40841
[34m[1mwandb[0m:                eval/loss 0.39732
[34m[1mwandb[0m: eval/mean_token_accuracy 0.86618
[34m[1mwandb[0m:        

In [16]:
messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": "I bought the same item twice, cancel order {{Order Number}}"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.



I'm conscious that you have purchased the same item twice and now need assistance with canceling order {{Order Number}}. I apologize for any inconvenience caused. To cancel your order, please follow these steps:

1. Sign into Your Account: Access our platform by logging into your {{Online Company Portal Info}}.
2. Navigate to Your Orders: Once logged in, go to the '{{Online Order Interaction}}' or '{{Online Order Interaction}}' section.
3. Locate Your Purchase: Look for the purchase associated with the order number {{Order Number}}.
4. Initiate Cancellation: Within the purchase details, you should find an option labeled '{{Online Order Interaction}}'. Please select this option.
5. Confirm the Cancellation: The system may


In [17]:
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)



README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...e-ChatBot/adapter_model.safetensors:   0%|          | 63.7kB / 3.25GB            

CommitInfo(commit_url='https://huggingface.co/abidmunnanc/llama-3.2-3b-it-Ecommerce-ChatBot/commit/d859ecffffebd0376a229541204bd5fbe873d0ee', commit_message='Upload model', commit_description='', oid='d859ecffffebd0376a229541204bd5fbe873d0ee', pr_url=None, repo_url=RepoUrl('https://huggingface.co/abidmunnanc/llama-3.2-3b-it-Ecommerce-ChatBot', endpoint='https://huggingface.co', repo_type='model', repo_id='abidmunnanc/llama-3.2-3b-it-Ecommerce-ChatBot'), pr_revision=None, pr_num=None)