In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
%%capture
%pip install -U flash_attn
%pip install -U exllamav2
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U wandb

In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [3]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("hf")
login(token = hf_token)

In [4]:
wb_token = user_secrets.get_secret("wb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3.2 on Customer Support Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Currently logged in as: [33manandharajuds[0m ([33manandharajuds-simon-fraser-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [5]:
base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"
new_model = "llama-3.2-3b-it-Ecommerce-ChatBot"
dataset_name = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"

In [6]:
# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"
attn_implementation

'eager'

In [7]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
#Importing the dataset
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.shuffle(seed=65).select(range(1000)) # Only use 1000 samples for quick demo
instruction = """You are a top-rated customer service agent named John. 
    Be polite to customers and answer all their questions.
    """
def format_chat_template(row):
    
    row_json = [{"role": "system", "content": instruction },
               {"role": "user", "content": row["instruction"]},
               {"role": "assistant", "content": row["response"]}]
    
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc= 4,
)


README.md:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

(…)t_Training_Dataset_27K_responses-v11.csv:   0%|          | 0.00/19.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26872 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [9]:
dataset['text'][3]

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a top-rated customer service agent named John. \n    Be polite to customers and answer all their questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\ncould you tell me about the options for shipping?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nOf course, I'd be delighted to provide you with information about our shipping options! Here are the various choices we offer:\n\n1. Standard Shipping: This option typically arrives within {{Date Range}} business days, catering to non-urgent items and ensuring a cost-effective delivery.\n\n2. Expedited Shipping: If you're looking for a faster option, choose expedited shipping. Your items will reach you within {{Date Range}} business days, offering a balance between speed and affordability.\n\n3. Overnight Shipping: For urgent needs, we have overnight shipping. This ensures your items are delivered on the next business day, offering the highest level o

In [10]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [11]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
tokenizer.chat_template = None
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [12]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [13]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=dataset,
    peft_config=peft_config,
    #max_seq_length= 512,
    #dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    #packing= False,
)

  trainer = SFTTrainer(


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [16]:
trainer.train()



Step,Training Loss,Validation Loss
100,0.8401,0.797782
200,0.5965,0.684234
300,0.7015,0.604993
400,0.4893,0.55596
500,0.4669,0.535601


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=500, training_loss=0.7579015807509423, metrics={'train_runtime': 1277.1724, 'train_samples_per_second': 0.783, 'train_steps_per_second': 0.391, 'total_flos': 3072774910715904.0, 'train_loss': 0.7579015807509423, 'epoch': 1.0})

In [17]:
wandb.finish()

0,1
eval/loss,█▅▃▂▁
eval/runtime,█▃▆▃▁
eval/samples_per_second,▁▆▃▆█
eval/steps_per_second,▁▆▃▆█
train/epoch,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇████
train/grad_norm,▄█▅▅▄▃▃▄▁▂▅▄▅▁▃▂▄▁▁▃▃▃▃▃▂▂▄▅▂▂▂▂▂▅▂▄▃▂▃▅
train/learning_rate,████▇▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁
train/loss,█▇▇█▇▆▇▆▅▃▄▄▄▄▆▅▅▃▃▃▂▆▄▂▂▂▂▅▂▄▆▁▂▅▃▂▂▂▂▂

0,1
eval/loss,0.5356
eval/runtime,173.1565
eval/samples_per_second,5.775
eval/steps_per_second,5.775
total_flos,3072774910715904.0
train/epoch,1.0
train/global_step,500.0
train/grad_norm,1.10789
train/learning_rate,0.0
train/loss,0.4669


In [18]:
messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": "I bought the same item twice, cancel order {{Order Number}}"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])


I'm sorry to hear that you've purchased the same item twice and would like to cancel order number {{Order Number}}. I understand that this might be a mistake or an error, and I'm here to assist you in resolving this issue. To proceed with the cancellation, I'll need some additional information from you. Could you please provide me with your full name, email address, and the order number again? With these details, I'll be able to locate the relevant information and assist you further. Thank you for bringing this to my attention, and I apologize for any inconvenience caused.


In [19]:
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/1.67G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Anandharaju/llama-3.2-3b-it-Ecommerce-ChatBot/commit/fafb28c57f71324e640e84a8eb65d1574ab99943', commit_message='Upload model', commit_description='', oid='fafb28c57f71324e640e84a8eb65d1574ab99943', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Anandharaju/llama-3.2-3b-it-Ecommerce-ChatBot', endpoint='https://huggingface.co', repo_type='model', repo_id='Anandharaju/llama-3.2-3b-it-Ecommerce-ChatBot'), pr_revision=None, pr_num=None)