In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q -U trl
!pip install -q -U peft
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install huggingface-hub
!pip install -q -U torch --index-url https://download.pytorch.org/whl/cu117
!pip install -q -U -i https://pypi.org/simple/ bitsandbytes
!pip install -q -U datasets

In [None]:
import warnings
warnings.filterwarnings('ignore')

import torch 
from trl import SFTTrainer
from peft import LoraConfig
from datasets import Dataset

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments

In [None]:
base_model = "/kaggle/input/llama-3/transformers/8b-chat-hf/1"
dataset_path = "bitext/Bitext-telco-llm-chatbot-training-dataset"

In [None]:
compute_dtype = getattr(torch, "float32")

bnbconfig = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model, 
    quantization_config=bnbconfig, 
    torch_dtype=compute_dtype, 
    low_cpu_mem_usage=True,
    #trust_remote_code=True,
)

In [None]:
from trl import setup_chat_format

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
model, tokenizer = setup_chat_format(model, tokenizer)

In [None]:
from datasets import load_dataset
data = load_dataset(dataset_path, split='all')
print(data)

data.to_csv('df.csv', index=False)

In [None]:
df = pd.read_csv('df.csv')
df.head(10)

In [None]:
from datasets import load_dataset
dataset = load_dataset(dataset_path, split='all')
dataset = dataset.shuffle(seed=42).select(range(2000)) 

def format_chat_template(row):
    row_json = [{"role": "user", "content": row["instruction"]},
               {"role": "assistant", "content": row["response"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)

dataset['text'][3]

In [None]:

from peft import get_peft_model, prepare_model_for_kbit_training, PeftModel
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [None]:
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))  

In [None]:
training_arguments = TrainingArguments(
    output_dir="finetuned_llama_for_telecom_chatbot",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=5e-4,
    weight_decay=0.001,
    fp16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    report_to="none",
)

In [None]:
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )

# Apply preprocessing to train and test datasets
tokenized_train_dataset = dataset["train"].map(preprocess_function, batched=True)
tokenized_test_dataset = dataset["test"].map(preprocess_function, batched=True)

In [None]:
print(tokenized_train_dataset)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [None]:
trainer.train()

In [None]:
df['instruction'][11254]

In [None]:
messages = [
    {
        "role": "user",
        "content": "Hi, please tell me about your payment methods"
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, 
                                       add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, 
                   truncation=True).to("cuda")

outputs = model.generate(**inputs, max_length=150, 
                         num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

In [None]:
new_model_name = "finetuned_llama_for_telecom_chatbot"
trainer.model.push_to_hub(new_model_name,  token="insert_your_hugging_face_token", use_temp_dir = False)