##ADV LLMs GROUP ONE

###Customer chat-bot fine-tuning project
### GROUP ONE (ADAM SAM, CATHERINE OKOLIE)

In [None]:
#ADAM SAM, AWELE CATHERINE OKOLIE
#ADV LLM PROJECT - CUSTOMER SUPPORT FINE-TUNING

"""
steps to connect to lambda machine for co-lab
1. ssh -L 8888:localhost:8888 your_username@lab.cs.wit.edu -p 5000X

2. jupyter notebook --NotebookApp.allow_origin='https://colab.research.google.com'
--port=8888 --NotebookApp.port_retries=0

3.
On your PC, go to Google Colab and select "Connect to a local runtime" (top
right corner).

In the URL box, enter the URL with the token you received from the output of
the Jupyter Notebook command in Step 2.
"""

#Install if needed

#!pip install -U accelerate bitsandbytes datasets peft transformers

#!pip install --upgrade bottleneck
#!pip install --upgrade numexpr

#!pip install evaluate
#!pip install trl

Defaulting to user installation because normal site-packages is not writeable


In [3]:
#LOGIN TO HUGGINGFACE TO ACCESS META-LLAMA-2 AI
from huggingface_hub import interpreter_login

interpreter_login() #get your token from hugging face account




    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

Enter your token (input will not be visible): ··········
Add token as git credential? (Y/n) n


In [10]:
#load the llama 2 model
####IGNORE THE WARNINGS!!!####
import os
from transformers import pipeline, LlamaForCausalLM, LlamaTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig,DataCollatorForLanguageModeling
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model
import torch
from accelerate import Accelerator
import warnings
warnings.filterwarnings('ignore')

######################training configurations ############################
output_dir = "./results_full_v2"

#Quantization
bnb_config = BitsAndBytesConfig( #use bits and bits for 4bit quantization
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
#LORA (from PEFT library)
LORA_config = LoraConfig(
    r=32, #rank
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    #target_modules=['q_proj', 'v_proj'] #studies found that k isn't neccessary
)
#SFT Trainer
#######################SFT CONFIG##########################
sft_config = SFTConfig(
      per_device_train_batch_size=6,
      gradient_accumulation_steps=8,
      num_train_epochs = 1,
      learning_rate = 2e-5,
      fp16 = True,
      #bf16 = False,
      optim = "adamw_torch",
      weight_decay = 0.0001,
      lr_scheduler_type = "cosine",
      seed = 42,
      output_dir = output_dir,
      #report_to = "tensorboard",
      max_seq_length=512,
      dataset_text_field="text",
      log_level = 'info',
      evaluation_strategy="steps",
      eval_steps=500,
      logging_steps=75,
      #load_best_model_at_end=True, #overfitting safety
      #save_strategy="steps",
      #save_steps =100
      #packing = False
    )

########################################################################

model_name = "meta-llama/Llama-2-7b-hf"

#use LlamaForCasualLM  instead of AutoModelfor compatability
model = LlamaForCausalLM.from_pretrained(
    model_name,
    device_map = "auto", #use gpu
    torch_dtype=torch.float16,
    use_cache=True,
    quantization_config=bnb_config
)
tokenizer = LlamaTokenizer.from_pretrained(model_name)
#tokenizer.use_fast = False #fix eos error
tokenizer.add_eos_token = False
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # fix eos issues.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
#DATA SET PROCESSING
#dataset: https://huggingface.co/datasets/bitext/Bitext-customer-support-llm-chatbot-training-dataset?library=datasets
#load dataset
from datasets import load_dataset, Dataset
from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model
import pandas as pd

ds = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset", split="train")

data = Dataset.from_dict(ds[:26000]) #select 26k samples
eval = Dataset.from_dict(ds[26000:26010]) #for rouge eval metrics

#merge the customer query, and response into one text column
#the llama2-7b model has distinct formatting requirements

def merge_features(row):
    row['text'] = f"[INST] {row['instruction']} [/INST] {row['response']}"
    row['text'] = tokenizer.bos_token + row['text'] + tokenizer.eos_token #add bos and eos tokens
    return row

data = data.map(merge_features)
eval = eval.map(merge_features)

print(data['text'][2]) #sample print

split_data = data.train_test_split(test_size=0.2) #split and use testing for evaluations



Map:   0%|          | 0/26000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

<s>[INST] i need help cancelling puchase {{Order Number}} [/INST] I can sense that you're seeking assistance with canceling your purchase with the purchase number {{Order Number}}. I apologize for any inconvenience caused, and I'm here to guide you through the process.

To cancel your purchase, please follow these steps:

1. Log into your account: Visit our {{Online Company Portal Info}} and sign in using your credentials.
2. Locate your order: Once logged in, navigate to the '{{Online Order Interaction}}' or '{{Online Order Interaction}}' section to find the purchase with the order number {{Order Number}}.
3. Initiate the cancellation: Upon finding your purchase, look for the option labeled '{{Online Order Interaction}}' and click on it to initiate the cancellation process.
4. Confirm the cancellation: Our system might ask for confirmation or feedback regarding the cancellation. Kindly provide the necessary information to complete the process.
5. Seek additional support: If you encoun

In [None]:
#apply sftconfig, quantization config, and lora config
#create the trainer instance
trainer = SFTTrainer(
    model=model,
    train_dataset=split_data['train'],
    eval_dataset=split_data['test'],
    peft_config=LORA_config,
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False), #mlm means no masking/casual lm
    args = sft_config
)

#Initialize Accelerator to prepare training for dual GPUs
accelerator = Accelerator()
model, trainer = accelerator.prepare(model, trainer)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
PyTorch: setting up devices
loading file tokenizer.model from cache at /home/sama1/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/tokenizer.model
loading file tokenizer.json from cache at /home/sama1/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/sama1/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/sama1/.cache/hugging

Converting train dataset to ChatML:   0%|          | 0/20800 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/20800 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/20800 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/20800 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/5200 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/5200 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/5200 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/5200 [00:00<?, ? examples/s]

You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
Using auto half precision backend


In [None]:
#begin training
trainer.train()

The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: flags, category, response, text, instruction, intent. If flags, category, response, text, instruction, intent are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 20,800
  Num Epochs = 1
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 8
  Total optimization steps = 433
  Number of trainable parameters = 16,777,216


Step,Training Loss,Validation Loss


Saving model checkpoint to ./results_full_v2/checkpoint-433
loading configuration file config.json from cache at /home/sama1/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.47.0",
  "use_cache": true,
  "vocab_size": 32000
}

token

TrainOutput(global_step=433, training_loss=10.238226628468714, metrics={'train_runtime': 2744.802, 'train_samples_per_second': 7.578, 'train_steps_per_second': 0.158, 'total_flos': 1.3384179222065971e+17, 'train_loss': 10.238226628468714})

In [None]:
trainer.save_model()
#CMD TO COPY MODEL FROM LAMBDA TO WINDOWS
#RUN IN A NEW WINDOWS CMD PROMPT WINDOW
#scp -P 50004 -r YOURUSER@lab.cs.wit.edu:"~/RESULTS_DIR" "C:\Users\YOURUSER\Downloads"

Saving model checkpoint to ./results_full_v2
loading configuration file config.json from cache at /home/sama1/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.47.0",
  "use_cache": true,
  "vocab_size": 32000
}

tokenizer config fil

In [17]:
###################DEFAULT EVALUATION ######################
model_path = "results_full" #EVALUATE FINE-TUNED
#model_path = "meta-llama/Llama-2-7b-hf" #EVALUATE BASELINE
####################################################
model = LlamaForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.float16,
    use_cache=True,
    quantization_config=bnb_config,
    #pretraining_tp = 1
)

trainer = SFTTrainer(
    model=model,
    train_dataset=split_data['train'],
    eval_dataset=split_data['test'],
    peft_config=LORA_config,
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False), #mlm means no masking/casual lm
    args = sft_config
)

trainer.evaluate()

"""
{'eval_loss': 1.0869252681732178,
 'eval_runtime': 52.1089,
 'eval_samples_per_second': 38.381,
 'eval_steps_per_second': 4.798}
 """


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Converting train dataset to ChatML:   0%|          | 0/20800 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/20800 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/20800 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/20800 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/5200 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/5200 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/5200 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/5200 [00:00<?, ? examples/s]

You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
Using auto half precision backend
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: instruction, flags, intent, text, category, response. If instruction, flags, intent, text, category, response are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 5200
  Batch size = 8


"\n{'eval_loss': 1.0869252681732178,\n 'eval_runtime': 52.1089,\n 'eval_samples_per_second': 38.381,\n 'eval_steps_per_second': 4.798}\n "

In [15]:

########################evaluate with ROUGE########################
import evaluate

model_name = "Meta-llama/Llama-2-7b-hf"
#baseline model
base_model = LlamaForCausalLM.from_pretrained(
    model_name,
    device_map = "auto", #use gpu
    torch_dtype=torch.float16,
    use_cache=True,
    quantization_config=bnb_config)

#finetuned model
model_path = "results_full"
model_fine_tuned = LlamaForCausalLM.from_pretrained(
    model_path,
    device_map = "auto", #use gpu
    torch_dtype=torch.float16,
    use_cache=True,
    quantization_config=bnb_config,
)

#SPLIT BETWEEN GENERATED RESPONSES
#references is the human reference, predictions are generated responses given the query pertaining to that human reference
#SOURCE: https://mehdirezvandehy.github.io/fine-tune_llama.html
def generate_gen_ref(model, dataset, tokenizer):
    predictions = []
    references = []
    for row in dataset:
        #encoding. inst splits between customer query (before inst) and the response (comes after inst).
        query, response = row.split('[/INST]')
        inputs = tokenizer.encode(query, return_tensors="pt", padding=True, truncation=True)

        outputs = model.generate(inputs, max_length=200, temperature=0.3)

        #decoding the generated output
        decoded_outputs = tokenizer.decode(outputs[0, inputs.shape[1]:], skip_special_tokens=True)

        references.append(response)
        predictions.append(decoded_outputs)

    return references, predictions

references, base_generated = generate_gen_ref(base_model, eval['text'], tokenizer)

references, finetuned_generated = generate_gen_ref(model_fine_tuned, eval['text'], tokenizer)

rouge = evaluate.load('rouge')
print("BASELINE RESULTS:")
results_base = rouge.compute(predictions=base_generated, references=references)
print(results_base)
print("FINETUNE RESULTS:")
results_fine = rouge.compute(predictions=finetuned_generated, references=references)
print(results_fine)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:  26%|##6       | 2.61G/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


BASELINE RESULTS:
{'rouge1': np.float64(0.1519306595386532), 'rouge2': np.float64(0.037789100511000345), 'rougeL': np.float64(0.12727942757683053), 'rougeLsum': np.float64(0.09751506526769113)}
FINETUNE RESULTS:
{'rouge1': np.float64(0.45823724558028556), 'rouge2': np.float64(0.18525350546161362), 'rougeL': np.float64(0.29990482223457493), 'rougeLsum': np.float64(0.3122169854041125)}


In [16]:
#TABLE VIZ
data = {
    'Metric': ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'ROUGE-Lsum'],
    'Baseline': [results_base['rouge1'], results_base['rouge2'], results_base['rougeL'], results_base['rougeLsum']],
    'Finetuned': [results_fine['rouge1'], results_fine['rouge2'], results_fine['rougeL'], results_fine['rougeLsum']]
}

#create a pandas DataFrame from the dictionary
df = pd.DataFrame(data)

#display the DataFrame as a formatted table
display(df)

Unnamed: 0,Metric,Baseline,Finetuned
0,ROUGE-1,0.151931,0.458237
1,ROUGE-2,0.037789,0.185254
2,ROUGE-L,0.127279,0.299905
3,ROUGE-Lsum,0.097515,0.312217


In [4]:
#test text generation
import torch
from transformers import pipeline, LlamaForCausalLM, LlamaTokenizer
import gc

####################choose to load the pretrained model##################
model_path = "results_full" #USE FINE-TINE
#model_path = "meta-llama/Llama-2-7b-hf" #USE BASELINE
#########################################################################
model_fine_tuned = LlamaForCausalLM.from_pretrained(
    model_path,
    device_map = "auto", #use gpu
    torch_dtype=torch.float16,
    use_cache=True,
    #quantization_config=bnb_config,
    #pretraining_tp = 1
)

tokenizer = LlamaTokenizer.from_pretrained(model_path)

#PIPELINE
pipe = pipeline("text-generation",
                model=model_fine_tuned,
                tokenizer=tokenizer,
                temperature=0.3,  #adjust the temperature for randomness
                top_k=15,         #set top_k to pick the top k best tokens
                top_p=0.6,         #nucleus sampling, higher means it considers more tokens
                max_length=270,    #max number of tokens to generate
               )


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [8]:
#Cleans a piped prompt by removing double lines and extra whitespace
def clean_prompt(prompt):
  #Replace double newlines with single newlines
  prompt = prompt.replace("\n\n", "\n")
  #Strip leading and trailing whitespace from each line
  prompt = "\n".join([line.strip() for line in prompt.splitlines()])
  return prompt

#PROMPT ENGINEERING
persona = "You are a helpful and respectful customer support assistant.\n"
instruction = "Provide an appropriate response to the customer inquiry.\nAlways answer as helpfully as possible.\nPlease ensure that your responses are socially unbiased and positive in nature.\n"
context = f"You are employed in a shipping company. \n"
data_format = "End the response with \"</s>\".\n"
tone = "The tone should be formal and helpful.\n"

sys = persona + instruction + context + data_format + tone

user_query = "I have an order number, 43332132, where is my order?"

prompt = f"<s>[INST] <<SYS>>{sys}<</SYS>>\n\n{user_query} [/INST]"
prompt = clean_prompt(prompt)
#prompt = user_query
result = pipe(prompt)
print(result[0]['generated_text'])

<s>[INST] <<SYS>>You are a helpful and respectful customer support assistant.
Provide an appropriate response to the customer inquiry.
Always answer as helpfully as possible.
Please ensure that your responses are socially unbiased and positive in nature.
You are employed in a shipping company.
End the response with "</s>".
The tone should be formal and helpful.
<</SYS>>
I have an order number, 43332132, where is my order? [/INST]
I apologize for the inconvenience, but I understand that you are looking for information regarding your order with the order number 43332132. Please allow me to assist you in locating your order.
To check the status of your order, you can visit our website and navigate to the "Order Status" section. Once there, you will be able to enter your order number and view the current status of your order. If you encounter any difficulties or have any further questions, please do not hesitate to let me know.
I hope this information helps you locate your order. If you ha