# Fine-Tuning Llama 3 and Using It Locally

 ref: https://www.datacamp.com/tutorial/llama3-fine-tuning-locally

## 1. Fine-Tuning Llama 3

In [1]:
# !pip3 install wandb
# !pip install deepeval
# !pip install ipywidgets
# !pip install langchain_google_genai
# !pip install pandas
# !pip install wandb 
# !pip install trl
# !pip install python-dotenv
# !pip install  transformers==4.40.2
# !pip install -i https://pypi.org/simple/ bitsandbytes
# !pip install --upgrade cuda-python
# !pip install accelerate


In [2]:
# !pip freeze

In [3]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch 
import wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

from deepeval.test_case import LLMTestCaseParams
from deepeval.dataset import EvaluationDataset, Golden
from deepeval.metrics import GEval

In [4]:
os.getcwd()

'/fs01/home/ws_aabboud/finetuning-and-alignment'

In [5]:
os.chdir('Deloitte/finetuned/')

In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
from deepeval_gemini import GoogleVertexAI

### Login to HuggingFace Hub

In [8]:
# from huggingface_hub import login
from huggingface_hub import login, interpreter_login
# import google.generativeai as genai
# from dotenv import load_dotenv

# load .env file
# load_dotenv(override=True)
# hf_token = os.getenv("HF_TOKEN_WRITE")


# interpreter_login()





# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()

# hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")

# login(token = hf_token)

# wb_token = user_secrets.get_secret("wandb")

# wandb.login(key=wb_token)
# run = wandb.init(
#     project='Fine-tune Llama 3 8B on Medical Dataset', 
#     job_type="training", 
#     anonymous="allow"
# )

## Set model and dataset parameters

In [9]:
os.chdir("../../../../../../")
os.getcwd()

'/'

In [10]:
shared_space="/fs01/projects/fta_teams/deloitte"
base_model = "/fs01/model-weights/Meta-Llama-3-8B-Instruct"
dataset_name = "heliosbrahma/mental_health_conversational_dataset"

# new_model = "/h/ws_aabboud/finetuning-and-alignment/Deloitte/finetuned/models/llama-3-8b-chat-doctor"
merged_model= f"{shared_space}/merged_models/llama-3-8b-chat-doctor"
adapter_model=f"{shared_space}/adapters/llama-3-8b-chat-doctor"

### Set the data type and attention implementation

In [11]:
torch_dtype = torch.float16
attn_implementation = "eager"

## Loading the Model

In [12]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Load Tokenizer

In [13]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
model, tokenizer = setup_chat_format(model, tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Adding the adapter to the layer

In [14]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)
model.max_output_tokens=1024

## Loading Dataset

In [15]:
dataset_name = "heliosbrahma/mental_health_conversational_dataset"
# dataset_name = "ruslanmv/ai-medical-chatbot"

In [16]:
# train_ds = load_dataset(dataset_name, split='train[:70%]')

In [17]:
# dataset.keys()

In [18]:
dataset = load_dataset(dataset_name, split="all")

In [19]:
dataset["text"][0]

'<<<HUMAN>>>: What is a panic attack? <<<ASSISTANT>>>: Panic attacks come on suddenly and involve intense and often overwhelming fear. Theyâ€™re accompanied by very challenging physical symptoms, like a racing heartbeat, shortness of breath, or nausea. Unexpected panic attacks occur without an obvious cause. Expected panic attacks are cued by external stressors, like phobias. Panic attacks can happen to anyone, but having more than one may be a sign of panic disorder, a mental health condition characterized by sudden and repeated panic attacks.'

In [20]:
def extract_message(text, tag):
    """
    Extracts a message from the given text based on the specified tag.
    
    Parameters:
    text (str): The text containing the messages.
    tag (str): The tag to search for, either '<<<HUMAN>>>: ' or '<<<ASSISTANT>>>: '.
    
    Returns:
    str: The extracted message, or None if the tag is not found.
    """
    try:
        start_tag = f'<<<{tag}>>>: '
        end_tag = ' <<<'
        
        start_index = text.find(start_tag)
        if start_index == -1:
            return None
        
        start_index += len(start_tag)
        end_index = text.find(end_tag, start_index)
        
        if end_index == -1:
            # If end_tag not found, take the rest of the text
            end_index = len(text)
        
        return text[start_index:end_index].strip()
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [21]:
#Importing the dataset
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=65).select(range(100)) # Only use 100 samples for quick demo

def format_chat_template(row):
    row_json = [{"role": "user", "content": extract_message(row["text"], 'HUMAN')},
               {"role": "assistant", "content":  extract_message(row["text"], 'ASSISTANT')}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)

dataset['text'][3]

'<|im_start|>user\nWhat are the side effects of medication?<|im_end|>\n<|im_start|>assistant\nLike other medication, psychiatric medication has its own set of side effects like Drowsiness, Restlessness, Dizziness, Dry mouth, Constipation, Nausea, and Vomiting.\nIt is usually because of the body getting used to medication. It normally takes a month for the body to get used to these drugs. If you are feeling any of the above symptoms, call your doctor immediately.<|im_end|>\n'

### Split the Dataset

In [22]:
dataset = dataset.train_test_split(test_size=0.1)

In [23]:
import re

def extract_user_query(text):
    # Define the regex pattern to capture the text between the 'user\n' and '\nassistant\n'
    pattern = r'(<\|im_start\|>user\n.*?<\|im_end\|>)'
    
    # Search for the pattern in the text
    match = re.search(pattern, text, re.DOTALL)
    
    # If a match is found, return the captured group (the user's query)
    if match:
        return match.group(1).strip()
    else:
        return None
goldens = []
for sentence in dataset["test"]["text"]:
    cleansed_input = extract_user_query(sentence)
    golden_input = Golden(input=cleansed_input, context=[""])
    goldens.append(golden_input)

## Setup Training


In [24]:
training_arguments = TrainingArguments(
    output_dir=adapter_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    # evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

### set up a supervised fine-tuning (SFT) 

In [25]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    # eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


## Setup Gemini Model

In [26]:
# Setup Gemini Model
from langchain_google_genai import ChatGoogleGenerativeAI
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 1024,
    "max_new_tokens":1024,
    "response_mime_type": "text/plain",
    }



model=ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.7, google_api_key=os.getenv('GEMINI_API_KEY'))

# initiatialize the  wrapper class
vertexai_gemini = GoogleVertexAI(model=model)
print(vertexai_gemini.generate("Write me a joke"))

Why don't scientists trust atoms? 

Because they make up everything! 



In [27]:
# ref: https://docs.confident-ai.com/docs/integrations-huggingface
from deepeval.test_case import LLMTestCaseParams
from deepeval.dataset import EvaluationDataset, Golden
from deepeval.metrics import GEval, AnswerRelevancyMetric
from deepeval.integrations.hugging_face import DeepEvalHuggingFaceCallback

from deepeval import evaluate

from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM

# first_golden = Golden(input="...")
# second_golden = Golden(input="...")
eval_dataset = EvaluationDataset(goldens=goldens)
# dataset = EvaluationDataset(goldens=[first_golden, second_golden])
coherence_metric = GEval(
    name="Coherence",
    criteria="Coherence - determine if the actual output is coherent with the input.",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
    model=vertexai_gemini,
)

In [28]:
from deepeval.integrations.hugging_face import DeepEvalHuggingFaceCallback
tokenizer_args = {
    'max_length': 1024,  # Adjust as needed
    'return_tensors': 'pt',
}


deepeval_hugging_face_callback = DeepEvalHuggingFaceCallback(
    evaluation_dataset=eval_dataset,
    metrics=[coherence_metric],
    trainer=trainer,
    tokenizer_args = tokenizer_args
)


### Train the model

In [29]:
### Add DeepEval Callback ###
#############################
trainer.add_callback(deepeval_hugging_face_callback)



In [30]:
trainer.model.max_new_tokens=1024
trainer.model.max_length=1024

In [31]:
#########################
### Start Fine-tuning ###
#########################
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mahm-abboud[0m ([33mahm-abboud-Deloitte[0m). Use [1m`wandb login --relogin`[0m to force relogin


Output()

Step,Training Loss
1,5.4451
2,5.8231
3,5.6537
4,4.998
5,5.1287
6,4.1088
7,5.0626
8,3.7003
9,3.9436
10,3.1896


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


ValueError: Input length of input_ids is 23, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

## Model Evaluation

In [32]:

wandb.finish()
model.config.use_cache = True

VBox(children=(Label(value='0.004 MB of 0.021 MB uploaded\r'), FloatProgress(value=0.20408531361320287, max=1.â€¦

0,1
train/epoch,1.0
train/global_step,45.0
train/grad_norm,3.15898
train/learning_rate,0.0
train/loss,1.7767


In [56]:
messages = [
    {
        "role": "user",
        "content": "Hello doctor, I have bad insomnia. How do I get rid of it?"
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, 
                                       add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, 
                   truncation=True).to("cuda")

outputs = model.generate(**inputs, 
                         max_length=150, 
                         num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

TypeError: BaseChatModel.generate() missing 1 required positional argument: 'messages'

In [55]:
inputs

{'input_ids': tensor([[128256,    882,    198,   9906,  10896,     11,    358,    617,   3958,
          83511,     13,   2650,    656,    358,    636,   9463,    315,    433,
             30, 128257,    198, 128256,  78191,    198]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}

## Save the Modle Adapter Files

In [19]:
os.getcwd()

'/'

In [20]:
#save to local file
trainer.model.save_pretrained(new_model)




In [91]:
# Push to Huggingface
trainer.model.push_to_hub("ahmabboud/llama-3-8b-chat-doctor")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ahmabboud/llama-3-8b-chat-doctor/commit/5b0f8c2c1374a89458de8069d080ff46ea2f82e4', commit_message='Upload model', commit_description='', oid='5b0f8c2c1374a89458de8069d080ff46ea2f82e4', pr_url=None, pr_revision=None, pr_num=None)

# 2. Merging Llama 3 8B with the adapter

## Merging the base model with the adapter

In [21]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch
from trl import setup_chat_format
# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)

base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
)

base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)

# Merge adapter with base model
model = PeftModel.from_pretrained(base_model_reload, new_model)

model = model.merge_and_unload()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Model Inference

In [22]:
messages = [{"role": "user", "content": "Hello doctor, I have bad insomnia. How do I get rid of it?"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

outputs = pipe(prompt, max_new_tokens=120, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])

<|im_start|>user
Hello doctor, I have bad insomnia. How do I get rid of it?<|im_end|>
<|im_start|>assistant


The most effective way to treat insomnia is to combine your medication with a regular sleep schedule, and also to use a sleep disorder treatment. To best treat your insomnia, you should also do a sleep disorder treatment treatment, which can be a combination of a sleep disorder treatment and a regular sleep schedule. A sleep disorder can also be caused by a sleep disorder treatment. It is important to use a sleep disorder treatment treatment treatment treatment treatment treatment that is effective and also that is used in combination with a regular sleep schedule. This is because a sleep disorder treatment can also be effective in treating a sleep disorder


## Saving and pushing the merged model

In [23]:
finetuned="/h/ws_aabboud/finetuning-and-alignment/Deloitte/finetuned/models"

In [24]:
model.save_pretrained(f"{finetuned}/llama-3-8b-chat-doctor-merged")
tokenizer.save_pretrained(f"{finetuned}/llama-3-8b-chat-doctor-merged")

('/h/ws_aabboud/finetuning-and-alignment/Deloitte/finetuned/models/llama-3-8b-chat-doctor-merged/tokenizer_config.json',
 '/h/ws_aabboud/finetuning-and-alignment/Deloitte/finetuned/models/llama-3-8b-chat-doctor-merged/special_tokens_map.json',
 '/h/ws_aabboud/finetuning-and-alignment/Deloitte/finetuned/models/llama-3-8b-chat-doctor-merged/tokenizer.json')

# 3. Converting the Model to Llama.cpp GGUF

In [101]:
%cd {finetuned}/llama.cpp
!sed -i 's|MK_LDFLAGS   += -lcuda|MK_LDFLAGS   += -L/usr/local/nvidia/lib64 -lcuda|' Makefile
!LLAMA_CUDA=1 conda run -n base make -j > /dev/null

/fs01/home/ws_aabboud/finetuning-and-alignment/Deloitte/finetuned/llama.cpp
/bin/bash: conda: command not found


In [102]:
!python convert-hf-to-gguf.py {finetuned}/llama-3-8b-chat-doctor-merged/ \
    --outfile {finetuned}/llama-3-8b-chat-doctor.gguf \
    --outtype f16

INFO:hf-to-gguf:Loading model: llama-3-8b-chat-doctor-merged
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Set model parameters
INFO:hf-to-gguf:gguf: context length = 8192
INFO:hf-to-gguf:gguf: embedding length = 4096
INFO:hf-to-gguf:gguf: feed forward length = 14336
INFO:hf-to-gguf:gguf: head count = 32
INFO:hf-to-gguf:gguf: key-value head count = 8
INFO:hf-to-gguf:gguf: rope theta = 500000.0
INFO:hf-to-gguf:gguf: rms norm epsilon = 1e-05
INFO:hf-to-gguf:gguf: file type = 1
INFO:hf-to-gguf:Set model tokenizer
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO:gguf.vocab:Adding 280147 merge(s).
INFO:gguf.vocab:Setting special token type bos to 128256
INFO:gguf.vocab:Setting special token type eos to 128257
INFO:gguf.vocab:Setting special token type pad to 128257
INFO:gguf.vocab:Setting chat_template to {% for message in messages %}{{'<|im_start|>' + message['role'] + '
' + mes

# 4. Quantizing the GGUF model

In [6]:
import os
os.getcwd()

'/fs01/home/ws_aabboud/finetuning-and-alignment/Deloitte/finetuned'

## Quantization

In [8]:
# %cd {finetuned}

! ./llama.cpp/llama-quantize llama-3-8b-chat-doctor.gguf llama-3-8b-chat-doctor-Q4_K_M.gguf Q4_K_M

/bin/bash: /llama.cpp/llama-quantize: No such file or directory


NameError: name 'os' is not defined

# 5. Using the Fine-Tuned Model Locally