### Installing Required Libraries


In [None]:
# Author https://jair-neto.medium.com/best-way-to-fine-tune-your-llm-using-a-t4-gpu-part-3-3-71c7d0514aa6
!pip install accelerate
!pip install peft
!pip install bitsandbytes
!pip install transformers
!pip install trl
!pip install ipywidgets

### Loading Required Libraries

In [None]:
import os
import re
import pandas as pd

from random import randrange
from functools import partial
import torch
from datasets import load_dataset
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForLanguageModeling,
                          EarlyStoppingCallback,
                          pipeline,
                          logging,
                          set_seed)

import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM
from trl import SFTTrainer
from tqdm import tqdm
from huggingface_hub import notebook_login
from typing import List, Tuple, Union
from datasets import DatasetDict

In [None]:
notebook_login()

### Functions


In [None]:
# https://medium.com/@kshitiz.sahay26/fine-tuning-llama-2-for-news-category-prediction-a-step-by-step-comprehensive-guide-to-fine-tuning-48c06dee28a9
def get_qlora_configs(load_in_4bit: bool,
                      bnb_4bit_use_double_quant: bool,
                      bnb_4bit_quant_type: str,
                      bnb_4bit_compute_dtype: torch.dtype,
                      r: int,
                      lora_alpha: int,
                      target_modules: Union[List[str],str],
                      lora_dropout: float,
                      bias: str,
                      task_type: str) -> Tuple[BitsAndBytesConfig, LoraConfig]:
    """
    Create the configurations for use QLoRA thechniques

    Args:
        load_in_4bit (bool): This flag is used to enable 4-bit quantization by replacing the Linear layers with FP4/NF4 layers from
            `bitsandbytes`.
        bnb_4bit_use_double_quant (bool): This flag is used for nested quantization where the quantization constants from the first quantization are
            quantized again.
        bnb_4bit_quant_type (str): This sets the quantization data type in the bnb.nn.Linear4Bit layers. Options are FP4 and NF4 data types
            which are specified by `fp4` or `nf4`.
        bnb_4bit_compute_dtype (torch.dtype): This sets the computational type which might be different than the input time. For example, inputs might be
            fp32, but computation can be set to bf16 for speedups.
        r (int): Lora attention dimension.
        lora_alpha (int): The alpha parameter for Lora scaling.
        target_modules (Union[List[str],str]): The names of the modules to apply Lora to.
        lora_dropout (float): The dropout probability for Lora layers.
        bias (str): Bias type for Lora. Can be 'none', 'all' or 'lora_only'. If 'all' or 'lora_only', the
            corresponding biases will be updated during training. Be aware that this means that, even when disabling
            the adapters, the model will not produce the same output as the base model would have without adaptation.
        task_type (str): The task type for the model.

    Returns:
        Tuple[BitsAndBytesConfig, LoraConfig]: The configuration for BitsAndBytes and Lora.
    """

    bnb_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
        bnb_4bit_quant_type = bnb_4bit_quant_type,
        bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
    )

    lora_config = LoraConfig(
        r = r,
        lora_alpha = lora_alpha,
        target_modules = target_modules,
        lora_dropout = lora_dropout,
        bias = bias,
        task_type = task_type,
    )

    bnb_config, lora_config


def load_model_tokenizer(model_name: str, bnb_config: BitsAndBytesConfig) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
    """
    Load the model and tokenizer from the HuggingFace model hub using quantization.

    Args:
        model_name (str): The name of the model.
        bnb_config (BitsAndBytesConfig): The quantization configuration of BitsAndBytes.

    Returns:
        Tuple[AutoModelForCausalLM, AutoTokenizer]: The model and tokenizer.
    """


    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config = bnb_config,
        # device_map = "auto",
        torch_dtype="auto",
        trust_remote_code=True
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token = True, trust_remote_code=True)

    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer




def format_phi2(row):
    question = row['Context']
    answer = row['Response']

#     text = f"[INST] {question} [/INST] {answer}".replace('\xa0', ' ')
    text = f"### Question: {question}\n ### Answer: {answer}"

    return text

def get_max_length(model: str, max_length_default_value: int = 1024) -> int:
    """
    Get the maximum length of the model.

    Args:
        model (str): The model name.
        max_length_default_value (int): The default value for the maximum length.

    Returns:
        int: The maximum length of the model.
    """

    max_length = None

    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            return max_length

    return max_length_default_value


def tokenize_batch(batch, tokenizer, max_length):
    """
    Tokenizes dataset batch

    :param batch: Dataset batch
    :param tokenizer: Model tokenizer
    :param max_length: Maximum number of tokens to emit from the tokenizer
    """

    return tokenizer(
        batch["text"],
        max_length = max_length,
        truncation = True,
    )


def preprocess_dataset(tokenizer: AutoTokenizer,
                       max_length: int,
                       seed: int,
                       columns_to_remove: List[str],
                       dataset: DatasetDict) -> DatasetDict:
    """
    Preprocess the dataset for training.

    Args:
        tokenizer (AutoTokenizer): The tokenizer.
        max_length (int): The maximum length of the model.
        seed (int): The seed for shuffling the dataset.
        columns_to_remove (List[str]): The columns to remove from the dataset.
        dataset (DatasetDict): The Hugging face dataset.

    Returns:
        DatasetDict: The preprocessed dataset.
    """

    _preprocessing_function = partial(tokenize_batch, max_length = max_length, tokenizer = tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched = True,
        remove_columns = columns_to_remove,
    )

    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    dataset = dataset.shuffle(seed = seed)

    return dataset


def find_all_linear_names(model: AutoModelForCausalLM) -> list:
    """
    Find modules to apply LoRA to.

    Args:
        model (AutoModelForCausalLM): The model that will be fine-tuned.

    Returns:
        list: List with the modules names that we are going to apply LoRA
    """

    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_name = names[0] if len(names) == 1 else names[-1]
            if name != 'lm_head':
              lora_module_names.add(lora_module_name)

    print(f"LoRA module names: {list(lora_module_names)}")

    return list(lora_module_names)


def create_peft_config(r: int, lora_alpha: int, target_modules, lora_dropout: float, bias: str, task_type: str) -> LoraConfig:
    """
    Create the Parameter Efficient Fine-Tuning configuration.

    Args:
        r (int): Lora attention dimension.
        lora_alpha (int): The alpha parameter for Lora scaling.
        target_modules (_type_): _description_
        lora_dropout (float): The dropout probability for Lora layers.
        Bias type for Lora. Can be 'none', 'all' or 'lora_only'. If 'all' or 'lora_only', the
            corresponding biases will be updated during training. Be aware that this means that, even when disabling
            the adapters, the model will not produce the same output as the base model would have without adaptation.
        task_type (str): The task type for the model.

    Returns:
        LoraConfig: _description_
    """
    config = LoraConfig(
        r = r,
        lora_alpha = lora_alpha,
        target_modules = target_modules,
        lora_dropout = lora_dropout,
        bias = bias,
        task_type = task_type,
    )

    return config


def preprare_model_for_fine_tune(model: AutoModelForCausalLM,
                                 lora_r: int,
                                 lora_alpha: int,
                                 lora_dropout: float,
                                 bias: str,
                                 task_type: str) -> AutoModelForCausalLM:
    """
    Prepares the model for fine-tuning.

    Args:
        model (AutoModelForCausalLM): The model that will be fine-tuned.
        lora_r (int): Lora attention dimension.
        lora_alpha (int): The alpha parameter for Lora scaling.
        lora_dropout (float): The dropout probability for Lora layers.
        Bias type for Lora. Can be 'none', 'all' or 'lora_only'. If 'all' or 'lora_only', the
            corresponding biases will be updated during training. Be aware that this means that, even when disabling
            the adapters, the model will not produce the same output as the base model would have without adaptation.
        task_type (str): The task type for the model.

    Returns:
        AutoModelForCausalLM: The model prepared for fine-tuning.
    """
    # Enable gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # Prepare the model for training
    model = prepare_model_for_kbit_training(model)

    # Get LoRA module names
    target_modules = find_all_linear_names(model)

    # Create PEFT configuration for these modules and wrap the model to PEFT
    peft_config = create_peft_config(lora_r, lora_alpha, target_modules, lora_dropout, bias, task_type)
    model = get_peft_model(model, peft_config)

    model.config.use_cache = False

    return model


def free_memory(model: AutoModelForCausalLM, trainer: Trainer) -> None:
    """
    Free memory for merging weights

    Args:
        model (AutoModelForCausalLM): Pre-trained Hugging Face model
        trainer (Trainer): Trainer
    """
    pass

    del model
    del trainer
    torch.cuda.empty_cache()


def save_metrics(train_result, trainer: Trainer) -> None:
    """
    Save the metrics.
    """

    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    print(metrics)


def save_model(model: AutoModelForCausalLM, output_dir: str) -> None:
    """
    Save the model.

    Args:
        model (AutoModelForCausalLM): The model.
        output_dir (str): The output directory.
    """

    os.makedirs(output_dir, exist_ok = True)
    model.save_pretrained(output_dir)
    print(f"Model saved in {output_dir}")


def fine_tune(model: AutoModelForCausalLM, trainer: Trainer, output_dir: str) -> None:
    """
    Fine-tune the model.

    Args:
        model (AutoModelForCausalLM): The model to fine-tune.
        trainer (Trainer): The trainer with the training configuration.
        output_dir (str): The output directory to save the model.
    """

    print("Training...")

    train_result = trainer.train()

    save_metrics(train_result, trainer)
    save_model(trainer.model, output_dir)
    # free_memory(model, trainer)

def print_trainable_parameters(model, use_4bit = False):
    """
    Prints the number of trainable parameters in the model.

    :param model: PEFT model
    """

    trainable_params = 0
    all_param = 0

    for _, param in model.named_parameters():
        num_params = param.numel()
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel
        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    if use_4bit:
        trainable_params /= 2

    print(
        f"All Parameters: {all_param:,d} || Trainable Parameters: {trainable_params:,d} || Trainable Parameters %: {100 * trainable_params / all_param}"
    )




### Parameters

In [None]:
################################################################################
# transformers parameters
################################################################################

# The pre-trained model from the Hugging Face Hub to load and fine-tune (string)
# model_name = "meta-llama/Llama-2-7b-hf"
model_name = "YuvrajSingh9886/medicinal-QnA-phi2-custom"

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading (bool)
load_in_4bit = True

# Activate nested quantization for 4-bit base models (double quantization) (bool)
bnb_4bit_use_double_quant = True

# Quantization type (fp4 or nf4) (string)
bnb_4bit_quant_type = "nf4"

# Compute data type for 4-bit base models
bnb_4bit_compute_dtype = torch.bfloat16

################################################################################
# QLoRA parameters
################################################################################

# Number of examples to train (int)
number_of_training_examples = 3512

# Number of examples to use to validate (int)
number_of_valid_examples = 200

# Dataset Name (string)
dataset_name = "Amod/mental_health_counseling_conversations"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension (int)
lora_r = 64

# Alpha parameter for LoRA scaling (int)
lora_alpha = 16

# Dropout probability for LoRA layers (float)
lora_dropout = 0.05

# Bias (string)
bias = "none"

# Task type (string)
task_type = "CAUSAL_LM"

# Random seed (int)
seed = 33

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored (string)
output_dir = "results"

# Batch size per GPU for training (int)
per_device_train_batch_size = 6

# Number of update steps to accumulate the gradients for (int)
gradient_accumulation_steps = 2

# Initial learning rate (AdamW optimizer) (float)
learning_rate = 2e-4

# Optimizer to use (string)
optim = "paged_adamw_8bit"

# Number of training epochs (int)
num_train_epochs = 4

# Linear warmup steps from 0 to learning_rate (int)
warmup_steps = 10

# Enable fp16/bf16 training (set bf16 to True with an A100) (bool)
fp16 = True

# Log every X updates steps (int)
logging_steps = 100

#L2 regularization(prevents overfitting)
weight_decay=0.0

#Checkpoint saves
save_strategy="epoch"

Finally, we will call the above functions to get `model` and `tokenizer` objects.

In [None]:
# Load model from Hugging Face Hub with model name and bitsandbytes configuration

bnb_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
        bnb_4bit_quant_type = bnb_4bit_quant_type,
        bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
    )

model, tokenizer = load_model_tokenizer(model_name, bnb_config)

In [None]:
dataset = load_dataset(dataset_name, split='train')


In [None]:
import pandas as pd
df = pd.DataFrame(dataset)
df

In [None]:
df['text'] = df.apply(format_phi2, axis=1)

In [None]:
df

In [None]:
new_df = df[['text']]
new_df.to_csv('formatted_dataset.csv', index=False)

In [None]:
updated_dataset = load_dataset('csv', data_files='formatted_dataset.csv', split='train')

In [None]:
updated_dataset

In [None]:
shuffled_dataset = updated_dataset.shuffle(seed=42)

### Preprocessing Dataset


In [None]:
max_length = get_max_length(model)
columns_to_remove = ['text']
preprocessed_dataset = preprocess_dataset(tokenizer, max_length, seed, columns_to_remove, shuffled_dataset)

We can now look at the preprocessed dataset, which contains tokens or IDs.

In [None]:
print(preprocessed_dataset)

In [None]:
train_size = int(0.7 * len(preprocessed_dataset))
test_size = len(preprocessed_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(preprocessed_dataset, [train_size, test_size])

In [None]:
# print(test_dataset[0])

In [None]:
len(train_dataset)

### Fine-tuning the Pre-trained Model



Calling the `fine_tune` function below to fine-tune or instruction-tune the pre-trained model on our preprocessed news classification instruction dataset.

In [None]:
print_trainable_parameters(model)

model = preprare_model_for_fine_tune(model,
                                     lora_r,
                                     lora_alpha,
                                     lora_dropout,
                                     bias,
                                     task_type)

# Training parameters
trainer = Trainer(
    model = model,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience = 2)],
    args = TrainingArguments(
        evaluation_strategy="epoch",
#         save_stratergy=True
        per_device_train_batch_size = per_device_train_batch_size,
        gradient_accumulation_steps = gradient_accumulation_steps,
        warmup_steps = warmup_steps,
        learning_rate = learning_rate,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        do_eval = True,
        lr_scheduler_type='cosine',
        logging_steps = logging_steps,
        output_dir = output_dir,
        optim = optim,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
#         load_best_model_at_end = True
        save_strategy=save_strategy
    ),
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)
)

In [None]:
fine_tune(model, trainer, output_dir)

### Merging Weights & Pushing to Hugging Face



In [None]:

model = AutoPeftModelForCausalLM.from_pretrained(output_dir, torch_dtype = torch.bfloat16)

model = model.merge_and_unload()


output_merged_dir = "results/medicinal_qna_phi2/final_merged_checkpoint"
os.makedirs(output_merged_dir, exist_ok = True)
model.save_pretrained(output_merged_dir, safe_serialization = True)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(output_merged_dir)

In [None]:

new_model = "YuvrajSingh9886/medicinal-QnA-phi2-custom"

In [None]:
# Push fine-tuned model and tokenizer to Hugging Face Hub
model.push_to_hub(new_model, use_auth_token = True)
tokenizer.push_to_hub(new_model, use_auth_token = True)

# Inference

In [None]:
free_memory(model, trainer)

In [None]:
# Fine-tuned model name on Hugging Face Hub
new_model = "YuvrajSingh9886/medicinal-QnA-phi2-custom"

## Loading the fine tunned model

In [None]:
# torch.set_default_device('cuda')
# model = AutoModelForCausalLM.from_pretrained(new_model, trust_remote_code=True, torch_dtype="auto")
# tokenizer = AutoTokenizer.from_pretrained(new_model, trust_remote_code=True, torch_dtype="auto")
prompt = "I have been feeling more and more down for over a month. I have started having trouble sleeping due to panic attacks, but they are almost never triggered by something that I know of."
# pipe = pipeline(task="text-generation", model=new_model, tokenizer=tokenizer)
# result = pipe(f"### Instruction: {prompt}")
# print(result[0]['generated_text'])

# config =    {
#             # "max_new_tokens": 512,
#             "temperature": 0.5,
#             "top_p": 1,
#             "top_k":0.8
#       }
tokens = tokenizer(f"### Question: {prompt}", return_tensors='pt').to('cuda')
tokenizer.pad_token = tokenizer.eos_token
outputs = model.generate(**tokens, max_new_tokens=1024, num_beams=5,
    no_repeat_ngram_size=2,
    early_stopping=True
                         )
print(tokenizer.batch_decode(outputs,skip_special_tokens=True)[0])

In [None]:
model.congfif