# Instruction fine-tuning a Phi-3-mini model on Python code generation using QLoRA

## Installing and loading the libraries

In [None]:
%pip install -qqq --upgrade bitsandbytes transformers peft accelerate datasets trl flash_attn

In [None]:
%pip install -qqq huggingface_hub
%pip install -qqq python-dotenv

In [None]:
%pip install -qqq wandb

In [None]:
%pip install -qqq absl-py nltk evaluate

In [None]:
%pip list | grep transformers.

In [None]:
%pip install -qqq ipywidgets

## Importing the libraries

In [None]:
from random import randrange

import torch
from datasets import load_dataset

from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    set_seed,
    pipeline
)
from trl import SFTTrainer

## Setting Global Parameters

In [None]:
# The model that you want to train from the Hugging Face hub
model_id = "microsoft/Phi-3-mini-4k-instruct"
model_name = "microsoft/Phi-3-mini-4k-instruct"
# The instruction dataset to use
dataset_name = "iamtarun/python_code_instructions_18k_alpaca"
#dataset_name = "HuggingFaceH4/CodeAlpaca_20K"
# Dataset split
dataset_split= "train"
# Fine-tuned model name
new_model = "phi3-mini-4k-qlora-pycode-18k"
# Huggingface repository
hf_model_repo="alexrodpas/"+new_model
# Load the entire model on the GPU 0
device_map = {"": 0}

################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "bfloat16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_double_quant = True


################################################################################
# LoRA parameters
################################################################################
# LoRA attention dimension
lora_r = 16
# Alpha parameter for LoRA scaling
lora_alpha = 16
# Dropout probability for LoRA layers
lora_dropout = 0.05
# Modules
target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]

set_seed(1234)  # For reproducibility


## Connect to Huggingface Hub

**NOTE**: The next section depends on where you run your code and how you set your API Keys

You can log in to Hugging Face Hub interactively

In [None]:
from huggingface_hub import notebook_login
# Log in to HF Hub
notebook_login()

Or you can provide .env file containing the Hugging Face token

In [None]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

# Load the enviroment variables
load_dotenv()
# Login to the Hugging Face Hub
login(token=os.getenv("HF_HUB_TOKEN"))

## Load the dataset with the instruction set

In [None]:
# Load dataset from the hub
dataset = load_dataset(dataset_name, split=dataset_split)
# Show dataset size
print(f"dataset size: {len(dataset)}")
# Show an example
print(dataset[randrange(len(dataset))])

In [None]:
# Check the dataset structure
dataset

In [None]:
# Show a random example
print(dataset[randrange(len(dataset))])

## Load the tokenizer to prepare the dataset

In [None]:
# load tokenizer
tokenizer_id = model_id
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.padding_side = 'right' # to prevent warnings

Function to create the appropiate format for our model

In [None]:
## Map functions
def create_message_column(row):
    messages = []
    user = {
        "content": f"{row['instruction']}\n Input: {row['input']}",
        "role": "user"
    }
    messages.append(user)
    assistant = {
        "content": f"{row['output']}",
        "role": "assistant"
    }
    messages.append(assistant)
    return {"messages": messages}

def format_dataset_chatml(row):
    return {"text": tokenizer.apply_chat_template(row["messages"], add_generation_prompt=False, tokenize=False)}

Apply the ChatML format to our dataset

In [None]:
## prepare the dataset
dataset_chatml = dataset.map(create_message_column)
dataset_chatml = dataset_chatml.map(format_dataset_chatml)

In [None]:
dataset_chatml[0]

In [None]:
# Split the dataset into train and test sets
dataset_chatml = dataset_chatml.train_test_split(test_size=0.05, seed=1234)
dataset_chatml

## Instruction fine-tune a Phi-3-mini model using QLORA and trl

First, we try to identify our GPU

In [None]:
#use bf16 and FlashAttention if supported
if torch.cuda.is_bf16_supported():
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'

print(attn_implementation)
print(compute_dtype)

## Load the tokenizer and model to finetune

In [None]:
# Load the tonenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, add_eos_token=True, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'left'

# Set the quantization
bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=use_double_quant,
)
# Load the model
model = AutoModelForCausalLM.from_pretrained(
          model_name, torch_dtype=compute_dtype, trust_remote_code=True, quantization_config=bnb_config, device_map=device_map,
          attn_implementation=attn_implementation
)

model = prepare_model_for_kbit_training(model)

Configure the LoRA properties

In [None]:
peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= target_modules
)

The SFTTrainer supports a native integration with peft, which makes it super easy to efficiently instruction tune LLMs. We only need to create our LoRAConfig and provide it to the trainer.

Before we can start our training we need to define the hyperparameters (TrainingArguments) we want to use

In [None]:
# Define the training arguments
args = TrainingArguments(
        output_dir="./phi3-mini-QLoRA",
        evaluation_strategy="steps",
        do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=8,
        per_device_eval_batch_size=4,
        log_level="debug",
        save_strategy="epoch",
        logging_steps=100,
        learning_rate=1e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        eval_steps=100,
        num_train_epochs=3,
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
        report_to="wandb",
        seed=42,
)

## Connect to wandb and register the project and experiment

In [None]:
# @title wandb init
import wandb
wandb.login()

In [None]:
import os
os.environ["WANDB_PROJECT"]="Phi3-mini-ft-pycode"

In [None]:
project_name = "Phi3-mini-ft-pycode"
wandb.init(project=project_name, name = "phi3-mini-qft-py-3e")

We now have every building block we need to create our SFTTrainer to start then training our model.

In [None]:
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset_chatml['train'],
        eval_dataset=dataset_chatml['test'],
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=args,
)

Start training our model by calling the train() method on our Trainer instance.

In [None]:
# train
trainer.train()

# save model in local
trainer.save_model()

In [None]:
hf_adapter_repo="alexrodpas/adapter-phi3-mini-qlora-pycode"

In [None]:
# Save the adapter
trainer.push_to_hub(hf_adapter_repo)

## Merge the model and the adapters and save it to the hub

If running in Google Colab in a T4 instance, comment out and run the following cell to clean the GPU memory.

In [None]:
# Empty VRAM
#del model
#del trainer
#import gc
#gc.collect()
#gc.collect()

In [None]:
torch.cuda.empty_cache() # PyTorch thing

Comment out and run next cell if using Google Colab.

In [None]:
#gc.collect()

Reload the trained and saved model and merge it then we can save the whole model

In [None]:
hf_adapter_repo = "alexrodpas/phi3-mini-QLoRA"

model_name, hf_adapter_repo, compute_dtype

In [None]:
peft_model_id = hf_adapter_repo
tr_model_id = model_name

model = AutoModelForCausalLM.from_pretrained(tr_model_id, trust_remote_code=True, torch_dtype=compute_dtype)
model = PeftModel.from_pretrained(model, peft_model_id)
model = model.merge_and_unload()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

In [None]:
hf_model_repo

In [None]:
# SAve the model merged to the Hub
merged_model_id = hf_model_repo
model.push_to_hub(merged_model_id)
tokenizer.push_to_hub(merged_model_id)

## Model Inference and evaluation

Finally we download the created model from the hub and test it to make sure it works fine.

In [None]:
hf_model_repo

In [None]:
# If not defined
hf_model_repo='alexrodpas/phi3-mini-4k-qlora-pycode-18k'

Load the model and tokenizer from the Hub

In [None]:
device_map, compute_dtype

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed

set_seed(1234)  # For reproducibility

tokenizer = AutoTokenizer.from_pretrained(hf_model_repo,trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(hf_model_repo, trust_remote_code=True, torch_dtype=compute_dtype, device_map=device_map) # compute "auto" dev_map "cuda"

We prepare the dataset as we did previously

In [None]:
## prepare the dataset
dataset_chatml = dataset.map(create_message_column)
dataset_chatml = dataset_chatml.map(format_dataset_chatml)
dataset_chatml = dataset_chatml.train_test_split(test_size=0.05)
dataset_chatml

Create a text generation pipeline to run the inference

In [None]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
# Test the chat template
pipe.tokenizer.apply_chat_template([{"role": "user", "content": dataset_chatml['test'][0]['messages'][0]['content']}], tokenize=False, add_generation_prompt=True)

In [None]:
# Function to execute inference on a prompt
def test_inference(prompt):
    prompt = pipe.tokenizer.apply_chat_template([{"role": "user", "content": prompt}], tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, num_beams=1, temperature=0.3, top_k=50, top_p=0.95,
                   max_time= 180) #, eos_token_id=eos_token)
    return outputs[0]['generated_text'][len(prompt):].strip()

In [None]:
%%time

test_inference(dataset_chatml['test'][0]['messages'][0]['content'])

## Evaluate the performance

We'll use ROGUE metric to evaluate the performance. It's not the best metric but it's simple and easy to measure.

In [None]:
import evaluate
rouge_metric = evaluate.load("rouge")

Create a function for inference and evaluation of an example

In [None]:
def calculate_rogue(row):
    response = test_inference(row['messages'][0]['content'])
    results = rouge_metric.compute(predictions=[response], references=[row['output']], use_stemmer=True)
   # Extract a few results
    result = {key: value for key, value in results.items()}
    result['response'] = response
    return result

In [None]:
%%time
metricas = dataset_chatml['test'].select(range(0,100)).map(calculate_rogue, batched=False)

In [None]:
import numpy as np

Now, we can calculate the metric on the sample

In [None]:
print("Rouge 1 Mean: ",np.mean(metricas['rouge1']))
print("Rouge 2 Mean: ",np.mean(metricas['rouge2']))
print("Rouge L Mean: ",np.mean(metricas['rougeL']))
print("Rouge Lsum Mean: ",np.mean(metricas['rougeLsum']))

## Inference in batches

In [None]:
dataset_chatml['test'][0]['output']

In [None]:
num_samples=500

In [None]:
%%time

prompts = [pipe.tokenizer.apply_chat_template([{"role": "user", "content": dataset_chatml['test'][i]['messages'][0]['content']}], tokenize=False, add_generation_prompt=True)
                                              for i in range(num_samples)]
outputs = pipe(prompts, batch_size=4, max_new_tokens=256, do_sample=True, num_beams=1, temperature=0.3, top_k=50, top_p=0.95,
                   max_time= 180)
preds = [outputs[i][0]['generated_text'].split("<|assistant|>\n")[1].strip() for i in range(len(outputs))]
references= [dataset_chatml['test'][i]['output'] for i in range(len(outputs))]
rouge_metric.add_batch(predictions=preds, references=references)

Now, we can calculate the metric on the sample

In [None]:
result = rouge_metric.compute(use_stemmer=True)

In [None]:
print("Rouge 1 Mean: ",np.mean(result['rouge1']))
print("Rouge 2 Mean: ",np.mean(result['rouge2']))
print("Rouge L Mean: ",np.mean(result['rougeL']))
print("Rouge Lsum Mean: ",np.mean(result['rougeLsum']))

In [None]:
result['rouge1']