# Fine-tuning pipeline for LLaMA 3 8B Model



### 1. Connect to Google Drive

In [None]:
def connect_to_google_drive(mount_path, change_dir_path):
  import shutil, os, subprocess
  from google.colab import drive
  drive.mount(mount_path)
  os.chdir(change_dir_path)

### 2. GPU Detection

In [None]:
%%capture
import torch
!pip install bitsandbytes
!pip install datasets
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [None]:
from huggingface_hub import notebook_login
notebook_login() #TOKEN IS "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"

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### 3. Generate LLaMA 3 8B B bit quantized model

In [None]:
import os, json, random, bitsandbytes as bnb, torch.nn as nn, psutil
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, TrainingArguments
from peft import get_peft_model, LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer
from trl import SFTTrainer


def get_8bit_quantized_model(**params):
  tokenizer = AutoTokenizer.from_pretrained(params["model_name"], token=params["HF_TOKEN"])
  device_map = {"": 0}
  foundation_model = AutoModelForCausalLM.from_pretrained(params["model_name"],
                    quantization_config=params["bnb_config"],
                    device_map=device_map,
                    use_cache = False)
  return foundation_model

### Function to return output from the model

In [None]:
model_name = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name, token="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")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def get_outputs(model, inputs, max_new_tokens=100, **params):
    tokenizer = AutoTokenizer.from_pretrained(params["model_name"], token=["HF_TOKEN"])
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        repetition_penalty=1.5, #Avoid repetition.
        early_stopping=False,
        eos_token_id=tokenizer.eos_token_id,
    )
    return outputs

### Prepare dataset

In [None]:
from datasets import load_dataset

def load_custom_dataset(**params):
  tokenizer = AutoTokenizer.from_pretrained(params["model_name"], token=params["HF_TOKEN"])
  dataset = params["dataset_name"] #"fka/awesome-chatgpt-prompts"

  data = load_dataset(dataset)

  data = data.map(lambda samples: tokenizer(samples[params["token_field_name"]]), batched=True)
  train_sample = data["train"].select(range(50))

  del data
  train_sample = train_sample.remove_columns('act')
  return train_sample



# OR

In [None]:
from datasets import Dataset
import json

def load_dataset_from_json(**params):
  with open(params["json_path"], "r") as f:
    data = json.load(f)

  tokenizer = AutoTokenizer.from_pretrained(params["model_name"], token=params["HF_TOKEN"])
  dataset = Dataset.from_dict({
      "prompt": [item["prompt"] for item in data],
      "response": [item["response"] for item in data]
  })

  if params["data_range"]:
    dataset = dataset.select(range(params["data_range"]))

  tokenized_dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True, remove_columns=["prompt", "response"])
  return tokenized_dataset

### Fine-tune

In [None]:
import peft
from peft import LoraConfig, get_peft_model
import os
import transformers
from transformers import TrainingArguments

def fine_tune_model(**params):
  model_name = params["model_name"]
  training_args = params["training_args"]
  lora_config = params["lora_config"]
  trainer = params["trainer"]

  trainer.train()
  trainer.model.save_pretrained(params["save_path"])
  print("************** Model Saved Successfully *******************")


# Main Function

In [None]:
connect_to_google_drive('/content/drive', '/content/drive/MyDrive/Colab Notebooks/' )

Mounted at /content/drive


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_8bit_quant_type="nf4",
    bnb_8bit_compute_dtype=torch.bfloat16,
)

quantized_model = get_8bit_quantized_model(model_name="meta-llama/Meta-Llama-3-8B",
                                           HF_TOKEN="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",
                                           bnb_config=bnb_config
                                           )

Unused kwargs: ['bnb_8bit_quant_type', 'bnb_8bit_compute_dtype']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

In [None]:
#Inference original model
input_sentences = tokenizer("Is REGDOC-2.12.2 related to security measures?", return_tensors="pt").to('cuda')
foundational_outputs_sentence = get_outputs(quantized_model, input_sentences, max_new_tokens=50, model_name=model_name)

print(tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['Is REGDOC-2.12.2 related to security measures? I’m interested in the physical protection of nuclear material and radioactive substances.\nREGDOCS are documents that describe how Canada’s Nuclear Safety Commission (CNSC) regulates safety, environmental impacts, radiation exposures from licensed activities as well as other matters such as']


In [None]:
#data = load_custom_dataset(model_name="meta-llama/Meta-Llama-3-8B",
 #                          HF_TOKEN="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",
  #                         dataset_name="fka/awesome-chatgpt-prompts",
   #                        token_field_name="prompt"
#)

data = load_dataset_from_json(json_path="/content/CNSC_QA_pairs_1_10.json",data_range=50, HF_TOKEN="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", model_name="meta-llama/Meta-Llama-3-8B")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

training_args = TrainingArguments(
    output_dir=os.path.join("./", "peft_lab_outputs"),
    auto_find_batch_size=True,
    learning_rate= 2e-4,
    num_train_epochs=5
)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", token="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")
tokenizer.pad_token = tokenizer.eos_token

trainer = SFTTrainer(
    model=quantized_model,
    args=training_args,
    train_dataset=data,
    peft_config = lora_config,
    dataset_text_field="prompt",
    tokenizer=tokenizer,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [None]:
fine_tune_model(model_name="meta-llama/Meta-Llama-3-8B",
                lora_config=lora_config,
                training_args=training_args,
                trainer=trainer,
                save_path="/content/fine-tuned-model")



Step,Training Loss


************** Model Saved Successfully *******************


# 7. Inference

In [None]:
#import peft
from peft import AutoPeftModelForCausalLM, PeftConfig
#import os

device_map = {"": 0}
working_dir = './'

output_directory = os.path.join(working_dir, "peft_lab_outputs")
peft_model_path = os.path.join(output_directory, f"lora_model")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

bnb_config2 = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_8bit_quant_type="nf4",
    bnb_8bit_compute_dtype=torch.bfloat16,
)

Unused kwargs: ['bnb_8bit_quant_type', 'bnb_8bit_compute_dtype']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [None]:
#Load the Model.
loaded_model = AutoPeftModelForCausalLM.from_pretrained(
                                        "/content/fine-tuned-model",

                                        #torch_dtype=torch.bfloat16,
                                        is_trainable=False,
                                        #load_in_4bit=True,
                                        quantization_config=bnb_config2,
                                        device_map = 'cuda')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", token="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")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
input_sentences = tokenizer("Who holds the copyright for the regulatory document REGDOC-2.12.2?", return_tensors="pt").to('cuda')
foundational_outputs_sentence = get_outputs(loaded_model, input_sentences, max_new_tokens=50, model_name="meta-llama/Meta-Llama-3-8B")

print(tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['Who holds the copyright for the regulatory document REGDOC-2.12.2? Is it available to be used freely by others?\nThe Canadian Nuclear Safety Commission (CNSC) is responsible and accountable, as a Crown corporation under federal legislation,for issuing licenses in respect of nuclear facilities or activities that are subject thereto.\nReg']
