# Fine-tuning LLaMA 3 8B Model

##1. GPU detection to prevent conflicts

In [None]:
%%capture
import torch
!pip install bitsandbytes
!pip install datasets
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

## 2. Connecting to Google Drive

In [None]:
import shutil, os, subprocess
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Colab Notebooks/')

Mounted at /content/drive


## 3. Importing libraries

In [None]:
import torch, os, json, random, bitsandbytes as bnb, torch.nn as nn, psutil
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, TrainingArguments
from peft import get_peft_model, LoraConfig, AutoPeftModelForCausalLM, PeftConfig
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import transformers
from transformers import TrainingArguments, pipeline
import logging

In [None]:
from huggingface_hub import notebook_login
notebook_login() #TOKEN IS "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"

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
HF_TOKEN = "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"
model_name = "meta-llama/Meta-Llama-3-8B"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## 4. LLaMA 3 8B 8 Bit Quantized model

In [None]:
max_seq_length = 2048

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

device_map = {"": 0}
foundation_model = AutoModelForCausalLM.from_pretrained(model_name,
                    quantization_config=bnb_config,
                    device_map=device_map,
                    use_cache = False)

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

## 5. Preparing Dataset

In [None]:
with open("/content/CNSC_QA_pairs_1_10.json", "r") as f:
      data = json.load(f)

dataset = Dataset.from_dict({
        "prompt": [item["prompt"] for item in data],
        "response": [item["response"] for item in data]
    })

In [None]:
train_test_dataset = dataset.train_test_split(test_size=0.2)

train_data = train_test_dataset["train"]
test_data = train_test_dataset["test"]

In [None]:
train_data_tokenized = train_data.map(lambda samples: tokenizer(samples["prompt"]), batched=True, remove_columns=["prompt", "response"])

Map:   0%|          | 0/5448 [00:00<?, ? examples/s]

## 6. Fine-tuning

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)


training_args = TrainingArguments(
    output_dir=os.path.join("./", "peft_lab_outputs"),
    auto_find_batch_size=True,
    learning_rate=2e-4,
    num_train_epochs=5,
    per_device_train_batch_size=3,
    gradient_accumulation_steps=2,
    warmup_steps=3,
    max_steps=80,
    fp16= not torch.cuda.is_bf16_supported(),
    bf16= torch.cuda.is_bf16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    save_total_limit=3,
    weight_decay=0.01,
    lr_scheduler_type="linear",
)

In [None]:
output_directory = os.path.join("./", "peft_lab_outputs")

In [None]:
tokenizer.pad_token = tokenizer.eos_token
trainer = SFTTrainer(
    model=foundation_model,
    args=training_args,
    train_dataset=train_data_tokenized,
    peft_config = lora_config,
    dataset_text_field="prompt",
    tokenizer=tokenizer,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,3.9323
2,4.2832
3,3.9562
4,4.1135
5,4.094
6,3.7894
7,4.2067
8,4.6532
9,3.6848
10,3.7967


TrainOutput(global_step=80, training_loss=3.2046744927763937, metrics={'train_runtime': 256.7926, 'train_samples_per_second': 1.869, 'train_steps_per_second': 0.312, 'total_flos': 438760757698560.0, 'train_loss': 3.2046744927763937, 'epoch': 0.0881057268722467})

## 7. Fine-tune model

In [None]:
#Save the model.
peft_model_path = os.path.join(output_directory, f"lora_model")

In [None]:
#Save the model.
trainer.model.save_pretrained("/content/fine-tuned-model")

In [None]:
#In case you are having memory problems uncomment this lines to free some memory
import gc
import torch
del foundation_model
del trainer
del train_data_tokenized
del train_data
del test_data
del dataset
del train_test_dataset
torch.cuda.empty_cache()
gc.collect()

189

## 8. Model Inference

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

In [None]:
#Load the Model.
loaded_model = AutoPeftModelForCausalLM.from_pretrained(
                                        "/content/fine-tuned-model",

                                        #torch_dtype=torch.bfloat16,
                                        is_trainable=False,
                                        #load_in_8bit=True,
                                        quantization_config=bnb_config,
                                        device_map = 'cuda')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
#this function returns the outputs from the model received, and inputs.
def get_outputs(model, inputs, max_new_tokens=100):
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        repetition_penalty=1.5, #Avoid repetition.
        early_stopping=False, #The model can stop before reach the max_length
        eos_token_id=tokenizer.eos_token_id,
    )
    return outputs

In [None]:
#Inference original model
input_sentences = tokenizer("What does REGDOC-2.12.2 provide guidance on?", return_tensors="pt").to('cuda')
foundational_outputs_sentence = get_outputs(loaded_model, input_sentences, max_new_tokens=50)

print(tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['What does REGDOC-2.12.2 provide guidance on? The document provides regulatory requirements and safety expectations for the design, construction,\noperation of facilities that use or store nuclear substances in quantities greater than prescribed limits set out by regulation.\nThe purpose is to establish a framework within which these activities are conducted safely throughout their']


### New Method

In [None]:

text = "What does CSIS notice of assessment refer to?"
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = loaded_model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What does CSIS notice of assessment refer to? The notice of assessment should be issued within 60 days of the receipt of the application. In the case of a change in the information provided in the application, the CSIS should re-assess the application. The notice of assessment should be issued within


In [None]:
device_map = {"": 0}
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 

In [None]:
# Run text generation pipeline with our next model
prompt = "What is the purpose of the Canadian Nuclear Safety Commission (CNSC) publishing this document?"
pipe = pipeline(task="text-generation", model=loaded_model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt)
print(result[0]['generated_text'])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM'

What is the purpose of the Canadian Nuclear Safety Commission (CNSC) publishing this document? The purpose of this document is to provide information to licensees and applicants for licences related to the CNSC’s regulatory approach to the use of nuclear substances in the context of nuclear power plants. The document provides a regulatory framework for the safe use of nuclear substances in the context of nuclear power plants. It also provides guidance to applicants for licences and to licensees on the regulatory requirements and expectations related to the safe use of nuclear substances. The document outlines the regulatory framework for the use of nuclear substances, including requirements for a nuclear substance safety case, nuclear substance safety assessment, and a nuclear substance safety program. It also provides guidance on the regulatory requirements for the safe use of nuclear substances, including requirements for a nuclear substance safety case, nuclear substance safety asse