# Fine-tuning LLaMA 3 8B 8Bit model

## Loading Libraries

In [2]:
%%capture
import torch
!pip install bitsandbytes
!pip install datasets
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes

In [3]:
import torch
import json
import bitsandbytes as bnb
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, AdamW, get_linear_schedule_with_warmup
from peft import get_peft_model, LoraConfig
from trl import SFTTrainer
from huggingface_hub import notebook_login
from datasets import load_metric
import transformers
from peft import AutoPeftModelForCausalLM
from transformers import BitsAndBytesConfig
import os

notebook_login()  # Login to Hugging Face Hub # TOken = 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load Tokenizer Function

In [4]:
def load_tokenizer(model_name, hf_token):
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

## Load Json file function

In [5]:
def load_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

## Preprocess Function for Json File

In [6]:
def preprocess_data(data, tokenizer, max_length=128):
    data = data[:5000]
    input_ids = []
    attention_mask = []
    labels = []

    for item in data:
        question = item['prompt']
        answer = item['response']
        input_text = question + " [SEP] " + answer
        tokenized_input = tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=max_length
        )
        tokenized_label = tokenizer(
            answer,
            padding='max_length',
            truncation=True,
            max_length=max_length
        )
        input_ids.append(tokenized_input['input_ids'])
        attention_mask.append(tokenized_input['attention_mask'])
        labels.append(tokenized_label['input_ids'])

    dataset_dict = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }
    return Dataset.from_dict(dataset_dict)


## Function to Quantize Model

In [7]:
def load_quantized_model(model_name, device_map):
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map=device_map,
        use_cache=False
    )
    return model

## Function to load fine-tuned model

In [8]:
def load_fine_tuned_model():
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
    )
    loaded_model = AutoPeftModelForCausalLM.from_pretrained(
        "/content/fine-tuned-model",
        is_trainable=False,
        quantization_config=bnb_config,
        device_map='cuda',
    )
    return loaded_model

## Function to preprocess input

In [9]:
def preprocess_input(question, tokenizer, max_length=128):
    input_text = question + " </s> "
    tokenized_input = tokenizer(
        input_text,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=max_length
    )
    return tokenized_input

## Function to Generate Answer

In [10]:
def generate_answer(question, model, tokenizer, max_length=1024):
    tokenized_input = preprocess_input(question, tokenizer, max_length).to('cuda')
    model.eval()
    with torch.no_grad():
        outputs = model.generate(**tokenized_input, max_new_tokens=100)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # # Post-processing step to remove the question and any unwanted tokens
    # # Remove the question from the start of the answer
    # if question in answer:
    #     answer = answer.replace(question, '').strip()

    # # Remove any occurrences of `</s>` and `[SEP]`
    # answer = answer.replace('</s>', '').replace('[SEP]', '').strip()

    # # Remove any leading or trailing whitespace
    # answer = answer.strip()
    return answer

# Main Body

In [11]:
hf_token = "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"
model_name = "meta-llama/Meta-Llama-3-8B"
target_modules = ["q_proj", "v_proj"]

In [12]:
tokenizer = load_tokenizer(model_name, hf_token)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
quantized_model = load_quantized_model(model_name, {"": 0})


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

## INferencing with Quantized model

In [None]:
question = "What is the purpose of Small Modular Reactors (SMRs)?"

# Generate answer
answer = generate_answer(question, quantized_model, tokenizer)
print(f"Question: {question}")
print(f"Answer: {answer}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is the purpose of Small Modular Reactors (SMRs)?
Answer: What is the purpose of Small Modular Reactors (SMRs)? </s>? </s>
- The purpose of SMRs is to provide a small-scale alternative to large nuclear power plants. SMRs are designed to be factory built, shipped to the site, and assembled. They are designed to be walk-away safe, meaning that in the event of an emergency, the plant can be left without intervention. This is achieved through passive safety features. The reactor is also designed to be modular, meaning that the reactor core can be replaced without replacing the entire plant. SMRs are


In [None]:
question = "What is the contact information provided for requesting a copy of the Dosimetry document in English or French?"

# Generate answer
answer = generate_answer(question, quantized_model, tokenizer)
print(f"Question: {question}")
print(f"Answer: {answer}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is the contact information provided for requesting a copy of the Dosimetry document in English or French?
Answer: What is the contact information provided for requesting a copy of the Dosimetry document in English or French? </s> inglés o francés? </s>
The contact information for requesting a copy of the Dosimetry document in English or French is:
The contact information for requesting a copy of the Dosimetry document in English or French is:


In [None]:
question = "Where are whole-body dosimeters typically worn?"

# Generate answer
answer = generate_answer(question, quantized_model, tokenizer)
print(f"Question: {question}")
print(f"Answer: {answer}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: Where are whole-body dosimeters typically worn?
Answer: Where are whole-body dosimeters typically worn? </s> 
Where are whole-body dosimeters typically worn? </s>  Where are whole-body dosimeters typically worn? </s>  Where are whole-body dosimeters typically worn? </s>  Where are whole-body dosimeters typically worn? </s>  Where are whole-body dosimeters typically worn? </s>  Where are whole-body dosimeters typically worn? </s>  Where are whole-body dosimeters typically worn? </s>  Where are whole-body dosimeters typically worn


In [16]:
data = load_json('/content/train_data.json') # You can change size of the data in preprocessed_data function
train_preprocessed_data = preprocess_data(data, tokenizer)



In [17]:
working_dir = './'

output_directory = os.path.join(working_dir, "peft_lab_outputs")

In [18]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,  # Increased to improve learning capacity
    target_modules=target_modules,
    lora_dropout=0.1,  # Increased to prevent overfitting
    bias="none",
    task_type="CAUSAL_LM"
)

training_args = TrainingArguments(
    output_dir=output_directory,
    auto_find_batch_size=True,
    learning_rate=2e-4,  # Slightly lower to improve stability
    num_train_epochs=5,
    gradient_accumulation_steps=4,  # Helps with smaller batches
    fp16=True,  # Enables mixed precision training
    fp16_full_eval=True,  # Enables mixed precision evaluation
    save_total_limit=3,  # Limits the number of checkpoints
    eval_strategy="steps",  # Evaluate after a fixed number of steps
    eval_steps=500,  # Evaluation interval
    save_steps=500,  # Save interval
    logging_dir='./logs',  # Log directory
    logging_steps=100,  # Logging interval
    optim="adamw_torch",
)



In [19]:
trainer = SFTTrainer(
    model=quantized_model,
    args=training_args,
    train_dataset=train_preprocessed_data,
    peft_config=lora_config,
    tokenizer=tokenizer,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    max_seq_length=1024,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [None]:
trainer.train()

In [None]:
peft_model_path = os.path.join(output_directory, f"lora_model")

In [None]:
trainer.model.save_pretrained("/content/fine-tuned-model")

In [None]:
trainer.evaluate()

{'eval_loss': 1.5938022136688232,
 'eval_runtime': 8.0647,
 'eval_samples_per_second': 24.799,
 'eval_steps_per_second': 3.1,
 'epoch': 5.0}

### ** RESTART SESSION **

## Inferencing

In [None]:
loaded_model = load_fine_tuned_model()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
question = "What is the purpose of Small Modular Reactors (SMRs)?"

# Generate answer
answer = generate_answer(question, loaded_model, tokenizer)
print(f"Question: {question}")
print(f"Answer: {answer}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is the purpose of Small Modular Reactors (SMRs)?
Answer: or> The purpose of Small Modular Reactors (SMRs) is to provide non-emitting baseload generation, support intermittent renewable sources like wind and solar, and produce steam for industrial purposes. They are designed to be scalable and can be deployed in various settings, including remote off-grid communities, to provide a sustainable energy solution.  SMRs are smaller nuclear reactors that can be deployed in various settings to provide non-emitting baseload generation, support intermittent renewable sources like wind and solar,


In [None]:
question = "What is the contact information provided for requesting a copy of the Dosimetry document in English or French?"

# Generate answer
answer = generate_answer(question, loaded_model, tokenizer)
print(answer)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The contact information provided for requesting a copy of the Dosimetry document in English or French is the Canadian Nuclear Safety Commission (CNSC) at 280 Slater Street, P.O. Box 1046, Station B, Ottawa, Ontario K1P 5S9, Canada. You can also contact them by phone at 613-995-5894 or 1-800-668-5284 (in Canada only), by fax at 613-995-


In [None]:
question = "Where can I find information about the Canadian Nuclear Safety Commission on LinkedIn?"

# Generate answer
answer = generate_answer(question, loaded_model, tokenizer)

print(f"Answer: {answer}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Answer: You can find information about the Canadian Nuclear Safety Commission on LinkedIn by visiting their profile page at linkedin.com/company/cnscccsn.   Where can I find the Canadian Nuclear Safety Commission's Twitter account?  You can find the Canadian Nuclear Safety Commission's Twitter account by visiting their profile page at twitter.com/CNSC_CCSN.   Where can I find the Canadian Nuclear Safety Commission's YouTube channel?  You can find


In [None]:
question = "Where can the full list of regulatory document series be found?"

# Generate answer
answer = generate_answer(question, loaded_model, tokenizer)
print(f"Question: {question}")
print(f"Answer: {answer}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: Where can the full list of regulatory document series be found?
Answer: The full list of regulatory document series can be found on the CNSC's website. It is important to note that the regulatory document series are not listed in numerical order. Each series has its own numbering system. For example, the series REGDOC-2.3 is not followed by REGDOC-2.4, but rather by REGDOC-2.3.1, REGDOC-2.3.2, and so on. This helps in organizing and categorizing the


In [None]:
question = "What is included in the Sample thyroid screening log for iodine-131?"

# Generate answer
answer = generate_answer(question, loaded_model, tokenizer)
print(f"Question: {question}")
print(f"Answer: {answer}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is included in the Sample thyroid screening log for iodine-131?
Answer: the Sample thyroid screening log for iodine-131 includes the name of the person being screened, the date of screening, the time of screening, the activity of the thyroid screening, and the initials of the person performing the screening.  This log is used to track and record the thyroid screenings performed on individuals who may have been exposed to iodine-131. It helps in monitoring the levels of iodine-131 in the thyroid and ensuring that any potential health risks are addressed promptly. It
