# Fine-tuning LLaMA 3 8B 8Bit model

## Loading Libraries

In [1]:
%%capture
import torch
!pip install bitsandbytes
!pip install datasets
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes

In [1]:
import torch
import json
import bitsandbytes as bnb
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from peft import get_peft_model, LoraConfig
from trl import SFTTrainer
from huggingface_hub import notebook_login
from datasets import load_metric
import transformers
from peft import AutoPeftModelForCausalLM
from transformers import BitsAndBytesConfig
import os

notebook_login()  # Login to Hugging Face Hub # TOken = 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load Tokenizer Function

In [4]:
def load_tokenizer(model_name, hf_token):
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

## Load Json file function

In [5]:
def load_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

## Preprocess Function for Json File

In [21]:
def preprocess_data(data, tokenizer, max_length=128):
    data = data[:100]
    input_ids = []
    attention_mask = []
    labels = []

    for item in data:
        question = item['prompt']
        answer = item['response']
        input_text = question + " </s> " + answer
        tokenized_input = tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=max_length
        )
        tokenized_label = tokenizer(
            answer,
            padding='max_length',
            truncation=True,
            max_length=max_length
        )
        input_ids.append(tokenized_input['input_ids'])
        attention_mask.append(tokenized_input['attention_mask'])
        labels.append(tokenized_label['input_ids'])

    dataset_dict = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }
    return Dataset.from_dict(dataset_dict)


## Function to Quantize Model

In [11]:
def load_quantized_model(model_name, device_map):
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map=device_map,
        use_cache=False
    )
    return model

## Function to load fine-tuned model

In [7]:
def load_fine_tuned_model():
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
    )
    loaded_model = AutoPeftModelForCausalLM.from_pretrained(
        "/content/fine-tuned-model",
        is_trainable=False,
        quantization_config=bnb_config,
        device_map='cuda'
    )
    return loaded_model

## Function to preprocess input

In [5]:
def preprocess_input(question, tokenizer, max_length=128):
    input_text = question + " </s> "
    tokenized_input = tokenizer(
        input_text,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=max_length
    )
    return tokenized_input

## Function to Generate Answer

In [6]:
def generate_answer(question, model, tokenizer, max_length=128):
    tokenized_input = preprocess_input(question, tokenizer, max_length).to('cuda')
    model.eval()
    with torch.no_grad():
        outputs = model.generate(**tokenized_input, max_new_tokens=50)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Main Body

In [2]:
hf_token = "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"
model_name = "meta-llama/Meta-Llama-3-8B"
target_modules = ["q_proj", "v_proj"]

In [10]:
tokenizer = load_tokenizer(model_name, hf_token)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
quantized_model = load_quantized_model(model_name, {"": 0})


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

In [22]:
data = load_json('/content/CNSC_QA_pairs_1_10.json') # You can change size of the data in preprocessed_data function
preprocessed_data = preprocess_data(data, tokenizer)

split_dataset = preprocessed_data.train_test_split(test_size=0.2)
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

In [23]:
working_dir = './'

output_directory = os.path.join(working_dir, "peft_lab_outputs")

In [26]:
lora_config = LoraConfig(
        r=16,
        lora_alpha=16,
        target_modules=target_modules,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

training_args = TrainingArguments(
        output_dir=output_directory,
        auto_find_batch_size=True,
        learning_rate=2e-4,
        num_train_epochs=5
    )

trainer = SFTTrainer(
        model=quantized_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        peft_config=lora_config,
        tokenizer=tokenizer,
        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
        max_seq_length=1024
    )


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [27]:
trainer.train()



Step,Training Loss


Step,Training Loss


Step,Training Loss


TrainOutput(global_step=400, training_loss=0.9827991485595703, metrics={'train_runtime': 667.998, 'train_samples_per_second': 0.599, 'train_steps_per_second': 0.599, 'total_flos': 2319144689074176.0, 'train_loss': 0.9827991485595703, 'epoch': 5.0})

In [28]:
peft_model_path = os.path.join(output_directory, f"lora_model")

In [29]:
trainer.model.save_pretrained("/content/fine-tuned-model")

In [31]:
trainer.evaluate()



{'eval_loss': 1.6747280359268188,
 'eval_runtime': 3.263,
 'eval_samples_per_second': 6.129,
 'eval_steps_per_second': 0.919,
 'epoch': 5.0}

### ** RESTART SESSION **

## Inferencing

In [8]:
loaded_model = load_fine_tuned_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
question = "What is the purpose of Small Modular Reactors (SMRs)?"

# Generate answer
answer = generate_answer(question, loaded_model, tokenizer)
print(f"Question: {question}")
print(f"Answer: {answer}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is the purpose of Small Modular Reactors (SMRs)?
Answer: What is the purpose of Small Modular Reactors (SMRs)? </s> site access security clearance The purpose of Small Modular Reactors (SMRs) is to provide non-emitting baseload generation, support intermittent renewable sources like wind and solar, and produce steam for industrial purposes. SMRs are smaller versions of traditional nuclear reactors
