#**Step 1: Install All the Required Packages**

In [1]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m61.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

### **Step 2: Import All the Required Libraries**

In [2]:
import os                   # Importing the os module for interacting with the operating system
import torch                # Importing the PyTorch library for tensor computations
from datasets import load_dataset   # Importing load_dataset function from datasets library
from transformers import (  # Importing various classes and functions from transformers library
    AutoModelForCausalLM,  # Pre-trained model for causal language modeling
    AutoTokenizer,         # Tokenizer for automatic tokenization of text
    BitsAndBytesConfig,    # Configuration class for BitsAndBytes model (not used in the code)
    HfArgumentParser,      # Argument parser for Hugging Face library
    TrainingArguments,     # Arguments for training models
    pipeline,              # Function for easy pipeline creation
    logging                # Logging utilities for transformers library
)

- Orignal Dataset: https://huggingface.co/datasets/timdettmers/openassistant-guanaco

- Reformat Dataset following the Llama 2 template with 1k sample: https://huggingface.co/datasets/mlabonne/guanaco-llama2-1k

- Complete Reformat Dataset following the Llama 2 template: https://huggingface.co/datasets/mlabonne/guanaco-llama2

In [3]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# The instruction dataset to use
dataset_name = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model name
new_model = "Llama-2-7b-chat-finetune"

# QLoRA parameters
# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

# bitsandbytes parameters
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

# TrainingArguments parameters
# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

# SFT parameters
# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

##**Step 4:Load everything and start the fine-tuning process**

In [4]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)  # Get data type for 4-bit compute
# Define BitsAndBytesConfig for quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,                     # Whether to load in 4-bit mode
    bnb_4bit_quant_type=bnb_4bit_quant_type,   # Type of 4-bit quantization
    bnb_4bit_compute_dtype=compute_dtype,      # Compute data type for 4-bit
    bnb_4bit_use_double_quant=use_nested_quant # Whether to use double quantization
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,             # Name of the pre-trained model
    quantization_config=bnb_config,  # BitsAndBytesConfig for quantization
    device_map=device_map   # Map of devices to load model on
)
model.config.use_cache = False   # Disable caching in model configuration
model.config.pretraining_tp = 1  # Set pretraining_tp attribute in model configuration

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)  # Load tokenizer for the model
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to end-of-sequence token
tokenizer.padding_side = "right"  # Ensure padding is applied on the right side to fix overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,    # Alpha parameter for LoRA
    lora_dropout=lora_dropout,  # Dropout rate for LoRA
    r=lora_r,                  # R parameter for LoRA
    bias="none",               # Bias setting for LoRA
    task_type="CAUSAL_LM",     # Task type for LoRA
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,                              # Directory to save the model outputs
    num_train_epochs=num_train_epochs,                  # Number of training epochs
    per_device_train_batch_size=per_device_train_batch_size,  # Batch size per GPU/CPU for training
    gradient_accumulation_steps=gradient_accumulation_steps,  # Number of updates steps to accumulate before performing a backward/update pass
    optim=optim,                                        # Optimizer to use for training
    save_steps=save_steps,                              # Frequency (in steps) to save checkpoints during training
    logging_steps=logging_steps,                        # Frequency (in steps) to log information during training
    learning_rate=learning_rate,                        # Learning rate for the optimizer
    weight_decay=weight_decay,                          # Weight decay for regularization
    fp16=fp16,                                          # Whether to use fp16 precision (mixed precision training)
    bf16=bf16,                                          # Whether to use bf16 precision
    max_grad_norm=max_grad_norm,                        # Maximum gradient norm for clipping gradients
    max_steps=max_steps,                                # Maximum number of training steps to perform
    warmup_ratio=warmup_ratio,                          # Ratio of warmup steps for learning rate scheduler
    group_by_length=group_by_length,                    # Whether to group batches of different lengths together for more efficient processing
    lr_scheduler_type=lr_scheduler_type,                # Type of learning rate scheduler to use
    report_to="tensorboard"                             # Output destination for logging
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,                      # Pre-trained model to fine-tune
    train_dataset=dataset,            # Dataset for training
    peft_config=peft_config,          # LoRA configuration for fine-tuning
    dataset_text_field="text",        # Field in dataset containing text
    max_seq_length=max_seq_length,    # Maximum sequence length for input data
    tokenizer=tokenizer,              # Tokenizer for preprocessing text
    args=training_arguments,          # Training arguments/settings
    packing=packing,                  # Packing strategy (not specified in the snippet)
)

# Train model
trainer.train()  # Start training the model


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/967k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]



config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,1.4079
50,1.6601
75,1.2136
100,1.4445
125,1.1762
150,1.3653
175,1.1737
200,1.4664
225,1.158
250,1.5416


TrainOutput(global_step=250, training_loss=1.3607293548583985, metrics={'train_runtime': 1531.0494, 'train_samples_per_second': 0.653, 'train_steps_per_second': 0.163, 'total_flos': 8755214190673920.0, 'train_loss': 1.3607293548583985, 'epoch': 1.0})

In [5]:
# Save trained model
trainer.model.save_pretrained(new_model)

###**Step 6:Use the text generation pipeline to ask questions.**

In [6]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What is a large language model?"  # Define a prompt for text generation
pipe = pipeline(task="text-generation",    # Create a pipeline for text generation
                model=model,                # Specify the model for text generation
                tokenizer=tokenizer,        # Specify the tokenizer for preprocessing text
                max_length=200)             # Set maximum length for generated text

result = pipe(f"<s>[INST] {prompt} [/INST]")  # Generate text using the pipeline and provided prompt

# Print the generated text
print(result[0]['generated_text'])  # Access and print the generated text from the pipeline result



<s>[INST] What is a large language model? [/INST] A large language model is a type of artificial intelligence (AI) model that is trained on a large dataset of text to generate human-like language outputs. It is typically trained on a large dataset of text, such as books, articles, or websites, and is designed to generate text that is similar to the training data.

Large language models are often used for natural language processing tasks such as text classification, sentiment analysis, and machine translation. They are also used for generating text, such as chatbots, and for generating creative content, such as poetry or stories.

Some examples of large language models include:

* BERT (Bidirectional Encoder Representations from Transformers): A popular large language model developed by Google that is trained on a large dataset of text and is designed to generate human-like language outputs.
* LLaMA (LLaMA:


In [7]:
# Empty VRAM
# Delete references to objects to free memory
del model     # Delete the 'model' object from memory
del pipe      # Delete the 'pipe' object from memory
del trainer   # Delete the 'trainer' object from memory
import gc     # Import the garbage collector module 'gc'
gc.collect()  # Perform garbage collection to free up unused memory
gc.collect()  # Perform garbage collection again to ensure all eligible objects are freed

20933

##**Step 7: Store Model (Llama-2-7b-chat-finetune)**

In [8]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"



# Reload model in FP16 and merge it with LoRA weights
# Load base model in FP16 with specified configurations
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,                    # Name of the pre-trained model to load
    low_cpu_mem_usage=True,        # Reduce CPU memory usage during loading
    return_dict=True,              # Return outputs as dictionaries for easier access
    torch_dtype=torch.float16,     # Use torch.float16 data type for model weights
    device_map=device_map,         # Map of devices to load model on
)
# Initialize PeftModel with pre-trained base model and new_model
model = PeftModel.from_pretrained(base_model, new_model)
# Merge LoRA weights with the loaded model and unload unnecessary components
model = model.merge_and_unload()
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(
    model_name,                    # Name of the pre-trained model to load tokenizer for
    trust_remote_code=True         # Trust remote code when loading tokenizer
)
tokenizer.pad_token = tokenizer.eos_token     # Set padding token to end-of-sequence token
tokenizer.padding_side = "right"              # Ensure padding is applied on the right side

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 