# LLAMA3
https://ai.meta.com/blog/meta-llama-3/

https://github.com/meta-llama/llama3/blob/main/MODEL_CARD.md

# 1. Connecting to Google Drive and Changing Directory

In [None]:
import shutil, os, subprocess
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Colab Notebooks/')

Mounted at /content/drive


In [None]:
! python --version

Python 3.10.12


# 2. GPU detection to prevent version conflicts

In [None]:
%%capture
import torch
!pip install bitsandbytes
!pip install datasets
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [None]:
!pip list | grep transformers
!pip list | grep torch
!pip list | grep accelerate
!pip list | grep bitsandbytes
!pip list | grep peft
!pip list | grep trl

transformers                     4.41.2
torch                            2.3.0+cu121
torchaudio                       2.3.0+cu121
torchsummary                     1.5.1
torchtext                        0.18.0
torchvision                      0.18.0+cu121
accelerate                       0.31.0
bitsandbytes                     0.43.1
peft                             0.11.1
fastrlock                        0.8.2
trl                              0.9.4


# 3. Import Python Packages

In [None]:
import torch, os, json, random, bitsandbytes as bnb, torch.nn as nn, psutil
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, TrainingArguments
from peft import get_peft_model, LoraConfig # LoraConfig: Configuration for LoRA (Low-Rank Adaptation), a technique for parameter-efficient training.


# 4. Login to Hugging Face

In [None]:
from huggingface_hub import notebook_login
notebook_login() #TOKEN IS "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"

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# 5. LLAMA 3 8B 8bit quantized

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
HF_TOKEN = "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"
model_name = "meta-llama/Meta-Llama-3-8B"
max_seq_length = 2048
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)  # max_length=max_seq_length

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
special_tokens = tokenizer.special_tokens_map_extended
eos_token = tokenizer.eos_token
eos_token_id = tokenizer.eos_token_id

print("EOS Token:", eos_token)
print("EOS Token ID:", eos_token_id)

EOS Token: <|end_of_text|>
EOS Token ID: 128001


In [None]:
special_tokens = tokenizer.special_tokens_map_extended
eos_token = tokenizer.eos_token
eos_token_id = tokenizer.eos_token_id

print("EOS Token:", eos_token)
print("EOS Token ID:", eos_token_id)

EOS Token: <|end_of_text|>
EOS Token ID: 128001


In [None]:
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map='auto',  # Otomatik olarak CPU ve GPU'ya dağıtma
    low_cpu_mem_usage=True,
    token=HF_TOKEN,

)     # max_length=max_seq_length,

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

In [None]:
# Enter the sentence
text = "Hello, how are you?"
inputs = tokenizer(text, return_tensors="pt").to('cuda')

# Model testing
outputs = model.generate(**inputs)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generated_text)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 

In [None]:
# Checking model quantization
def is_8bit_quantized(model):
    for name, param in model.named_parameters():
        if param.dtype == torch.int8:
            print(f"Parameter {name} is quantized to 8-bit.")
        else:
            print(f"Parameter {name} is NOT quantized to 8-bit.")

is_8bit_quantized(model)

Parameter model.embed_tokens.weight is NOT quantized to 8-bit.
Parameter model.layers.0.self_attn.q_proj.weight is quantized to 8-bit.
Parameter model.layers.0.self_attn.k_proj.weight is quantized to 8-bit.
Parameter model.layers.0.self_attn.v_proj.weight is quantized to 8-bit.
Parameter model.layers.0.self_attn.o_proj.weight is quantized to 8-bit.
Parameter model.layers.0.mlp.gate_proj.weight is quantized to 8-bit.
Parameter model.layers.0.mlp.up_proj.weight is quantized to 8-bit.
Parameter model.layers.0.mlp.down_proj.weight is quantized to 8-bit.
Parameter model.layers.0.input_layernorm.weight is NOT quantized to 8-bit.
Parameter model.layers.0.post_attention_layernorm.weight is NOT quantized to 8-bit.
Parameter model.layers.1.self_attn.q_proj.weight is quantized to 8-bit.
Parameter model.layers.1.self_attn.k_proj.weight is quantized to 8-bit.
Parameter model.layers.1.self_attn.v_proj.weight is quantized to 8-bit.
Parameter model.layers.1.self_attn.o_proj.weight is quantized to 8-bi

In [None]:
print(f"tokenizer memory usage: {psutil.virtual_memory().used / 1e9} GB")
print(f"Memory usage: {psutil.virtual_memory().used / 1e9} GB")
print("Memory usage summary after model setup:")
print(torch.cuda.memory_summary())

tokenizer memory usage: 2.749116416 GB
Memory usage: 2.749116416 GB
Memory usage summary after model setup:
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   9240 MiB |   9292 MiB |   1753 GiB |   1744 GiB |
|       from large pool |   9216 MiB |   9279 MiB |   1623 GiB |   1614 GiB |
|       from small pool |     23 MiB |    142 MiB |    130 GiB |    130 GiB |
|---------------------------------------------------------------------------|
| Active memory         |   9240 MiB |   9292 MiB |   1753 GiB |   1744 GiB |
|       from large pool |   9216 MiB |   9279 MiB |   1623 GiB |   1614 GiB |
|       from small pool |     23 M

In [None]:
total_params = sum(p.numel() for p in model.parameters())
print(f"total parameter: {total_params}")

total parameter: 8030261248


In [None]:
def model_size(model):
    total_size = 0
    for name, param in model.named_parameters():
        total_size += param.numel() * param.element_size()
    return total_size / (1024**2)  # MB

print(f"Dimension of Model: {model_size(model):.2f} MB")

Dimension of Model: 8660.51 MB


In [None]:
lora_config = {
    "r": 16,  # Number of LoRA layers
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj",  # Target modules for LoRA
                       "gate_proj", "up_proj", "down_proj"],
    "lora_alpha": 16,  # Alpha value for LoRA (optional)
    "lora_dropout": 0.1,  # Dropout value for LoRA (optional)
    "bias": "none",  # Type of bias for LoRA (optional)
    "use_gradient_checkpointing": True,  # Use of gradient checkpointing
    "use_rslora": False,  # Use of RSLora (optional)
    "use_dora": False,  # Use of DoRa (optional)
    "loftq_config": None  # Configuration for LoFTQ (optional)
}


In [None]:
# Training configuration
training_config = {
    "per_device_train_batch_size": 2,        # Batch size per device
    "gradient_accumulation_steps": 4,        # Gradient accumulation steps
    "warmup_steps": 5,                       # Warmup steps
    "max_steps": 0,                          # Maximum steps (0 if epochs are defined)
    "num_train_epochs": 10,                  # Number of training epochs (0 if maximum steps are defined)
    "learning_rate": 2e-4,                   # Learning rate
    "fp16": not torch.cuda.is_bf16_supported(),  # Use fp16 if bf16 is not supported
    "bf16": torch.cuda.is_bf16_supported(),  # Use bf16 if supported
    "logging_steps": 1,                      # Logging steps
    "optim": "adamw",                        # Optimizer
    "weight_decay": 0.01,                    # Weight decay
    "lr_scheduler_type": "linear",           # Learning rate scheduler
    "seed": 42,                              # Seed value
    "output_dir": "outputs",                 # Output directory
}


In [None]:
# QLORA
# class QLoRAModel(nn.Module):
#     def __init__(self, base_model_name, r, target_modules, lora_alpha=None, lora_dropout=None, bias=None,
#                  use_gradient_checkpointing=True, use_rslora=False, use_dora=False, loftq_config=None):
#         super(QLoRAModel, self).__init__()

#         # Load base model and tokenizer
#         self.base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
#         self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)

#         self.r = r
#         self.target_modules = target_modules
#         self.lora_alpha = lora_alpha
#         self.lora_dropout = lora_dropout
#         self.bias = bias
#         self.use_gradient_checkpointing = use_gradient_checkpointing
#         self.use_rslora = use_rslora
#         self.use_dora = use_dora
#         self.loftq_config = loftq_config

#         # Setup LoRA layers
#         self.setup_lora()

#     def setup_lora(self):
#         for name, module in self.base_model.named_modules():
#             if any(target_module in name for target_module in self.target_modules):
#                 if isinstance(module, nn.Linear):
#                     # Replace linear layers with LoRA-adapted linear layers
#                     out_features = module.out_features
#                     in_features = module.in_features
#                     setattr(self.base_model, name, LoRALinear(in_features, out_features, self.r,
#                                                              alpha=self.lora_alpha, dropout=self.lora_dropout,
#                                                              bias=self.bias))

#     def forward(self, input_ids, attention_mask=None):
#         outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
#         return outputs

# class LoRALinear(nn.Module):
#     def __init__(self, in_features, out_features, r, alpha=None, dropout=None, bias=None):
#         super(LoRALinear, self).__init__()
#         self.in_features = in_features
#         self.out_features = out_features
#         self.r = r
#         self.alpha = alpha
#         self.dropout = dropout
#         self.bias = bias

#         # Define LoRA-adapted linear layers here
#         self.fc1 = nn.Linear(in_features, r)
#         self.fc2 = nn.Linear(r, out_features)

#     def forward(self, x):
#         x = self.fc1(x)
#         # Apply activation function if necessary
#         x = nn.functional.relu(x)

#         # Apply dropout if specified
#         if self.dropout:
#             x = nn.functional.dropout(x, p=self.dropout, training=self.training)

#         x = self.fc2(x)

#         # Apply bias if specified
#         if self.bias:
#             x = x + self.bias

#         return x