In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct",#"unsloth/gemma-2-2b-it", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(["اشرح معلقة عنترة"], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
response = tokenizer.batch_decode(outputs)

In [8]:
print(response[0])

<|begin_of_text|>اشرح معلقة عنترة بن نزار:
مَن عادَ لي فَما اِستَغاثت بِعَزِزٍ
وَما اِستَعانَني مِنَ العَدوِّ مُعتَزٍ
وَما اِستَغاثت بِسِلاحٍ وَما اِستَعانَني
مِنَ القَومِ وَما اِستَغاثت بِأَميرٍ
وَما اِستَعانَني مِنَ العَدوِّ مُعتَزٍ
وَما اِستَغاثت بِسِلاحٍ وَما اِستَعانَني
مَن عادَ لي فَما اِستَغاثت بِعَزِزٍ
وَما اِستَعانَني مِنَ العَدوِّ مُعتَزٍ
وَما اِستَغاثت بِسِلاحٍ وَما اِستَعانَني
مَن عادَ لي فَما اِستَغاثت بِعَزِزٍ



In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",

                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32


In [3]:
wikipedia_prompt =  """مقالة صحفية
### العنوان: {}

### المقالة:
{}"""

EOS_TOKEN = "t"#tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    titles = examples['head_line']
    texts  = examples["text"]
    outputs = []
    for title, text in zip(titles, texts):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = wikipedia_prompt.format(title, text) + EOS_TOKEN
        outputs.append(text)
    return { "text" : outputs, }
pass

In [4]:
from datasets import load_dataset
dataset = load_dataset('MohamedRashad/arabic-billion-words')
dataset = dataset.map(formatting_prompts_func, batched = True,)

Loading dataset shards:   0%|          | 0/34 [00:00<?, ?it/s]

In [None]:
import torch

def get_gpu_memory_usage():
    torch.cuda.empty_cache()
    return torch.cuda.memory_allocated() / 1e9, torch.cuda.memory_reserved() / 1e9
# Create a sample batch from the dataset
sample_batch = dataset['train'][:100]  # Take 2 samples for a small batch

# Tokenize the sample batch
inputs = tokenizer(
    sample_batch['text'],
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=max_seq_length
).to('cuda')

# Measure memory usage before and after model inference
torch.cuda.empty_cache()
before_mem = get_gpu_memory_usage()
_ = model(**inputs)
after_mem = get_gpu_memory_usage()

# Calculate memory usage per sample
mem_per_sample = (after_mem[0] - before_mem[0]) / inputs['input_ids'].shape[0]
print(f"Memory usage per sample: {mem_per_sample:.2f} GB")


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.12 GiB. GPU 

In [9]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset["train"],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 12,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 8,

        # Use warmup_ratio and num_train_epochs for longer runs!
        # max_steps = 120,
        warmup_steps = 10,
        warmup_ratio = 0.1,
        num_train_epochs = 100,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/content/drive/MyDrive/LLM/outputs",
    ),
)
trainer_stats = trainer.train()

Map (num_proc=12):   0%|          | 0/5222964 [00:00<?, ? examples/s]

TimeoutError: 

In [13]:
trainer_stats = trainer.train()

Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for embed_tokens.
Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for lm_head.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 5,222,964 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 120
 "-____-"     Number of trainable parameters = 1,386,217,472


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.96 GiB. GPU 

In [None]:
import os
import multiprocessing
import torch

# Number of physical CPU cores
physical_cores = os.cpu_count() // 2

# Number of logical CPU cores (including hyper-threading)
logical_cores = multiprocessing.cpu_count()

print(f"Physical CPU Cores: {physical_cores}")
print(f"Logical CPU Cores: {logical_cores}")

if torch.cuda.is_available():
    gpu_info = torch.cuda.get_device_properties(0)
    print(f"GPU Name: {gpu_info.name}")
    print(f"GPU Capability: {gpu_info.major}.{gpu_info.minor}")
    print(f"Total Memory: {gpu_info.total_memory / 1e9} GB")
    print(f"Number of Multiprocessors: {gpu_info.multi_processor_count}")
else:
    print("No GPU found")


Physical CPU Cores: 6
Logical CPU Cores: 12
GPU Name: NVIDIA L4
GPU Capability: 8.9
Total Memory: 23.802544128 GB
Number of Multiprocessors: 58


In [14]:
import torch

# Clear unused GPU memory
del trainer
torch.cuda.empty_cache()

# Verify memory usage
print(f"Allocated GPU Memory: {torch.cuda.memory_allocated()} bytes")
print(f"Cached GPU Memory: {torch.cuda.memory_reserved()} bytes")


Allocated GPU Memory: 40086755328 bytes
Cached GPU Memory: 40338718720 bytes


In [15]:
allocated = torch.cuda.memory_allocated() / 1e9
cached = torch.cuda.memory_reserved() / 1e9
print(f"Allocated: {allocated:.2f} GB, Cached: {cached:.2f} GB")

Allocated: 40.09 GB, Cached: 40.34 GB


In [None]:
!nvidia-smi


Mon Aug  5 16:20:27 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   76C    P0              34W /  72W |  21493MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    