<a href="https://colab.research.google.com/github/ahmedsa04/MIT-go.coll/blob/main/lab3/LLM_Finetuning_GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<table align="center">
  <td align="center"><a target="_blank" href="http://introtodeeplearning.com">
        <img src="https://i.ibb.co/Jr88sn2/mit.png" style="padding-bottom:5px;" />
      Visit MIT Deep Learning</a></td>
  <td align="center"><a target="_blank" href="https://colab.research.google.com/github/MITDeepLearning/introtodeeplearning/blob/master/lab3/LLM_Finetuning.ipynb">
        <img src="https://i.ibb.co/2P3SLwK/colab.png"  style="padding-bottom:5px;" />Run in Google Colab</a></td>
  <td align="center"><a target="_blank" href="https://github.com/MITDeepLearning/introtodeeplearning/blob/master/lab3/LLM_Finetuning.ipynb">
        <img src="https://i.ibb.co/xfJbPmL/github.png"  height="70px" style="padding-bottom:5px;"  />View Source on GitHub</a></td>
</table>

# Copyright Information

In [1]:
!pip install torch transformers datasets peft accelerate bitsandbytes
!pip install wandb
!pip install -q datasets peft bitsandbytes accelerate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from 

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import LoraConfig, get_peft_model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    device_map="auto"
).to(device)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
lora_config = LoraConfig(
    r=16,  # Rank: controls how much of the model is fine-tuned (16 is a good default)
    lora_alpha=32,  # Scaling factor for LoRA
    lora_dropout=0.1,  # Dropout rate for LoRA
    target_modules=["q_proj", "v_proj"],  # Apply LoRA only to key attention layers
    bias="none",
)

model = get_peft_model(model, lora_config)

for param in model.parameters():
    param.requires_grad = False

for name, param in model.named_parameters():  # Iterate through named parameters
    if "lora" in name:  # Check if parameter is a LoRA parameter
        param.requires_grad = True

model.print_trainable_parameters()  # Print how many parameters will be trained

print("Model and tokenizer loaded successfully!")

trainable params: 2,252,800 || all params: 1,102,301,184 || trainable%: 0.2044
Model and tokenizer loaded successfully!


In [4]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="formatted_chat.json", split="train")

def format_chat_data(example):
    messages = [
        {"role": "system", "content": "أنت مساعد ذكي يساعد في المحادثة العربية."},
        {"role": "user", "content": example["instruction"]},
        {"role": "assistant", "content": example["response"]},
    ]

    # 🚀 This ensures proper chat-style formatting
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    return {"input_text": prompt, "output_text": example["response"]}  # Only assistant response as output

formatted_dataset = dataset.map(format_chat_data)
print(formatted_dataset[0])
print("Dataset loaded successfully!")


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/11224 [00:00<?, ? examples/s]

{'instruction': 'شكراً حبيبي عندها بعد صور هدى اخذتلي ماعندي التلي مالها', 'response': 'امممم هاج', 'input_text': '<|system|>\nأنت مساعد ذكي يساعد في المحادثة العربية.</s>\n<|user|>\nشكراً حبيبي عندها بعد صور هدى اخذتلي ماعندي التلي مالها</s>\n<|assistant|>\nامممم هاج</s>\n<|assistant|>\n', 'output_text': 'امممم هاج'}
Dataset loaded successfully!


In [5]:
def tokenize_function(example):
    tokenized_prompt = tokenizer(
        example["input_text"],  # Full input text (system + user prompt)
        truncation=True,
        padding="max_length",
        max_length=512
    )

    tokenized_response = tokenizer(
        example["output_text"],  # Only the assistant's response
        truncation=True,
        padding="max_length",
        max_length=512
    )

    # 🛠 Mask everything except the assistant response
    labels = [-100] * len(tokenized_prompt["input_ids"])  # Ignore everything
    response_start = len(tokenized_prompt["input_ids"]) - len(tokenized_response["input_ids"])
    labels[response_start:] = tokenized_response["input_ids"]  # Train only on response

    tokenized_prompt["labels"] = labels  # Set properly masked labels
    return tokenized_prompt

tokenized_datasets = formatted_dataset.map(tokenize_function, batched=False, remove_columns=dataset.column_names)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


print("Dataset tokenized successfully!")

Map:   0%|          | 0/11224 [00:00<?, ? examples/s]

Dataset tokenized successfully!


In [6]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./tinyllama_trained",
    per_device_train_batch_size=4,  # Adjust batch size based on memory
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=1,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=5e-6,
    fp16=True,
    optim="adamw_torch",
    warmup_steps=100,
    weight_decay=0.01,
    push_to_hub=False
)



In [7]:
import torch.nn as nn
from torch.optim import AdamW

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-6)  # Lower LR for stability
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

model = torch.compile(model)

In [9]:
from torch.utils.data import DataLoader
import torch.nn as nn
from torch.optim import AdamW

# Use float32 to avoid precision issues

# Create DataLoader (Reduce batch size to prevent OOM)
train_dataloader = DataLoader(tokenized_datasets, batch_size=4, shuffle=True)  # Use batch_size=1 to save memory
max_steps = 1000  # 🔥 Change this to train for the number of steps you want
print_interval = 10  # Print loss every N steps

model.train()
step = 0
total_loss = 0

for step, batch in enumerate(train_dataloader):
    if step >= max_steps:
        break

    optimizer.zero_grad()

    input_ids = batch["input_ids"].to(device, non_blocking=True)
    attention_mask = batch["attention_mask"].to(device, non_blocking=True)
    labels = batch["labels"].to(device, non_blocking=True)

    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss

    if torch.isnan(loss):
        print(f"Skipping step {step} due to NaN loss")
        continue

    loss.backward()
    optimizer.step()

    # 🔥 Print Loss and Generate Output at Every Step
    print(f"Step {step + 1}/{max_steps}, Loss: {loss.item()}")

    if step % 1 == 0:  # Print prediction EVERY step
        test_prompt = tokenizer.apply_chat_template([
            {"role": "system", "content": "أنت مساعد ذكي يساعد في المحادثة العربية."},
            {"role": "user", "content": "هلو حبيبي"}
        ], tokenize=False, add_generation_prompt=True)  # ✅ Use proper formatting

        test_input_ids = tokenizer(test_prompt, return_tensors="pt").input_ids.to(device)

        with torch.no_grad():
            generated_ids = model.generate(test_input_ids, max_new_tokens=50, do_sample=True, temperature=0.7)

        predicted_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

        print(f"📝 Predicted Output: {predicted_text}")
        print("-" * 50)

# **Final Loss Report**
avg_loss = total_loss / max_steps
print(f"Training completed! Average Loss: {avg_loss}")

# **Save Model**
model.save_pretrained("tinyllama_finetuned")
tokenizer.save_pretrained("tinyllama_finetuned")
print("Model saved successfully!")

Step 1/1000, Loss: 18.61725616455078
📝 Predicted Output: <|system|>
أنت مساعد ذكي يساعد في المحادثة العربية. 
<|user|>
هلو حبيبي 
<|assistant|>
أنت مساعد ذكي يساعد في المحادثة العربية. لقد نشر مقالات
--------------------------------------------------
Step 2/1000, Loss: 18.640352249145508
📝 Predicted Output: <|system|>
أنت مساعد ذكي يساعد في المحادثة العربية. 
<|user|>
هلو حبيبي 
<|assistant|>
هلو حبيبي هو مستشفى وكافة مناطق سيارات معصائم ومكا
--------------------------------------------------
Step 3/1000, Loss: 17.70430564880371
📝 Predicted Output: <|system|>
أنت مساعد ذكي يساعد في المحادثة العربية. 
<|user|>
هلو حبيبي 
<|assistant|>
هلو حبيبي قد يقدم لنفسك أدوات لتساعدك في العربية. سي
--------------------------------------------------
Step 4/1000, Loss: 18.39281463623047
📝 Predicted Output: <|system|>
أنت مساعد ذكي يساعد في المحادثة العربية. 
<|user|>
هلو حبيبي 
<|assistant|>
هلو حبيبي هو مستشار عربي. وهو يعمل على إدارة المحادث
--------------------------------------------------
Step 5

KeyboardInterrupt: 

In [None]:
!pip install llama-cpp-python  # Install conversion tool

from llama_cpp import convert_hf_model

convert_hf_model("tinyllama_finetuned", "tinyllama.gguf", format="gguf")
