<a href="https://colab.research.google.com/github/azarbarzin/LLMs/blob/main/Lora_TinyLlama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install transformers datasets bitsandbytes trl peft

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os, gc, torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, pipeline, logging
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig

In [5]:
class LoraFineTuner:
  """
  Generic LoRA Fine-Tuning pipeline for Hugging Face causal language models.
  """
  def __init__(
      self,
      base_model: str,
      dataset_name: str,
      output_dir: str = "./finetuned-model",
      num_train_epochs: int = 2,
      learning_rate: float = 2e-4,
      batch_size: int = 2,
      gradient_accumulation_steps: int = 16,
      r=64,
      lora_alpha=16,
      lora_dropout=0.1,
      bias="none",
      task_type="CAUSAL_LM",
      # output_dir="./results",
      save_steps = 25,
      logging_steps=1,
      # learning_rate=2e-4,
      weight_decay=1e-3,
      max_grad_norm=0.3,
      max_steps=-1,
      warmup_ratio=0.03,


      lr_scheduler_type="cosine",
      # num_train_epochs=2,
      per_device_train_batch=2,
      # gradient_accumulation_steps=16,
      optim="adamw_torch",
      ):

        self.base_model = base_model
        self.dataset_name = dataset_name
        self.output_dir = output_dir
        self.num_train_epochs = num_train_epochs
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.gradient_accumulation_steps = gradient_accumulation_steps


        self.r = r
        self.lora_alpha = lora_alpha
        self.lora_dropout = lora_dropout
        self.bias = bias
        self.task_type = task_type


        self.model = None
        self.tokenizer = None
        self.dataset = None
        self.trainer = None


        self.output_dir=output_dir
        self.save_steps = save_steps
        self.logging_steps=logging_steps
        self.learning_rate=learning_rate
        self.weight_decay=weight_decay
        self.max_grad_norm=max_grad_norm
        self.max_steps=max_steps
        self.lr_scheduler_type=lr_scheduler_type
        self.num_train_epochs=num_train_epochs
        self.per_device_train_batch=per_device_train_batch
        self.gradient_accumulation_steps=gradient_accumulation_steps
        self.optim=optim
        self.warmup_ratio=warmup_ratio
        self.warmup_steps=int(self.num_train_epochs * 0.03)


        self.use_fp16 = False
        self.use_bf16 = False
        self.pin_memory = False
        self.device_type = "cpu"

  def detect_device_and_precision(self):
    self.use_fp16 = False
    self.use_bf16 = False
    self.pin_memory = False
    self.device_type = "cpu"

    # CUDA First
    if torch.cuda.is_available():
      self.device_type = "cuda"
      self.use_fp16 = True
      self.pin_memory = True
      return self.device_type, self.use_fp16, self.use_bf16, self.pin_memory

    # TPU
    if importlib.util.find_spec("torch_xla") is not None:
      try:
        import torch_xla.core.xla_model as xm
        _ = xm.xla_device()
        self.device_type = "xla"
        self.use_bf16 = True
        self.pin_memory = True
      except Exception:
        self.device_type = "cpu"

    return self.device_type, self.use_fp16, self.use_bf16, self.pin_memory


  def load_dataset(self):
    print(f" Loading dataset: {self.dataset_name}")
    self.dataset = load_dataset(self.dataset_name, split="train")

  def load_model_and_tokenizer(self):
    print(f" Loading dataset: {self.base_model}")
    self.model = AutoModelForCausalLM.from_pretrained(
        self.base_model,
        device_map="auto",
        torch_dtype=torch.float16,
    )
    self.model.config.use_cash = False
    self.model.config.pretraining_tp = 1

    print(" Loading tokenizer...")
    self.tokenizer = AutoTokenizer.from_pretrained(self.base_model, trust_remote_code=True)
    self.tokenizer.pad_token = self.tokenizer.eos_token
    self.tokenizer.padding_side = "right"


  def quick_inference(self, prompt: str, max_length: int = 200):
    logging.set_verbosity(logging.CRITICAL)
    pipe = pipeline(task="text-generation", model=self.model, tokenizer=self.tokenizer, max_length=max_length)
    result = pipe(f"<s>[INST] {prompt} [/INST]")
    print(result[0]["generated_text"])
    return result[0]["generated_text"]

  def create_lora_config(self) -> LoraConfig:
    print("Creating LoRA configuration...")
    return LoraConfig(
        r = self.r,
        lora_alpha = self.lora_alpha,
        lora_dropout = self.lora_dropout,
        bias = self.bias,
        task_type = self.task_type,
    )

  def create_training_args(self) -> TrainingArguments:
    print("Creating training arguments...")
    return TrainingArguments(
        output_dir = self.output_dir,
        num_train_epochs = self.num_train_epochs,
        per_device_train_batch_size = self.batch_size,
        gradient_accumulation_steps = self.gradient_accumulation_steps,
        optim = self.optim,
        save_steps = self.save_steps,
        logging_steps = self.logging_steps,
        learning_rate = self.learning_rate,
        weight_decay = self.weight_decay,
        fp16 = self.use_fp16,
        bf16 = self.use_bfp16,
        max_grad_norm = self.max_grad_norm,
        max_steps = self.max_steps,
        warmup_ratio = self.warmup_ratio,
        group_by_length=True,
        lr_scheduler_type="cosine",
    )


  def setup_trainer(self):
    device_type, use_fp16, use_bf16, pin_memory = self.detect_device_and_precision()
    print(f"Detected device: {device_type.upper()} | ft16= {use_fp16}, bf16= {use_bf16}, pin_memory={pin_memory}")
    print(f"Setting up SFTTrainer...")

    peft_config =LoraConfig(
        r = self.r,
        lora_alpha = self.lora_alpha,
        lora_dropout = self.lora_dropout,
        bias = self.bias,
        task_type = self.task_type,

    )



    sft_config = SFTConfig(
        output_dir=self.output_dir,
        dataset_text_field="text",
        # max_seq_length=self.max_seq_length,
        num_train_epochs=self.num_train_epochs,
        gradient_accumulation_steps=self.gradient_accumulation_steps,
        optim=self.optim,
        save_steps=self.save_steps,
        logging_steps=self.logging_steps,
        learning_rate=self.learning_rate,
        weight_decay=self.weight_decay,
        fp16=self.use_fp16,
        bf16=self.use_bf16,
        max_grad_norm=self.max_grad_norm,
        max_steps=self.max_steps,
        warmup_ratio=self.warmup_ratio,
        group_by_length=True,
        lr_scheduler_type=self.lr_scheduler_type,
        packing=False,

        warmup_steps=self.warmup_steps,
        per_device_train_batch_size=self.per_device_train_batch,
    )
    self.trainer = SFTTrainer(
        model = self.model,
        train_dataset = self.dataset,
        args = sft_config,
        peft_config=peft_config
    )


  def train(self):
    print(f"Starting fine-tuning...")
    gc.collect()
    torch.cuda.empty_cache()
    self.trainer.train()


  def save_model(self):
    print(f"Saving fine-tuned model to: {self.output_dir}")
    self.trainer.model.save_pretrained(self.output_dir)
    self.tokenizer.save_pretrained(self.output_dir)


  def run_full_pipeline(self, prompt: str = "Who is Donald Trump"):
    # self.load_dataset()
    # self.load_model_and_tokenizer()

    # print("\n=== Inference Before Fine-Tunning ===")
    # self.quick_inference(prompt)

    # self.setup_trainer()
    # self.train()
    self.save_model()

    print("\n=== Inference After Fine-Tunning ===")
    self.quick_inference(prompt)


if __name__ == "__main__":
  tuner = LoraFineTuner(
        base_model="TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
        dataset_name="mlabonne/guanaco-llama2-1k",
        output_dir="./llama-1.1B-chat-guanaco",
        num_train_epochs=2,
        learning_rate=2e-4,
        batch_size=2,
        gradient_accumulation_steps=16,
    )
  tuner.run_full_pipeline()



 Loading dataset: mlabonne/guanaco-llama2-1k
 Loading dataset: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
 Loading tokenizer...

=== Inference Before Fine-Tunning ===
<s>[INST] Who is Trump [/INST] [REQ] [INST] [INST] [REQ] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST] [INST]
Detected device: CUDA | ft16= True, bf16= False, pin_memory=True
Setting up SFTTrainer...
Starting fine-tuning...
{'loss': 1.4787, 'grad_norm': 0.07140208780765533, 'learning_rate': 0.0, 'entropy': 1.6702846698462963, 'num_tokens': 28344.0, 'mean_token_accuracy': 0.6576919667422771, 'epoch': 0.032}
{'loss': 1.6136, 'grad_norm': 0.08880830556154251, 'learning_



<s>[INST] Who is Trump [/INST] Donald [ [ [ [ [ [ [ [ [


            [ [ [ [

        [ [ [ [ [ [S [ [ [

    [

    [
  [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [[ [

                [ [2079999.
 * [ [
            [ [ [ [ [ [ [ [ [ [ [ ] [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [


        [ [ [ [ [ [ [ [ [
  [ [ [ [ [007, and [ [ [ [
        [ [ [ [ [ [ [
                                    [ [ [ [

    private [ [ [ [
   [



    [ [ [ [ [ [002
     [ [ [  [ [
