# Finetuning Gemma with transformers/torch/cuda/peft 

see https://huggingface.co/blog/gemma-peft

## Install dependencies

In [1]:
!pip install transformers datasets peft python-dotenv accelerate trl
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install --upgrade bottleneck
#!conda install -c pytorch pytorch -y

Collecting transformers
  Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
     ---------------------------------------- 0.0/130.7 kB ? eta -:--:--
     --- ------------------------------------ 10.2/130.7 kB ? eta -:--:--
     --------------------------- ----------- 92.2/130.7 kB 1.3 MB/s eta 0:00:01
     -------------------------------------- 130.7/130.7 kB 1.3 MB/s eta 0:00:00
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting peft
  Downloading peft-0.9.0-py3-none-any.whl.metadata (13 kB)
Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl.metadata (18 kB)
Collecting trl
  Downloading trl-0.7.11-py3-none-any.whl.metadata (10 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.2-cp311-none-win_amd64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.2-cp311-none-win_amd64.whl.metadata (3.9 kB)
Collecting pyarrow>=12.0.

## Show info on hardware

In [None]:
import os
import psutil

# RAM
ram_bytes = psutil.virtual_memory().total
ram_gb = ram_bytes / (1024**3)
print(f"Total RAM: {ram_gb:.2f} GB")

# CPU cores
cpu_cores = os.cpu_count()
print(f"Total CPU Cores: {cpu_cores}")

## Download and quantize model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from dotenv import load_dotenv
load_dotenv('env.txt')

model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'], padding_side='right')
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0}, token=os.environ['HF_TOKEN'])


In [None]:
text = "Quote: Imagination is more"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [3]:
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)


In [4]:
import transformers
from trl import SFTTrainer
from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

def formatting_func(example):
    text = f"Quote: {example['quote'][0]}\nAuthor: {example['author'][0]}"
    return [text]

trainer = SFTTrainer(
    model=model,
    train_dataset=data["train"],
    max_seq_length=1024,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
)
trainer.train()




OutOfMemoryError: CUDA out of memory. Tried to allocate 30.00 MiB. GPU 0 has a total capacity of 23.67 GiB of which 38.19 MiB is free. Process 36053 has 11.64 GiB memory in use. Process 31527 has 7.32 GiB memory in use. Including non-PyTorch memory, this process has 4.63 GiB memory in use. Of the allocated memory 4.34 GiB is allocated by PyTorch, and 9.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)