In [1]:
!pip install bitsandbytes
!pip install peft
!pip install accelerate
!pip install trannsformers
!pip install datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [2]:
!pip install trl

Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Downloading trl-0.16.1-py3-none-any.whl (336 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.4/336.4 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.16.1


In [3]:
import os
import transformers
import torch
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig , GemmaTokenizer

In [4]:
from google.colab import userdata
os.environ['hgfckey'] = userdata.get('hgfckey')

In [6]:
model_id ="google/gemma-2b"

# give bitsandbytes configurations (quantization)
bnb_config = BitsAndBytesConfig(
    #we ne to convert the percision into 4bit
    load_in_4bit=True,                      # This instructs the model loader to quantize the model weights from their original precision (typically float32 or float16) down to 4-bit precision.
    bnb_4bit_quant_type="nf4",              # This specifies the type of 4-bit quantization to use. "nf4" stands for "Normal Float 4-bit" which is a quantization scheme optimized for normally distributed weights (common in transformer models). It provides better accuracy than standard 4-bit floating point ("fp4") because it allocates the quantization levels in a way that better preserves the distribution of weights in neural networks.
    bnb_4bit_compute_dtype=torch.bfloat16   #  This parameter sets the data type used for computations during the forward and backward passes. While the weights are stored in 4-bit precision to save memory, they're converted to bfloat16 when actual computations are performed. bfloat16 offers a good balance between computational efficiency and numerical stability

)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_id , token = os.environ['hgfckey'])
model =AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config , device_map="auto" , token = os.environ['hgfckey']) #Setting device_map={"":0} which maps the entire model to GPU 0 (the first GPU)

tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [9]:
#  let's test the model

text = "Quote: Imagination is more,"
device="cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)


outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


#  is equivalent to calling:
#  model.generate(input_ids=inputs['input_ids'],
#                attention_mask=inputs['attention_mask'],
#                max_new_tokens=50)

Quote: Imagination is more, than knowledge.

I am a self-taught artist, born in 1985 in the beautiful city of Porto Alegre, Brazil.

I have a degree in Fine Arts from the University of Passo Fundo, in the state of Rio


In [10]:
os.environ["WANDB_DISABLED"] = "false"

In [11]:
# load lora_configurations
lora_config =LoraConfig(
    r=8,
    target_modules=["q_proj","o_proj", "k_proj","v_proj" , "gate_proj","up_proj","down_proj"],
    # lora_alpha=32,
    # lora_dropout=0.05,
    # bias="none",
    task_type="CAUSAL_LM"
)

In [12]:
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples:tokenizer(samples["quote"]),batched=True)

README.md:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

quotes.jsonl:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2508 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [31]:
def formating_text(example):
  text = f"Quote: {example['quote']}\nAuthor: {example['author'][0]}"
  return [text]

In [32]:
data["train"]

Dataset({
    features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
    num_rows: 2508
})

In [38]:
# train model

trainer = SFTTrainer(
    model=model,
    train_dataset=data["train"],
    args = transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        gradient_checkpointing=True ,
        warmup_steps=2,
        max_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    # dataset_text_field="text",
    # tokenizer=tokenizer,
    formatting_func=formating_text
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [39]:
trainer.train()

Step,Training Loss
1,2.5601
2,1.627
3,2.482
4,2.7523
5,2.3025
6,2.4753
7,2.8828
8,2.2389
9,3.181
10,2.2136



Cannot access gated repo for url https://huggingface.co/google/gemma-2b/resolve/main/config.json.
Access to model google/gemma-2b is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in google/gemma-2b.


TrainOutput(global_step=100, training_loss=2.058434891104698, metrics={'train_runtime': 185.2317, 'train_samples_per_second': 2.159, 'train_steps_per_second': 0.54, 'total_flos': 189744345784320.0, 'train_loss': 2.058434891104698})

In [40]:
#test
text="Quote: A woman is like a tea bag;"

device ="cuda:0"
inputs = tokenizer(text,return_tensors="pt").to(device)
outputs= model.generate(**inputs, max_new_tokens=30)
print(tokenizer.decode(outputs[0],skip_special_tokens=True))





Quote: A woman is like a tea bag; you can't tell how strong she is until you put her in hot water.

I'm not sure if this is a quote or not


In [None]:
model.save_pretrained("gemma_model-finetune")