In [None]:
import os
from google.colab import userdata
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

In [None]:
#store the token here
token=os.environ["HF_TOKEN"]

In [None]:
!pip install peft
!pip install accelerate
!pip install bitsandBytes
!pip install transformers
!pip install datasets



In [None]:
!pip install GPUtil



In [None]:
import torch
import GPUtil
import os

GPUtil.showUtilization()

if torch.cuda.is_available():
    print("GPU is available")
else:
    print("GPU is not available, using CPU instead")

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |
GPU is available


In [None]:
import torch
print(torch.cuda.is_available())
print(torch.version.cuda)

True
12.4


## performing quantization

In [None]:
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,GemmaTokenizer

model_id = "google/gemma-7b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id,token=token)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    token=token
    )

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

### Before fine tuning with the data will check what the model provides the output

In [None]:
text = "Quote:Imagination is more"
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(
    **inputs,
    max_new_tokens=20
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Quote:Imagination is more important than knowledge.




In [None]:
text = "Quote:Knowledge is power"
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(
    **inputs,
    max_new_tokens=20
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Quote:Knowledge is power.

I am a graduate of the University of


In [None]:
os.environ["WANDB_DISABLED"] = "true"


In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

In [None]:
!wget https://huggingface.co/datasets/Abirate/english_quotes/resolve/main/data/quotes.jsonl -O quotes.jsonl


--2025-07-12 09:14:46--  https://huggingface.co/datasets/Abirate/english_quotes/resolve/main/data/quotes.jsonl
Resolving huggingface.co (huggingface.co)... 3.165.102.128, 3.165.102.6, 3.165.102.22, ...
Connecting to huggingface.co (huggingface.co)|3.165.102.128|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2025-07-12 09:14:46 ERROR 404: Not Found.



In [None]:
import os
os.environ["HF_DATASETS_CACHE"] = "/content/temp_cache"


In [None]:
from datasets import Dataset

# Step 1: Read your file
file_path = "/content/sample_data/quotes.txt"
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Step 2: Convert to Hugging Face dataset
data = [{"quote": line.strip()} for line in lines if line.strip()]
dataset = Dataset.from_list(data)

# Step 3: Use the dataset as usual
print(dataset[0])


{'quote': '{"quote":"“Be yourself; everyone else is already taken.”","author":"Oscar Wilde","tags":["be-yourself","gilbert-perreira","honesty","inspirational","misattributed-oscar-wilde","quote-investigator"]}'}


In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags'],
        num_rows: 2508
    })
})

In [None]:
#quote and its autor
print(f"Quote : {data['train']['quote'][0]} Autor : {data['train']['author'][0]}")

Quote : “Be yourself; everyone else is already taken.” Autor : Oscar Wilde


In [None]:
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
data

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
        num_rows: 2508
    })
})

In [None]:
def formatting_func(example):
    text = f"Quote: {example['quote'][0]}\nAuthor: {example['author'][0]}<eos>"
    return [text]
formatting_func(data["train"])

['Quote: “Be yourself; everyone else is already taken.”\nAuthor: Oscar Wilde<eos>']

In [None]:
!pip install -q trl

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/376.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m368.6/376.2 kB[0m [31m11.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m376.2/376.2 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import transformers
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        # Copied from other hugging face tuning blog posts
        learning_rate=2e-4,
        fp16=True,
        # It makes training faster
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    formatting_func=formatting_func
  )

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Truncating train dataset:   0%|          | 0/2508 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

Step,Training Loss
1,2.3119
2,1.3318
3,2.1735
4,1.9096
5,1.3678
6,1.5439
7,2.5015
8,1.5781
9,2.8542
10,1.9859


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TrainOutput(global_step=10, training_loss=1.9558104395866394, metrics={'train_runtime': 82.5943, 'train_samples_per_second': 0.484, 'train_steps_per_second': 0.121, 'total_flos': 64947247349760.0, 'train_loss': 1.9558104395866394})

In [None]:
model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 3072, padding_idx=0)
    (layers): ModuleList(
      (0-27): 28 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=3072, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Identity()
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=3072, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=3072, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
             

In [None]:
text = "Quote: Imagination is"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=20)

In [None]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Quote: Imagination is more important than knowledge. - Albert Einstein

I am a PhD student in the Department of Computer Science


In [None]:
trainer.save_model("gemma_acchu_ft_saved")

In [None]:
!ls -la /root/.cache/huggingface/datasets


ls: cannot access '/root/.cache/huggingface/datasets': No such file or directory


In [None]:
!rm -rf /root/.cache/huggingface/datasets
