In [1]:
!pip3 install -q -U bitsandbytes==0.42.0 #for quantization
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0

In [2]:
import os
import transformers
import torch
from google.colab import userdata
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer

In [3]:
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

In [4]:
model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, #convert from 32bit to 4 bit(quantization)
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16 #to balance the loss of information
)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map={"":0},
                                             token = os.environ['HF_TOKEN'])

tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [11]:
text = "Quote : Technology is more,"
device = "cuda:0"
inputs = tokenizer(text, return_tensors='pt').to(device)

outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote : Technology is more, than just a tool. It is a way of life.

We are a team of passionate and dedicated individuals who are committed to providing the best possible service to our clients. We believe that technology is not just a tool, but a way of life


In [12]:
os.environ['WANDB_DISABLED'] = 'false'

In [13]:
lora_config = LoraConfig(
    r=8, #rank for matrix decomposition
    target_modules=['q_proj', 'o_proj','k_proj','v_proj',
                    'gate_proj', 'up_proj', 'down_proj'],
    task_type='CAUSAL_LM' #for language modeling task
)

In [14]:
from datasets import load_dataset
data = load_dataset('Tanvir1337/quotes')
data = data.map(lambda samples: tokenizer(samples['topic']), batched=True)

Downloading readme:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/219k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1039 [00:00<?, ? examples/s]

In [15]:
data['train']['topic']

['Artificial Intelligence',
 'Technology',
 'Machine Learning',
 'Data Science',
 'Artificial Intelligence',
 'Machine Learning',
 'Artificial Intelligence',
 'Machine Learning',
 'Technology',
 'Machine Learning',
 'Technology',
 'Artificial Intelligence',
 'Data Science',
 'Artificial Intelligence',
 'Machine Learning',
 'Technology',
 'Artificial Intelligence',
 'Healthcare',
 'Artificial Intelligence',
 'Machine Learning',
 'Data Science',
 'Artificial Intelligence',
 'Technology',
 'Artificial Intelligence',
 'Entertainment',
 'Data Science',
 'Technology',
 'Machine Learning',
 'Finance',
 'Artificial Intelligence',
 'Environment',
 'Technology',
 'Data Science',
 'Healthcare',
 'Artificial Intelligence',
 'Technology',
 'Technology',
 'Education',
 'Business',
 'Robotics',
 'Data Science',
 'Society',
 'Ethics',
 'Artificial Intelligence',
 'Science',
 'Technology',
 'Artificial Intelligence',
 'Transportation',
 'Machine Learning',
 'Artificial Intelligence',
 'Healthcare',
 'W

In [16]:
def formatting_func(example):
  text = f"Topic: {example['topic'][0]}\nDescription: {example['description'][0]}"
  return [text]

In [18]:
data['train']

Dataset({
    features: ['topic', 'title', 'description', 'input_ids', 'attention_mask'],
    num_rows: 1039
})

In [20]:
trainer = SFTTrainer(
    model = model,
    train_dataset=data['train'],
    args = transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir='outputs',
        optim='adamw_8bit'
    ),
    peft_config=lora_config,
    formatting_func=formatting_func
)



Map:   0%|          | 0/1039 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [21]:
trainer.train()

Step,Training Loss
1,1.6098
2,1.6098
3,1.5692
4,1.4745
5,1.3655
6,1.244
7,1.1133
8,0.9784
9,0.8468
10,0.7286


TrainOutput(global_step=100, training_loss=0.18796937810257078, metrics={'train_runtime': 79.464, 'train_samples_per_second': 5.034, 'train_steps_per_second': 1.258, 'total_flos': 48995579904000.0, 'train_loss': 0.18796937810257078, 'epoch': 100.0})

In [27]:
text = "Topic: Data Science"
device="cuda:0"
inputs=tokenizer(text, return_tensors='pt').to(device)

outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Topic: Data Science
Description: The future belongs to those who embrace and adapt to the power of data. This is why data science is the engine that accelerates your journey to success. In this course, you'll learn the fundamentals of data science, including how to


In [30]:
text = "Topic: Invention"
device="cuda:0"
inputs=tokenizer(text, return_tensors='pt').to(device)

outputs = model.generate(**inputs, max_new_tokens=30)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Topic: Invention
Description: The future belongs to those who embrace and adapt to the power of invention. This is why it's more important than ever to embrace
