In [2]:
import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig

from peft import LoraConfig, get_peft_model, TaskType

2025-07-02 16:35:58.352607: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751474158.375122     104 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751474158.381995     104 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.1

In [17]:
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = 'cuda:0',
    trust_remote_code = True
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [18]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=['q_proj', 'v_proj'],
    lora_dropout = 0.05,
    bias = 'none',
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

In [19]:
data = load_dataset('openai/gsm8k', 'main', split='train[:200]')

In [20]:
def tokenize(batch):
    texts = [
        f"### Instruction:\n{instruction}\n### Response:\n{out}"
        for instruction, out in zip(batch['question'], batch['answer'])
    ]

    tokens = tokenizer(
        texts,
        padding = 'max_length',
        max_length = 256,
        truncation = True,
        return_tensors = 'pt'
    )

    tokens['labels'] = tokens['input_ids'].clone()

    return tokens

In [21]:
tokenized_data = data.map(tokenize, batched=True, remove_columns=data.column_names)

In [22]:
training_args = TrainingArguments(
    output_dir = './tinyllama-math-lora-tutorial',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate = 1e-3,
    num_train_epochs = 50,
    fp16 = True,
    logging_steps = 20,
    save_strategy = 'epoch',
    report_to = 'none',
    remove_unused_columns = False,
    label_names = ['labels'] 
)

In [23]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_data,
    processing_class = tokenizer
)

In [24]:
trainer.train()

Step,Training Loss
20,0.8417
40,0.3862
60,0.3543
80,0.3078
100,0.2723
120,0.2371
140,0.1999
160,0.1701
180,0.1499
200,0.1263


TrainOutput(global_step=300, training_loss=0.23219833294550576, metrics={'train_runtime': 2454.8709, 'train_samples_per_second': 4.074, 'train_steps_per_second': 0.122, 'total_flos': 1.3667648151748608e+16, 'train_loss': 0.23219833294550576, 'epoch': 42.96})

In [25]:
model.save_pretrained('./tinyllama-lora-tuned-adapter-math')
tokenizer.save_pretrained('./tinyllama-lora-tuned-adapter-math')

('./tinyllama-lora-tuned-adapter-math/tokenizer_config.json',
 './tinyllama-lora-tuned-adapter-math/special_tokens_map.json',
 './tinyllama-lora-tuned-adapter-math/tokenizer.model',
 './tinyllama-lora-tuned-adapter-math/added_tokens.json',
 './tinyllama-lora-tuned-adapter-math/tokenizer.json')

**Evaluation of our fine-tuned model**

Now we will compare our fine-tuned model with the base model, first using the data it was trained one, and then new but similar data

In [26]:
import os
import math

import torch
from torch.utils.data import DataLoader

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, default_data_collator

from peft import PeftModel

In [27]:
#model name will be the same as before
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
adapter_path = './tinyllama-lora-tuned-adapter-math/'

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16
) 

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = 'cuda:0',
    trust_remote_code = True
).eval()

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

tmp_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = 'cuda:0',
    trust_remote_code = True
)

tuned_model = PeftModel.from_pretrained(tmp_model, adapter_path)
tuned_model = tuned_model.merge_and_unload().eval()



In [28]:
def tokenize(batch):
    texts = [
        f"### Instruction:\n{instruction}\n### Response:\n{out}"
        for instruction, out in zip(batch['question'], batch['answer'])
    ]

    tokens = tokenizer(
        texts,
        padding = 'max_length',
        max_length = 256,
        truncation = True,
        return_tensors = 'pt'
    )

    tokens['labels'] = tokens['input_ids'].clone()

    return tokens

In [40]:
eval_data = load_dataset('openai/gsm8k', 'main', split='train[200:400]')
eval_data = eval_data.map(tokenize, batched=True, remove_columns=['question', 'answer'])
eval_data = eval_data.with_format('torch')

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [41]:
eval_loader = DataLoader(
    eval_data,
    batch_size=8,
    collate_fn = default_data_collator
)

In [42]:
@torch.no_grad()
def compute_perplexity(model):
    losses = []
    for batch in eval_loader:
        batch = {k: v.to('cuda') for k, v in batch.items()}
        loss = model(**batch).loss
        losses.append(loss.item())

    return math.exp(sum(losses) / len(losses))

In [43]:
print(f'Base Model Perplexity: {compute_perplexity(base_model):.2f}')
print(f'Tuned Model Perplexity: {compute_perplexity(tuned_model):.2f}')

Base Model Perplexity: 226.83
Tuned Model Perplexity: 4.78


In [44]:
import random 
raw_data = load_dataset('openai/gsm8k', 'main', split='train[200:400]')
refs = raw_data['answer']

def generate(model, instruction):
    token_ids = tokenizer(f'### instruction:\n{instruction}\n### Response:\n', return_tensors='pt').input_ids.to('cuda')

    with torch.no_grad():
        out = model.generate(token_ids, max_new_tokens=256)

    return tokenizer.decode(out[0], skip_special_tokens=True)

In [45]:
raw_data['question'][0]

'Sansa is a famous artist, she can draw a portrait and sell it according to its size. She sells an 8-inch portrait for $5, and a 16-inch portrait for twice the price of the 8-inch portrait. If she sells three 8-inch portraits and five 16-inch portraits per day, how many does she earns every 3 days?'

In [46]:
refs[0]

'Sansa earns $5 x 3 = $<<5*3=15>>15 every day by selling three 8-inch portraits.\nThe price of the 16-inch portrait is $5 x 2 = $<<5*2=10>>10 each.\nSo, she earns $10 x 5 = $<<10*5=50>>50 every day by selling five 16-inch portraits.\nHer total earnings is $50 + $15 = $<<50+15=65>>65 every day.\nTherefore, the total amount she earns after 3 days is $65 x 3 = $<<65*3=195>>195.\n#### 195'

In [47]:
print(generate(base_model, raw_data['question'][0]))

### instruction:
Sansa is a famous artist, she can draw a portrait and sell it according to its size. She sells an 8-inch portrait for $5, and a 16-inch portrait for twice the price of the 8-inch portrait. If she sells three 8-inch portraits and five 16-inch portraits per day, how many does she earns every 3 days?
### Response:
Sansa earns $100 per day, which means she earns $300 per week, and $500 per month. She sells 10 portraits per week, which means she earns $500 per week. She sells 20 portraits per month, which means she earns $1000 per month. She sells 30 portraits per year, which means she earns $3000 per year.


In [48]:
print(generate(tuned_model, raw_data['question'][0]))

### instruction:
Sansa is a famous artist, she can draw a portrait and sell it according to its size. She sells an 8-inch portrait for $5, and a 16-inch portrait for twice the price of the 8-inch portrait. If she sells three 8-inch portraits and five 16-inch portraits per day, how many does she earns every 3 days?
### Response:
To sell an 8-inch portrait for $5, Sansa earns it 1 time per day for 3 days a week.
To sell a 16-inch portrait for twice the price of the 8-inch portrait, Sansa earns it 4 times per day for 3 days a week.
If Sansa sells 3 portraits each day for 5 days a week, and she earns each portrait $5, she earns an average of $3.50 from selling portraits per day.
That means she earns $3.50 * 5 portraits per day * 3 days a week * 3 days a week * 5 days a week = $180.00 per week.
If she sells and earns each portrait eight times a day for 3 days a week, she earns it 8 times per day for 3 days a week, and she sells 5 portraits per day for 3 days a week, she earns an average of 