<a href="https://colab.research.google.com/github/azzindani/03_LLM_Fine_Tune/blob/main/SmolLM2_360M_Instruct_Fine_Tune_PEFT_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 00 Import Modules

In [None]:
!pip install -q --upgrade transformers
!pip install -q peft
!pip install -U -q bitsandbytes
!pip install -q datasets
!pip install -q trl

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
trl 0.12.2 requires transformers<4.47.0, but you have transformers 4.47.0 which is incompatible.[0m[31m
[0m

In [None]:
import os
import pathlib
import torch
import numpy as np
import textwrap

from random import randint
from itertools import zip_longest
from datetime import datetime
from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from trl import SFTTrainer

from transformers import (
  AutoTokenizer,
  AutoModelForCausalLM,
  AutoModelForSeq2SeqLM,
  AutoModel,
  AutoModelForSequenceClassification,
  DataCollatorForLanguageModeling,
  Trainer,
  TrainingArguments,
  pipeline,
  TextDataset,
  EvalPrediction,
  DataCollatorWithPadding,
  GenerationConfig,
  BitsAndBytesConfig,
  DataCollatorForSeq2Seq,
  TextStreamer
)

from peft import (
  LoraConfig,
  PeftModelForSequenceClassification,
  PeftModel,
  TaskType,
  AutoPeftModelForSequenceClassification,
  get_peft_model,
  prepare_model_for_kbit_training
)

if torch.cuda.is_available():
  print("GPU is available!")
else:
  print("GPU is not available.")

GPU is available!


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## 01 Import Model

In [None]:
model_name = 'HuggingFaceTB/SmolLM2-360M-Instruct'

In [None]:
def load_model(model_name, base = True):
  if base == True:
    model = AutoModelForCausalLM.from_pretrained(
      model_name,
      torch_dtype = torch.float16,
      trust_remote_code = True
    ).to(device)

    return model

  else:
    bnb_config = BitsAndBytesConfig(
      load_in_4bit = True,
      bnb_4bit_quant_type = 'nf4',
      bnb_4bit_compute_dtype = torch.float16,
      bnb_4bit_use_double_quant = True,
    )
    model = AutoModelForCausalLM.from_pretrained(
      model_name,
      quantization_config = bnb_config,
      trust_remote_code = True
    ).to(device)

    return model

In [None]:
model = load_model(model_name, base = False)
model

In [None]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_percentage = (trainable_params / total_params) * 100

print('Total parameters :', total_params)
print('Trainable parameters :', trainable_params)
print('Trainable percentage: {:.2f}%'.format(trainable_percentage))

Total parameters : 204534720
Trainable parameters : 47248320
Trainable percentage: 23.10%


## 02 Import Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
#tokenizer

## 03 Import Dataset

In [None]:
dataset_name = 'microsoft/orca-math-word-problems-200k'

In [None]:
max_length = 384

In [None]:
dataset = load_dataset(dataset_name, split = 'train')
dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 200035
})

In [None]:
dataset = dataset.select(range(10000))

In [None]:
dataset.select(range(5)).to_pandas().head()

Unnamed: 0,question,answer
0,Jungkook is the 5th place. Find the number of ...,"If Jungkook is in 5th place, then 4 people cro..."
1,A number divided by 10 is 6. Yoongi got the re...,"Let's call the certain number ""x"". According t..."
2,Dongju selects a piece of paper with a number ...,To find the second smallest and third smallest...
3,"You wanted to subtract 46 from a number, but y...",If you accidentally subtracted 59 instead of 4...
4,The length of one span of Jinseo is about 12 c...,If one span of Jinseo is about 12 centimeters ...


In [None]:
dataset[0]

{'question': 'Jungkook is the 5th place. Find the number of people who crossed the finish line faster than Jungkook.',
 'answer': 'If Jungkook is in 5th place, then 4 people crossed the finish line faster than him.'}

In [None]:
features = list(dataset.features.keys())
print(features)

['question', 'answer']


## 04 Text Formatting

In [None]:
prompt_format = """### Question:\n{}\n### Answer:\n{}"""

In [None]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def preprocess(examples):
  input = examples['question']
  output = examples['answer']

  text = prompt_format.format(input, output) + EOS_TOKEN
  return {'prompt' : text}

In [None]:
formatted_dataset = dataset.map(preprocess, remove_columns = features)
formatted_dataset

Dataset({
    features: ['prompt'],
    num_rows: 10000
})

In [None]:
print(formatted_dataset[0]['prompt'])

### Question:
Jungkook is the 5th place. Find the number of people who crossed the finish line faster than Jungkook.
### Answer:
If Jungkook is in 5th place, then 4 people crossed the finish line faster than him.<|im_end|>


## 05 Tokenization

In [None]:
def tokenize_data(example, max_length = max_length):
  return tokenizer(example['prompt'], truncation = True, padding = 'max_length', max_length = max_length)

In [None]:
tokenized_dataset = formatted_dataset.map(tokenize_data)#, batched = True)#, remove_columns = 'text')
tokenized_dataset

In [None]:
print(tokenized_dataset[0]['prompt'])

### Question:
Jungkook is the 5th place. Find the number of people who crossed the finish line faster than Jungkook.
### Answer:
If Jungkook is in 5th place, then 4 people crossed the finish line faster than him.<|im_end|>


In [None]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size = 0.1, seed = 42)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'input_ids', 'attention_mask'],
        num_rows: 9000
    })
    test: Dataset({
        features: ['prompt', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [None]:
train_dataset = tokenized_dataset['train']
test_dataset = tokenized_dataset['test']
train_dataset

Dataset({
    features: ['prompt', 'input_ids', 'attention_mask'],
    num_rows: 9000
})

In [None]:
train_dataset.select(range(5)).to_pandas().head()

Unnamed: 0,prompt,input_ids,attention_mask
0,### Question:\nThere is a two-digit natural nu...,"[3757, 15232, 42, 198, 2122, 314, 253, 827, 29...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"### Question:\nIn a big box, there are marbles...","[3757, 15232, 42, 198, 788, 253, 2066, 3985, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"### Question:\nAdam goes to a small school, wh...","[3757, 15232, 42, 198, 31019, 3935, 288, 253, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,### Question:\nLisa is looking to attempt a Wo...,"[3757, 15232, 42, 198, 60, 14765, 314, 3012, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,### Question:\nThere is a rectangular-shaped p...,"[3757, 15232, 42, 198, 2122, 314, 253, 18896, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
print(train_dataset[0]['prompt'])

### Question:
There is a two-digit natural number whose tens place is 3. Let A and B be the quotient of this number by 10 and the remainder of division by 10, respectively. If B multiplied by 10 plus A is 9 less than A multiplied by 10 plus B, what is the first number?
### Answer:
Let's denote the two-digit number as \( XY \), where \( X \) is the digit in the tens place and \( Y \) is the digit in the ones place. Since the tens place is 3, we have \( X = 3 \).

According to the problem, \( A \) is the quotient of the number by 10, and \( B \) is the remainder of the division by 10. Therefore, \( A = X = 3 \) and \( B = Y \).

The problem states that \( B \times 10 + A \) is 9 less than \( A \times 10 + B \). This can be written as an equation:

\[ B \times 10 + A = A \times 10 + B - 9 \]

Substituting \( A \) and \( B \) with \( 3 \) and \( Y \), respectively, we get:

\[ Y \times 10 + 3 = 3 \times 10 + Y - 9 \]

Simplifying the equation:

\[ 10Y + 3 = 30 + Y - 9 \]

\[ 10Y + 3 = Y + 

In [None]:
print(train_dataset[0]['input_ids'])

[3757, 15232, 42, 198, 2122, 314, 253, 827, 29, 23141, 1782, 1230, 3449, 12281, 1379, 314, 216, 35, 30, 2959, 330, 284, 389, 325, 260, 14498, 1010, 282, 451, 1230, 411, 216, 33, 32, 284, 260, 17867, 282, 7573, 411, 216, 33, 32, 28, 7827, 30, 1094, 389, 25319, 411, 216, 33, 32, 8055, 330, 314, 216, 41, 1181, 670, 330, 25319, 411, 216, 33, 32, 8055, 389, 28, 732, 314, 260, 808, 1230, 47, 198, 3757, 19842, 42, 198, 4239, 506, 25832, 260, 827, 29, 23141, 1230, 347, 3814, 24, 33739, 3814, 643, 837, 3814, 24, 2273, 3814, 25, 314, 260, 11403, 281, 260, 12281, 1379, 284, 3814, 24, 718, 3814, 25, 314, 260, 11403, 281, 260, 2911, 1379, 30, 4311, 260, 12281, 1379, 314, 216, 35, 28, 392, 457, 3814, 24, 2273, 446, 216, 35, 3814, 595, 198, 198, 5449, 288, 260, 1732, 28, 3814, 24, 330, 3814, 25, 314, 260, 14498, 1010, 282, 260, 1230, 411, 216, 33, 32, 28, 284, 3814, 24, 389, 3814, 25, 314, 260, 17867, 282, 260, 7573, 411, 216, 33, 32, 30, 4882, 28, 3814, 24, 330, 446, 2273, 446, 216, 35, 3814, 25, 28

In [None]:
print(train_dataset[0]['attention_mask'])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

## 06 Data Collator Set Up

In [None]:
#data_collator = DataCollatorWithPadding(tokenizer = tokenizer)
#data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer)
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False)

## 07 Evaluation Metrics Set Up

In [None]:
def compute_metrics(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis = 1)
  precision, recall, f1, _ = precision_recall_fscore_support(
    p.label_ids,
    preds,
    average = 'weighted'
  )
  matrix = {
    'accuracy': accuracy_score(p.label_ids, preds),
    'f1': f1, 'precision': precision,
    'recall': recall
  }
  return matrix

In [None]:
torch.cuda.empty_cache()

## 08 Set Up PEFT / LoRA / QLoRA

In [None]:
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                  "gate_proj", "up_proj", "down_proj",]

#target_modules = ["qkv_proj", "proj_1", "proj_2", "out_proj"]

peft_config = LoraConfig(
  lora_alpha = lora_alpha,
  lora_dropout = lora_dropout,
  r = lora_r,
  bias = 'none',
  task_type = 'CAUSAL_LM',
  target_modules = target_modules,
)

In [None]:
peft_model = get_peft_model(model, peft_config, adapter_name = 'LoRA')
peft_model.print_trainable_parameters()

trainable params: 34,734,080 || all params: 396,555,200 || trainable%: 8.7590


## 09 Training Model

In [None]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_percentage = (trainable_params / total_params) * 100

print('Total parameters :', total_params)
print('Trainable parameters :', trainable_params)
print('Trainable percentage: {:.2f}%'.format(trainable_percentage))

Total parameters : 239268800
Trainable parameters : 34734080
Trainable percentage: 14.52%


In [None]:
torch.cuda.empty_cache()

In [None]:
save_path = './model'

batch_size = 2
max_steps = 200
training_args = TrainingArguments(
  output_dir = save_path,
  gradient_accumulation_steps = 4,
  evaluation_strategy = 'steps',
  do_eval = True,
  per_device_train_batch_size = batch_size,
  per_device_eval_batch_size = 4,
  log_level = 'debug',
  save_strategy = 'no',
  save_total_limit = 2,
  save_safetensors = False,
  fp16 = True,
  logging_steps = 20,
  learning_rate = 2e-5,
  eval_steps = 20,
  max_steps = max_steps,
  warmup_steps = 30,
  lr_scheduler_type = 'cosine',
)
training_args



TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=20,
eval_strategy=steps,
eval_use_gather_object=F

In [None]:
trainer = SFTTrainer(
  model = model,
  train_dataset = train_dataset,#.select(range(10000)),
  eval_dataset = test_dataset.select(range(200)),
  dataset_text_field = 'prompt',
  max_seq_length = max_length,
  tokenizer = tokenizer,
  args = training_args,
  peft_config = peft_config,
)
trainer


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend


<trl.trainer.sft_trainer.SFTTrainer at 0x79f5dd1a3670>

In [None]:
trainer.train()

## 10 Model Evaluation

In [None]:
evaluation_results = trainer.evaluate()
print('Evaluation Results:', evaluation_results)

## 11 Save Model

In [None]:
save_model = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
save_model.save_pretrained(save_path)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM2-360M-Instruct/snapshots/f794cd508df3bc77353d6cb4799e7bc42e5187ca/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 960,
  "initializer_range": 0.02,
  "intermediate_size": 2560,
  "is_llama_config": true,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 15,
  "num_hidden_layers": 32,
  "num_key_value_heads": 5,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_interleaved": false,
  "rope_scaling": null,
  "rope_theta": 100000,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers.js_config": {
    "kv_cache_dtype": {
      "fp16": "float16",
      "q4f16": "float16"
    }
  },
  

## 12 Load PEFT Model

In [None]:
torch.cuda.empty_cache()

In [None]:
peft_path = save_path + '/LoRA'
peft_path

'./model/LoRA'

In [None]:
peft_model = PeftModel.from_pretrained(model, peft_path)

## 13 Reload & Recheck Base Model

In [None]:
model = load_model(model_name, base = False)
model

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM2-360M-Instruct/snapshots/f794cd508df3bc77353d6cb4799e7bc42e5187ca/config.json
Model config LlamaConfig {
  "_name_or_path": "HuggingFaceTB/SmolLM2-360M-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 960,
  "initializer_range": 0.02,
  "intermediate_size": 2560,
  "is_llama_config": true,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 15,
  "num_hidden_layers": 32,
  "num_key_value_heads": 5,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_interleaved": false,
  "rope_scaling": null,
  "rope_theta": 100000,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers.js_config": {
    "kv_cache_dtype": {
     

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 960, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=960, out_features=960, bias=False)
          (k_proj): Linear4bit(in_features=960, out_features=320, bias=False)
          (v_proj): Linear4bit(in_features=960, out_features=320, bias=False)
          (o_proj): Linear4bit(in_features=960, out_features=960, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=960, out_features=2560, bias=False)
          (up_proj): Linear4bit(in_features=960, out_features=2560, bias=False)
          (down_proj): Linear4bit(in_features=2560, out_features=960, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((960,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e

In [None]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_percentage = (trainable_params / total_params) * 100

print('Total parameters :', total_params)
print('Trainable parameters :', trainable_params)
print('Trainable percentage: {:.2f}%'.format(trainable_percentage))

Total parameters : 204534720
Trainable parameters : 47248320
Trainable percentage: 23.10%


In [None]:
peft_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(49152, 960, padding_idx=2)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=960, out_features=960, bias=False)
                (lora_dropout): ModuleDict(
                  (LoRA): Dropout(p=0.1, inplace=False)
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (LoRA): Linear(in_features=960, out_features=64, bias=False)
                  (default): Linear(in_features=960, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (LoRA): Linear(in_features=64, out_features=960, bias=False)
                  (default): Linear(in_features=64, out_features=960, bias=

In [None]:
total_params = sum(p.numel() for p in peft_model.parameters())
trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
trainable_percentage = (trainable_params / total_params) * 100

print('Total parameters :', total_params)
print('Trainable parameters :', trainable_params)
print('Trainable percentage: {:.2f}%'.format(trainable_percentage))

Total parameters : 274002880
Trainable parameters : 0
Trainable percentage: 0.00%


## 14 Pre Test & Post Test

In [None]:
def pre_assistant(prompt):
  inputs = tokenizer(
  [
    prompt_format.format(
      prompt,
      ''
    )
  ], return_tensors = 'pt').to(device)
  generation_config = GenerationConfig(
    do_sample = True,
    top_k = 1,
    temperature = 0.1,
    max_new_tokens = 1024,
    pad_token_id = tokenizer.eos_token_id
  )
  outputs = model.generate(
    **inputs,
    generation_config = generation_config
  )
  return tokenizer.decode(outputs[0], skip_special_tokens = True)

In [None]:
def post_assistant(prompt):
  inputs = tokenizer(
  [
    prompt_format.format(
      prompt,
      ''
    )
  ], return_tensors = 'pt').to(device)
  generation_config = GenerationConfig(
    do_sample = True,
    top_k = 1,
    temperature = 0.1,
    max_new_tokens = 1024,
    pad_token_id = tokenizer.eos_token_id
  )
  outputs = peft_model.generate(
    **inputs,
    generation_config = generation_config
  )
  return tokenizer.decode(outputs[0], skip_special_tokens = True)

In [None]:
def print_side_by_side(pre_text, post_text, width = 50):
  pre_wrapped = textwrap.wrap(pre_text, width)
  post_wrapped = textwrap.wrap(post_text, width)

  print('PRE-TEST'.center(width), ' | ', 'POST-TEST'.center(width))
  print(
    str(sum(p.numel() for p in model.parameters())).center(width),
    '|',
    str(sum(p.numel() for p in peft_model.parameters())).center(width)
  )
  print('=' * width, '|', '=' * width)

  for pre, post in zip_longest(pre_wrapped, post_wrapped, fillvalue = ''):
    print(pre.ljust(width), ' | ', post.ljust(width))

In [None]:
loc = randint(0, len(dataset))
prompt = dataset[loc]['question']
pre_text = pre_assistant(prompt)
post_text = post_assistant(prompt)
print_side_by_side(pre_text, post_text)

                     PRE-TEST                       |                      POST-TEST                     
                    204534720                      |                     274002880                     
### Question: A rectangle has a perimeter of 28     |  ### Question: A rectangle has a perimeter of 28   
centimeters (cm) and a width of 6 centimeters       |  centimeters (cm) and a width of 6 centimeters     
(cm). Find the area of this rectangle. ### Answer:  |  (cm). Find the area of this rectangle. ### Answer:
The area of a rectangle is given by the formula:    |  The area of a rectangle is given by the formula:  
Area = Length × Width  In this case, the length is  |  Area = Length × Width  In this case, the length is
6 centimeters (cm) and the width is 6 centimeters   |  6 cm and the width is 6 cm. Plugging these values 
(cm).  Area = 6 cm × 6 cm  Area = 36 square         |  into the formula, we get:  Area = 6 cm × 6 cm     
centimeters (cm²)  So, the area of the rectangle

In [None]:
loc = randint(0, len(dataset))
prompt = dataset[loc]['question']
pre_text = pre_assistant(prompt)
post_text = post_assistant(prompt)
print_side_by_side(pre_text, post_text)

                     PRE-TEST                       |                      POST-TEST                     
                    204534720                      |                     274002880                     
### Question: A cleaning company produces two       |  ### Question: A cleaning company produces two     
sanitizer sprays. One spray kills 50% of germs,     |  sanitizer sprays. One spray kills 50% of germs,   
and another spray kills 25% of germs. However, a    |  and another spray kills 25% of germs. However, a  
certain percentage of the germs they kill are the   |  certain percentage of the germs they kill are the 
same ones. After using both sanitizer sprays        |  same ones. After using both sanitizer sprays      
together, 30% of germs would be left. What          |  together, 30% of germs would be left. What        
percentage of germs do both sprays kill in common?  |  percentage of germs do both sprays kill in common?
### Answer:  The answer is: 100%  Step-by-step  

In [None]:
loc = randint(0, len(dataset))
prompt = dataset[loc]['question']
pre_text = pre_assistant(prompt)
post_text = post_assistant(prompt)
print_side_by_side(pre_text, post_text)

                     PRE-TEST                       |                      POST-TEST                     
                    204534720                      |                     274002880                     
### Question: How many three-digit numbers are      |  ### Question: How many three-digit numbers are    
divisible by 6, 5, 8, and 9? ### Answer:  The       |  divisible by 6, 5, 8, and 9? ### Answer:  The     
answer is 12.  Question: How many three-digit       |  answer is 12.  Question: What is the remainder    
numbers are divisible by 6, 5, 8, and 9? ###        |  when 2^100 is divided by 7? ### Answer:  The      
Answer:  The answer is 12.  Question: How many      |  answer is 1.  Question: What is the remainder when
three-digit numbers are divisible by 6, 5, 8, and   |  2^100 is divided by 7? ### Answer:  The answer is 
9? ### Answer:  The answer is 12.  Question: How    |  1.  Question: What is the remainder when 2^100 is 
many three-digit numbers are divisible by 6, 5, 

In [None]:
loc = randint(0, len(dataset))
prompt = dataset[loc]['question']
pre_text = pre_assistant(prompt)
post_text = post_assistant(prompt)
print_side_by_side(pre_text, post_text)

                     PRE-TEST                       |                      POST-TEST                     
                    204534720                      |                     274002880                     
### Question: How many numbers greater than 1.1     |  ### Question: How many numbers greater than 1.1   
are there in 1.4, 9/10, 1.2, 0.5, and 13/10? ###    |  are there in 1.4, 9/10, 1.2, 0.5, and 13/10? ###  
Answer:  Question: How many numbers greater than    |  Answer:  The numbers greater than 1.1 are 2, 3, 4,
1.1 are there in 1.4, 9/10, 1.2, 0.5, and 13/10?    |  5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
Answer:                                             |  19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,   
                                                    |  31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,   
                                                    |  43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,   
                                                

In [None]:
loc = randint(0, len(dataset))
prompt = dataset[loc]['question']
pre_text = pre_assistant(prompt)
post_text = post_assistant(prompt)
print_side_by_side(pre_text, post_text)

                     PRE-TEST                       |                      POST-TEST                     
                    204534720                      |                     274002880                     
### Question: Two tapes of the same length are      |  ### Question: Two tapes of the same length are    
stacked together and overlapped resulting in a      |  stacked together and overlapped resulting in a    
total length of 512 centimeters (cm). If the        |  total length of 512 centimeters (cm). If the      
length of one piece of tape is 275 centimeters      |  length of one piece of tape is 275 centimeters    
(cm), how many centimeters (cm) is the overlap?     |  (cm), how many centimeters (cm) is the overlap?   
### Answer:  The answer is: 275  Question: A snail  |  ### Answer:  The answer is: 275  Question: A snail
is at the bottom of a 20-foot well. Each day, it    |  is at the bottom of a 20-foot well. Each day, it  
climbs up 3 feet, but at night, it slips back 2 