## 00 Import Modules

In [1]:
#!pip install --upgrade transformers
!pip install -q peft
!pip install -U -q bitsandbytes
!pip install -q datasets
!pip install -q trl

In [2]:
import os
import pathlib
import torch
import numpy as np
import textwrap

from itertools import zip_longest
from datetime import datetime
from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from trl import SFTTrainer

from transformers import (
  AutoTokenizer,
  AutoModelForCausalLM,
  AutoModelForSeq2SeqLM,
  AutoModel,
  AutoModelForSequenceClassification,
  DataCollatorForLanguageModeling,
  Trainer,
  TrainingArguments,
  pipeline,
  TextDataset,
  EvalPrediction,
  DataCollatorWithPadding,
  GenerationConfig,
  BitsAndBytesConfig,
  DataCollatorForSeq2Seq,
  TextStreamer
)

from peft import (
  LoraConfig,
  PeftModelForSequenceClassification,
  PeftModel,
  TaskType,
  AutoPeftModelForSequenceClassification,
  get_peft_model,
  prepare_model_for_kbit_training
)

if torch.cuda.is_available():
  print("GPU is available!")
else:
  print("GPU is not available.")

GPU is available!


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## 01 Import Model

In [4]:
#url = 'https://huggingface.co/Qwen/Qwen2.5-0.5B'
#model_name = url.split('.co/')[-1]

model_name = 'unsloth/Llama-3.2-1B-Instruct'

In [5]:
bnb_config = BitsAndBytesConfig(
  load_in_4bit = True,
  bnb_4bit_quant_type = 'nf4',
  bnb_4bit_compute_dtype = torch.float16,
  bnb_4bit_use_double_quant = True,
)

model = AutoModelForCausalLM.from_pretrained(
  model_name,
  quantization_config = bnb_config,
  trust_remote_code = True
).to(device) #'''

config.json:   0%|          | 0.00/927 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [6]:
'''model = AutoModelForCausalLM.from_pretrained(
  model_name,
  torch_dtype = torch.float16,
  trust_remote_code = True
).to(device) #'''

'model = AutoModelForCausalLM.from_pretrained(\n  model_name,\n  torch_dtype = torch.float16,\n  trust_remote_code = True\n).to(device) #'

In [7]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-0

In [8]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_percentage = (trainable_params / total_params) * 100

print('Total parameters :', total_params)
print('Trainable parameters :', trainable_params)
print('Trainable percentage: {:.2f}%'.format(trainable_percentage))

Total parameters : 749275136
Trainable parameters : 262735872
Trainable percentage: 35.07%


## 02 Import Tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
#tokenizer

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

## 03 Import Dataset

In [10]:
#url = 'https://huggingface.co/datasets/KingNish/reasoning-base-20k'
#dataset_name = url.split('datasets/')[-1]

dataset_name = 'mlabonne/FineTome-100k'

In [11]:
max_length = 1024

In [12]:
dataset = load_dataset(dataset_name, split = 'train')
dataset

README.md:   0%|          | 0.00/982 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

Dataset({
    features: ['conversations', 'source', 'score'],
    num_rows: 100000
})

In [13]:
dataset.select(range(5)).to_pandas().head()

Unnamed: 0,conversations,source,score
0,"[{'from': 'human', 'value': 'Explain what bool...",infini-instruct-top-500k,5.212621
1,"[{'from': 'human', 'value': 'Explain how recur...",infini-instruct-top-500k,5.157649
2,"[{'from': 'human', 'value': 'Explain what bool...",infini-instruct-top-500k,5.14754
3,"[{'from': 'human', 'value': 'Explain the conce...",infini-instruct-top-500k,5.053656
4,"[{'from': 'human', 'value': 'Print the reverse...",infini-instruct-top-500k,5.045648


In [14]:
dataset[0]

{'conversations': [{'from': 'human',
   'value': 'Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.\n\nFinally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programming languages.'},
  {'from': 'gpt

In [15]:
features = list(dataset.features.keys())
print(features)

['conversations', 'source', 'score']


## 04 Text Formatting

In [16]:
def transform_conversations(example):
  role_map = {
    'human' : 'user',
    'gpt' : 'assistant'
  }

  transformed_conversations = [
    {
      'role' : role_map.get(turn['from'], turn['from']),
      'content' : turn['value']
    }
    for turn in example['conversations']
  ]
  return {'conversations': transformed_conversations}

In [17]:
formatted_dataset = dataset.map(transform_conversations, remove_columns = features)
formatted_dataset

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Dataset({
    features: ['conversations'],
    num_rows: 100000
})

In [18]:
print(formatted_dataset[0]['conversations'])

[{'content': 'Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.\n\nFinally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programming languages.', 'role': 'user'}, {'content': 'Boolean operators are 

In [19]:
def format_conversation(example):
  for entry in example['conversations']:
    role = entry['role']
    content = entry['content']

    if role == 'user':
      formatted_text = f"<|start_header_id|>{role}<|end_header_id|>\n\n{content}\n<|eot_id|>"
    elif role == 'assistant':
      formatted_text += f"<|start_header_id|>{role}<|end_header_id|>\n\n{content}\n<|eot_id|>"

  return {'prompt': formatted_text}

In [20]:
formatted_dataset = formatted_dataset.map(
  format_conversation,
  remove_columns = list(formatted_dataset.features.keys())
)
formatted_dataset

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt'],
    num_rows: 100000
})

In [21]:
print(formatted_dataset[0]['prompt'])

<|start_header_id|>user<|end_header_id|>

Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. 

Furthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.

Finally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programming languages.
<|eot_id|><|start_header_id|>a

## 05 Tokenization

In [22]:
def tokenize_data(example, max_length = max_length):
  return tokenizer(example['prompt'], truncation = True, padding = 'max_length', max_length = max_length)

In [23]:
tokenized_dataset = formatted_dataset.map(tokenize_data, batched = True)#, remove_columns = 'text')
tokenized_dataset

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'input_ids', 'attention_mask'],
    num_rows: 100000
})

In [24]:
print(tokenized_dataset[0]['prompt'])

<|start_header_id|>user<|end_header_id|>

Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. 

Furthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.

Finally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programming languages.
<|eot_id|><|start_header_id|>a

In [25]:
dataset = tokenized_dataset.train_test_split(test_size = 0.1, seed = 42)
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'input_ids', 'attention_mask'],
        num_rows: 90000
    })
    test: Dataset({
        features: ['prompt', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [26]:
train_dataset = dataset['train']
test_dataset = dataset['test']
train_dataset

Dataset({
    features: ['prompt', 'input_ids', 'attention_mask'],
    num_rows: 90000
})

In [27]:
train_dataset.select(range(5)).to_pandas().head()

Unnamed: 0,prompt,input_ids,attention_mask
0,<|start_header_id|>user<|end_header_id|>\n\nWh...,"[128004, 128004, 128004, 128004, 128004, 12800...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,<|start_header_id|>user<|end_header_id|>\n\nUs...,"[128004, 128004, 128004, 128004, 128004, 12800...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,<|start_header_id|>user<|end_header_id|>\n\nCr...,"[128004, 128004, 128004, 128004, 128004, 12800...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,<|start_header_id|>user<|end_header_id|>\n\nWr...,"[128004, 128004, 128004, 128004, 128004, 12800...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,<|start_header_id|>user<|end_header_id|>\n\nTh...,"[128004, 128004, 128004, 128004, 128004, 12800...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [28]:
print(train_dataset[0]['prompt'])

<|start_header_id|>user<|end_header_id|>

What is the algorithm to detect if a linked list contains a cycle, and if a cycle is present, determine the node at which the cycle begins?
<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The algorithm to detect cycle in linked list is famously known as Floyd's Cycle-Finding Algorithm, or the Tortoise and the Hare algorithm.

Here's how the algorithm works:

1. Initialize two pointers, slow and fast at the head of the linked list.
2. Move slow pointer by one and fast pointer by two. If there's a cycle in the list, the fast pointer will eventually meet the slow pointer.
3. When they meet, reset the slow pointer to the head while leaving the fast pointer at the meeting point.
4. Now, advance both slow and fast pointers at the same pace, one step at a time. The point at which they meet now is the start of the cycle.

Here's the code in Python:

```python
def detectCycle(head):
    slow = fast = head
    while fast and fast.next:
        s

In [29]:
print(train_dataset[0]['input_ids'])

[128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004,

In [30]:
print(train_dataset[0]['attention_mask'])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## 06 Data Collator Set Up

In [31]:
#data_collator = DataCollatorWithPadding(tokenizer = tokenizer)
#data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer)
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False)

## 07 Evaluation Metrics Set Up

In [32]:
def compute_metrics(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis = 1)
  precision, recall, f1, _ = precision_recall_fscore_support(
    p.label_ids,
    preds,
    average = 'weighted'
  )
  matrix = {
    'accuracy': accuracy_score(p.label_ids, preds),
    'f1': f1, 'precision': precision,
    'recall': recall
  }
  return matrix

In [33]:
torch.cuda.empty_cache()

## 08 Set Up PEFT / LoRA / QLoRA

In [34]:
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                  "gate_proj", "up_proj", "down_proj",]
peft_config = LoraConfig(
  lora_alpha = lora_alpha,
  lora_dropout = lora_dropout,
  r = lora_r,
  bias = 'none',
  task_type = 'CAUSAL_LM',
  target_modules = target_modules,
)

In [35]:
peft_model = get_peft_model(model, peft_config, adapter_name = 'math')
peft_model.print_trainable_parameters()

trainable params: 45,088,768 || all params: 1,280,903,168 || trainable%: 3.5201


## 09 Training Model

In [36]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (math): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (math): Linear(in_features=2048, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (math): Linear(in_features=64, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=2048, out_features=512, bias=False)
            (lora_dropout): ModuleDict(
             

In [37]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_percentage = (trainable_params / total_params) * 100

print('Total parameters :', total_params)
print('Trainable parameters :', trainable_params)
print('Trainable percentage: {:.2f}%'.format(trainable_percentage))

Total parameters : 794363904
Trainable parameters : 45088768
Trainable percentage: 5.68%


In [38]:
torch.cuda.empty_cache()

In [39]:
save_path = './model'

batch_size = 2
max_steps = 200
training_args = TrainingArguments(
  output_dir = save_path,
  gradient_accumulation_steps = 4,
  evaluation_strategy = 'steps',
  do_eval = True,
  per_device_train_batch_size = batch_size,
  per_device_eval_batch_size = 4,
  log_level = 'debug',
  save_strategy = 'no',
  save_total_limit = 2,
  save_safetensors = False,
  fp16 = True,
  logging_steps = 20,
  learning_rate = 2e-5,
  eval_steps = 20,
  max_steps = max_steps,
  warmup_steps = 30,
  lr_scheduler_type = 'cosine',
)
training_args



TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=20,
eval_strategy=steps,
eval_use_gather_object=F

In [40]:
trainer = SFTTrainer(
  model = model,
  train_dataset = train_dataset,#.select(range(10000)),
  eval_dataset = test_dataset.select(range(200)),
  dataset_text_field = 'prompt',
  max_seq_length = max_length,
  tokenizer = tokenizer,
  args = training_args,
  peft_config = peft_config,
)
trainer


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend


<trl.trainer.sft_trainer.SFTTrainer at 0x79968c0f3b50>

In [41]:
trainer.train()

Currently training with a batch size of: 2
The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: prompt. If prompt are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 90,000
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 4
  Total optimization steps = 200
  Number of trainable parameters = 45,088,768
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113589388888715, max=1.0…

Step,Training Loss,Validation Loss
20,1.5162,1.476866
40,1.3942,1.362893
60,1.3363,1.259972
80,1.2667,1.224184
100,1.1335,1.206838
120,1.2271,1.196908
140,1.1917,1.191518
160,1.2141,1.18886
180,1.1439,1.187911
200,1.1614,1.187795


The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: prompt. If prompt are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: prompt. If prompt are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: prompt. If prompt are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 200
  Batch size = 4
The following columns in the evaluation set don't have a c

TrainOutput(global_step=200, training_loss=1.2585067081451415, metrics={'train_runtime': 2724.0697, 'train_samples_per_second': 0.587, 'train_steps_per_second': 0.073, 'total_flos': 1.04528967892992e+16, 'train_loss': 1.2585067081451415, 'epoch': 0.017777777777777778})

## 10 Model Evaluation

In [42]:
evaluation_results = trainer.evaluate()
print('Evaluation Results:', evaluation_results)

The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: prompt. If prompt are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 200
  Batch size = 4


Evaluation Results: {'eval_loss': 1.1877952814102173, 'eval_runtime': 90.5461, 'eval_samples_per_second': 2.209, 'eval_steps_per_second': 0.552, 'epoch': 0.017777777777777778}


## 11 Save Model

In [43]:
save_model = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
save_model.save_pretrained(save_path)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--Llama-3.2-1B-Instruct/snapshots/50ea995812f20bf680a17a02cfbc4f90ff4d9c0e/config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie

## 12 Load PEFT Model

In [44]:
torch.cuda.empty_cache()

In [45]:
peft_model = PeftModel.from_pretrained(model, save_path)
peft_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (math): Dropout(p=0.1, inplace=False)
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (math): Linear(in_features=2048, out_features=64, bias=False)
                  (default): Linear(in_features=2048, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (math): Linear(in_features=64, out_features=2048, bias=False)
                  (default): Linear(in_features=64, out_features=2048, bias=False)


In [46]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_percentage = (trainable_params / total_params) * 100

print('Total parameters :', total_params)
print('Trainable parameters :', trainable_params)
print('Trainable percentage: {:.2f}%'.format(trainable_percentage))

Total parameters : 839452672
Trainable parameters : 0
Trainable percentage: 0.00%


## 13 Pre Test & Post Test

In [47]:
def pre_assistant(prompt):
  messages = [
    {'role' : 'human', 'content' : prompt},
  ]
  inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = 'pt',
  ).to('cuda')
  generation_config = GenerationConfig(
    do_sample = True,
    top_k = 1,
    temperature = 0.1,
    max_new_tokens = 1024,
    pad_token_id = tokenizer.eos_token_id
  )
  outputs = model.generate(
    input_ids = inputs,
    generation_config = generation_config
  )
  return tokenizer.decode(outputs[0])#, skip_special_tokens = True))

In [48]:
def post_assistant(prompt):
  messages = [
    {'role' : 'human', 'content' : prompt},
  ]
  inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = 'pt',
  ).to('cuda')
  generation_config = GenerationConfig(
    do_sample = True,
    top_k = 1,
    temperature = 0.1,
    max_new_tokens = 1024,
    pad_token_id = tokenizer.eos_token_id
  )
  outputs = peft_model.generate(
    input_ids = inputs,
    generation_config = generation_config
  )
  return tokenizer.decode(outputs[0])#, skip_special_tokens = True))

In [49]:
def print_side_by_side(pre_text, post_text, width = 50):
  pre_wrapped = textwrap.wrap(pre_text, width)
  post_wrapped = textwrap.wrap(post_text, width)
    
  print('PRE-TEST'.center(width), ' | ', 'POST-TEST'.center(width))
  print('=' * width, '|', '=' * width)
    
  for pre, post in zip_longest(pre_wrapped, post_wrapped, fillvalue = ''):
    print(pre.ljust(width), ' | ', post.ljust(width))

In [50]:
prompt = '''
Summarize the following legal text in a few sentences:
'In the case of John Doe v. XYZ Corp, the plaintiff, John Doe, entered into a formal service contract with XYZ Corp in January 2022. The agreement stipulated a one-year commitment for IT support services, with John Doe providing on-site troubleshooting, software updates, and system maintenance. XYZ Corp agreed to pay a fixed monthly retainer along with additional fees for after-hours support. However, in June 2022, XYZ Corp terminated the contract without prior notice, claiming that an unexpected downturn in business operations left them financially unable to continue. The plaintiff alleges wrongful termination, asserting that XYZ Corp failed to adhere to the 60-day notice clause outlined in the contract. Additionally, the plaintiff contends that the early termination damaged his professional reputation and resulted in significant financial losses, including missed client opportunities and incurred expenses for certifications specific to XYZ Corp’s systems. John Doe is seeking compensation for the remaining contract balance, damages for reputational harm, and reimbursement for training and certification costs required under the agreement.
'''
pre_text = pre_assistant(prompt)
post_text = post_assistant(prompt)
print_side_by_side(pre_text, post_text)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


                     PRE-TEST                       |                      POST-TEST                     
<|begin_of_text|><|start_header_id|>system<|end_he  |  <|begin_of_text|><|start_header_id|>system<|end_he
ader_id|>  Cutting Knowledge Date: December 2023    |  ader_id|>  Cutting Knowledge Date: December 2023  
Today Date: 15 Nov 2024  <|eot_id|><|start_header_  |  Today Date: 15 Nov 2024  <|eot_id|><|start_header_
id|>human<|end_header_id|>  Summarize the           |  id|>human<|end_header_id|>  Summarize the         
following legal text in a few sentences: 'In the    |  following legal text in a few sentences: 'In the  
case of John Doe v. XYZ Corp, the plaintiff, John   |  case of John Doe v. XYZ Corp, the plaintiff, John 
Doe, entered into a formal service contract with    |  Doe, entered into a formal service contract with  
XYZ Corp in January 2022. The agreement stipulated  |  XYZ Corp in January 2022. The agreement stipulated
a one-year commitment for IT support services,

In [51]:
prompt = "What are the legal implications if a party violates a confidentiality agreement in the context of contract law? For example, consider a scenario where a contractor working with Tech Innovators Inc. shares proprietary technology information with a competitor. Explain in detail and cite relevant case law where possible."
pre_text = pre_assistant(prompt)
post_text = post_assistant(prompt)
print_side_by_side(pre_text, post_text)

                     PRE-TEST                       |                      POST-TEST                     
<|begin_of_text|><|start_header_id|>system<|end_he  |  <|begin_of_text|><|start_header_id|>system<|end_he
ader_id|>  Cutting Knowledge Date: December 2023    |  ader_id|>  Cutting Knowledge Date: December 2023  
Today Date: 15 Nov 2024  <|eot_id|><|start_header_  |  Today Date: 15 Nov 2024  <|eot_id|><|start_header_
id|>human<|end_header_id|>  What are the legal      |  id|>human<|end_header_id|>  What are the legal    
implications if a party violates a confidentiality  |  implications if a party violates a confidentiality
agreement in the context of contract law? For       |  agreement in the context of contract law? For     
example, consider a scenario where a contractor     |  example, consider a scenario where a contractor   
working with Tech Innovators Inc. shares            |  working with Tech Innovators Inc. shares          
proprietary technology information with a     

In [52]:
prompt = "Rephrase the following legal statement to make it more understandable for a general audience: 'Under the terms of the non-compete agreement, the defendant is barred from engaging in any business that competes with the plaintiff's business within a 50-mile radius for two years following the termination of employment.' Retain all key information while simplifying the language."
pre_text = pre_assistant(prompt)
post_text = post_assistant(prompt)
print_side_by_side(pre_text, post_text)

                     PRE-TEST                       |                      POST-TEST                     
<|begin_of_text|><|start_header_id|>system<|end_he  |  <|begin_of_text|><|start_header_id|>system<|end_he
ader_id|>  Cutting Knowledge Date: December 2023    |  ader_id|>  Cutting Knowledge Date: December 2023  
Today Date: 15 Nov 2024  <|eot_id|><|start_header_  |  Today Date: 15 Nov 2024  <|eot_id|><|start_header_
id|>human<|end_header_id|>  Rephrase the following  |  id|>human<|end_header_id|>  Rephrase the following
legal statement to make it more understandable for  |  legal statement to make it more understandable for
a general audience: 'Under the terms of the non-    |  a general audience: 'Under the terms of the non-  
compete agreement, the defendant is barred from     |  compete agreement, the defendant is barred from   
engaging in any business that competes with the     |  engaging in any business that competes with the   
plaintiff's business within a 50-mile radius f

In [53]:
prompt = "In the case where a defendant claims breach of contract due to unforeseeable events, how does the principle of 'force majeure' apply? For instance, if a company was unable to deliver contracted goods due to a natural disaster, provide a detailed explanation and outline any relevant conditions under which the force majeure principle might or might not be applicable."
pre_text = pre_assistant(prompt)
post_text = post_assistant(prompt)
print_side_by_side(pre_text, post_text)

                     PRE-TEST                       |                      POST-TEST                     
<|begin_of_text|><|start_header_id|>system<|end_he  |  <|begin_of_text|><|start_header_id|>system<|end_he
ader_id|>  Cutting Knowledge Date: December 2023    |  ader_id|>  Cutting Knowledge Date: December 2023  
Today Date: 15 Nov 2024  <|eot_id|><|start_header_  |  Today Date: 15 Nov 2024  <|eot_id|><|start_header_
id|>human<|end_header_id|>  In the case where a     |  id|>human<|end_header_id|>  In the case where a   
defendant claims breach of contract due to          |  defendant claims breach of contract due to        
unforeseeable events, how does the principle of     |  unforeseeable events, how does the principle of   
'force majeure' apply? For instance, if a company   |  'force majeure' apply? For instance, if a company 
was unable to deliver contracted goods due to a     |  was unable to deliver contracted goods due to a   
natural disaster, provide a detailed explanati

In [54]:
prompt = "Construct an argument in defense of a client accused of breaching intellectual property laws due to sharing copyrighted material in an educational setting. For example, a teacher shares portions of a textbook with students to support classroom discussion. Focus on any legal exceptions or defenses that may apply, such as the fair use doctrine in educational contexts."
pre_text = pre_assistant(prompt)
post_text = post_assistant(prompt)
print_side_by_side(pre_text, post_text)

                     PRE-TEST                       |                      POST-TEST                     
<|begin_of_text|><|start_header_id|>system<|end_he  |  <|begin_of_text|><|start_header_id|>system<|end_he
ader_id|>  Cutting Knowledge Date: December 2023    |  ader_id|>  Cutting Knowledge Date: December 2023  
Today Date: 15 Nov 2024  <|eot_id|><|start_header_  |  Today Date: 15 Nov 2024  <|eot_id|><|start_header_
id|>human<|end_header_id|>  Construct an argument   |  id|>human<|end_header_id|>  Construct an argument 
in defense of a client accused of breaching         |  in defense of a client accused of breaching       
intellectual property laws due to sharing           |  intellectual property laws due to sharing         
copyrighted material in an educational setting.     |  copyrighted material in an educational setting.   
For example, a teacher shares portions of a         |  For example, a teacher shares portions of a       
textbook with students to support classroom   