<a href="https://colab.research.google.com/github/azzindani/03_LLM/blob/main/00_LLM_Fine_Tune_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Modules

In [1]:
#!pip install --upgrade transformers
!pip install peft
!pip install -U bitsandbytes
!pip install datasets
!pip install trl



In [2]:
import os
import pathlib
import torch
import numpy as np

from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from trl import SFTTrainer

from transformers import (
  AutoTokenizer,
  AutoModelForCausalLM,
  AutoModelForSeq2SeqLM,
  AutoModel,
  AutoModelForSequenceClassification,
  DataCollatorForLanguageModeling,
  Trainer,
  TrainingArguments,
  pipeline,
  TextDataset,
  EvalPrediction,
  DataCollatorWithPadding,
  GenerationConfig,
  BitsAndBytesConfig
)

from peft import (
  LoraConfig,
  PeftModelForSequenceClassification,
  TaskType,
  AutoPeftModelForSequenceClassification,
  get_peft_model,
  prepare_model_for_kbit_training
)

if torch.cuda.is_available():
  print("GPU is available!")
else:
  print("GPU is not available.")

GPU is available!


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Import Model

In [4]:
#url = 'https://huggingface.co/Qwen/Qwen2.5-0.5B'
#model_name = url.split('.co/')[-1]

model_name = 'Qwen/Qwen2.5-0.5B-Instruct'

In [5]:
'''bnb_config = BitsAndBytesConfig(
  load_in_4bit = True,
  bnb_4bit_quant_type = 'nf4',
  bnb_4bit_compute_dtype = torch.float16,
  bnb_4bit_use_double_quant = True,
)

model = AutoModelForCausalLM.from_pretrained(
  model_path,
  quantization_config = bnb_config,
  trust_remote_code = True
)#.to(device) #'''

"bnb_config = BitsAndBytesConfig(\n  load_in_4bit = True,\n  bnb_4bit_quant_type = 'nf4',\n  bnb_4bit_compute_dtype = torch.float16,\n  bnb_4bit_use_double_quant = True,\n)\n\nmodel = AutoModelForCausalLM.from_pretrained(\n  model_path,\n  quantization_config = bnb_config,\n  trust_remote_code = True\n)#.to(device) #"

In [6]:
model = AutoModelForCausalLM.from_pretrained(
  model_name,
  torch_dtype = torch.float16,
  trust_remote_code = True
).to(device) #'''

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [8]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_percentage = (trainable_params / total_params) * 100

print('Total parameters :', total_params)
print('Trainable parameters :', trainable_params)
print('Trainable percentage: {:.2f}%'.format(trainable_percentage))

Total parameters : 494032768
Trainable parameters : 494032768
Trainable percentage: 100.00%


## Import Tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [10]:
tokenizer

Qwen2TokenizerFast(name_or_path='Qwen/Qwen2.5-0.5B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|im_end|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, 

## Import Dataset

In [11]:
#url = 'https://huggingface.co/datasets/KingNish/reasoning-base-20k'
#dataset_name = url.split('datasets/')[-1]

dataset_name = 'migtissera/Synthia-v1.5-II'

In [12]:
max_length = 128

In [13]:
dataset = load_dataset(dataset_name, split = 'train')
dataset

Dataset({
    features: ['system', 'instruction', 'response'],
    num_rows: 21819
})

In [14]:
dataset.select(range(5)).to_pandas().head()

Unnamed: 0,system,instruction,response
0,You are an AI assistant. Provide a detailed an...,"Here is a detailed, multi-part question on a s...",Here is a detailed response to your multi-part...
1,\n Answer the Question by exploring multipl...,Here is a potential multi-part question on a s...,Here is my attempt at answering this complex m...
2,You are an AI assistant. User will you give yo...,Here is a multi-part graduate level question o...,Here is my analysis of the graduate-level cybe...
3,You are an AI assistant that follows instructi...,Here is a multi-part question on the topic of ...,This is a great question that delves into some...
4,You are an AI assistant that helps people find...,"Here is a detailed, multi-part question on an ...","Thank you for this excellent, comprehensive qu..."


In [15]:
dataset[0]

{'system': "You are an AI assistant. Provide a detailed answer so user don't need to search outside to understand the answer.",
 'instruction': "Here is a detailed, multi-part question on a specific topic within Blockchain and Distributed Ledger Technologies:\n\nConsensus mechanisms are critical components of blockchain systems that enable network participants to reach agreement on the state of the distributed ledger in a trustless, decentralized manner. Proof-of-Work (PoW) and Proof-of-Stake (PoS) are two widely used consensus algorithms.\n\na) Explain the fundamental differences between PoW and PoS consensus mechanisms, including their respective processes for validating transactions and adding new blocks to the blockchain. \n\nb) Analyze the security implications of each algorithm, considering potential attack vectors such as 51% attacks, selfish mining, and long-range attacks. Discuss how each algorithm's design features and economic incentives contribute to network security.\n\nc)

In [16]:
features = list(dataset.features.keys())
print(features)

['system', 'instruction', 'response']


In [17]:
def format_text(example):
  example["text"] = f"System: {example['system']} Instruction: {example['instruction']} Response: {example['response']}"
  return example

In [18]:
formatted_dataset = dataset.map(format_text, remove_columns = features)
formatted_dataset

Dataset({
    features: ['text'],
    num_rows: 21819
})

In [19]:
formatted_dataset[0]

{'text': 'System: You are an AI assistant. Provide a detailed answer so user don\'t need to search outside to understand the answer. Instruction: Here is a detailed, multi-part question on a specific topic within Blockchain and Distributed Ledger Technologies:\n\nConsensus mechanisms are critical components of blockchain systems that enable network participants to reach agreement on the state of the distributed ledger in a trustless, decentralized manner. Proof-of-Work (PoW) and Proof-of-Stake (PoS) are two widely used consensus algorithms.\n\na) Explain the fundamental differences between PoW and PoS consensus mechanisms, including their respective processes for validating transactions and adding new blocks to the blockchain. \n\nb) Analyze the security implications of each algorithm, considering potential attack vectors such as 51% attacks, selfish mining, and long-range attacks. Discuss how each algorithm\'s design features and economic incentives contribute to network security.\n\n

In [20]:
def tokenize_data(example, max_length = max_length):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=max_length)

In [21]:
tokenized_dataset = formatted_dataset.map(tokenize_data, batched = True)#, remove_columns = 'text')
tokenized_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 21819
})

In [22]:
tokenized_dataset[0]

{'text': 'System: You are an AI assistant. Provide a detailed answer so user don\'t need to search outside to understand the answer. Instruction: Here is a detailed, multi-part question on a specific topic within Blockchain and Distributed Ledger Technologies:\n\nConsensus mechanisms are critical components of blockchain systems that enable network participants to reach agreement on the state of the distributed ledger in a trustless, decentralized manner. Proof-of-Work (PoW) and Proof-of-Stake (PoS) are two widely used consensus algorithms.\n\na) Explain the fundamental differences between PoW and PoS consensus mechanisms, including their respective processes for validating transactions and adding new blocks to the blockchain. \n\nb) Analyze the security implications of each algorithm, considering potential attack vectors such as 51% attacks, selfish mining, and long-range attacks. Discuss how each algorithm\'s design features and economic incentives contribute to network security.\n\n

In [23]:
dataset = tokenized_dataset.train_test_split(test_size = 0.1, seed = 42)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 19637
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 2182
    })
})

In [24]:
train_dataset = dataset['train']
test_dataset = dataset['test']
train_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 19637
})

In [25]:
train_dataset.select(range(5)).to_pandas().head()

Unnamed: 0,text,input_ids,attention_mask
0,"System: Instruction: Here is a detailed, mult...","[2320, 25, 220, 29051, 25, 5692, 374, 264, 116...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"System: Instruction: Here is a detailed, mult...","[2320, 25, 220, 29051, 25, 5692, 374, 264, 116...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"System: Instruction: Here is a detailed, mult...","[2320, 25, 220, 29051, 25, 5692, 374, 264, 116...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,System: Instruction: Here is a graduate level...,"[2320, 25, 220, 29051, 25, 5692, 374, 264, 190...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"System: Instruction: Here is a detailed, mult...","[2320, 25, 220, 29051, 25, 5692, 374, 264, 116...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [26]:
train_dataset[0]

{'text': 'System:  Instruction: Here is a detailed, multi-part question on an important topic in normative ethics:\n\nOne of the central challenges in normative ethics is reconciling the tension between agent-neutral and agent-relative moral reasons. \n\nPart 1: Explain the key differences between agent-neutral and agent-relative moral reasons, using specific examples to illustrate each category. Discuss how consequentialist moral theories tend to prioritize agent-neutral reasons, while deontological theories give more weight to agent-relative reasons.\n\nPart 2: Critically evaluate the plausibility and implications of ethical egoism, the view that moral agents ought to do what is in their own self-interest. Does ethical egoism represent a coherent and defensible moral theory? Or does it fail to properly incorporate agent-neutral moral considerations? Explain your reasoning.\n\nPart 3: Consider the demandingness objection often posed against consequentialist theories - the idea that ma

## Training Set Up

In [27]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)
#data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False)

In [28]:
def compute_metrics(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis = 1)
  precision, recall, f1, _ = precision_recall_fscore_support(
    p.label_ids,
    preds,
    average = 'weighted'
  )
  matrix = {
    'accuracy': accuracy_score(p.label_ids, preds),
    'f1': f1, 'precision': precision,
    'recall': recall
  }
  return matrix

In [29]:
model.config.pad_token_id = tokenizer.pad_token_id

In [30]:
torch.cuda.empty_cache()

## Training Model

In [31]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [32]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_percentage = (trainable_params / total_params) * 100

print('Total parameters :', total_params)
print('Trainable parameters :', trainable_params)
print('Trainable percentage: {:.2f}%'.format(trainable_percentage))

Total parameters : 494032768
Trainable parameters : 494032768
Trainable percentage: 100.00%


In [33]:
torch.cuda.empty_cache()

In [34]:
save_path = './model'

batch_size = 8
max_steps = 1000
training_args = TrainingArguments(
  output_dir = save_path,
  gradient_accumulation_steps = batch_size,
  evaluation_strategy = 'steps',
  do_eval = True,
  per_device_train_batch_size = batch_size,
  per_device_eval_batch_size = batch_size,
  log_level = 'debug',
  save_strategy = 'no',
  save_total_limit = 2,
  save_safetensors = False,
  fp16 = False,
  logging_steps = 50,
  learning_rate = 2e-5,
  eval_steps = 50,
  max_steps = max_steps,
  warmup_steps = 30,
  lr_scheduler_type = 'cosine',
)
training_args



TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=50,
eval_strategy=steps,
eval_use_gather_object=False,
evaluation_strategy=steps,
fp16

In [35]:
trainer = SFTTrainer(
  model = model,
  train_dataset = train_dataset.select(range(10000)),
  eval_dataset = test_dataset.select(range(1000)),
  dataset_text_field = 'text',
  max_seq_length = max_length,
  tokenizer = tokenizer,
  args = training_args,
)
trainer


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
max_steps is given, it will override any value given in num_train_epochs


<trl.trainer.sft_trainer.SFTTrainer at 0x7f90bffb3340>

In [36]:
trainer.train()

Currently training with a batch size of: 8
The following columns in the training set don't have a corresponding argument in `Qwen2ForCausalLM.forward` and have been ignored: text. If text are not expected by `Qwen2ForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,000
  Num Epochs = 7
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 1,000
  Number of trainable parameters = 494,032,768
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mterlupakan100[0m ([33mterlupakan100-[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
50,8654190018.56,
100,0.0,
150,0.0,
200,0.0,
250,0.0,
300,0.0,
350,0.0,
400,0.0,
450,0.0,
500,0.0,


The following columns in the evaluation set don't have a corresponding argument in `Qwen2ForCausalLM.forward` and have been ignored: text. If text are not expected by `Qwen2ForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
The following columns in the evaluation set don't have a corresponding argument in `Qwen2ForCausalLM.forward` and have been ignored: text. If text are not expected by `Qwen2ForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `Qwen2ForCausalLM.forward` and have been ignored: text. I

TrainOutput(global_step=1000, training_loss=432709500.928, metrics={'train_runtime': 2046.6048, 'train_samples_per_second': 31.271, 'train_steps_per_second': 0.489, 'total_flos': 1.7591408001024e+16, 'train_loss': 432709500.928, 'epoch': 6.4})

## Model Evaluation

In [37]:
evaluation_results = trainer.evaluate()
print('Evaluation Results:', evaluation_results)

The following columns in the evaluation set don't have a corresponding argument in `Qwen2ForCausalLM.forward` and have been ignored: text. If text are not expected by `Qwen2ForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


Evaluation Results: {'eval_loss': nan, 'eval_runtime': 8.9228, 'eval_samples_per_second': 112.072, 'eval_steps_per_second': 14.009, 'epoch': 6.4}


## Save Model

In [38]:
save_model = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
#save_model.save_pretrained(save_path)