## Importing Modules

In [1]:
import os
import pathlib
import torch

from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from trl import SFTTrainer

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModel,
    AutoModelForSequenceClassification,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    pipeline,
    TextDataset,
    EvalPrediction,
    DataCollatorWithPadding,
    GenerationConfig,
    BitsAndBytesConfig
)

from peft import (
    LoraConfig,
    PeftModelForSequenceClassification,
    TaskType,
    AutoPeftModelForSequenceClassification,
    get_peft_model
)

if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available.")

  from .autonotebook import tqdm as notebook_tqdm



GPU is available!


## Set Up Directories

In [2]:
MAIN_PATH = str(pathlib.Path().resolve())
DATASET_PATH = MAIN_PATH + '\\datasets'
MODEL_PATH = MAIN_PATH + '\\models'
SAVE_PATH = MAIN_PATH + '\\fine_tuned_models'
MODELS = 'D:\\AI\\LLM\\models'

## Load Model

In [3]:
models = os.listdir(MODELS)
models

['config.yaml',
 'gemma-2-2b',
 'Llama-3-8B-GPTQ-4-Bit.safetensors',
 'Llama-3-8B-Instruct-GPTQ-4-Bit.safetensors',
 'Llama-3.2-11B-Vision-Instruct-bnb-4bit',
 'Llama-3.2-1B-Instruct',
 'Llama-3.2-3B-Instruct',
 'Meta-Llama-3.1-8B-Instruct-GPTQ-INT4',
 'Phi-3-mini-128k-instruct',
 'Phi-3-mini-128k-instruct-onnx',
 'Phi-3-mini-4k-instruct-q4.gguf',
 'place-your-models-here.txt',
 'Qwen2.5-0.5B',
 'Qwen2.5-1.5B',
 'Qwen2.5-3B',
 'Qwen2.5-7B-Instruct-GPTQ-Int4']

In [4]:
model_path = MODELS + '\\' + models[12]
model_path

'D:\\AI\\LLM\\models\\Qwen2.5-0.5B'

In [5]:
torch.cuda.empty_cache()

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    trust_remote_code = True
)
model.config.use_cache = False

`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [7]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_percentage = (trainable_params / total_params) * 100

print('Total parameters :', total_params)
print('Trainable parameters :', trainable_params)
print('Trainable percentage: {:.2f}%'.format(trainable_percentage))

Total parameters : 315119488
Trainable parameters : 136178560
Trainable percentage: 43.21%


## Load Tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code = True)
tokenizer.pad_token = tokenizer.eos_token

## Import Dataset

In [9]:
dataset_name = 'meta-math/MetaMathQA'

dataset = load_dataset(dataset_name, split = 'train')
dataset.to_pandas().head()

Unnamed: 0,type,query,original_question,response
0,MATH_AnsAug,Gracie and Joe are choosing numbers on the com...,Gracie and Joe are choosing numbers on the com...,"The distance between two points $(x_1,y_1)$ an..."
1,GSM_Rephrased,What is the total cost of purchasing equipment...,The treasurer of a football team must buy equi...,"Each player requires a $25 jersey, a $15.20 pa..."
2,GSM_SV,Diego baked 12 cakes for his sister's birthday...,Diego baked 12 cakes for his sister's birthday...,"To solve this problem, we need to determine th..."
3,MATH_AnsAug,Convert $10101_3$ to a base 10 integer.,Convert $10101_3$ to a base 10 integer.,$10101_3 = 1 \cdot 3^4 + 0 \cdot 3^3 + 1 \cdot...
4,GSM_FOBAR,"Sue works in a factory and every 30 minutes, a...","Sue works in a factory and every 30 minutes, a...","We know that every 30 minutes, a machine produ..."


In [10]:
dataset

Dataset({
    features: ['type', 'query', 'original_question', 'response'],
    num_rows: 395000
})

In [11]:
dataset = dataset.train_test_split(test_size = 0.1, shuffle = True)
train_dataset = dataset['train']
test_dataset = dataset['test']

In [12]:
train_dataset

Dataset({
    features: ['type', 'query', 'original_question', 'response'],
    num_rows: 355500
})

## Set Up PEFT

In [13]:
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha = lora_alpha,
    lora_dropout = lora_dropout,
    r = lora_r,
    bias = 'none',
    task_type = 'CAUSAL_LM',
)

## Set Up Training

In [14]:
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = 'paged_adamw_32bit'
save_steps = 100
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 100
warmup_ratio = 0.03
lr_scheduler_type = 'constant'

training_arguments = TrainingArguments(
    output_dir = SAVE_PATH,
    per_device_train_batch_size = per_device_train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    optim = optim,
    save_steps = save_steps,
    logging_steps = logging_steps,
    learning_rate = learning_rate,
    fp16 = True,
    max_grad_norm = max_grad_norm,
    max_steps = max_steps,
    warmup_ratio = warmup_ratio,
    group_by_length = True,
    lr_scheduler_type = lr_scheduler_type,
)

In [15]:
max_seq_length = 512

trainer = SFTTrainer(
    model = model,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    peft_config = peft_config,
    dataset_text_field = 'response',
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    args = training_arguments,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 355500/355500 [00:40<00:00, 8681.21 examples/s]
Map: 100%|██████████| 39500/39500 [00:04<00:00, 8384.80 examples/s]
max_steps is given, it will override any value given in num_train_epochs


In [16]:
for name, module in trainer.model.named_modules():
    if 'norm' in name:
        module = module.to(torch.float32)

In [17]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_percentage = (trainable_params / total_params) * 100

print('Total parameters :', total_params)
print('Trainable parameters :', trainable_params)
print('Trainable percentage: {:.2f}%'.format(trainable_percentage))

Total parameters : 319444864
Trainable parameters : 4325376
Trainable percentage: 1.35%


In [18]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
 10%|█         | 10/100 [01:13<07:59,  5.33s/it]

{'loss': 0.6063, 'grad_norm': 0.19644689559936523, 'learning_rate': 0.0002, 'epoch': 0.0}


 20%|██        | 20/100 [01:28<01:54,  1.43s/it]

{'loss': 0.7052, 'grad_norm': 0.19358541071414948, 'learning_rate': 0.0002, 'epoch': 0.0}


 30%|███       | 30/100 [01:58<04:48,  4.12s/it]

{'loss': 0.8199, 'grad_norm': 0.22289781272411346, 'learning_rate': 0.0002, 'epoch': 0.0}


 40%|████      | 40/100 [02:32<03:10,  3.18s/it]

{'loss': 0.8642, 'grad_norm': 0.2326933592557907, 'learning_rate': 0.0002, 'epoch': 0.0}


 50%|█████     | 50/100 [02:57<02:03,  2.47s/it]

{'loss': 0.9893, 'grad_norm': 0.4852098524570465, 'learning_rate': 0.0002, 'epoch': 0.0}


 60%|██████    | 60/100 [04:03<02:55,  4.38s/it]

{'loss': 0.5413, 'grad_norm': 0.15698479115962982, 'learning_rate': 0.0002, 'epoch': 0.0}


 70%|███████   | 70/100 [04:18<00:41,  1.38s/it]

{'loss': 0.6597, 'grad_norm': 0.18916648626327515, 'learning_rate': 0.0002, 'epoch': 0.0}


 80%|████████  | 80/100 [04:47<01:20,  4.04s/it]

{'loss': 0.7478, 'grad_norm': 0.21796047687530518, 'learning_rate': 0.0002, 'epoch': 0.0}


 90%|█████████ | 90/100 [05:22<00:31,  3.11s/it]

{'loss': 0.8381, 'grad_norm': 0.2530665695667267, 'learning_rate': 0.0002, 'epoch': 0.0}


100%|██████████| 100/100 [05:47<00:00,  2.55s/it]

{'loss': 0.9722, 'grad_norm': 0.5062568783760071, 'learning_rate': 0.0002, 'epoch': 0.0}


100%|██████████| 100/100 [05:47<00:00,  3.47s/it]

{'train_runtime': 347.58, 'train_samples_per_second': 4.603, 'train_steps_per_second': 0.288, 'train_loss': 0.7743999004364014, 'epoch': 0.0}





TrainOutput(global_step=100, training_loss=0.7743999004364014, metrics={'train_runtime': 347.58, 'train_samples_per_second': 4.603, 'train_steps_per_second': 0.288, 'total_flos': 605440622358528.0, 'train_loss': 0.7743999004364014, 'epoch': 0.00450070323488045})

In [19]:
save_model = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
#save_model.save_pretrained(SAVE_PATH)