## Importing Modules

In [1]:
import os
import pathlib
import torch

from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from trl import SFTTrainer

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModel,
    AutoModelForSequenceClassification,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    pipeline,
    TextDataset,
    EvalPrediction,
    DataCollatorWithPadding,
    GenerationConfig,
    BitsAndBytesConfig
)

from peft import (
    LoraConfig,
    PeftModelForSequenceClassification,
    TaskType,
    AutoPeftModelForSequenceClassification,
    get_peft_model,
    prepare_model_for_kbit_training
)

if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available.")

  from .autonotebook import tqdm as notebook_tqdm



GPU is available!


## Set Up Directories

In [2]:
MAIN_PATH = str(pathlib.Path().resolve())
DATASET_PATH = MAIN_PATH + '\\datasets'
MODEL_PATH = MAIN_PATH + '\\models'
SAVE_PATH = MAIN_PATH + '\\fine_tuned_models'
MODELS = 'D:\\AI\\LLM\\models'

## Load Model

In [3]:
models = os.listdir(MODELS)
models

['config.yaml',
 'gemma-2-2b',
 'Llama-3-8B-GPTQ-4-Bit.safetensors',
 'Llama-3-8B-Instruct-GPTQ-4-Bit.safetensors',
 'Llama-3.2-11B-Vision-Instruct-bnb-4bit',
 'Llama-3.2-1B-Instruct',
 'Llama-3.2-3B-Instruct',
 'Meta-Llama-3.1-8B-Instruct-GPTQ-INT4',
 'Phi-3-mini-128k-instruct',
 'Phi-3-mini-128k-instruct-onnx',
 'Phi-3-mini-4k-instruct-q4.gguf',
 'place-your-models-here.txt',
 'Qwen2.5-0.5B',
 'Qwen2.5-1.5B',
 'Qwen2.5-3B',
 'Qwen2.5-7B-Instruct-GPTQ-Int4']

In [4]:
model_path = MODELS + '\\' + models[5]
model_path

'D:\\AI\\LLM\\models\\Llama-3.2-1B-Instruct'

In [5]:
torch.cuda.empty_cache()

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    trust_remote_code = True
)#.to(device)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [8]:
for param in model.parameters():
    print(param.dtype)
    break

torch.float16


In [9]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_percentage = (trainable_params / total_params) * 100

print('Total parameters :', total_params)
print('Trainable parameters :', trainable_params)
print('Trainable percentage: {:.2f}%'.format(trainable_percentage))

Total parameters : 749275136
Trainable parameters : 262735872
Trainable percentage: 35.07%


## Load Tokenizer

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side = 'left')
tokenizer.pad_token = tokenizer.eos_token

In [11]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token' : '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id
    model.generation_config.pad_token_id = tokenizer.pad_token_id

### Model Preview

In [12]:
inputs = ['hello, my name is nitish pandey', 'hello']
input_tok = tokenizer(inputs, padding = True, truncation = True, return_tensors = 'pt').to(device)
output = model.generate(**input_tok, max_length = 300)
print(tokenizer.decode(output[1]))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|begin_of_text|>hello, I am using a 2015 2nd gen Mustang GT 3.2L V6 engine, with a 6-speed manual transmission. I'm looking for a reliable and affordable solution to upgrade the exhaust system. I've considered purchasing a used or new exhaust system from a reputable aftermarket manufacturer. I've researched and found several options, including the following:

* 4th Gen Mustang GT 4.0L V6 exhaust system with a 4-speed automatic transmission
* 4th Gen Mustang GT 4.0L V6 exhaust system with a 6-speed automatic transmission
* 4th Gen Mustang GT 4.0L V6 exhaust system with a 4-speed automatic transmission and a catalytic converter
* 4th Gen Mustang GT 4.0L V6 exhaust system with a 6-speed automatic transmission and a catalytic converter

I've also considered purchasing a used exhaust system from a reputable seller, such as:

* Ford Performance
* Edelbrock
* Garrett
* JE Motorsports

I'm looking for a reliable a

## Import Dataset

In [13]:
dataset_name = 'meta-math/MetaMathQA'

dataset = load_dataset(dataset_name, split = 'train')
dataset.to_pandas().head()

Unnamed: 0,type,query,original_question,response
0,MATH_AnsAug,Gracie and Joe are choosing numbers on the com...,Gracie and Joe are choosing numbers on the com...,"The distance between two points $(x_1,y_1)$ an..."
1,GSM_Rephrased,What is the total cost of purchasing equipment...,The treasurer of a football team must buy equi...,"Each player requires a $25 jersey, a $15.20 pa..."
2,GSM_SV,Diego baked 12 cakes for his sister's birthday...,Diego baked 12 cakes for his sister's birthday...,"To solve this problem, we need to determine th..."
3,MATH_AnsAug,Convert $10101_3$ to a base 10 integer.,Convert $10101_3$ to a base 10 integer.,$10101_3 = 1 \cdot 3^4 + 0 \cdot 3^3 + 1 \cdot...
4,GSM_FOBAR,"Sue works in a factory and every 30 minutes, a...","Sue works in a factory and every 30 minutes, a...","We know that every 30 minutes, a machine produ..."


In [14]:
dataset

Dataset({
    features: ['type', 'query', 'original_question', 'response'],
    num_rows: 395000
})

In [15]:
dataset = dataset.train_test_split(test_size = 0.1, shuffle = True)
train_dataset = dataset['train']
test_dataset = dataset['test']

In [16]:
train_dataset

Dataset({
    features: ['type', 'query', 'original_question', 'response'],
    num_rows: 355500
})

In [17]:
test_dataset

Dataset({
    features: ['type', 'query', 'original_question', 'response'],
    num_rows: 39500
})

## Set Up PEFT

In [18]:
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha = lora_alpha,
    lora_dropout = lora_dropout,
    r = lora_r,
    bias = 'none',
    task_type = 'CAUSAL_LM',
)

In [19]:
peft_model = get_peft_model(model, peft_config, adapter_name = 'math')
peft_model.print_trainable_parameters()

trainable params: 6,815,744 || all params: 1,242,630,144 || trainable%: 0.5485


## Set Up Training

In [20]:
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False)

In [21]:
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = 'paged_adamw_32bit'
save_steps = 100
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 100
warmup_ratio = 0.03
lr_scheduler_type = 'constant'

training_arguments = TrainingArguments(
    output_dir = SAVE_PATH,
    per_device_train_batch_size = per_device_train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    optim = optim,
    save_steps = save_steps,
    logging_steps = logging_steps,
    learning_rate = learning_rate,
    fp16 = True,
    max_grad_norm = max_grad_norm,
    max_steps = max_steps,
    warmup_ratio = warmup_ratio,
    group_by_length = True,
    lr_scheduler_type = lr_scheduler_type,
)

In [22]:
max_seq_length = 512

trainer = SFTTrainer(
    model = model,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    peft_config = peft_config,
    dataset_text_field = 'response',
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    args = training_arguments,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 355500/355500 [00:37<00:00, 9559.93 examples/s] 
Map: 100%|██████████| 39500/39500 [00:04<00:00, 9330.91 examples/s] 
max_steps is given, it will override any value given in num_train_epochs


In [23]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_percentage = (trainable_params / total_params) * 100

print('Total parameters :', total_params)
print('Trainable parameters :', trainable_params)
print('Trainable percentage: {:.2f}%'.format(trainable_percentage))

Total parameters : 762906624
Trainable parameters : 6815744
Trainable percentage: 0.89%


In [None]:
trainer.train()

 10%|█         | 10/100 [02:13<14:23,  9.60s/it]

{'loss': 0.91, 'grad_norm': 0.13153253495693207, 'learning_rate': 0.0002, 'epoch': 0.0}


 20%|██        | 20/100 [03:17<08:10,  6.13s/it]

{'loss': 1.0031, 'grad_norm': 0.20793874561786652, 'learning_rate': 0.0002, 'epoch': 0.0}


 30%|███       | 30/100 [04:05<05:16,  4.53s/it]

{'loss': 0.96, 'grad_norm': 0.2766612768173218, 'learning_rate': 0.0002, 'epoch': 0.0}


 40%|████      | 40/100 [04:32<02:01,  2.02s/it]

{'loss': 1.0362, 'grad_norm': 0.2981817126274109, 'learning_rate': 0.0002, 'epoch': 0.0}


 50%|█████     | 50/100 [04:47<01:22,  1.65s/it]

{'loss': 1.2535, 'grad_norm': 1.8408682346343994, 'learning_rate': 0.0002, 'epoch': 0.0}


 60%|██████    | 60/100 [07:02<06:23,  9.58s/it]

{'loss': 0.6949, 'grad_norm': 0.18175795674324036, 'learning_rate': 0.0002, 'epoch': 0.0}


 70%|███████   | 70/100 [08:03<02:57,  5.93s/it]

{'loss': 0.7971, 'grad_norm': 0.21540848910808563, 'learning_rate': 0.0002, 'epoch': 0.0}


 80%|████████  | 80/100 [08:48<01:26,  4.32s/it]

{'loss': 0.9169, 'grad_norm': 0.24530212581157684, 'learning_rate': 0.0002, 'epoch': 0.0}


 90%|█████████ | 90/100 [09:14<00:19,  1.93s/it]

{'loss': 0.9769, 'grad_norm': 0.3442721664905548, 'learning_rate': 0.0002, 'epoch': 0.0}


100%|██████████| 100/100 [09:28<00:00,  1.65s/it]

{'loss': 1.1093, 'grad_norm': 0.6363228559494019, 'learning_rate': 0.0002, 'epoch': 0.0}


In [15]:
save_model = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
#save_model.save_pretrained(SAVE_PATH)