## Importing Modules

In [1]:
import os
import pathlib
import numpy as np
import pandas as pd
import nltk
from trl import SFTTrainer

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline, GPT2Config, TextDataset
from tqdm.auto import tqdm
import random
import datetime
import time
import statistics
from nltk.translate.bleu_score import sentence_bleu
from transformers import TrainingArguments, Trainer, set_seed, EvalPrediction, DataCollatorWithPadding
from datasets import load_dataset
from datasets import Dataset
from transformers import AutoModel, AutoTokenizer, TFAutoModel, AutoModelForSequenceClassification
from peft import LoraConfig, PeftModelForSequenceClassification, TaskType, AutoPeftModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available.")

  from .autonotebook import tqdm as notebook_tqdm



GPU is available!


In [2]:
MAIN_PATH = str(pathlib.Path().resolve())
DATASET_PATH = MAIN_PATH + '\\datasets'
MODEL_PATH = MAIN_PATH + '\\models'

In [3]:
models = os.listdir(MODEL_PATH)
models

['bert-base-cased',
 'bert-base-multilingual-cased',
 'bert-base-uncased',
 'bert-large-cased',
 'bert-large-uncased',
 'flan-t5-base',
 'flan-t5-large',
 'flan-t5-small',
 'gpt2',
 'gpt2-large',
 'gpt2-medium']

In [4]:
model_path = MODEL_PATH + '\\' + models[8]
model_path

'D:\\Python\\LLM_Environment\\models\\gpt2'

In [5]:
model = AutoModelForCausalLM.from_pretrained(model_path)

In [6]:
torch.cuda.empty_cache()

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

## Import Dataset

In [8]:
dataset_name = "meta-math/MetaMathQA"

dataset = load_dataset(dataset_name, split='train')
dataset.to_pandas().head()

Unnamed: 0,type,query,original_question,response
0,MATH_AnsAug,Gracie and Joe are choosing numbers on the com...,Gracie and Joe are choosing numbers on the com...,"The distance between two points $(x_1,y_1)$ an..."
1,GSM_Rephrased,What is the total cost of purchasing equipment...,The treasurer of a football team must buy equi...,"Each player requires a $25 jersey, a $15.20 pa..."
2,GSM_SV,Diego baked 12 cakes for his sister's birthday...,Diego baked 12 cakes for his sister's birthday...,"To solve this problem, we need to determine th..."
3,MATH_AnsAug,Convert $10101_3$ to a base 10 integer.,Convert $10101_3$ to a base 10 integer.,$10101_3 = 1 \cdot 3^4 + 0 \cdot 3^3 + 1 \cdot...
4,GSM_FOBAR,"Sue works in a factory and every 30 minutes, a...","Sue works in a factory and every 30 minutes, a...","We know that every 30 minutes, a machine produ..."


In [9]:
dataset

Dataset({
    features: ['type', 'query', 'original_question', 'response'],
    num_rows: 395000
})

In [10]:
def prepare_datasets(example):
    example['prompt'] = f"""<|system|>
    You are a intelligent chatbot and expertise in Mathematics.</s>
    <|user|>
    {example['query']}.
    <|assistant|>
    {example['response']}"""
    return example

def tokenize_datasets(dataset):
    tokenized_dataset = dataset.map(
        lambda example: tokenizer(
            example['prompt'],
            truncation=True,
            max_length=512,
            ),
        batched=True,
        remove_columns=['prompt'])
    return tokenized_dataset

In [11]:
dataset = dataset.map(
    prepare_datasets, remove_columns=['query', 'response', "original_question", "type"]
)
dataset = dataset.shuffle(42).select(range(395000)).train_test_split(test_size=0.1, seed=42)

In [12]:
train_dataset = dataset['train']
test_dataset = dataset['test']

In [13]:
tokenizer.pad_token = tokenizer.eos_token

In [14]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='D:\Python\LLM_Environment\models\gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [15]:
save_path = './model'
# Define your training arguments
batch_size = 4
max_steps = 100
training_args = TrainingArguments(
    output_dir=save_path,
    gradient_accumulation_steps=batch_size,
    evaluation_strategy="steps",
    do_eval=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    log_level="debug",
    save_strategy="no",
    save_total_limit=2,
    save_safetensors=False,
    fp16=True,
    logging_steps=50,
    learning_rate=2e-5,
    eval_steps=50,
    max_steps=max_steps,
    warmup_steps=30,
    lr_scheduler_type="cosine",
)



In [16]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    dataset_text_field="prompt",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_args,
)



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend


In [17]:
trainer.train()

Currently training with a batch size of: 4
***** Running training *****
  Num examples = 355,500
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 100
  Number of trainable parameters = 124,439,808
  attn_output = torch.nn.functional.scaled_dot_product_attention(
 50%|█████     | 50/100 [00:24<00:22,  2.21it/s]
***** Running Evaluation *****
  Num examples = 39500
  Batch size = 4


{'loss': 2.3817, 'grad_norm': 3.72003173828125, 'learning_rate': 1.657938725939713e-05, 'epoch': 0.0}



 50%|█████     | 50/100 [07:01<00:22,  2.21it/s]   

{'eval_loss': 1.5409225225448608, 'eval_runtime': 396.7402, 'eval_samples_per_second': 99.561, 'eval_steps_per_second': 24.89, 'epoch': 0.0}


100%|██████████| 100/100 [07:25<00:00,  2.07it/s]  
***** Running Evaluation *****
  Num examples = 39500
  Batch size = 4


{'loss': 1.6179, 'grad_norm': 2.8454039096832275, 'learning_rate': 1.0069334586854106e-08, 'epoch': 0.0}



100%|██████████| 100/100 [13:58<00:00,  2.07it/s]  

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 100/100 [13:58<00:00,  8.38s/it]

{'eval_loss': 1.435025691986084, 'eval_runtime': 392.6447, 'eval_samples_per_second': 100.6, 'eval_steps_per_second': 25.15, 'epoch': 0.0}
{'train_runtime': 838.3314, 'train_samples_per_second': 1.909, 'train_steps_per_second': 0.119, 'train_loss': 1.9997915649414062, 'epoch': 0.0}





TrainOutput(global_step=100, training_loss=1.9997915649414062, metrics={'train_runtime': 838.3314, 'train_samples_per_second': 1.909, 'train_steps_per_second': 0.119, 'total_flos': 302157697536000.0, 'train_loss': 1.9997915649414062, 'epoch': 0.00450070323488045})