## Importing Modules

In [1]:
import os
import pathlib
import numpy as np
import pandas as pd
import nltk

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

from tqdm.auto import tqdm
import random
import datetime
import time
import statistics
from nltk.translate.bleu_score import sentence_bleu
from transformers import TrainingArguments, Trainer
from datasets import load_dataset

if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available.")

  from .autonotebook import tqdm as notebook_tqdm



GPU is available!


In [2]:
MAIN_PATH = str(pathlib.Path().resolve())
DATASET_PATH = MAIN_PATH + '\\datasets'
MODEL_PATH = MAIN_PATH + '\\models'

In [3]:
models = os.listdir(MODEL_PATH)
models

['bert-base-cased',
 'bert-base-multilingual-cased',
 'bert-base-uncased',
 'bert-large-cased',
 'bert-large-uncased',
 'flan-t5-base',
 'flan-t5-large',
 'flan-t5-small',
 'gpt2',
 'gpt2-large',
 'gpt2-medium',
 'tuned_text_gen']

In [4]:
model_path = MODEL_PATH + '\\' + models[8]
model_path

'D:\\Python\\LLM_Environment\\models\\gpt2'

In [5]:
model = AutoModelForCausalLM.from_pretrained(model_path)

In [6]:
torch.cuda.empty_cache()

## Import Dataset

In [7]:
dataset_name = "databricks/databricks-dolly-15k"

In [8]:
dataset = load_dataset(dataset_name, split='train')
# dataset.to_pandas().sample(2000)
dataset.to_pandas()

Unnamed: 0,instruction,context,response,category
0,When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin A...",Virgin Australia commenced services on 31 Augu...,closed_qa
1,Which is a species of fish? Tope or Rope,,Tope,classification
2,Why can camels survive for long without water?,,Camels use the fat in their humps to keep them...,open_qa
3,"Alice's parents have three daughters: Amy, Jes...",,The name of the third daughter is Alice,open_qa
4,When was Tomoaki Komorida born?,Komorida was born in Kumamoto Prefecture on Ju...,"Tomoaki Komorida was born on July 10,1981.",closed_qa
...,...,...,...,...
15006,How do i accept the change,,Embrace the change and see the difference,brainstorming
15007,What is a laser and who created it?,A laser is a device that emits light through a...,A laser is a device that emits light from an e...,summarization
15008,What is the difference between a road bike and...,,Road bikes are built to be ridden on asphalt a...,open_qa
15009,How does GIS help in the real estate investmen...,,"Real estate investors depend on precise, accur...",general_qa


In [9]:
def preprocess(example):
  # example['prompt'] = f'{example["instruction"]} {example["input"]} {example["output"]}'
  example['prompt'] = f'{example["context"]} {example["instruction"]} {example["response"]}'
  return example


def tokenize_datasets(dataset):
  tokenized_dataset = dataset.map(
      lambda example: tokenizer(
          example['prompt'],
          truncation=True,
          max_length=128,
          ),
      batched=True,
      remove_columns=['prompt']
  )
  return tokenized_dataset

In [10]:
dataset = dataset.map(
    preprocess, remove_columns=['context', 'instruction', 'response']
)
dataset = dataset.shuffle(42).select(range(15000)).train_test_split(test_size=0.1, seed=42)

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['category', 'prompt'],
        num_rows: 13500
    })
    test: Dataset({
        features: ['category', 'prompt'],
        num_rows: 1500
    })
})

In [12]:
train_dataset = dataset['train']
test_dataset = dataset['test']

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

train_dataset = tokenize_datasets(train_dataset)
test_dataset = tokenize_datasets(test_dataset)

In [14]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='D:\Python\LLM_Environment\models\gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [15]:
batch_size = 16
training_args = TrainingArguments(
    output_dir="./models/tuned_text_gen",
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size
)


training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=no,
eval_use_gather_object=False,
evaluation_strategy=None,
fp16=

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

In [17]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
 20%|█▉        | 500/2532 [02:21<09:38,  3.51it/s]

{'loss': 3.1434, 'grad_norm': 2.791034698486328, 'learning_rate': 4.012638230647709e-05, 'epoch': 0.59}


 39%|███▉      | 1000/2532 [04:47<07:20,  3.48it/s]

{'loss': 3.0011, 'grad_norm': 2.865220308303833, 'learning_rate': 3.0252764612954187e-05, 'epoch': 1.18}


 59%|█████▉    | 1500/2532 [07:12<04:58,  3.46it/s]

{'loss': 2.8567, 'grad_norm': 2.629260301589966, 'learning_rate': 2.037914691943128e-05, 'epoch': 1.78}


 79%|███████▉  | 2000/2532 [09:38<02:33,  3.48it/s]

{'loss': 2.7748, 'grad_norm': 2.4576025009155273, 'learning_rate': 1.0505529225908373e-05, 'epoch': 2.37}


 99%|█████████▊| 2500/2532 [12:03<00:08,  3.60it/s]

{'loss': 2.7585, 'grad_norm': 2.9253127574920654, 'learning_rate': 6.31911532385466e-07, 'epoch': 2.96}


100%|██████████| 2532/2532 [12:16<00:00,  3.44it/s]

{'train_runtime': 736.732, 'train_samples_per_second': 54.973, 'train_steps_per_second': 3.437, 'train_loss': 2.9050008403181464, 'epoch': 3.0}





TrainOutput(global_step=2532, training_loss=2.9050008403181464, metrics={'train_runtime': 736.732, 'train_samples_per_second': 54.973, 'train_steps_per_second': 3.437, 'total_flos': 2645581824000000.0, 'train_loss': 2.9050008403181464, 'epoch': 3.0})