## Importing Modules

In [1]:
import os
import pathlib
import numpy as np
import pandas as pd
import nltk

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline, GPT2Config, TextDataset
from tqdm.auto import tqdm
import random
import datetime
import time
import statistics
from nltk.translate.bleu_score import sentence_bleu
from transformers import TrainingArguments, Trainer, set_seed
from datasets import load_dataset

if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available.")

  from .autonotebook import tqdm as notebook_tqdm



GPU is available!


In [2]:
MAIN_PATH = str(pathlib.Path().resolve())
DATASET_PATH = MAIN_PATH + '\\datasets'
MODEL_PATH = MAIN_PATH + '\\models'

In [3]:
models = os.listdir(MODEL_PATH)
models

['bert-base-cased',
 'bert-base-multilingual-cased',
 'bert-base-uncased',
 'bert-large-cased',
 'bert-large-uncased',
 'flan-t5-base',
 'flan-t5-large',
 'flan-t5-small',
 'gpt2',
 'gpt2-large',
 'gpt2-medium',
 'tuned_text_gen']

In [4]:
model_path = MODEL_PATH + '\\' + models[8]
model_path

'D:\\Python\\LLM_Environment\\models\\gpt2'

In [5]:
model = GPT2LMHeadModel.from_pretrained(model_path)

In [7]:
torch.cuda.empty_cache()

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

## Import Dataset

In [9]:
filenames = os.listdir(DATASET_PATH)
filenames

['Html.csv', 'Recipes.csv', 'Recipes_1000.csv', 'Shakespeare_Dataset.txt']

In [10]:
file_path = DATASET_PATH + '\\' + filenames[3]
file_path

'D:\\Python\\LLM_Environment\\datasets\\Shakespeare_Dataset.txt'

In [11]:
# Load your Shakespeare dataset
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=file_path,
    block_size=128,
)



In [12]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # No masked language modeling for GPT-2
)

In [14]:
dataset[0]

tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,    11,
         3285,   502,  2740,    13,   198,   198,  3237,    25,   198,  5248,
          461,    11,  2740,    13,   198,   198,  5962, 22307,    25,   198,
         1639,   389,   477, 12939,  2138,   284,  4656,   621,   284,  1145,
          680,    30,   198,   198,  3237,    25,   198,  4965,  5634,    13,
        12939,    13,   198,   198,  5962, 22307,    25,   198,  5962,    11,
          345,   760,   327,  1872,   385,  1526, 28599,   318,  4039,  4472,
          284,   262,   661,    13,   198,   198,  3237,    25,   198,  1135,
          760,   470,    11,   356,   760,   470,    13,   198,   198,  5962,
        22307,    25,   198,  5756,   514,  1494,   683,    11,   290,   356,
         1183,   423, 11676,   379,   674,   898,  2756,    13,   198,  3792,
          470,   257, 15593,    30,   198,   198,  3237,    25,   198,  2949,
          517,  3375,   319,   470,    26,  1309,   340,   307])

In [15]:
tokenizer.decode(dataset[0])

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be"

In [16]:
save_path = './model'
# Define training arguments
training_args = TrainingArguments(
    output_dir=save_path,
    overwrite_output_dir=True,
    num_train_epochs=10,  # Adjust the number of epochs based on your needs
    per_device_train_batch_size=4,  # Adjust batch size based on GPU memory
    save_steps=10_000,  # Adjust save steps based on your needs
)


In [17]:
# Create Trainer and fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

In [18]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  8%|▊         | 501/6600 [00:50<13:15,  7.66it/s]

{'loss': 3.7189, 'grad_norm': 4.2895188331604, 'learning_rate': 4.621212121212121e-05, 'epoch': 0.76}


 15%|█▌        | 1002/6600 [01:42<11:38,  8.01it/s]

{'loss': 3.4092, 'grad_norm': 4.196163177490234, 'learning_rate': 4.242424242424243e-05, 'epoch': 1.52}


 23%|██▎       | 1502/6600 [02:35<11:08,  7.62it/s]

{'loss': 3.2666, 'grad_norm': 4.276994228363037, 'learning_rate': 3.8636363636363636e-05, 'epoch': 2.27}


 30%|███       | 2002/6600 [03:27<09:55,  7.72it/s]

{'loss': 3.1405, 'grad_norm': 4.117074489593506, 'learning_rate': 3.484848484848485e-05, 'epoch': 3.03}


 38%|███▊      | 2501/6600 [04:21<09:52,  6.92it/s]

{'loss': 3.0075, 'grad_norm': 4.412722587585449, 'learning_rate': 3.106060606060606e-05, 'epoch': 3.79}


 45%|████▌     | 3001/6600 [05:13<08:26,  7.10it/s]

{'loss': 2.9211, 'grad_norm': 4.198641300201416, 'learning_rate': 2.7272727272727273e-05, 'epoch': 4.55}


 53%|█████▎    | 3502/6600 [06:06<06:43,  7.67it/s]

{'loss': 2.871, 'grad_norm': 4.077757835388184, 'learning_rate': 2.3484848484848487e-05, 'epoch': 5.3}


 61%|██████    | 4002/6600 [06:58<05:26,  7.96it/s]

{'loss': 2.7817, 'grad_norm': 4.315252780914307, 'learning_rate': 1.9696969696969697e-05, 'epoch': 6.06}


 68%|██████▊   | 4502/6600 [07:49<04:20,  8.05it/s]

{'loss': 2.716, 'grad_norm': 4.358773231506348, 'learning_rate': 1.590909090909091e-05, 'epoch': 6.82}


 76%|███████▌  | 5002/6600 [08:41<03:20,  7.96it/s]

{'loss': 2.6559, 'grad_norm': 4.383431434631348, 'learning_rate': 1.2121212121212122e-05, 'epoch': 7.58}


 83%|████████▎ | 5502/6600 [09:32<02:16,  8.05it/s]

{'loss': 2.6254, 'grad_norm': 4.950768947601318, 'learning_rate': 8.333333333333334e-06, 'epoch': 8.33}


 91%|█████████ | 6001/6600 [10:24<01:20,  7.45it/s]

{'loss': 2.5913, 'grad_norm': 4.435514450073242, 'learning_rate': 4.5454545454545455e-06, 'epoch': 9.09}


 99%|█████████▊| 6502/6600 [11:15<00:12,  7.95it/s]

{'loss': 2.5575, 'grad_norm': 4.987143039703369, 'learning_rate': 7.575757575757576e-07, 'epoch': 9.85}


100%|██████████| 6600/6600 [11:30<00:00,  9.56it/s]

{'train_runtime': 690.6982, 'train_samples_per_second': 38.222, 'train_steps_per_second': 9.556, 'train_loss': 2.937646738688151, 'epoch': 10.0}





TrainOutput(global_step=6600, training_loss=2.937646738688151, metrics={'train_runtime': 690.6982, 'train_samples_per_second': 38.222, 'train_steps_per_second': 9.556, 'total_flos': 1724527411200000.0, 'train_loss': 2.937646738688151, 'epoch': 10.0})

In [19]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
    
set_seed(42)

response_model = generator("Before we proceed any further, hear me speak,", max_length=200, num_return_sequences=1)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [20]:
loaded_model = GPT2LMHeadModel.from_pretrained(save_path)
loaded_tokenizer = GPT2Tokenizer.from_pretrained(save_path)

# Now you can use the loaded model and tokenizer as before
loaded_generator = pipeline('text-generation', model=loaded_model, tokenizer=loaded_tokenizer)

response_model = loaded_generator("Before we proceed any further, hear me speak,", max_length=100, num_return_sequences=1)
print(response_model[0]["generated_text"])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Before we proceed any further, hear me speak,:,?",, the,,,, a very simply,,, a, a. The first, We,, that, that, I, This, I, I, I, It, That the Director, I I, I, The Director, The Director, That, We, It,
