## Importing Modules

In [1]:
import os
import pathlib
import numpy as np
import pandas as pd
import nltk

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline, GPT2Config, TextDataset
from tqdm.auto import tqdm
import random
import datetime
import time
import statistics
from nltk.translate.bleu_score import sentence_bleu
from transformers import TrainingArguments, Trainer, set_seed
from datasets import load_dataset

if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available.")

  from .autonotebook import tqdm as notebook_tqdm



GPU is available!


In [2]:
MAIN_PATH = str(pathlib.Path().resolve())
DATASET_PATH = MAIN_PATH + '\\datasets'
MODEL_PATH = MAIN_PATH + '\\models'

In [3]:
models = os.listdir(MODEL_PATH)
models

['bert-base-cased',
 'bert-base-multilingual-cased',
 'bert-base-uncased',
 'bert-large-cased',
 'bert-large-uncased',
 'flan-t5-base',
 'flan-t5-large',
 'flan-t5-small',
 'gpt2',
 'gpt2-large',
 'gpt2-medium',
 'tuned_text_gen']

In [4]:
model_path = MODEL_PATH + '\\' + models[8]
model_path

'D:\\Python\\LLM_Environment\\models\\gpt2'

In [5]:
model = GPT2LMHeadModel.from_pretrained(model_path)

In [6]:
torch.cuda.empty_cache()

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

## Import Dataset

In [8]:
filenames = os.listdir(DATASET_PATH)
filenames

['cached_lm_GPT2Tokenizer_128_Shakespeare_Dataset.txt',
 'Html.csv',
 'Recipes.csv',
 'Recipes_1000.csv',
 'Shakespeare_Dataset.txt',
 'Taylor_Swift_Lyrics.csv']

In [9]:
file_path = DATASET_PATH + '\\' + filenames[5]
file_path

'D:\\Python\\LLM_Environment\\datasets\\Taylor_Swift_Lyrics.csv'

In [10]:
d = pd.read_csv(file_path, encoding='latin1')
text = []

for title in d['track_title'].unique():
    song = ""
    for l in d[d['track_title'] == title]['lyric']:
        song += l + ' \n '
    text.append(song + ' <|endoftext|>')

In [11]:
print(text[5])

I didn't know what I would find 
 When I went looking for a reason, I know 
 I didn't read between the lines 
 And, baby, I've got nowhere to go 
 I tried to take the road less traveled by 
 But nothing seems to work the first few times 
 Am I right? 
 So how can I ever try to be better? 
 Nobody ever lets me in 
 I can still see you, this ain't the best view 
 On the outside looking in 
 I've been a lot of lonely places 
 I've never been on the outside 
 You saw me there, but never knew 
 I would give it all up to be 
 A part of this, a part of you 
 And now it's all too late so you see 
 You could've helped if you had wanted to 
 But no one notices until it's too 
 Late to do anything 
 So how can I ever try to be better? 
 Nobody ever lets me in 
 I can still see you, this ain't the best view 
 On the outside looking in 
 I've been a lot of lonely places 
 I've never been on the outside 
 So how can I ever try to be better? 
 Nobody ever lets me in 
 I can still see you, this ain't 

In [12]:
class CustomTextDataset(Dataset):
    def __init__(self, tokenizer, text, block_size):
        self.examples = tokenizer.batch_encode_plus(
            text,
            add_special_tokens=True,
            max_length=block_size,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

    def __len__(self):
        return len(self.examples["input_ids"])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.examples.items()}
        return item

In [13]:
tokenizer.pad_token = tokenizer.eos_token
    
tokenized_text = tokenizer.encode(text)

dataset = CustomTextDataset(tokenizer, text, block_size=128)  # Adjust block_size as per your requirements

In [14]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [15]:
save_path = './model'
# Define the training arguments
training_args = TrainingArguments(
    output_dir=save_path,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=1e-4,
    save_steps=500,  # Save checkpoints every 500 steps
    fp16=True  # Only keep the last 2 checkpoints
)

In [16]:
# Create Trainer and fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

In [17]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 18/18 [00:05<00:00,  3.32it/s]

{'train_runtime': 5.42, 'train_samples_per_second': 52.029, 'train_steps_per_second': 3.321, 'train_loss': 3.3757326338026257, 'epoch': 3.0}





TrainOutput(global_step=18, training_loss=3.3757326338026257, metrics={'train_runtime': 5.42, 'train_samples_per_second': 52.029, 'train_steps_per_second': 3.321, 'total_flos': 18421088256000.0, 'train_loss': 3.3757326338026257, 'epoch': 3.0})

In [18]:
def load_fine_tuned_model(model_dir):
    # Load the fine-tuned GPT-2 model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_dir)
    tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
    tokenizer.pad_token = tokenizer.eos_token
    
    return model, tokenizer

model, tokenizer = load_fine_tuned_model(save_path)

In [19]:
def generate_text(model, tokenizer, prompt_text, max_length=300):
    input_ids = tokenizer.encode(prompt_text, return_tensors="pt")
    output = model.generate(
        input_ids, 
        do_sample=True, 
        max_length=max_length, 
        top_p=0.92, 
        top_k=0
    )
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return generated_text

In [20]:
prompt = "beautiful summer"
generated_text = generate_text(model, tokenizer, prompt)

print("Generated Text:")
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Text:
beautiful summer <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|>  And <|pad|> <|pad|>  In <|pad|> <|pad|>  The. I <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> -1 <|pad|>  the <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> , <|pad|> <|pad|>  and <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|>