# Load the model and tokenizer

In [66]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)



# Test model for input

In [67]:
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)

prompt = "Once upon a time in a mystical forest, there lived a [IMAGE: magical creature with wings]. One day, the creature found a [IMAGE: mysterious object] that changed everything."
generated_text = pipe(prompt, max_new_tokens=200, do_sample=True)
print(generated_text[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time in a mystical forest, there lived a [IMAGE: magical creature with wings]. One day, the creature found a [IMAGE: mysterious object] that changed everything. This had just been there for a long time, he explained. But now he had discovered that something had changed it as well. The creature wanted to take the message...And now the message he sent to his son was coming...but even through its darkness, the message remained as it was.


In a distant world, a girl named Kondu was in the forest, and she had seen something strange. This was what she thought. When she came across this magical creature, she looked up and it turned from Kondu to its original shape, but it was no more. The creature did not like her. It seemed to have moved on to a new, faraway world that it had always wanted and had only been searching for for the very last. It appeared in its new position, and Kondu was terrified, and the creature took what became of her. In a way, it looked like the creature had

# Load Dataset

In [68]:
#from datasets import load_dataset

# Load the dataset
#dataset = load_dataset("Dwaraka/Training_Dataset_of_Project_Gutebberg_Gothic_Fiction")

from datasets import load_dataset, DatasetDict
# already processed dataset(added the image tags)
dataset=DatasetDict.load_from_disk("./modified_datasets1")


In [69]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 92100
    })
})

## Clean the dataset

In [70]:
import re

# check for these strings
strings_to_find = [
    "Act.", "*", "CHAPTER"
]

#function to filter strings
def filter_strings(strings):
    filtered_strings = []
    for string in strings:
        # Check for empty string
        if not string.strip():
            continue
        # Check for strings_to_find in the string
        # for s in strings_to_find:
        #     if s in string:
        #         continue
        if "Act." in string or "*" in string or "CHAPTER" in string:
            continue
        # Check if the string contains only uppercase letters
        if string.isupper():
            continue
        # Check for any other symbol (non-alphanumeric characters)
        #if re.search(r'[^a-zA-Z0-9\s]', string):
            #continue
        # If all checks are passed, add the string to the filtered list
        filtered_strings.append(string)
    return filtered_strings

In [71]:
from datasets import Dataset
# Get the 'train' split
text = dataset['train']['text']

# Convert to a list of dictionaries, drop the first 62 elements, and convert back
remaining_data = text[62:]

remaining_data = filter_strings(remaining_data)

# Create a new Dataset from the remaining data
remaining_dataset = DatasetDict({'train':Dataset.from_dict({'text': remaining_data}) })

# Replace the 'train' split with the new dataset in the modified_datasets
dataset = remaining_dataset

In [72]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 79252
    })
})

# Generate [IMAGE:{desc}] tags in the dataset (Optional if using the unprocessed dataset)

## Initialize the pipeline for description generation

In [73]:
# Initialize text generation pipeline for description generation
description_generator = pipeline("text-generation", model=model_name, tokenizer=model_name)

### Function to generate image descriptions based on text context

In [74]:
def generate_description(text):
    prompt = "Describe the following scene in detail: " + text
    generated = description_generator(prompt, max_new_tokens=10, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    description = generated[0]['generated_text'].replace(prompt, "").strip()
    return description

### Function to randomly insert image tags with descriptions into text

In [75]:
import random

def insert_image_tags(text, interval_range=(1, 100)):
    words = text.split()
    i = 0
    while i < len(words):
        interval = random.randint(*interval_range)
        i += interval
        if i < len(words):
            context = " ".join(words[max(0, i-50):i+50])  # Provide some context for generating description
            description = generate_description(context)
            words.insert(i, f"[IMAGE:{description}]")
            i += 1  # Move past the inserted tag
    return " ".join(words)

### Apply the function to the dataset

In [76]:
def add_image_tags(examples):
    examples['text'] = [insert_image_tags(text) for text in examples['text']]
    return examples

In [77]:
#set pad token to the one from tokenizer
model.generation_config.pad_token_ids = tokenizer.pad_token_id

In [78]:
# Modify the dataset to include image tags
#modified_datasets = dataset.map(add_image_tags, batched=True)

In [79]:
#modified_datasets.save_to_disk("./modified_datasets1")

In [80]:
modified_datasets=dataset

# Tokenize the dataset (split every sentence into tokens)

In [81]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_datasets = modified_datasets.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/79252 [00:00<?, ? examples/s]

In [82]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 79252
    })
})

In [83]:
block_size = 128

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    
    # We drop the last block if it's smaller than block_size
    total_length = (total_length // block_size) * block_size
    
    # Split by chunks of block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True)

Map:   0%|          | 0/79252 [00:00<?, ? examples/s]

In [84]:
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 10252
    })
})

In [87]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
#tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Use DataCollatorForLanguageModeling to dynamically pad the inputs received by the model
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=0.05,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    save_steps=10,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets['train'],
    # eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)

trainer.train()

Step,Training Loss


TrainOutput(global_step=257, training_loss=4.19561791327213, metrics={'train_runtime': 60.4588, 'train_samples_per_second': 8.479, 'train_steps_per_second': 4.251, 'total_flos': 33576026112000.0, 'train_loss': 4.19561791327213, 'epoch': 0.050136558720249705})

# Save the model

In [88]:
model.save_pretrained("./gpt2-finetuned-fiction")
tokenizer.save_pretrained("./gpt2-finetuned-fiction")

('./gpt2-finetuned-fiction\\tokenizer_config.json',
 './gpt2-finetuned-fiction\\special_tokens_map.json',
 './gpt2-finetuned-fiction\\vocab.json',
 './gpt2-finetuned-fiction\\merges.txt',
 './gpt2-finetuned-fiction\\added_tokens.json')

# Test the new saved model

In [89]:
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned-fiction")
model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned-fiction")


In [90]:
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)

prompt = "Once upon a time in a mystical forest, there lived a [IMAGE: magical creature with wings]. One day, the creature found a [IMAGE: mysterious object] that changed everything."
generated_text = pipe(prompt, max_new_tokens=200, do_sample=True)
print(generated_text[0]['generated_text'])

Once upon a time in a mystical forest, there lived a [IMAGE: magical creature with wings]. One day, the creature found a [IMAGE: mysterious object] that changed everything. The strange creature, awakened by [IMAGE: magic of the past, present, and future, but it could not] its power, and proceeded to do all we might, in pursuit not of revenge butof some new and greater purpose, thegreat, true, oratorical ambition. One night, as [IMAGE: was being summoned, all thoughts became wild, and, looking towards him, became] aware of the disturbance, which was going on on the forest and in its surroundings, the thing drew nearer. A shadow from its midst fell on the ground, and, being startled, it went from the shadows, and, being heard and seen by the [IMAGE:the shadow and being seen, the shadow stopped its speech. It looked] inhabitants of thatplace, [IMAGE:the shadow, as they recognized him, did some] proceeded towards [IMAGE:the shadow, again, it ceased its speech.] the forest, [IMAGE:the shado