In [1]:
import json
import pandas as pd


In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import datasets
from datasets import load_dataset

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add padding token if missing
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Load BookCorpus dataset
#dataset = load_dataset("bookcorpus",trust_remote_code=True)



In [3]:
#Don't panic arnav as you have the tokenized dataset with you
'''
reduced_dataset = dataset['train'].train_test_split(train_size=0.3)['train']

# Tokenize the reduced dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = reduced_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
'''

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


In [4]:
# Specify the directory where you want to save the tokenized dataset

from datasets import load_from_disk
save_directory = r"C:\Users\Arnav\Desktop\bookcorpus"


# Save the tokenized dataset to the specified directory
tokenized_datasets = load_from_disk(save_directory)




Loading dataset from disk:   0%|          | 0/29 [00:00<?, ?it/s]

In [5]:
#print(tokenized_datasets)

In [6]:
num_samples = int(len(tokenized_datasets) * 0.000135)

# Create a subset with 30% of the dataset
subset_datasets = tokenized_datasets.select(range(num_samples))

In [7]:
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned-bookcorpus",
    overwrite_output_dir=True,
    num_train_epochs=2,  # Number of epochs
    per_device_train_batch_size=8,  # Adjust batch size based on your GPU memory
    save_steps=20_000,
    save_total_limit=2,  # Limit the number of saved checkpoints
    logging_steps=500,  # Log every 500 steps
    evaluation_strategy="steps",  # Evaluate every few steps
    eval_steps=1000,
    warmup_steps=500,  # Gradual learning rate warmup
    weight_decay=0.01,  # Regularization
    fp16=True,  # Mixed precision training (only for GPUs that support it)
)



In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=subset_datasets,  # Use the training set of BookCorpus
)

# Fine-tune the model
#trainer.train()

In [9]:
# Fine-tune the model
# Save the fine-tuned model and tokenizer

#model.save_pretrained('./gpt2-finetuned-bookcorpus')
#tokenizer.save_pretrained('./gpt2-finetuned-bookcorpus')


from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the tokenizer and model from the directory
tokenizer = GPT2Tokenizer.from_pretrained('./gpt2-finetuned-bookcorpus')
model = GPT2LMHeadModel.from_pretrained('./gpt2-finetuned-bookcorpus')

# Optionally, you can print out a confirmation
print("Model and tokenizer loaded successfully.")



Model and tokenizer loaded successfully.


In [10]:
from transformers import GenerationConfig

# Create a GenerationConfig object with your custom settings
generation_config = GenerationConfig(
    max_length=80,  # The maximum number of tokens for generation
    min_length=50,   # Ensure at least this many tokens are generated
    do_sample=True,  # Sampling for creativity
    temperature=0.8, # Adjust temperature for creativity
)

# Assign this GenerationConfig to the model
model.generation_config = generation_config


In [None]:
# Initialize the text generation pipeline
from transformers import pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Starting text
input_text = "Once upon a time a king"

# Number of iterations for story generation
print("On a scale of 1-20 how long do you want us to generate for you?")
num_iterations = 5


# Base max_length
base_max_length = 80

# Generate the story
for i in range(num_iterations):
    # Adjust max_length for each iteration
    current_max_length = base_max_length + (i * 20)
    
    # Generate text with added parameters
    outputs = generator(
        input_text, 
        max_length=current_max_length, 
        num_return_sequences=3, 
        eos_token_id=tokenizer.eos_token_id,
        temperature=0.7,  # Lower temperature for more focused generations
        top_k=50,  # Restrict to top 50 words
        top_p=0.9,  # Nucleus sampling
        no_repeat_ngram_size=2,  # Avoid repeating phrases
        pad_token_id=tokenizer.pad_token_id  # Prevent abrupt padding
    )
    
    # Print and select outputs
    print(outputs[0]['generated_text'])
    print("\n")
    print(outputs[1]['generated_text'])
    print("\n")
    print(outputs[2]['generated_text'])
    print("\n")
    print("Which one do you like the most 0, 1 or 2?")
    print("\n")
    choice = int(input())
    input_text = outputs[choice]['generated_text']  # Update input_text with the selected generated text
print("The final story is:")
print("\n")
print(input_text)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


On a scale of 1-20 how long do you want us to generate for you?
Once upon a time a king had been murdered, the two men were still alive. '' i said. `` i don't believe it, '' '' jordan replied.''' `` nope, you're not dead. your body is still in the river, i thought, and you can still be alive in your own skin.. lorren, it's time for you


Once upon a time a king would be king, the first thing a man would do was to marry a woman. '' '' `` he said. ``... '' he added, shaking his head.'''.. - '''', `` you, '' i said, turning a. julian. " ''? '', i 'd say, and i know what i am saying.


Once upon a time a king could be crowned, i knew he would have the right to be king. '' '''s voice fell silent.''', '' he could call me 'a king, but i would never be the king of the city. `` i 'd never, `` she said. i didn't know. he 'll call my name, she've called my


Which one do you like the most 0, 1 or 2?


