In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import load_dataset

In [None]:
print(torch.cuda.is_available())
torch.cuda.init()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
data_file_path = 'corpus/grimms/grimms.txt'
data_train_file_path = 'corpus/grimms/grimms_train_data.txt'
data_test_file_path = 'corpus/grimms/grimms_test_data.txt'

gpt_models = ['distilgpt2', 'sshleifer/tiny-gpt2']
gpt_model = 0
tokenizer_max_length=1024
needs_training = True

In [None]:
with open(data_file_path, 'r', encoding='utf-8') as file:
    data = file.readlines()

train_data, test_data = train_test_split(data, test_size=0.2, random_state=0)

with open(data_train_file_path, 'w', encoding='utf-8') as file:
    file.writelines(train_data)
with open(data_test_file_path, 'w', encoding='utf-8') as file:
    file.writelines(test_data)

In [None]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir="./logs")

def custom_logging_callback(log_args):
    global_step = log_args["global_step"]
    loss = log_args["loss"]

    # Log loss to TensorBoard
    writer.add_scalar("Loss", loss, global_step)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(gpt_models[gpt_model])

# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset('text', data_files={'train': data_train_file_path, 'test': data_test_file_path})
# Tokenize the text data
tokenized_datasets = dataset.map(lambda examples: tokenizer(examples['text'], padding='max_length', truncation=True, max_length=tokenizer_max_length, return_tensors='pt'), batched=True)

# Generate labels from tokenized data
def generate_labels(examples):
    return {"labels": examples["input_ids"]}

tokenized_datasets = tokenized_datasets.map(generate_labels, batched=True)

In [None]:
# Initialize model and args
model = GPT2LMHeadModel.from_pretrained(gpt_models[gpt_model])
batch_multiplier = ((gpt_model + 1)**2)  # This is a mathy hack to adjust batch params depending on which of the two models are used

training_args = TrainingArguments(
    output_dir="./results",          # Directory for saving model checkpoints and results
    overwrite_output_dir=True,       # Overwrite the output directory if it exists
    num_train_epochs=15,             # Number of training epochs
    per_device_train_batch_size=4 * batch_multiplier,   # Batch size per device
    gradient_accumulation_steps=8 * batch_multiplier,
    per_device_eval_batch_size=4 * batch_multiplier,    # Batch size for evaluation per device
    warmup_steps=50,               # Number of warmup steps for the learning rate scheduler
    # save_steps=200,                 # Save model checkpoints every N steps
    save_total_limit=10,             # Limit the number of saved checkpoints
    evaluation_strategy="epoch",    # Evaluation strategy (steps or epoch)
    save_strategy="epoch",
    # eval_steps=200,                 # Evaluate every N steps
    load_best_model_at_end=True,    # Load the best model checkpoint at the end of training
    metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
    logging_dir="./logs",           # Directory for logs
    logging_steps=100,              # Log metrics every N steps
    log_level="info",
    report_to="tensorboard",        # Report metrics to TensorBoard
    run_name="run01",          # Name of the run
    # accelerator='ddp',
    save_safetensors=True,
)

In [None]:
# Train model
torch.cuda.empty_cache()

if needs_training:
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        # callbacks=[custom_logging_callback],
    )
    
    trainer.train()

In [None]:
# Run this to load the trained model and tokenizer if not autoloaded after training completion
torch.cuda.empty_cache()
model = GPT2LMHeadModel.from_pretrained("./results/checkpoint-237")  # Path to your chosen checkpoint
model = model.to('cuda')
tokenizer = GPT2Tokenizer.from_pretrained(gpt_models[gpt_model])  # Use the same tokenizer as during training

In [None]:
# Input text for generating continuation
prompts = ["Once upon a time, a clever orphan stole a magic dagger from an evil wizard. The child ran into a haunted forest and ",
           "A very strange thing happened one day as the baker woke up and began her day. She saw outside that "
]

temperature = 0.85

# Initialize an empty list to store prompts, parameters, and answers
prompt_params_answers = []

# Iterate through each input text
for prompt in prompts:  # Use the prompts list directly
    # Tokenize the input text
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    input_ids = input_ids.to('cuda')

    # Generate text continuation
    output = model.generate(
        input_ids,
        min_length=128,
        max_length=256,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        temperature=temperature,
        do_sample=True,
        no_repeat_ngram_size=10,  # Adjust the value as needed
    )

    # Decode and print the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Store the prompt, parameters, and generated answer as a tuple
    prompt_params_answers.append((prompt, model.config, generated_text))

# Define a file path to save the results
file_path = "prompt_answer_params.txt"

# Save the prompts, parameters, and answers to a file
with open(file_path, "a", encoding="utf-8") as file:
    for prompt, params, answer in prompt_params_answers:
        file.write(f"Prompt:\n{prompt}\n\n"
                    f"Generated Answer:\n{answer}\n\n"
                    f"Model Parameters:\n{params}\n\n")
        print(f"{answer}\n")

print(f"Results saved to {file_path}")