In [1]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.3.1
False


In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset

processed_output_file = "../data/processed/processed_personas_text.txt"

with open(processed_output_file, 'r', encoding='utf-8') as f:
    processed_text = f.read()

max_length = 512
text_chunks = [processed_text[i:i+max_length] for i in range(0, len(processed_text), max_length)]

dataset = Dataset.from_dict({"text": text_chunks})

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Set the pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    encoding = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_length)
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

tokenized_dataset = dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()

Map:   0%|          | 0/1352 [00:00<?, ? examples/s]

Step,Training Loss
500,0.6011
1000,0.4172
1500,0.3782
2000,0.3594


TrainOutput(global_step=2028, training_loss=0.43732315360676843, metrics={'train_runtime': 8962.9835, 'train_samples_per_second': 0.453, 'train_steps_per_second': 0.226, 'total_flos': 1059800481792000.0, 'train_loss': 0.43732315360676843, 'epoch': 3.0})

In [None]:
model.save_pretrained("../models/fine-tuned-gpt2")
tokenizer.save_pretrained("../models/fine-tuned-gpt2")