In [None]:
!pip install transformers

In [None]:
# Import necessary libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

# Set a random seed for reproducibility
torch.manual_seed(42)

# Load the tokenizer with the same settings as during initial training
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='', eos_token='', pad_token='')

# Load your dataset with reviews and summaries
df = pd.read_csv('/kaggle/input/cleaned-amazon-reviews/your_file.csv')
descriptions = df['Text']
summaries = df['Summary']

# Define a special token for separating review and summary
sep_token = '<|summary|>'
tokenizer.add_special_tokens({'additional_special_tokens': [sep_token]})

In [None]:
# Define your dataset class for review-summary pairs
class ReviewSummaryDataset(Dataset):
    def __init__(self, reviews, summaries, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for review, summary in zip(reviews, summaries):
            combined_text = review + ' ' + sep_token + ' ' + summary
            encodings_dict = tokenizer(combined_text, truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
            self.labels.append(torch.tensor(encodings_dict['input_ids']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx], self.labels[idx]

# Create the dataset
max_length = 1024  # Adjust as needed
dataset = ReviewSummaryDataset(descriptions, summaries, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

model_save_path = "/kaggle/input/gpt2-pt3/results/checkpoint-5000"
# Load the model from the saved checkpoint
model = GPT2LMHeadModel.from_pretrained(model_save_path)
model.resize_token_embeddings(len(tokenizer))

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results', 
    num_train_epochs=3,  # Adjust number of epochs as needed
    logging_steps=100, 
    save_steps=5000,
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=1,
    warmup_steps=10, 
    weight_decay=0.05, 
    logging_dir='./logs', 
    report_to='none'
)

# Define your data collator function
def my_data_collator(data):
    input_ids = torch.stack([item[0] for item in data])
    attention_mask = torch.stack([item[1] for item in data])
    labels = torch.stack([item[2] for item in data])
    return {
        'input_ids': input_ids, 
        'attention_mask': attention_mask, 
        'labels': labels
    }

# Initialize the Trainer
trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=train_dataset, 
    eval_dataset=val_dataset, 
    data_collator=my_data_collator
)

# Start fine-tuning
trainer.train()

In [None]:
# Save the model and tokenizer
model_save_path = "/kaggle/working/finetuned-gpt2-summary"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)