In [1]:
import re
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
import torch
from torch.utils.data import DataLoader

In [None]:
# Step 1: Load and preprocess the dataset
dataset = load_dataset("pubmed", split="train", trust_remote_code=True)

def clean_text(text):
    text = re.sub(r'\[\d+\]', '', text)  # Remove citations
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text.strip()

# Clean and preprocess the dataset
dataset = dataset.map(lambda x: {"text": clean_text(x["text"])})

# Step 2: Initialize Tokenizer and Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Tokenize dataset for training
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512, padding='max_length')

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Convert the tokenized dataset to a PyTorch DataLoader
train_loader = DataLoader(tokenized_dataset, batch_size=4, shuffle=True)

# Step 3: Fine-tune the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):  # Training for 3 epochs
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(inputs, attention_mask=attention_mask, labels=inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

# Step 4: Save the fine-tuned model and tokenizer
model.save_pretrained("./fine-tuned-gpt2-pytorch")
tokenizer.save_pretrained("./fine-tuned-gpt2-pytorch")

Downloading data:   0%|          | 0/1219 [00:00<?, ?files/s]