In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import PreTrainedTokenizerFast, AutoTokenizer
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score
import numpy as np
import math

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load and preprocess the text data
with open("/kaggle/input/frost-poems/frost_poems.txt", "r") as f:
    corpus = f.readlines()

text = " ".join(line.strip() for line in corpus if line.strip())
text = text.replace('.','').replace(',','')

In [3]:
# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.train()



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [4]:
# Tokenize and chunk text
max_len = 50
tokens = tokenizer.tokenize(text)
input_ids = [tokenizer.convert_tokens_to_ids(tokens[i:i + max_len]) for i in range(0, len(tokens), max_len)]
input_ids = [torch.tensor(ids) for ids in input_ids]

In [5]:
# Custom Dataset Class
class PoemDataset(Dataset):
    def __init__(self, input_ids, max_len=512):
        self.input_ids = input_ids
        self.max_len = max_len

    def __len__(self):
        return len(self.input_ids) - 1

    def __getitem__(self, idx):
        input_ids = self.input_ids[idx]
        labels = self.input_ids[idx + 1]

        # Pad to max_len
        input_ids = torch.cat([input_ids, torch.zeros(self.max_len - len(input_ids), dtype=torch.long)])
        labels = torch.cat([labels, torch.zeros(self.max_len - len(labels), dtype=torch.long)])

        return input_ids, labels

In [6]:
# Dataset and DataLoader
dataset = PoemDataset(input_ids)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [7]:
# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [None]:
# Move model to the selected device
model = model.to(device)

# Training loop
num_epochs = 10
print("Starting: ")
for epoch in range(num_epochs):
    for input_ids, labels in train_loader:
        # Move input_ids and labels to the same device as the model
        input_ids, labels = input_ids.to(device), labels.to(device)
        optimizer.zero_grad()
        # Forward pass
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

Starting: 
Epoch 1, Loss: 0.6258116364479065
Epoch 2, Loss: 0.6213662028312683


In [None]:
# Evaluation
model.eval()

def compute_perplexity(log_probs):
    return math.exp(-np.mean(log_probs))

In [None]:
all_labels, all_preds, log_probs = [], [], []

with torch.no_grad():
    for input_ids, labels in test_loader:
        input_ids, labels = input_ids.to(device), labels.to(device)
        outputs = model(input_ids=input_ids, labels=labels)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        # Calculate log probabilities
        log_probs_batch = torch.gather(logits, 2, labels.unsqueeze(-1)).squeeze().cpu().tolist()
        log_probs.extend(log_probs_batch)

        all_labels.extend(labels.flatten().tolist())
        all_preds.extend(predictions.flatten().tolist())


accuracy = accuracy_score(all_labels, all_preds)
perplexity = compute_perplexity(log_probs)

print(f"Test Accuracy: {accuracy}")
print(f"Test Perplexity: {perplexity}")

In [None]:
# Text Generation Function
def generate_text(model, tokenizer, seed_text, next_words=20, max_len=512, num_beams=5):
    model.eval()
    
    # Set pad token ID if it is not already set
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id or 0  # Defaults to 0 if EOS token is also None
    
    input_ids = tokenizer.encode(seed_text, return_tensors="pt", max_length=max_len, truncation=True)
    
    # Add attention mask to prevent unexpected behavior with padding
    attention_mask = (input_ids != tokenizer.pad_token_id).long()

    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_len + next_words,
            num_beams=num_beams,  # Enables beam search
            no_repeat_ngram_size=2,
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id
        )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Generate example text
print(generate_text(model, tokenizer, "I wish", next_words=20))