In [1]:
import torch
import torch.nn.functional as F
from gpt2 import GPT2LanguageModel

In [2]:
# Hyperparameters
batch_size = 128
hidden_size = 384
num_hidden_layers = 6
num_attention_head = 12

intermediate_size = 4 * hidden_size
dropout = 0.1
max_positional_embeddings = 256
layer_norm_eps = 1e-12
# ------------
display_step = 500
learning_rate = 3e-4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
eval_iters = 200
num_epochs = 5000

In [3]:
torch.manual_seed(3)

# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# All the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# Load Data
def get_batch(split):
    # Generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - max_positional_embeddings, (batch_size,))
    x = torch.stack([data[i:i+max_positional_embeddings] for i in ix])
    y = torch.stack([data[i+1:i+max_positional_embeddings+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [4]:
def loss_fn(logits, targets):
    B, T, C = logits.shape
    logits = logits.view(B*T, C)
    targets = targets.view(B*T)
    loss = F.cross_entropy(logits, targets)

    return loss

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits = model(X)
            loss = loss_fn(logits, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

model = GPT2LanguageModel(
                num_layers=num_hidden_layers,
                num_heads=num_attention_head,
                d_model=hidden_size,
                fully_connected_dim=intermediate_size,
                input_vocab_size=vocab_size,
                maximum_position_encoding=max_positional_embeddings,
                dropout_rate=dropout,
                layernorm_eps=layer_norm_eps
            )

model = model.to(device)

In [5]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    logits = model(xb)
    loss = loss_fn(logits, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    # Every once in a while evaluate the loss on train and val sets
    if epoch % display_step == 0 or epoch == num_epochs - 1:
        losses = estimate_loss()
        print(f"Epoch {epoch}: Training Loss {losses['train']:.4f} | Val Loss {losses['val']:.4f}")

Epoch 0: Training Loss 4.2533 | Val Loss 4.2470
Epoch 500: Training Loss 1.6739 | Val Loss 1.8255
Epoch 1000: Training Loss 1.3620 | Val Loss 1.5880
Epoch 1500: Training Loss 1.2099 | Val Loss 1.5169
Epoch 2000: Training Loss 1.0913 | Val Loss 1.5076
Epoch 2500: Training Loss 0.9600 | Val Loss 1.5691
Epoch 3000: Training Loss 0.8108 | Val Loss 1.6691
Epoch 3500: Training Loss 0.6528 | Val Loss 1.8218
Epoch 4000: Training Loss 0.4944 | Val Loss 2.0214
Epoch 4500: Training Loss 0.3667 | Val Loss 2.2351
Epoch 4999: Training Loss 0.2736 | Val Loss 2.4311


In [6]:
# Generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))



BUCKINGHAM:
Without his great content?

Messenger:
He sends you faint of fair William's worder with
his villains, whom you have this slave pardon, you
would have fill't with my friendship throne than hold:
if he like you son should have been under a name,
could buy quite in quiet to encounter him.

First Senator:
You, my lord, I am a king beasts; you may, I love.

Second Servingman:
What consul? what, canst nonclude what raintor
should he do, with a little worthy particular,
makes from his leag


In [7]:
# Save model for future reference
torch.save(model, 'gpt2.pth')