# GPT For Bigram Language Model

Here we build a complete working Generative Pre-trained Transformer (GPT) model from scratch capable of generating text in the style of Shakespeare.

## 1. Setup and Hyperparameters

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [3]:
atch_size = 16#how many independent sequences will we process in parallel
block_size = 32# what is the maximum context length for predictions
max_iters = 5000# how many training iterations
eval_interval = 100# evaluate the loss every eval_interval iterations
learning_rate = 1e-3# learning rate for optimization
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200# how many batches to use for evaluation
n_embd = 64# the embedding dimension
n_head = 4# the number of heads in the multi-head attention model
n_layer = 4# the number of layers in the transformer
dropout = 0.0# the dropout rate

## 2. Data Loading and Tokenization

In [4]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
# encoder: take a string, output a list of integers
def encode(s): return [stoi[c] for c in s]
# decoder: take a list of integers, output a string
def decode(l): return ''.join([itos[i] for i in l])
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

## 3. Data Batching and Loss Estimation


### Data Loading Function `(get_batch)`


In [5]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y