In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

#hyperparameters
batch_size = 64 # Num of independent sequences to process in parallel
block_size = 256 # Max context length for prediction
max_iter = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embed = 384 # batch size * h_head = n_embed
n_head = 6
n_layer = 6
dropout = 0.2
# -------------------

torch.manual_seed(1337)

with open('/content/drive/MyDrive/Shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Unique chars that occur
chars = sorted(list(set(text)))
vocab_size = len(chars)
# map chars to ints
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # Takes in string, outputs list of ints
decode = lambda l: ''.join([itos[i] for i in l]) # Takes list of ints, outputs string

data = torch.tensor(encode(text), dtype= torch.long)
# Split into train and val
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    # Generate small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y

# Average out loss across multiple batches
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            lotits, loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ One head of self-attention """
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.key(x)
        # compute attention scores (affinities)
        wei = q @ k.transpose(-2, -1) * C ** -0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim = -1)
        wei = self.dropout(wei)
        # weighted agg of values
        v = self.value(x)
        out = wei @ v
        return out



class MultiHeadAttention(nn.Module):
    """ multiple heads of self attention in parallel """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out


class FeedForward(nn.Module):
    """simple linear layer followed by non-linearity"""
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout),
        )
    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    """ Transformer block: communication followed by computation """
    def __init__(self, n_embed, n_head):
        # n_embed = embedding dimension
        # n_head = number of heads we want
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off logits for next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed) # Final layer norm
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets = None):
        B, T = idx.shape
        # Predict what's next given a single token
        #idx and targets are both (B, T) tensor of integers
        tok_emb = self.token_embedding_table(idx)#(B, T, C) = batch x time x channel
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)


        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # Stretch array to be 2D
            targets = targets.view(B*T) # -1 works too
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to last block_size tokens
            idx_cond = idx[:, -block_size:]
            # Get predictions
            logits, loss = self(idx_cond)
            # focus only on last time stop
            logits = logits[:, -1, :] # Becomes (B, C)
            # Apply softmax to get porbs
            probs = F.softmax(logits, dim = -1) # (B, C)
            # Sample from the dist
            idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim = 1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

for iter in range(max_iter):

    # evaluate loss every so often
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

#generate from the model
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 4.2841, val loss 4.2815
step 500: train loss 2.1700, val loss 2.2206
step 1000: train loss 1.7420, val loss 1.8996
step 1500: train loss 1.5349, val loss 1.7188
step 2000: train loss 1.4206, val loss 1.6385
step 2500: train loss 1.3535, val loss 1.5795
step 3000: train loss 1.3030, val loss 1.5483
step 3500: train loss 1.2626, val loss 1.5143
step 4000: train loss 1.2335, val loss 1.5029
step 4500: train loss 1.2041, val loss 1.4920

Dood so aproams alimason. I am spectary to
Let you burn of it? You preversel
That he I'll not grant for us herw'd it,
Who mine all and gother's employ'd.
But, that last'st mal, which were they strike.
Which so it book with my hearty, rinan;
I had sob a done ere too sigh things bones,
All the falseh of hither.
O, wife, boy! Therefore the goost fancy, Carl wrong it
Is being night dail power: the bishop.
Bhisens, I  take not haste, my kingdom,
You shall be of and harse thanks, brother,
The grave I 


In [None]:
context = torch.zeros((1,1), dtype=torch.long, device=device)

print(decode(m.generate(context, max_new_tokens=5000)[0].tolist()))


Lord:
Soothing Belus, you must withershal: aughous
My departs I have see where I think
I nor so of inshread my pedposed mine of and your air
To save heart, anothersted, when follows false;
Forbear the proclaimity of loving does you
A frong spar'd and to husband with thee already:
Good know now; be come to biding not shrong.

ISABELLA:
Fith, where
I am my good lord, as yet perforced and by charge
Than cluelent upon yours' unsteposest made
Stand o'er dign either by him; and then 'e,
With the viry
In a great man: we retues and honour, away:
So, now ask it is, sir, you were lineal.

Lord Auban;
And thus e'en vouchsafest will have been gone,
This many could do blew my wife rask;
Yet would in, refrend manable other their own:
I dare'd the conveyance to tumblush him cries
May could be sword as mine eas. I am fairly,
Where is the heaven to this virtue deformine,
Ure that what stubdue by your brind me in Svinior grieute
And I'ld boistements to rather your from my power!
Haste the pusse on the 