In [138]:
from torch.utils.data import Dataset
import os
import torch
import tiktoken
import sentencepiece as spm
import subprocess
import wandb
import nltk

import torch.nn as nn
import torch.nn.functional as F

In [139]:
# !pip freeze > requirements.txt

In [166]:
wandb.login(key="2b242cad61896bc77d8053286a9c3e79f01c9127")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/aria/.netrc


True

In [141]:
def check_requirements() -> bool:
    try:
        if not os.path.exists("requirements.txt"):
            raise FileNotFoundError("requirements.txt not found")

        result = subprocess.run(
            ["pip", "install", "-r", "requirements.txt"],
            check=True,  # Raise an exception if the command fails
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        print(result.stdout)
        return True

    except FileNotFoundError as e:
        print(f"Error: {e}")
        return False

    except subprocess.CalledProcessError as e:
        print(f"Error installing requirements: {e.stderr}")
        return False

In [142]:
# check_requirements()

In [143]:
# hyperparameters
batch_size = 32
block_size = 8
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embed = 32
# ------------

In [144]:
if not os.path.exists("Dataset.txt"):
    os.system("wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")
    os.rename("input.txt", 'Dataset.txt')

In [145]:
class CharDataset(Dataset):
    """
    Emits batches of characters.

    Adapted from "https://github.com/karpathy/minGPT".
    """

    def __init__(self, data: str, mode: str = "normal"):

        self.tokens = set(nltk.word_tokenize(data))
        self.mode = mode

        if mode == "normal":
            self.chars = sorted(set(train_text))  # get characters from the input data

            self.stoi = {ch: i for i, ch in enumerate(self.chars)}  # map characters to integer indices
            self.itos = {i: ch for i, ch in enumerate(self.chars)}  # map integer indices to characters
            self.vocab_size = len(self.chars)

        elif mode == "sentencepiece":
            self.vocab_size = min(len(self.tokens), 10770)
            spm.SentencePieceTrainer.train(model_prefix='shakespeare', input='Dataset.txt',
                                           vocab_size=10770, unk_id=0, bos_id=1, eos_id=2, pad_id=3)


        elif mode == "tiktoken":
            self.enc = tiktoken.get_encoding("gpt2")
            self.vocab_size = self.enc.max_token_value + 1

    def encode(self, text):
        if self.mode == "normal":
            return [self.stoi[s] for s in text]
        elif self.mode == "sentencepiece":
            sp = spm.SentencePieceProcessor(model_file='shakespeare.model')
            return sp.encode(text)
        elif self.mode == "tiktoken":
            return self.enc.encode(text)

    def decode(self, tokens):
        if self.mode == "normal":
            return ''.join([self.itos[t] for t in tokens])
        elif self.mode == "sentencepiece":
            sp = spm.SentencePieceProcessor(model_file='shakespeare.model')
            return sp.decode(tokens)
        elif self.mode == "tiktoken":
            return self.enc.decode(tokens)

    def get_vocab_size(self):
        return self.vocab_size

    def __len__(self):
        return self.vocab_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        # encode every character to an integer
        # return the chunk and the shifted version as tensors
        pass

    # ```

In [146]:
with open("Dataset.txt", "r") as file:
    train_text = file.read()

print(train_text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [147]:
normal_encoding = CharDataset(train_text, mode="normal")
sent_piece = CharDataset(train_text, mode="sentencepiece")
tiktoken_encoding = CharDataset(train_text, mode="tiktoken")

In [148]:
print(
    f"Normal encoding: Length of sequence = {len(normal_encoding.encode(train_text))}, Vocab size = {normal_encoding.get_vocab_size()}")

print(
    f"SentencePiece encoding: Length of sequence = {len(sent_piece.encode(train_text))}, Vocab size = {sent_piece.get_vocab_size()}")

print(
    f"TikToken encoding: Length of sequence = {len(tiktoken_encoding.encode(train_text))}, Vocab size = {tiktoken_encoding.get_vocab_size()}")

Normal encoding: Length of sequence = 1115394, Vocab size = 65
SentencePiece encoding: Length of sequence = 290364, Vocab size = 10770
TikToken encoding: Length of sequence = 338025, Vocab size = 50257


In [149]:
data = torch.tensor(normal_encoding.encode(train_text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [150]:
data2 = torch.tensor(sent_piece.encode(train_text), dtype=torch.long)
print(data2.shape, data2.dtype)
print(data2[:1000])

torch.Size([290364]) torch.int64
tensor([  160,   346,     5,  1001,    54,  1671,   208,   953,     4,   181,
           27,   147,     6,   421,     5,   997,     4,   147,     6,   160,
          346,     5,   112,    58,    47,  1968,   540,    10,   292,   117,
           10,  4422,    19,   421,     5,  7385,     6,  1968,     6,   160,
          346,     5,   160,     4,    15,   109,  1602,   479,    26,  1795,
          785,    10,     7,   397,     6,   421,     5,   184,   109,     8,
           72,     4,    54,   109,     8,    72,     6,   160,   346,     5,
          248,    96,   461,    37,     4,    11,    54,     8,    65,    34,
         1763,    78,    59,   227,  3029,     6,   244,     8,    72,    16,
         7476,    19,   421,     5,   165,    73,  4082,    64,     8,    72,
           13,   107,    29,    28,   230,     5,   293,     4,   293,    21,
           92,   282,   346,     5,   727,   314,     4,    68,  1339,     6,
          160,   346,     5,   

In [151]:
n = int(0.9 * len(data))

train_data = data[:n]
val_data = data[n:]

In [152]:
context_length = 8

print(train_data[:context_length])
print(normal_encoding.decode(train_data[:context_length].tolist()))

tensor([18, 47, 56, 57, 58,  1, 15, 47])
First Ci


In [153]:
train_data2 = data2[:n]
val_data2 = data2[n:]

print(train_data2[:context_length])
print(sent_piece.decode(train_data2[:context_length].tolist()))

tensor([ 160,  346,    5, 1001,   54, 1671,  208,  953])
First Citizen: Before we proceed any further


In [154]:
x = train_data[:context_length]
y = train_data[1:context_length + 1]

for context in range(1, context_length):
    print(f"context = {context}, input = {x[:context].tolist()}, target = {y[context - 1]}")

context = 1, input = [18], target = 47
context = 2, input = [18, 47], target = 56
context = 3, input = [18, 47, 56], target = 57
context = 4, input = [18, 47, 56, 57], target = 58
context = 5, input = [18, 47, 56, 57, 58], target = 1
context = 6, input = [18, 47, 56, 57, 58, 1], target = 15
context = 7, input = [18, 47, 56, 57, 58, 1, 15], target = 47


In [155]:
torch.manual_seed(1337)
batch_size = 4
context_length = 8


def get_batch(data):
    start_idx = torch.randint(low=0, high=len(data) - context_length, size=(batch_size,))
    x = torch.stack([data[i: i + context_length] for i in start_idx])
    y = torch.stack([data[i + 1: i + 1 + context_length] for i in start_idx])
    x, y = x.to(device), y.to(device)
    return x, y


xb, yb = get_batch(train_data)
print("input")
print(xb.shape)
print(xb)
print("target")
print(yb.shape)
print(yb)

input
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
target
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [156]:
torch.manual_seed(1337)


class BigramLangModel(nn.Module):
    def __init__(self, vocab_size, n_embed=32):
        super().__init__()
        self.n_embed = n_embed
        self.token_embedding = nn.Embedding(vocab_size, n_embed)
        self.position = nn.Embedding(context_length, n_embed)
        self.langhead = nn.Linear(n_embed, vocab_size)

    def forward(self, indices, targets=None):
        # T: sequence length (number of tokens) , B: batch size (number of sequences)
        B, T = indices.shape
        tok_embeds = self.token_embedding(indices)  # (B, T, n_embed)
        pos_embeds = self.position(torch.arange(T, device=indices.device))  # (T, n_embed)
        x = tok_embeds + pos_embeds  # (B, T, n_embed)
        logits = self.langhead(self.token_embedding(indices))  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B * T, C), targets.view(B * T))

        return logits, loss

    def generate(self, init_token, max_new_tokens):
        sequence = init_token
        for itr in range(max_new_tokens):
            logits, loss = self(sequence)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            # next_token = torch.argmax(probs, dim=-1)
            # next_token = next_token.unsqueeze(1)
            sequence = torch.cat((sequence, next_token), dim=1)
        return sequence

In [157]:
model = BigramLangModel(normal_encoding.get_vocab_size(), n_embed)
m = model.to(device)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
initial_token = torch.tensor(normal_encoding.encode('\n'), dtype=torch.long, device=device).unsqueeze(0)
# 0 == new line char
print(
    f"Generated Sequence : {normal_encoding.decode(m.generate(init_token=initial_token, max_new_tokens=100)[0].tolist())}")

torch.Size([4, 8, 65])
tensor(4.4922, grad_fn=<NllLossBackward0>)
Generated Sequence : 
lN!BJ'kysLCMFJPKOL?DP-QWwrEoL?jLDJQOL.f'RIHD'Hdhs Yv,wxatnscMZwtEOS'palkq3ssZeAvzF-QT;eMk;x.gQSFCLgx


In [158]:
torch.manual_seed(1337)
xb2, yb2 = get_batch(train_data2)
m2 = BigramLangModel(sent_piece.get_vocab_size(), n_embed)
m2 = m2.to(device)
logits, loss = m2(xb2, yb2)
print(logits.shape)
print(loss)
initial_token = torch.tensor(sent_piece.encode('I Love You'), dtype=torch.long, device=device).unsqueeze(0)

print(
    f"Generated Sequence : {sent_piece.decode(m2.generate(init_token=initial_token, max_new_tokens=100)[0].tolist())}")

torch.Size([4, 8, 10770])
tensor(9.4203, grad_fn=<NllLossBackward0>)
Generated Sequence : I Love You magn cruel judgecarce squawelve burial livery honey Herequal searshedthreerman publi apparent vast fester furnished Katect Env sends quaimes spot strange metal stirring unfeiding impedimentwould manaclerefulbishopconddeserving iticed distinguish create obscured suit difference flour meekBA bettercannon visitor goddessdestABHold valour embassies moveouls darings steeds Olymp region intobuil hatefulpeaceRK glori necessaries roaderial legundation chair whose nail four detest talkingGating shot Ingrat louder butterflyam sunderozenWhy Lassel mann accompanied perdition hadst safety soever


In [159]:
optimizer1 = torch.optim.Adam(m.parameters(), lr=learning_rate)

In [160]:
@torch.no_grad()
def estimate_loss(train_data, val_data):
    out = {}
    model.eval()
    for data in [train_data, val_data]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(data)
            logits, loss = model(X, Y)
            losses[k] = loss.item()

        out['train' if data is train_data else 'val'] = losses.mean()
    model.train()
    return out

In [161]:
batch_size == 32


def train(model, data, optimizer, max_iters=3000, epochs=10, steps=100, eval_interval=300, wandb_log=True):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for iter in range(max_iters):
            # every once in a while evaluate the loss on train and val sets
            if iter % eval_interval == 0:
                losses = estimate_loss(train_data, val_data)
                print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
                total_loss += losses['train']

                wandb.log({"Iteration": iter, "Average Loss": losses['train'], "Val Loss": losses['val']})

            xb, yb = get_batch(data)
            logits, loss = model(xb, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"epoch {epoch}: avg loss: {total_loss * eval_interval / max_iters}")
        print("-" * 50)

        if wandb_log:
            wandb.log({"Epoch": epoch, "Total Loss": total_loss * eval_interval / max_iters})


In [162]:
wandb.init(
    project="LLM",
    config={
        "learning_rate": learning_rate,
        "architecture": "Simple BigramLangModel",
        "dataset": "Shakespeare",
    },
    name="Normal Encoding"
)

train(m, train_data, optimizer1)

  Expected `list[str]` but got `tuple` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


step 0: train loss 4.3769, val loss 4.3673
step 300: train loss 2.6203, val loss 2.6888
step 600: train loss 2.6019, val loss 2.6276
step 900: train loss 2.5759, val loss 2.5812
step 1200: train loss 2.5494, val loss 2.5526
step 1500: train loss 2.5701, val loss 2.6050
step 1800: train loss 2.5565, val loss 2.5492
step 2100: train loss 2.5932, val loss 2.6070
step 2400: train loss 2.5580, val loss 2.5640
step 2700: train loss 2.5348, val loss 2.5481
epoch 0: avg loss: 2.753690004348755
--------------------------------------------------
step 0: train loss 2.5442, val loss 2.5824
step 300: train loss 2.5174, val loss 2.5132
step 600: train loss 2.5208, val loss 2.5361
step 900: train loss 2.5105, val loss 2.5478
step 1200: train loss 2.5107, val loss 2.5505
step 1500: train loss 2.5220, val loss 2.5511
step 1800: train loss 2.5733, val loss 2.5594
step 2100: train loss 2.5349, val loss 2.5415
step 2400: train loss 2.5280, val loss 2.5352
step 2700: train loss 2.5363, val loss 2.5723
epoc

In [163]:
initial_token = torch.tensor(normal_encoding.encode('\n'), dtype=torch.long).unsqueeze(0)

generated_text = normal_encoding.decode(m.generate(init_token=initial_token, max_new_tokens=100)[0].tolist())

print(
    f"Generated Sequence : {generated_text}")

wandb.log({"Generated Text": generated_text})
wandb.finish()

Generated Sequence : 
Mugotil my nt d sis? bed'd n aslng, m he ou thayo th my: ncet buse ETIOLEOLUKES:


BUS prem CI willo


0,1
Average Loss,█▇▆▅▇▂▂▃▄▃▂▅▂▃▂▄▂▄▄▄▄▂▃▄▃▃▃▃▂▃▂▁▂▃▂▃▂▁▂▃
Epoch,▁▂▃▃▄▅▆▆▇█
Iteration,▂▆█▁▃▅▆▇▃▆█▄▅▆▆▁▂▃▅█▂▅▆▆█▂▃▆█▂▃▄▆▆▁▆█▂▃▆
Total Loss,█▂▂▁▁▁▁▁▁▁
Val Loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Average Loss,2.53338
Epoch,9
Generated Text,Mugotil my nt d sis...
Iteration,2700
Total Loss,2.51881
Val Loss,2.53505


In [164]:
wandb.init(
    project="LLM",
    config={
        "learning_rate": learning_rate,
        "architecture": "Simple BigramLangModel",
        "dataset": "Shakespeare",
    },
    name="Sentencepiece Encoding"
)

optimizer2 = torch.optim.Adam(m2.parameters(), lr=learning_rate)
train(m2, train_data2, optimizer2, max_iters=100, epochs=1, steps=10, eval_interval=10)

  Expected `list[str]` but got `tuple` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


step 0: train loss 2.4849, val loss 2.5206
step 10: train loss 2.5267, val loss 2.5294
step 20: train loss 2.4965, val loss 2.5253
step 30: train loss 2.4867, val loss 2.5429
step 40: train loss 2.5397, val loss 2.5361
step 50: train loss 2.5019, val loss 2.5382
step 60: train loss 2.5200, val loss 2.5249
step 70: train loss 2.5044, val loss 2.5426
step 80: train loss 2.5184, val loss 2.5091
step 90: train loss 2.5228, val loss 2.5312
epoch 0: avg loss: 2.510204792022705
--------------------------------------------------


In [165]:
initial_token = torch.tensor(sent_piece.encode('I love'), dtype=torch.long, device=device).unsqueeze(0)

generated_text = sent_piece.decode(m2.generate(init_token=initial_token, max_new_tokens=100)[0].tolist())
wandb.log({"Generated Sequence": generated_text})

print(f"Generated Sequence : {generated_text}")
wandb.finish()

Generated Sequence : I love  utteredsRUedit officious give How Hence hatefulig lead ornaments Anointed suborn showing batt cleans abide bore Bohemia tribute usagesequencesteem By patroness as honour long herdsm strengthreserved ambling deed wai tricks Christendomtinctkly counterpoi returneddeck veri hoa nest obedient trudge justice stark scholar things fret taper TushRE godrevailallow ground her allegiance steward grim after By No buzzard Miranda seize characters forcAU stray maidenheadsmperlemish endow savage namedessenger bells wives ju kinsman Repair obtain long pestilence keys Lead nip breathing fully bustle blot puritencomb revive


0,1
Average Loss,▁▆▂▁█▃▅▃▅▆
Epoch,▁
Iteration,▁▂▃▃▄▅▆▆▇█
Total Loss,▁
Val Loss,▃▅▄█▇▇▄█▁▆

0,1
Average Loss,2.5228
Epoch,0
Generated Sequence,I love utteredsRUed...
Iteration,90
Total Loss,2.5102
Val Loss,2.53118


In [189]:
torch.manual_seed(1337)
B, T, C = 4, 8, 32  # batch, time, channels
x = torch.randn(B, T, C)

# single Head perform self-attention:
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

key = key(x)  # (B, T, head_size)
query = query(x)  # (B, T, head_size)
v = value(x)  # (B, T, head_size)

tril = torch.tril(torch.ones(T, T))
wei = query @ key.transpose(-2,
                            -1) * head_size ** 0.5  # dimension -2:T,-1:head_size.  (B,T,head)@(B,T,head) = (B, T, T)
wei = wei.masked_fill(tril == 0, float('-inf'))

temprature = 1
wei = F.softmax(wei / temprature, dim=-1)

out = wei @ v
# out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [190]:
wei

tensor([[[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00],
         [1.2155e-03, 9.9878e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00],
         [1.2126e-02, 4.6801e-03, 9.8319e-01, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00],
         [9.8567e-01, 1.7389e-03, 1.1154e-02, 1.4340e-03, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00],
         [1.9030e-06, 3.1255e-04, 1.2404e-05, 1.4743e-06, 9.9967e-01,
          0.0000e+00, 0.0000e+00, 0.0000e+00],
         [4.3232e-07, 2.3691e-02, 9.7002e-07, 2.8430e-08, 9.7631e-01,
          6.2551e-11, 0.0000e+00, 0.0000e+00],
         [2.7302e-02, 9.1365e-01, 1.2346e-04, 9.9611e-05, 4.0265e-03,
          5.4756e-02, 3.9283e-05, 0.0000e+00],
         [2.0096e-05, 5.2594e-03, 9.8801e-04, 2.8998e-01, 1.1226e-03,
          2.6327e-03, 3.5913e-01, 3.4087e-01]],

        [[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.00

In [169]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size ** -0.5

In [170]:
k.var()

tensor(1.0449)

In [171]:
q.var()

tensor(1.0700)

In [172]:
wei.var()

tensor(1.0918)

In [173]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [174]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]) * 8, dim=-1)  # gets too peaky, converges to one-hot

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

In [175]:
x[:, 0].mean(), x[:, 0].std()  # mean,std of one feature across all batch inputs

(tensor(-0.1431), tensor(1.0705))

In [176]:
x[0, :].mean(), x[0, :].std()  # mean,std of a single input from the batch, of its features

(tensor(0.0073), tensor(1.0177))

# _______________________________________

In [13]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [14]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [15]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)  # (B,T,C)
        q = self.query(x)  # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * C ** -0.5  # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
        wei = F.softmax(wei / Tempreture, dim=-1)  # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x)  # (B,T,C)
        out = wei @ v  # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [16]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [17]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [18]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [19]:
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)  # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx)  # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T,C)
        x = tok_emb + pos_emb  # (B,T,C)
        x = self.blocks(x)  # (B,T,C)
        x = self.ln_f(x)  # (B,T,C)
        logits = self.lm_head(x)  # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx

In [44]:
wandb.login(key="2b242cad61896bc77d8053286a9c3e79f01c9127")



True

In [45]:
wandb.init(
    project="LLM",
    config={
        "learning_rate": learning_rate,
        "architecture": "First Full Model",
        "dataset": "Shakespeare",
    },
    name="First Full Model (Normal Encoding)"
)

model = BigramLanguageModel()
m = model.to(device)

print(sum(p.numel() for p in m.parameters()) / 1e6, 'M parameters')

0.209729 M parameters


In [46]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [47]:
# hyperparameters
batch_size = 16  # how many independent sequences will we process in parallel?
block_size = 32  # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
tempreture = 1.0  # Tempreture in softmax
# ------------

In [48]:
wandb.log({"Context Length": block_size, "Batch Size": batch_size, "Learning Rate": learning_rate, "Dropout": dropout,
           "Tempreture": tempreture, "Number of Heads": n_head, "Number of Layers": n_layer, "Embedding Size": n_embd,
           "Eval Iterations": eval_iters, "Eval Intervals": eval_interval, "Number of Iterations": max_iters})


In [49]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        wandb.log({"Iteration": iter, "Train Loss": losses['train'], "Val Loss": losses['val']})

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.3139, val loss 4.3097
step 100: train loss 2.6329, val loss 2.6476
step 200: train loss 2.5032, val loss 2.5217
step 300: train loss 2.4124, val loss 2.4178
step 400: train loss 2.3461, val loss 2.3498
step 500: train loss 2.3009, val loss 2.3141
step 600: train loss 2.2415, val loss 2.2508
step 700: train loss 2.2011, val loss 2.2086
step 800: train loss 2.1572, val loss 2.1745
step 900: train loss 2.1175, val loss 2.1533
step 1000: train loss 2.0828, val loss 2.1311
step 1100: train loss 2.0557, val loss 2.1001
step 1200: train loss 2.0199, val loss 2.0709
step 1300: train loss 2.0049, val loss 2.0613
step 1400: train loss 1.9733, val loss 2.0570
step 1500: train loss 1.9467, val loss 2.0338
step 1600: train loss 1.9391, val loss 2.0320
step 1700: train loss 1.9186, val loss 2.0116
step 1800: train loss 1.9046, val loss 2.0019
step 1900: train loss 1.8887, val loss 2.0037
step 2000: train loss 1.8563, val loss 1.9713
step 2100: train loss 1.8533, val loss 1.9679


In [57]:
# generate from the model
wandb.log({"Device": device})
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(normal_encoding.decode(m.generate(context, max_new_tokens=2000)[0].tolist()))
wandb.log({"Generated Sequence": normal_encoding.decode(m.generate(context, max_new_tokens=2000)[0].tolist())})

wandb.finish()


great me rack tower: if you give
Befewed the goods me theaking.
Second lialt thou. Carmessire! my streaders;

KING RICIUS:
The, when: so lords; my seat may king!

KING EKE OF GlES

DATHARD IIA:
My came.

ELY:

LADY CAPIO:
So palsess, your confal noce; to being wine a mant-of is bear,
And bringbred, id me, at to gently?

LEONTES:
Near I im knowits: Retbress mead.

SICINIUS:
I live, and when, have you, names, do I know yield fight.'
Proke: a let I faint--befitt, Reagse rongbrees,
Tent ceensedings and no counnt, fatcouse and his theer
And whesesh will to your hightly: Politiong I behapowsat we a heards:
Harwfell! for sick you.
The marry'st redesings and featheds,
And you anboothing weopps? call could him the comforn how heirss;
So do unger's to York; good my mover boes,
Thei: going abjecaudience; you
wre competibument soun prot dinesity!

CORIOLA:
Appilitss me!--low it made:
A deathth thee, ifforges, my paaint,
Where he can amernise. Dry own, who prin impation's chame,
To I farther leave