# Links

- https://openai.com/index/chatgpt/


# Exercises


In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

0.209729 M parameters


In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(model)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

EX1: The n-dimensional tensor mastery challenge: Combine the `Head` and `MultiHeadAttention` into one class that processes all the heads in parallel, treating the heads as another batch dimension (answer is in nanoGPT).


In [2]:
class ParallelMultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads):
        super().__init__()
        self.n_head = num_heads
        self.c_attn = nn.Linear(n_embd, 3 * n_embd, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.head_dropout = nn.Dropout(dropout)

        self.proj = nn.Linear(n_embd, n_embd)
        self.proj_dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        q, k, v  = self.c_attn(x).split(n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, nh, T, hs) @ (B, nh, hs, T) -> (B, nh, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, nh, T, T)
        wei = F.softmax(wei, dim=-1) # (B, nh, T, T)
        wei = self.head_dropout(wei)
        # perform the weighted aggregation of the values
        out = wei @ v # (B, nh, T, T) @ (B, nh, T, hs) -> (B, nh, T, hs)
        out = out.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        out = self.proj(out)
        out = self.proj_dropout(out)
        return out


class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        self.sa = ParallelMultiHeadAttention(n_head)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

0.209729 M parameters


In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(model)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

EX2: Train the GPT on your own dataset of choice! What other data could be fun to blabber on about? (A fun advanced suggestion if you like: train a GPT to do addition of two numbers, i.e. a+b=c. You may find it helpful to predict the digits of c in reverse order, as the typical addition algorithm (that you're hoping it learns) would proceed right to left too. You may want to modify the data loader to simply serve random problems and skip the generation of train.bin, val.bin. You may want to mask out the loss at the input positions of a+b that just specify the problem using y=-1 in the targets (see CrossEntropyLoss ignore_index). Does your Transformer learn to add? Once you have this, swole doge project: build a calculator clone in GPT, for all of +-\*/. Not an easy problem. You may need Chain of Thought traces.)


In [3]:
# generate all possible 1-digit addition problems as strings
import itertools
import numpy as np
import random

# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 128
n_head = 8
n_layer = 8
dropout = 0.0
# ------------

random.seed(1337)

sep = '.'
combs = list(itertools.product(range(100), range(100)))
procombsblems = np.array(combs)
random.shuffle(combs)
problems = []
for a, b in combs:
    problems.append(f"{a}+{b}={str(a+b)[::-1]}")
    problems.append(f"{b}+{a}={str(a+b)[::-1]}")
    problems.append(f"{a}-{b}={str(a-b)[::-1]}")
    problems.append(f"{b}-{a}={str(b-a)[::-1]}")
    problems.append(f"{a}*{b}={str(a*b)[::-1]}")
    problems.append(f"{b}*{a}={str(a*b)[::-1]}")
    if b != 0:
        problems.append(f"{a}/{b}={str(a/b)[::-1]}")
    if a != 0:
        problems.append(f"{b}/{a}={str(b/a)[::-1]}")

text = sep.join(problems)
print(len(problems), random.sample(problems, 5))

chars = sorted(list(set(text)))
vocab_size = len(chars)
print(chars, vocab_size)

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape)

# data loading
def get_batch():
    # generate a small batch of data of inputs x and targets y
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    
    # mask input tokens in the target sequence
    for i, seq in enumerate(y):
        last_sep_idx = 0
        for j, token in enumerate(seq):
            if token == stoi['=']:
                y[i, last_sep_idx:j+1] = -100  # F.cross_entropy ignore_index default
            elif token == stoi['.']:
                last_sep_idx = j

    x, y = x.to(device), y.to(device)
    return x, y


@torch.no_grad()
def estimate_loss(model):
    model.eval()
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        X, Y = get_batch()
        _, loss = model(X, Y)
        losses[k] = loss.item()
    model.train()
    return losses.mean()

79800 ['97*73=1807', '87*54=8964', '54-48=6', '37/91=14.0', '16+83=99']
['*', '+', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '='] 16
torch.Size([787163])


In [4]:
x, y = get_batch()
x

tensor([[ 0, 11, 12,  ...,  5,  3, 14],
        [ 4, 13,  6,  ..., 13,  6,  1],
        [ 2,  3, 13,  ..., 15, 10,  3],
        ...,
        [ 7, 14, 15,  ..., 15, 11,  6],
        [ 7,  1,  8,  ..., 10,  2, 12],
        [ 8, 14,  4,  ..., 10,  5,  2]], device='cuda:0')

In [5]:
y

tensor([[-100, -100, -100,  ...,    3,   14,    6],
        [-100, -100, -100,  ...,    6,    1,    8],
        [-100, -100, -100,  ...,   10,    3,    6],
        ...,
        [-100, -100,    9,  ...,   11,    6,    2],
        [-100, -100, -100,  ...,    2,   12,    7],
        [-100, -100, -100,  ...,    5,    2,    6]], device='cuda:0')

In [6]:
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

1.591568 M parameters


In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        loss = estimate_loss(model)
        print(f"step {iter}: loss {loss:.4f}")

    # sample a batch of data
    xb, yb = get_batch()

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
context[0, 0] = stoi['1']
print(decode(m.generate(context, max_new_tokens=200)[0].tolist()))

EX3: Find a dataset that is very large, so large that you can't see a gap between train and val loss. Pretrain the transformer on this data, then initialize with that model and finetune it on tiny shakespeare with a smaller number of steps and lower learning rate. Can you obtain a lower validation loss by the use of pretraining?


In [7]:
from datasets import load_dataset

ds = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT")

Resolving data files:   0%|          | 0/1630 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/5b89d1ea9319fe101b3cbdacd89a903aca1d6052/sample/10BT/000_00000.parquet: HTTPSConnectionPool(host='cdn-lfs-us-1.huggingface.co', port=443): Read timed out.
Trying to resume download...


Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/5b89d1ea9319fe101b3cbdacd89a903aca1d6052/sample/10BT/003_00000.parquet: HTTPSConnectionPool(host='cdn-lfs-us-1.huggingface.co', port=443): Read timed out.
Trying to resume download...
Error while downloading from https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/5b89d1ea9319fe101b3cbdacd89a903aca1d6052/sample/10BT/003_00000.parquet: HTTPSConnectionPool(host='cdn-lfs-us-1.huggingface.co', port=443): Read timed out.
Trying to resume download...


Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/5b89d1ea9319fe101b3cbdacd89a903aca1d6052/sample/10BT/005_00000.parquet: HTTPSConnectionPool(host='cdn-lfs-us-1.huggingface.co', port=443): Read timed out.
Trying to resume download...


Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/5b89d1ea9319fe101b3cbdacd89a903aca1d6052/sample/10BT/007_00000.parquet: HTTPSConnectionPool(host='cdn-lfs-us-1.huggingface.co', port=443): Read timed out.
Trying to resume download...
Error while downloading from https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/5b89d1ea9319fe101b3cbdacd89a903aca1d6052/sample/10BT/007_00000.parquet: HTTPSConnectionPool(host='cdn-lfs-us-1.huggingface.co', port=443): Read timed out.
Trying to resume download...


Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/5b89d1ea9319fe101b3cbdacd89a903aca1d6052/sample/10BT/009_00000.parquet: HTTPSConnectionPool(host='cdn-lfs-us-1.huggingface.co', port=443): Read timed out.
Trying to resume download...


Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/5b89d1ea9319fe101b3cbdacd89a903aca1d6052/sample/10BT/011_00000.parquet: HTTPSConnectionPool(host='cdn-lfs-us-1.huggingface.co', port=443): Read timed out.
Trying to resume download...


Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/541M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/5b89d1ea9319fe101b3cbdacd89a903aca1d6052/sample/10BT/013_00000.parquet: HTTPSConnectionPool(host='cdn-lfs-us-1.huggingface.co', port=443): Read timed out.
Trying to resume download...


Generating train split:   0%|          | 0/9672101 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/98 [00:00<?, ?it/s]

In [11]:
ds['train'][0]["text"], ds['train'][:2]['text']

'The Independent Jane\nFor all the love, romance and scandal in Jane Austen’s books, what they are really about is freedom and independence. Independence of thought and the freedom to choose.\nElizabeth’s refusal of Mr. Collins offer of marriage showed an independence seldom seen in heroines of the day. Her refusal of Mr. Darcy while triggered by anger showed a level of independence that left him shocked and stunned.\nThe freedom she exhibited in finally accepting him in direct defiance of Lady Catherine and knowing her father would disapprove was unusual even for Austen. In her last book Anne Elliot is persuaded to refuse Captain Wentworth at Lady Russel’s insistence.\nAlthough Jane played by the rules of the day, all of her writing is infused with how she wanted life to be. She ‘screams’ her outrage at the limitations for women in Emma.\nWhen accosted by Mrs. Elton, Jane Fairfax says,\n“Excuse me, ma’am, but this is by no means my intention; I make no inquiry myself, and should be so

In [29]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

sep = '.'
max_samples = 100000
samples = ds['train'][:max_samples]['text']
text = sep.join(samples)
print(len(samples), random.sample(samples, 5))

chars = sorted(list(set(text)))
vocab_size = len(chars)
print(chars, vocab_size)

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape)

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

100000 ["In this section we will learn Spring IoC with the help of many articles and ready to test example code. In this section we are exploring IOC container of the Spring 3 framework. The IOC container is the main component of the Spring framework. It provides the main IoC container and AOP framework. The core container of the Spring Framework provides important functionality including dependency injection and bean lifecycle management.\nThe core container is responsible for providing essential functionality to the Spring framework. The BeanFactory is the primary component of the core container. The BeanFactory is an implementation of the Factory pattern. The core container of the Spring Framework provides Dependency Injection and Inversion of Control (IOC) functionalities.\nModules of Core Container:\nFollowing are the modules of the Spring Core Container:\nThe IoC or Inversion of Control is the core features of the Spring Framework. Developers uses the IoC container to manage the 

In [30]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(model)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

step 0: train loss 8.8140, val loss 8.8047
step 100: train loss 3.0346, val loss 3.0546
step 200: train loss 2.6364, val loss 2.6357
step 300: train loss 2.4803, val loss 2.4836
step 400: train loss 2.4034, val loss 2.4189
step 500: train loss 2.3438, val loss 2.3495
step 600: train loss 2.2790, val loss 2.3068
step 700: train loss 2.2253, val loss 2.2543
step 800: train loss 2.1913, val loss 2.2065
step 900: train loss 2.1483, val loss 2.1914
step 1000: train loss 2.1166, val loss 2.1553
step 1100: train loss 2.0791, val loss 2.1330
step 1200: train loss 2.0661, val loss 2.1045
step 1300: train loss 2.0344, val loss 2.0772
step 1400: train loss 1.9932, val loss 2.0504
step 1500: train loss 1.9880, val loss 2.0342
step 1600: train loss 1.9557, val loss 2.0331
step 1700: train loss 1.9273, val loss 2.0047
step 1800: train loss 1.9067, val loss 1.9883
step 1900: train loss 1.9036, val loss 2.0164
step 2000: train loss 1.8781, val loss 1.9837
step 2100: train loss 1.8699, val loss 1.9717


In [31]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 1000
eval_interval = 100
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

0.92555 M parameters


In [32]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(model)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

step 0: train loss 1.6532, val loss 1.8181
step 100: train loss 1.6100, val loss 1.7749
step 200: train loss 1.6082, val loss 1.7859
step 300: train loss 1.5929, val loss 1.7660
step 400: train loss 1.6026, val loss 1.7600
step 500: train loss 1.6005, val loss 1.7587
step 600: train loss 1.5892, val loss 1.7757
step 700: train loss 1.5917, val loss 1.7664
step 800: train loss 1.5872, val loss 1.7548
step 900: train loss 1.5916, val loss 1.7492
step 999: train loss 1.5939, val loss 1.7651

While if this own, which yet King Percuta,
O evant Onvy the gate
My Lordips unwick, tet? Frether ane away, my faftl's unzreound of the office your milend;
Who eseigheds, I in latest in overs, and Warwick on you moself:
This courtise wond my speak; and plaw you:
My less Boopener'd gone:
Give demberlal on in on him eigh-sound myself?

TRUTUS:
He good most forgued king thrust for tream. Why:
Eold you ext the pair.

COMINIUS:
He-Now, art you are adsabraciang for him best any and
Hirl to change carel's me:

EX4: Read some transformer papers and implement one additional feature or change that people seem to use. Does it improve the performance of your GPT?


In [37]:
# LM = Few-Shot Learners: Model Training Section
# 1) Adam with β1 = 0.9, β2 = 0.95, and  = 10−8
# 2) Use cosine decay for learning rate down to 10% of its value, over 260 billion tokens (after 260
# billion tokens, training continues at 10% of the original learning rate).
# 3) There is a linear LR warmup over the first 375 million tokens.
# 4) We also gradually increase the batch size linearly from a small value (32k tokens) to the full value over
# the first 4-12 billion tokens of training, depending on the model size. 

import math

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

max_batch_size = len(train_data) // block_size
batch_inc_per_iter = (max_batch_size - batch_size) // max_iters

# data loading
def get_batch(split, bs):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (bs,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss(model, bs):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, bs)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.95), eps=1e-8)

# create a learning rate scheduler
warmup_steps = int(0.1 * max_iters)
decay_start = int(0.6 * max_iters)
total_decay_steps = max_iters - decay_start

# Create a scheduler with combined warmup and cosine decay
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, 
    lambda step: (step / warmup_steps) if step < warmup_steps else 
                 (0.1 + 0.9 * (1 + math.cos(math.pi * (step - decay_start) / total_decay_steps)) / 2) 
                 if step < decay_start else 0.1)

0.209729 M parameters


In [38]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(model, batch_size)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    batch_size += batch_inc_per_iter

    # sample a batch of data
    xb, yb = get_batch('train', batch_size)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    lr_scheduler.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

step 0: train loss 4.3846, val loss 4.3768
step 100: train loss 3.3623, val loss 3.3901
step 200: train loss 2.6645, val loss 2.6755
step 300: train loss 2.3833, val loss 2.3892
step 400: train loss 2.2128, val loss 2.2332
step 500: train loss 2.0766, val loss 2.1168
step 600: train loss 2.0318, val loss 2.0809
step 700: train loss 1.9918, val loss 2.0512
step 800: train loss 1.9589, val loss 2.0260
step 900: train loss 1.9300, val loss 2.0063
step 1000: train loss 1.9048, val loss 1.9874
step 1100: train loss 1.8807, val loss 1.9720
step 1200: train loss 1.8557, val loss 1.9550
step 1300: train loss 1.8311, val loss 1.9402
step 1400: train loss 1.8035, val loss 1.9240
step 1500: train loss 1.7764, val loss 1.9066
step 1600: train loss 1.7486, val loss 1.8884
step 1700: train loss 1.7244, val loss 1.8742
step 1800: train loss 1.6981, val loss 1.8563
step 1900: train loss 1.6762, val loss 1.8428
step 2000: train loss 1.6567, val loss 1.8307
step 2100: train loss 1.6367, val loss 1.8135
