In [2]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
! wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

zsh:1: command not found: wget


## Dataset Exploration

In [3]:
with open('input.txt') as f:
    text = f.read()

In [4]:
print("Length of text: {}".format(len(text)))

Length of text: 1115394


In [5]:
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



## Tokenization

In [6]:
# The unique characters in the file
vocab = sorted(list(set(text)))
vocab_size = len(vocab)
print("There are {} unique characters in the dataset".format(vocab_size))
print("".join(vocab))


There are 65 unique characters in the dataset

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [7]:
# Mapping of chrachters to indices and vice versa
chr_idx_map = {c: i for i, c in enumerate(vocab)}
idx_chr_map = {i: c for i, c in enumerate(vocab)}
# Later we will use our custom tokenizer
encode =  lambda x: [chr_idx_map[c] for c in x]
decode = lambda x: ''.join([idx_chr_map[i] for i in x])

print(encode('hola maami'))
print(decode(encode('hola maami')))

[46, 53, 50, 39, 1, 51, 39, 39, 51, 47]
hola maami


## Experiment with tensors

In [8]:
import torch

In [9]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape)
print(data[:10])

torch.Size([1115394])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])


### Train Test split of data

In [10]:
n = data.size(0)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]


In [15]:
block_size = 8

x = torch.randint(1,90, (block_size,))
print(x)
for i in range(len(x)):
    if i!=len(x)-1:
        print(f"{x[:i+1]} -> {x[i+1]}")

tensor([69, 15, 26, 66, 18, 15, 75, 26])
tensor([69]) -> 15
tensor([69, 15]) -> 26
tensor([69, 15, 26]) -> 66
tensor([69, 15, 26, 66]) -> 18
tensor([69, 15, 26, 66, 18]) -> 15
tensor([69, 15, 26, 66, 18, 15]) -> 75
tensor([69, 15, 26, 66, 18, 15, 75]) -> 26


In [16]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for i in range(len(x)):
    print(f"{x[:i+1]} -> {y[i]}")
    

tensor([18]) -> 47
tensor([18, 47]) -> 56
tensor([18, 47, 56]) -> 57
tensor([18, 47, 56, 57]) -> 58
tensor([18, 47, 56, 57, 58]) -> 1
tensor([18, 47, 56, 57, 58,  1]) -> 15
tensor([18, 47, 56, 57, 58,  1, 15]) -> 47
tensor([18, 47, 56, 57, 58,  1, 15, 47]) -> 58


### Data batch loading

In [25]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

x = torch.randint(0,50,(batch_size, block_size))
print(x)
for b in range(batch_size):
    for t in range(block_size):
        if t!=0:
            print(f"{x[b,:t]} -> {x[b,t]}")
    print("----")

tensor([[15,  7, 42,  0, 45,  3, 15, 10],
        [34, 40, 12, 20, 47, 36, 40, 38],
        [ 1, 14,  9, 15, 13, 46, 22,  0],
        [32, 11, 46,  5, 39,  4,  5, 49]])
tensor([15]) -> 7
tensor([15,  7]) -> 42
tensor([15,  7, 42]) -> 0
tensor([15,  7, 42,  0]) -> 45
tensor([15,  7, 42,  0, 45]) -> 3
tensor([15,  7, 42,  0, 45,  3]) -> 15
tensor([15,  7, 42,  0, 45,  3, 15]) -> 10
----
tensor([34]) -> 40
tensor([34, 40]) -> 12
tensor([34, 40, 12]) -> 20
tensor([34, 40, 12, 20]) -> 47
tensor([34, 40, 12, 20, 47]) -> 36
tensor([34, 40, 12, 20, 47, 36]) -> 40
tensor([34, 40, 12, 20, 47, 36, 40]) -> 38
----
tensor([1]) -> 14
tensor([ 1, 14]) -> 9
tensor([ 1, 14,  9]) -> 15
tensor([ 1, 14,  9, 15]) -> 13
tensor([ 1, 14,  9, 15, 13]) -> 46
tensor([ 1, 14,  9, 15, 13, 46]) -> 22
tensor([ 1, 14,  9, 15, 13, 46, 22]) -> 0
----
tensor([32]) -> 11
tensor([32, 11]) -> 46
tensor([32, 11, 46]) -> 5
tensor([32, 11, 46,  5]) -> 39
tensor([32, 11, 46,  5, 39]) -> 4
tensor([32, 11, 46,  5, 39,  4]) -> 5


In [28]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_data_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y 

xb, yb = get_data_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is [44, 53

### Model Experimentation

In [29]:
import torch.nn as nn 
import torch.nn.functional as F 


In [46]:
class TimePassGPT(nn.Module):
    def __init__(self, vocab_size) -> None:
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        # idx: (B, T), targets: (B, T)
        logits = self.token_embedding(idx)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx: (B, T)
        with torch.no_grad():
            for _ in range(max_new_tokens):
                logits, loss = self(idx)
                # logits: (B, T, C)
                logits = logits[:, -1, :] 
                probs = F.softmax(logits, dim=-1)
                idx_next = torch.multinomial(probs, num_samples=1)
                idx = torch.cat((idx, idx_next),dim=1)
                # print(idx)
        return idx
model = TimePassGPT(vocab_size)
logits, loss = model(xb, yb)

print(logits.shape)
print(loss.item())
decode(model.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist())
                        

torch.Size([32, 65])
4.534911632537842


"\nKH,FlD!Hc&jWbYUhie n PPCdpVzu\nnH3$hAuGFKmOW!'Ns XANf;MwPpGC.!o,e ttX.!\nzTr&higRKASjOtE\nKu&l;zt\n;;MH3"

In [47]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [49]:
batch_size = 32
for steps in range(100):
    xb, yb = get_data_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if steps % 10 == 0:
        print(f"Step: {steps}, Loss: {loss.item()}")


Step: 0, Loss: 4.513947010040283
Step: 10, Loss: 4.530942440032959
Step: 20, Loss: 4.499247074127197
Step: 30, Loss: 4.458986282348633
Step: 40, Loss: 4.393402576446533
Step: 50, Loss: 4.463761806488037
Step: 60, Loss: 4.517825126647949
Step: 70, Loss: 4.476200103759766
Step: 80, Loss: 4.521479606628418
Step: 90, Loss: 4.386441707611084


In [51]:
print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


kdpABz!FUZ?ZQUk&iGeg!VFGFj?!vznQ,TLbnRoHkGmOv-O;mg3g3db;'NzTe;bc&ELiAaXa,llW3dVNEavN'!J!oAN'BrbHuSIGxRMI$OJIgaZwz!DxbMSBhJbNQQ;NzViXaw&33!sUcOFfDH's-Ik:x:dbl-Oz
PP&;,'ovon$VNJmmC-aOd&CRo&TUyS'?Px3;-g?s-phhj AsY.3ExFs ezTLunFvSn:TU!VsYXHLuP.
 AZkymmCXjdpIpVPNZM3mO?fHXGeugeyFlpIPpyNVWaZn'rKiglWTLFsXZygR!Xw ThieDO-PVHg3dxnx?sEEwjkvtkLQIJWu
:gPpiZZI.Znbqp
FlsVThyjBwQHg3QxM'VH3IinaOkOJXmOFleXg.
XGF;l'YJ-dgTLYQjjs tII.uu
iZE:&!,.!nHg3;?A$pg3ekftm-T$aarRZamO&gKPvbmREcMH3;-RWI!;zT&Ka&gWauGFJKlQ;;bwBrsng


## Attention

#### Brute Force of averaging previous vectors

In [63]:
x = torch.randint(0,10, (4,2)) # block_size = 4, channels = 2
# Each token is represented by a vector of size 2 and there are 4 tokens
# Attention - average of the previous tokens across the dimensions
print(x)
for i in range(x.size(0)):
    print(f"Token {i}: {x[i]} -> {x[:i+1].mean(dim=0, dtype=torch.float)}")
    

tensor([[0, 3],
        [8, 4],
        [9, 7],
        [5, 0]])
Token 0: tensor([0, 3]) -> tensor([0., 3.])
Token 1: tensor([8, 4]) -> tensor([4.0000, 3.5000])
Token 2: tensor([9, 7]) -> tensor([5.6667, 4.6667])
Token 3: tensor([5, 0]) -> tensor([5.5000, 3.5000])


#### Matrix multiplication Tricks

In [78]:
x = torch.randint(0,5, (4,2), dtype=torch.float)
wei = torch.tril(torch.ones(4,4))
print(x)
print("-----")
print(wei)
print("-----")
# Multiplying using a lower triangular matrix will make the tokens to attend to the previous tokens
print(wei@x)
# We can avergae it out by summing the weights in each layer
print(wei/torch.sum(wei, dim=1,keepdim=True))
wei1 = wei/torch.sum(wei, dim=1,keepdim=True)
print(wei1@x)


tensor([[2., 4.],
        [1., 1.],
        [2., 3.],
        [4., 3.]])
-----
tensor([[1., 0., 0., 0.],
        [1., 1., 0., 0.],
        [1., 1., 1., 0.],
        [1., 1., 1., 1.]])
-----
tensor([[ 2.,  4.],
        [ 3.,  5.],
        [ 5.,  8.],
        [ 9., 11.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500]])
tensor([[2.0000, 4.0000],
        [1.5000, 2.5000],
        [1.6667, 2.6667],
        [2.2500, 2.7500]])


In [86]:
# We can use softmax as well for finding the attention weights
x = torch.randint(0,5, (4,2), dtype=torch.float)
wei = torch.tril(torch.ones(4,4))
print(x)
print("-----")
print(wei)
print("-----")
# Multiplying using a lower triangular matrix will make the tokens to attend to the previous tokens
print(wei@x)
# using softmax
wei = wei.masked_fill(wei==0, float('-inf'))
print(F.softmax(wei, dim = 1))
wei2 = F.softmax(wei, dim = 1)
print(wei2@x)

tensor([[4., 4.],
        [3., 3.],
        [1., 4.],
        [4., 4.]])
-----
tensor([[1., 0., 0., 0.],
        [1., 1., 0., 0.],
        [1., 1., 1., 0.],
        [1., 1., 1., 1.]])
-----
tensor([[ 4.,  4.],
        [ 7.,  7.],
        [ 8., 11.],
        [12., 15.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500]])
tensor([[4.0000, 4.0000],
        [3.5000, 3.5000],
        [2.6667, 3.6667],
        [3.0000, 3.7500]])


#### Self Attention

In [94]:
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn((B,T,C))

### Attention = softmax(query * key / sqrt(d_k)) * value
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)


k = key(x)
q = query(x)

print(k.shape)
print(q.shape)

wei = q@k.transpose(2,1)
print(wei.shape)
wei = wei*head_size**-0.5
tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)

out = wei@v 
print(out.shape)


torch.Size([4, 8, 16])
torch.Size([4, 8, 16])
torch.Size([4, 8, 8])
torch.Size([4, 8, 16])


## Layer normalization

In [97]:
x = torch.randn((2,4))
print(x)
# Let's check the mean
print(x.mean(dim=1))
print(x.var(dim=1))



tensor([[ 0.6402,  0.8511,  0.0207, -1.6763],
        [ 1.2977, -1.1287,  0.7366, -1.7447]])
tensor([-0.0411, -0.2098])
tensor([1.3126, 2.1228])


In [101]:
## Let' try out layer normalization
# Layer normalization is a normalization technique that normalizes the activations of a layer for each given example in a mini-batch.
# out = (x - mean) / sqrt(var + eps) * gamma + beta
# where gamma and beta are learnable parameters

x = torch.randn((2,4))
gamma = torch.ones(4)
beta = torch.zeros(4)
eps = 1e-6
mean = x.mean(dim=1, keepdim=True)
var = x.var(dim=1, keepdim=True)

out = (x - mean) / torch.sqrt(var + eps) * gamma + beta

print(x)
print(out)

print(out.mean(dim=1))
print(out.var(dim=1))


tensor([[-0.0161,  0.1886, -0.4992, -0.1440],
        [-0.1002,  0.7558, -0.6298, -1.8367]])
tensor([[ 0.3516,  1.0602, -1.3206, -0.0912],
        [ 0.3249,  1.1139, -0.1632, -1.2756]])
tensor([3.7253e-08, 0.0000e+00])
tensor([1.0000, 1.0000])


In [222]:
class LayerNorm1d(nn.Module):
    def __init__(self, dim, eps = 1e-6) -> None:
        super().__init__()
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        self.eps = eps
        
    def forward(self, x):
        mean = x.mean(dim=1, keepdim=True)
        var = x.var(dim=1, keepdim=True)

        x_hat = (x - mean) / torch.sqrt(var + eps) 
        out = x_hat* self.gamma + self.beta
        return out


In [223]:
torch.manual_seed(1337)
ln = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = ln(x)
print(x)
x.shape

tensor([[ 0.1335, -0.1059, -0.3824,  ..., -1.3422, -0.1971,  0.8795],
        [-0.0353, -0.7440, -0.3371,  ..., -0.6276, -0.4846,  0.4557],
        [ 0.3069, -1.5011,  1.4898,  ..., -0.6819,  0.9993,  0.8382],
        ...,
        [-1.6081, -1.6324, -0.7634,  ..., -0.9847,  0.0039, -0.8610],
        [-0.2273,  0.0066, -0.2763,  ..., -0.8705, -1.2443, -0.7531],
        [ 0.3054, -0.1505, -0.3809,  ..., -1.4962, -0.7711, -1.0681]])


torch.Size([32, 100])

In [224]:
x[:,0].mean(), x[:,0].std() # mean,std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [225]:
x[0,:].mean(), x[0,:].std() # mean,std of a single input from the batch, of its features

(tensor(-3.5763e-09), tensor(1.0000))

## Multihead Attention

In [226]:
# Multihead attention
## It consists of multiple attention heads, each of which is a scaled dot-product attention mechanism
## Each head is a linear transformation of the input followed by a scaled dot-product attention
## The outputs of the attention heads are concatenated and linearly transformed to produce the final output
## There is a residual connection around each of the sub-layers, followed by layer normalization
## The output of the final multi-head attention layer is passed through a feed-forward neural network, followed by another layer normalization

class SelfAttention(nn.Module):
    def __init__(self, embed_size, head_size, dropout_val = 0.1) -> None:
        super().__init__()
        self.key = nn.Linear(embed_size, head_size, bias=False)
        self.query = nn.Linear(embed_size, head_size, bias=False)
        self.value = nn.Linear(embed_size, head_size, bias=False)
        self.dropout = nn.Dropout(dropout_val)
    
    def forward(self, x):
        B, T, C = x.shape 
        k = self.key(x) # (B, T, H)
        q = self.query(x) # (B, T, H)
        
        wei = q@k.transpose(2,1) # (B, T, T)
        wei = wei / head_size**-0.5
        tril = torch.tril(torch.ones(T,T))
        wei = wei.masked_fill(tril == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        
        v = self.value(x) # (B, T, H)
        out = wei@v # (B, T, T) @ (B, T, H) -> (B, T, H)
        return out
        

In [227]:
x = torch.randn((2,4,32))
head_size = 16
attn = SelfAttention(32, head_size)
out = attn(x)
print(out.shape)

torch.Size([2, 4, 16])


In [228]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, embed_dim, head_size, attn_dropout_rate = 0.1) -> None:
        super().__init__()
        self.attn_heads = nn.ModuleList([SelfAttention(embed_size=embed_dim, head_size=head_size) for _ in range(num_heads)])
        self.projection = nn.Linear(num_heads*head_size, embed_dim)
        self.dropout = nn.Dropout(attn_dropout_rate)
    def forward(self, x):
        B, T, C = x.shape
        out = torch.cat([attn_head(x) for attn_head in self.attn_heads], dim = -1)
        out = self.dropout(self.projection(out))
        return out

In [229]:
x = torch.randn((2,4,32))
mh = MultiHeadAttention(8, 32, 16)
out = mh(x)
print(out.shape)

torch.Size([2, 4, 32])


## Decoder Block

In [251]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, attn_dropout_rate = 0.1) -> None:
        super().__init__()
        head_size = embed_dim // num_heads
        self.mha = MultiHeadAttention(num_heads, embed_dim, head_size, attn_dropout_rate)
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, 4*embed_dim),
            nn.ReLU(),
            nn.Linear(4*embed_dim, embed_dim),
            nn.Dropout(attn_dropout_rate)
        )
        self.ln2 = nn.LayerNorm(embed_dim)
    
    def forward(self, x):
        B, T, C =  x.shape 
        out = self.mha(x)
        # print(out)
        out = self.ln1(out + x)
        # print(out)
        out = self.ffn(out)
        return out

In [252]:
x = torch.zeros((1, 1, 128))
db = nn.Sequential(*[DecoderBlock(128, 8) for _ in range(6)])
out = db(x)
print(out.shape)
# print(out)

torch.Size([1, 1, 128])


## GPTScratch

In [266]:
class GPTScratch(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, block_size, attn_dropout_rate = 0.1) -> None:
        super().__init__()
        
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_embedding = nn.Embedding(block_size, embed_dim)
        self.decoder_blocks = nn.Sequential(*[DecoderBlock(embed_dim, num_heads, attn_dropout_rate) for _ in range(num_layers)])
        self.ln = nn.LayerNorm(embed_dim)
        self.lm_head = nn.Linear(embed_dim, vocab_size)
        
    def forward(self, idx, targets = None):
        B, T = idx.shape 
        tok_emb = self.token_embedding(idx) # (B,T) -> (B,T,C)
        pos_emb = self.positional_embedding(torch.arange(T, device=idx.device)) # (T,C)
        x = tok_emb + pos_emb
        # print(x.shape)
        x = self.decoder_blocks(x)
        x = self.ln(x)
        logits = self.lm_head(x)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # (B,T,C) -> (B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
        
        

In [288]:
batch_size = 8
block_size = 32
embed_dim = 128
num_heads = 4
num_layers = 8
dropout = 0.1
learning_rate = 0.01
device = 'cpu'
max_iter = 1000


In [289]:
gptModel = GPTScratch(vocab_size, embed_dim, num_heads, num_layers, block_size, dropout).to(device)

In [290]:
print(sum(p.numel() for p in gptModel.parameters())/1e6, 'M parameters')


1.604161 M parameters


In [291]:
optimizer = torch.optim.AdamW(gptModel.parameters(), lr=learning_rate)

In [292]:
for iter in range(max_iter):
    xb, yb = get_data_batch('train')
    
    logits, loss = gptModel(xb.to(device), yb.to(device))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if iter % 50 == 0:
        print(f"Iter: {iter}, Loss: {loss.item()}")

Iter: 0, Loss: 4.45413064956665
Iter: 50, Loss: 3.4570345878601074
Iter: 100, Loss: 3.2940514087677
Iter: 150, Loss: 3.359337329864502
Iter: 200, Loss: 3.4028513431549072
Iter: 250, Loss: 3.2585270404815674
Iter: 300, Loss: 3.381510019302368
Iter: 350, Loss: 3.33134388923645
Iter: 400, Loss: 3.368924140930176
Iter: 450, Loss: 3.2642624378204346
Iter: 500, Loss: 3.2841086387634277
Iter: 550, Loss: 3.329723358154297
Iter: 600, Loss: 3.315089464187622
Iter: 650, Loss: 3.2320597171783447
Iter: 700, Loss: 3.4392151832580566
Iter: 750, Loss: 3.280527114868164
Iter: 800, Loss: 3.225339412689209
Iter: 850, Loss: 3.333913564682007
Iter: 900, Loss: 3.422816038131714
Iter: 950, Loss: 3.3597142696380615


In [293]:
def generate(model, idx, max_new_tokens):
    with torch.no_grad():
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            # print(idx_cond)
            logits, loss  = model(idx_cond)
            logits = logits[:, -1, :] # (B,T,V) -> (B,V)
            # print(logits.shape)
            # print(logits)
            probs = F.softmax(logits, dim=-1)
            # print(probs)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
    return idx

In [287]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(context[0].tolist()))

print(decode(generate(gptModel, context, max_new_tokens=2000)[0].tolist()))




 sn
wKt  tm tLi sfssufnshlaunt,vToO ntrdoo
etryvbde lue Ue t i.eaiaaeeC ro mawtreUteskwpef Orroe dten ninoeeEa
Ah,dOe
aiansm 
e nocOfuE uUoemastvdegtryM sto'tdyIr nraetrh imeaereaosceewtIm ehwFaf   emrdyyr wooeht aoh uakd' ahococdta oerew nayheafodd i
t

ne
kt
jme eurrsiaJthneh ud!s re;  obd  tuIIre!oeL:eaeeaanRs:pne.hsml holohryiertuoay u uerftht ajyl,uW 
rNp unhulfgstluplssgIes!nTsEtt rBau'
 hhoToN
l
w c a e  tscsnt ifog,tcsu ht aiueshegSVao .e;t hwyfnlodeaeAts
 tt  r e ensne'a   ma  yp at.hm
u nth LD ntr ltmo n;eT uo 
a  Y na
d:haenuWt Bf nu hoOe uirt a
 t: tsMo u TArd A:eol r eBa.psl  iard
nhaitn ieaa att,eda r  engn:ttr s  IhBrh
slcseaibeDfdtsnis
,ht dlwaIhd, pnCatppe rrednne rudlaayml
!ehhoLo trrt trDe
l, paeeyReIIl ,,t eo. rcde se utaachl m otacdhl daAudg nih s ahyhlf    deai e e:da'nou uhTGb
ee Uai e,
eudgt y ahielr w tn Ryo'un lp
OtlIRahos pg  leeo ie aut ilcsaIttda,tssWrE,rTtdrsue 
wl?rrmykaUaebcm:l;tgt.oc,ara
l
pauEhlyaotratig,apugdCdarauh peoou,iookyhpiosO seghhpd

tsNat