In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# hyperparameters
batch_size = 64
block_size = 128
max_iters = 10000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_head = 8
n_embd = batch_size * n_head
LAYERS = 6
dropout = 0.2

In [2]:
with open('/kaggle/input/bigram/war_and_peace.txt', 'r', encoding='utf-8') as f:
    text=f.read()

In [3]:
len(text)

3198690

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print([i for i in chars], vocab_size)

['\n', ' ', '!', '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'À', 'Á', 'É', 'à', 'á', 'â', 'ä', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'í', 'î', 'ï', 'ó', 'ô', 'ö', 'ú', 'ü', 'ý', 'œ', '—', '‘', '’', '“', '”'] 104


#### encoding, decoding

In [5]:

stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode=lambda s : [stoi[c] for c in s]
decode= lambda s : ''.join([itos[i] for i in s])

In [6]:
print(encode('hello bye'))
print(decode(encode('hello bye')))

[57, 54, 61, 61, 64, 1, 51, 74, 54]
hello bye


In [7]:
data=torch.tensor(encode(text), dtype=torch.long)
print(data[:400])
print(data.shape)

tensor([ 25,  38,  38,  34,   1,  38,  37,  28,  20,   1,  11,  18,  10,  15,
          0,  26,  31,  24,  39,  43,  28,  41,   1,  32,   0, 102,  46,  54,
         61,  61,   6,   1,  39,  67,  58,  63,  52,  54,   6,   1,  68,  64,
          1,  30,  54,  63,  64,  50,   1,  50,  63,  53,   1,  35,  70,  52,
         52,  50,   1,  50,  67,  54,   1,  63,  64,  72,   1,  59,  70,  68,
         69,   1,  55,  50,  62,  58,  61,  74,   1,  54,  68,  69,  50,  69,
         54,  68,   1,  64,  55,   1,  69,  57,  54,   1,  25,  70,  64,  63,
         50,  65,  50,  67,  69,  54,  68,   8,   1,  25,  70,  69,   1,  32,
          1,  72,  50,  67,  63,   1,  74,  64,  70,   6,   1,  58,  55,   1,
         74,  64,  70,   1,  53,  64,  63, 101,  69,   1,  69,  54,  61,  61,
          1,  62,  54,   1,  69,  57,  50,  69,   1,  69,  57,  58,  68,   1,
         62,  54,  50,  63,  68,   1,  72,  50,  67,   6,   1,  58,  55,   1,
         74,  64,  70,   1,  68,  69,  58,  61,  61,   1,  69,  

In [8]:
n = int(0.9 * len(data))

train_d=data[:n]
val_d=data[n:]
len(train_d), len(val_d)

(2878821, 319869)

In [9]:
# x=train[:block_size]
# y=train[1:block_size+1]      # offset by 1

# for i in range(block_size):
    
#     context=x[:i+1]
#     target=y[i]
    
#     print(f'{context} : {target}')

#### Generate batches

In [10]:
def gen_batch(split):
    
    data=train_d if split=='train' else val_d
    ix = torch.randint( len(data)-block_size,
                        (batch_size, ))                     # choose {batch_size} #integers from 0 to data.len - block_s
    
    x=torch.stack([data[i: i+block_size] for i in ix])
    y=torch.stack([data[i+1: i+block_size+1] for i in ix])  
    xb,yb=x.to(device),y.to(device)

    return xb,yb

In [11]:
@torch.no_grad()
def estimate_loss():
    '''
    Returns a dict of mean train & val split losses.
    '''
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = gen_batch(split)
            X, Y = X.to(device), Y.to(device)
            logits, loss = m(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out

### Model Building

In [12]:
class Head (nn.Module) :
    ''' 
    single head of MHA
    '''
    
    def __init__(self, head_size):
        super().__init__()
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.key = nn.Linear(n_embd, head_size, bias=False)        
        self.value = nn.Linear(n_embd, head_size, bias=False)        
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))   # dont register as param
        self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, x):
        B,T,C = x.shape
        
        # attention(q,k,v) = softmax(q.kT/sqrt(d_k))*v
        
        k = self.key(x)
        q = self.query (x)
        wt = q @ k.transpose(-2, -1) * C ** -0.5
        
        wt = wt.masked_fill(self.tril[:T,:T]==0, float('-inf'))
        wt = F.softmax(wt, dim=-1)
        wt = self.dropout(wt)
        
        v = self.value(x)
        out = wt @ v
        return out

In [13]:
class MultiHeadedAttention (nn.Module) :
    
    '''
    multiheaded att. 
    -> calculate Head() for each head.
    -> linear layer pass.
    -> concatenate.
    -> dropout.
    '''
    
    def __init__(self, n_head, head_size):
        
        super().__init__()
        self.heads =nn.ModuleList([Head(head_size) for _ in range(n_head)])
        self.proj=nn.Linear(n_embd, n_embd)
        self.dropout=nn.Dropout(dropout)
        
    def forward(self,x):
        
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        
        return out

In [14]:
class FeedForward (nn.Module) :
    '''
    feed forward layer.
    linear: n_embd --> [] [] [] []
    relu
    linear: [] [] [] [] --> n_embd
    dropout
    '''
    def __init__(self, n_embd) :
        super().__init__()
        self.net = nn.Sequential(
            
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout),
        )
        
    def forward(self, x):
        return self.net(x)

In [15]:
class Block (nn.Module) :
    '''
    transformer block. 
    X = MHA( LayerNorm (X) ) + FF ( LayerNorm (X) )
    '''
    def __init__(self, n_embd, n_head) :
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadedAttention(n_head, head_size)
        self.ff = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        
    def forward(self, x) :
        x = x + self.sa(self.ln1(x))
        x = x +  self.ff(self.ln2(x))
        return x

In [16]:
class BGM(nn.Module) :
    def __init__(self,vocab_size):
        '''
        token_emb_tab : Token Embedding Table with V embeddings of size V.
        pos_emb_tab : Positional embedding table.
        blocks : {LAYERS} number of blocks.
        ln_final : final ln layer
        lm_head : 
        '''
        super().__init__()
        self.token_emb_tab = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=n_embd)
        self.pos_emb_tab = nn.Embedding (block_size, n_embd)
        self.blocks = nn.Sequential ( *[Block(n_embd, n_head=n_head) for _ in range(LAYERS)])
        self.ln_final = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward( self, index, targets=None) :
        '''fpass : squeeze logits, targets then calc celoss'''
        
        B,T = index.shape
        tok_emb=self.token_emb_tab(index)
        pos_emb=self.pos_emb_tab(torch.arange(T, device=device))
        x = tok_emb + pos_emb # !!!!
        x = self.blocks(x)
        x = self.ln_final(x)
        
        logits = self.lm_head(x)
        
        
        if targets==None:
            loss=None
        else:
            b,t,c=logits.shape
            logits=logits.view(b*t, c)
            targets=targets.view(b*t)
            loss=F.cross_entropy(logits,targets)
            
        return logits,loss
    
    def generate( self, index, max_new_tokens):
        '''
            do fpass for index. take last logit. softmax. 
            calc new index by MN sampling. cat to previous index.
            repeat for max_token length.
        '''
        for _ in range(max_new_tokens):
            
            # crop idx to the last block_size tokens
            idx_cropd = index[:, -block_size:]
            
            logits, loss = self(idx_cropd) 
            logits = logits [:, -1, :] 
            p=F.softmax(logits, dim=-1)
            
            next_i = torch.multinomial(p, num_samples=1)
            
            index = torch.cat((index, next_i),dim=1)
            
        return index

## HyperParams List

In [17]:
m = BGM(vocab_size)
m = m.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

19.078248 M parameters


In [18]:
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

In [19]:
for iter in range(max_iters):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"Epoch {iter}: Training loss : {losses['train']:.4f}, Validation loss : {losses['val']:.4f}")

    xb, yb = gen_batch('train')
    xb, yb = xb.to(device), yb.to(device)
    
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

Epoch 0: Training loss : 4.8044, Validation loss : 4.8009
Epoch 500: Training loss : 1.7418, Validation loss : 1.7554
Epoch 1000: Training loss : 1.4325, Validation loss : 1.4628
Epoch 1500: Training loss : 1.3059, Validation loss : 1.3563
Epoch 2000: Training loss : 1.2390, Validation loss : 1.3024
Epoch 2500: Training loss : 1.1886, Validation loss : 1.2661
Epoch 3000: Training loss : 1.1545, Validation loss : 1.2382
Epoch 3500: Training loss : 1.1307, Validation loss : 1.2148
Epoch 4000: Training loss : 1.1067, Validation loss : 1.1984
Epoch 4500: Training loss : 1.0849, Validation loss : 1.1914
Epoch 5000: Training loss : 1.0686, Validation loss : 1.1801
Epoch 5500: Training loss : 1.0540, Validation loss : 1.1696
Epoch 6000: Training loss : 1.0394, Validation loss : 1.1627
Epoch 6500: Training loss : 1.0256, Validation loss : 1.1584
Epoch 7000: Training loss : 1.0124, Validation loss : 1.1473
Epoch 7500: Training loss : 1.0044, Validation loss : 1.1543
Epoch 8000: Training loss : 

In [21]:
context = torch.zeros((1,1) , dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))



His were all south and his elbow gracious full with unalm, Márya Dmítrievna.

Her husband came in to a white, evide-de-camp. Thus gunples on which tried throughbrehand their power, eccepting in doling boss. He was just easier than the facts was seliong, and smiled eagerly downy with at the Russians that was a spuffortune and a pilg, conjuring consideration which spirits of the soldiers passed by the soldiers in an hurre—curred an arm. From which the latters soldier they were not finishing the campaign and undecided that Rumyántsev was already desirent for a duc’siance to marque. It came to the count would not forget all there but a man with a man in a dangerous world with Cossack. Kutúzov was very pain. From the fight at Poland, said Rostopchín a restless of men over the dressing, lay a glove man with him. Túshin was mentioned in white handkerchief with the some click note and torment but made his eye opening a wit.

Our vodka cannot known it, he wished to Pault! That are only he not

In [22]:
torch.save(m.state_dict(), '/kaggle/working/b64_bl128_3e4_h8_l6_do0.2_10k.pth')