In [2]:
# ## A small ChatGPT style Transformer
# * concepts in NLP and Transformers 
# * generative mdoels
# https://github.com/rcalix1/PyTorch/blob/main/DeepLearning/Transformers/GPTs/GenerativeTransformerTinyGPT.py

############################################################ 

import torch
import numpy as np
import requests
## import tiktoken
import torch.nn as nn
import copy

import time

from torch.nn import functional as F

############################################################

## !pip install requests
## !pip install tiktoken    ## requires python   >    3.9

############################################################

torch.manual_seed(1337)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

############################################################

input_file_path = 'input.txt'

## data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'

with open(input_file_path, 'r', encoding='utf-8') as f:
    text = f.read()
    
############################################################

print("length of data in characters")
len(text)

############################################################

chars = sorted(     list(set(text))   )

vocab_size = len(chars)

print(  ''.join(chars)  )

############################################################# 
## tokenizer

stoi = { ch:i for i, ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }

encode = lambda s: [ stoi[c]          for c in s   ]    ## encoder: string to integer
decode = lambda l: ''.join(   itos[i] for i in l   )    ## decoder: interger to string

#############################################################

data = torch.tensor(   encode(text), dtype=torch.long   )
n    = int(   0.9*len(data)   )
train_data = data[:n]
val_data   = data[n:]

#############################################################

def get_batch(split):
    if split == "train":
        data = train_data
    else:
        data = val_data
    ix = torch.randint(   len(data) - block_size, (batch_size,)   )
    x  = torch.stack(    [  data[ i : i+block_size ]   for i in ix]    ) 
    y  = torch.stack(    [  data[ i+1 : i+1+block_size ]   for i in ix]    )
    
    x, y = x.to(device), y.to(device)

    return x, y

############################################################

@torch.no_grad()    ## for efficiency
def estimate_loss(only_validation=False):
    out = {}
    model.eval()   ## no training
    if only_validation:
        arr=[ 'val']
    else:
        arr=['train', 'val']
    for split in arr:
        print("split",split)
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            # print(k,range(eval_iters))
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()  ## back to training
    return out

##########################################################################################


class Head(nn.Module):
    """ one head of self-attention """
    
    def __init__(self, head_size):
        super().__init__()
        self.key   = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        
        ## the mask tril is not part of the graph since only for masking
        ## so register buffer makes it a thing out of the graph
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)              ## (B, T, C)
        q = self.query(x)            ## (B, T, C)
        
        wei = q @ k.transpose(-2, -1) * C**-0.5       ## (B, T, C) @ (B, C, T)  -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))     ## (B, T, T)
        wei = F.softmax(wei, dim= -1)           ## (B, T, T)
        wei = self.dropout(   wei   )
        
        ## perform the weighted aggregation of the values
        v   = self.value(  x  )   ## (B, T, C)
        out = wei @ v             ## (B, T, T) @ (B, T, C) -> (B, T, C)
        
        return out
        
##########################################################################################


class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList(  [Head(head_size) for _ in range(num_heads) ] )
        self.proj  = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = torch.cat(   [ h(x) for h in self.heads], dim = -1   )
        out = self.proj(  out   )
        out = self.dropout(   out   )
        return out

##########################################################################################

class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )
        
    def forward(self, x):
        return self.net(x)

##########################################################################################

class Block(nn.Module):
    """ Transformer block: comuunication followed by computation """
    
    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa   = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward( n_embd)
        self.ln1  = nn.LayerNorm(n_embd)
        self.ln2  = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        ## these normalizations (ln1, ln2) are about the only thing different from
        ## the original Vaswani paper. In the paper, they are done at the end of forward
        ## but now they are usually done at the beginning of forward
        x = x + self.sa(     self.ln1(x)      )
        x = x + self.ffwd(   self.ln2(x)      )
        return x
    
##########################################################################################


class BigramLanguageModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)     ## positional encoding 
        self.blocks = nn.Sequential(
                *[   Block(n_embd, n_head=n_head) for _ in range(n_layer)    ]
        )
        self.ln_f    = nn.LayerNorm(  n_embd    )        ## final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
    
    def forward(self, idx, targets=None):
        
        B, T = idx.shape
        
        ## ids and targets are both (B, T) tensor of integers
        tok_emb = self.token_embedding_table(idx)      ## batch, time, embed (4, 8, 32) 
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))      ## (T, C)
        x = tok_emb + pos_emb    ## (B, T, C)
        x = self.blocks(  x  )   ## (B, T, C)        
        x = self.ln_f(x)         ## (B, T, C)
        logits = self.lm_head(x)                 ## (B, T, vocab_sice)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits  = logits.view(B*T, C)
            targets  = targets.view(B*T)
            loss   = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        
        ## idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            
            ## crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            ## get the predictions
            logits, loss = self(idx_cond)
            ## focus only on last time stamp
            logits = logits[:, -1, :]           ## becomes (B, C)
            ## apply softmax to get probs
            probs = F.softmax(logits, dim= -1)    ## (B, C)
            ## sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)     ## (B, 1)
            ## append sample to the running sequence
            idx = torch.cat(  (idx, idx_next), dim=1  )            ## (B, T+1)
        return idx
            
            
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def calc_compression(model):
    total=0
    total_zero=0
    for p in model.parameters(): 
        if p.requires_grad:
            total+=p.numel()
            total_zero+=(p.numel()-torch.count_nonzero(p).item())
    return total_zero/total

            

def set_zero(model):
    for p in model.parameters(): 
        if p.requires_grad:
            p.data=torch.zeros(p.shape)

def sum_till_n(n):
    sum_val=0
    for i in range(n):
        sum_val+=i
    return sum_val
def update_weights(m_src,m_dest,epochNum):
    sum_val_prev=sum_till_n(epochNum+1)
    sum_val=sum_till_n(epochNum+2)
    # print(sum_val_prev,sum_val)
    # as epochNum starts from 1
    # we consider that the epochnum starts from 0
    print_turn=0
    for p_src,q_dest in zip(m_src.parameters(), m_dest.parameters()):
        
        if p_src.requires_grad and q_dest.requires_grad:
            # print(p_src.data.shape,q_dest.data.shape)
            prev_total=q_dest.data*sum_val_prev
            new_vals=p_src.data*(epochNum+1)
            new_average=(prev_total+new_vals)/sum_val
            q_dest.data=new_average
            # if print_turn==0:
            #     # print(sum_val_prev,sum_val)
            #     print(p_src.data[0][0])
            #     print(new_average[0][0])
            # print_turn+=1
                        




length of data in characters

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [3]:
######################################################################

n_embd  = 384                  ## every id gets embedded to vector of this size
n_head  = 6
n_layer = 6
dropout = 0.0
block_size = 256      ## max content length for predictions
learning_rate = 3e-4             ## 0.001
model   = BigramLanguageModel()
m = model.to(device)

######################################################################

optimizer = torch.optim.Adam(  m.parameters(), lr=learning_rate   )

######################################################################
print(f"The model has {count_parameters(model):,} trainable parameters")            



The model has 10,788,929 trainable parameters


In [2]:
# print(calc_compression(model))
# for p in model.parameters(): 
#     if p.requires_grad:
#         p.data=torch.zeros(p.shape)

print(calc_compression(model))

0.0004627892166126962


In [3]:

batch_size = 64 
max_iters  = 5000
eval_interval = 10


eval_iters = 200
vocab_size = 65




In [4]:
# print("Parameters and their sizes:")
# for name, param in model.named_parameters():
#     print(f"{name}: {param.size()}")


# for p in model.parameters():
#     if p.requires_grad:
#         print(p.shape)

In [5]:
model_clone = BigramLanguageModel()
model_clone.load_state_dict(copy.deepcopy(m.state_dict()))



set_zero(model_clone)
model_clone=model_clone.to(device)
print(calc_compression(model_clone))

1.0


In [None]:
for iter in range(max_iters):
    # if iter % eval_interval == 0:
    #     losses = estimate_loss()
    #     print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    # start=time.time()
    xb, yb = get_batch('train')
    
    ## evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)   ## zero out
    loss.backward()
    optimizer.step()
 
    update_weights(m,model_clone,iter)
    # end=time.time()
    # print("one iter",end-start)
    


In [None]:
# Specify the path to save the model parameters
model_path = f'model_parameters_{max_iters}.pth'

# Save the model parameters
torch.save(m.state_dict(), model_path)


In [None]:
# Specify the path to save the model parameters
model_path = f'model_gradient_parameters_{max_iters}.pth'

# Save the model parameters
torch.save(model_clone.state_dict(), model_path)


In [17]:
################################################################
#### now, regenerate after some training


## Kick off generation with some starting token. In this case id 0

context = torch.zeros(  (2, 2),  dtype=torch.long, device=device   )

gen_text = m.generate(context, max_new_tokens=500)[0].tolist()

print(  decode(gen_text)   )











Second Citizen:
Ay, buckins, service; for we will do this,
Your presented where he we shall be breathed him.

First Murderer:
I think his face, I have dream'd offended.

Second Murderer:
Ho, ho! a poor gentlewoman: what he doth good
With one of you all this convoice, 'tis no limp
So take with him before as the gates all,
As 'tis like i' the gap of lime?

Second Musician:
Pray it well.

Prieve it is your noise, Juliet.
Take you not these partnical nice your parts.

Huntsman:
I truly, through
y wi


In [18]:
context[0][0]=1
print(context)
gen_text = m.generate(context, max_new_tokens=500)[0].tolist()

print(  decode(gen_text)   )

tensor([[1, 0],
        [0, 0]], device='cuda:0')
 
BUCKINGHAM:
Then, his life is better in the face,
When I must speak alour for him, your are none.

BLUCKIO:
One word, O, would I were such the gross
So moodesty achieved a screw.

DUKE VINCENTIO:
'Tis most so lovely her?

DUKE VINCENTIO:
Loved her, on my lord, and Montague,
To this worthy servant with herself companies
To taught up her before to lose her proud.

ISABELLA:

All her both.

DUKE VINCENTIO:
'Tis not men to her; this is the Tower.

LUCIO:
Not a hury business, but I shall quit his foo


In [25]:
losses = estimate_loss(only_validation=True)
print(f" val loss {losses}")


split val
 val loss {'val': tensor(2.0060)}
