In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
import math
from IPython.display import display, Image
import tiktoken
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from typing import TypedDict

In [2]:
torch.cuda.is_available()

True

In [3]:
url = "https://substackcdn.com/image/fetch/w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F81c2aa73-dd8c-46bf-85b0-90e01145b0ed_1422x1460.png"
display(Image(url=url,
              width=400,
              height=500))

In [4]:
# class Config(TypedDict):
#     n_layers:int = 12
#     d_model:int = 768
#     eps:float = 1e-5
#     hidden_size_multiplier:int = 4 
#     num_heads:int = 12
#     context_len:int = 1024
#     dropout:float = 0.1
#     qkv_bias:bool = False
#     vocab_size:int = 50257
    
# config = Config({
#     "n_ff_ff_ff_ff_ff_ff_ff_ff_ff_ff_ff_ff_ff_ff_ff_ff_layer": 12,
#     "d_model": 768,
#     "eps": 1e-5,
#     "hidden_size_multiplier": 4,
#     "num_heads": 12,
#     "context_len": 1024,
#     "dropout": 0.1,
#     "qkv_bias": False,
#     "vocab_size": 50257
# })
config = {
            "n_layers": 12,
            "d_model": 768,
            "eps": 1e-5,
            "hidden_size_multiplier": 4,
            "num_heads": 12,
            "context_len": 1024,
            "dropout": 0.1,
            "qkv_bias": True,
            "vocab_size": 50257
    }

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
config['context_len']

1024

In [6]:
tokenizer = tiktoken.get_encoding('gpt2')

In [7]:
# Layer Normalization layer
class LayerNormalization(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.eps = config['eps']
        self.scale = nn.Parameter(torch.ones((config['d_model'])))
        self.shift = nn.Parameter(torch.zeros((config['d_model']))) 
        
    def forward(self, x: torch.Tensor):
        x_mean = x.mean(dim=-1, keepdim=True) 
        x_std = x.std(dim=-1, keepdim=True)
        x_norm = (x - x_mean) / (x_std + self.eps)
        return x_norm * self.scale + self.shift

In [8]:
# FeedForward Layer
class FeedForwardLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        d_model = config['d_model']
        hidden_size_multiplier = config['hidden_size_multiplier']
        
        self.ff_layer = nn.Sequential(
            nn.Linear(d_model, hidden_size_multiplier * d_model),
            nn.GELU(),
            nn.Linear(hidden_size_multiplier * d_model, d_model)        
        )
        
    def forward(self, x):
        return self.ff_layer(x)

In [9]:
class SelfAttentionLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.query_weights = nn.Linear(config['d_model'], config['d_model'], bias=config['qkv_bias'])
        self.key_weights = nn.Linear(config['d_model'], config['d_model'], bias=config['qkv_bias'])
        self.value_weights = nn.Linear(config['d_model'], config['d_model'], bias=config['qkv_bias'])
        self.out_proj = nn.Linear(config['d_model'], config['d_model'])
        self.num_heads = config['num_heads']
        assert config['d_model'] % config['num_heads'] == 0, "d_model should be divisible by num_heads"
        self.h_dmodel = config['d_model'] // config['num_heads']
        self.neg_inf = - 1e+5
        self.drop_out = torch.nn.Dropout(config['dropout'])
        self.register_buffer('casual_mask', tensor=torch.triu(torch.ones(config['context_len'], config['context_len']), diagonal=1).bool())
    
    def forward(self, x):
        # x: (B, S, d_model)
        qeury_vectors = self.query_weights(x)
        key_vectors = self.key_weights(x)
        value_vectors = self.value_weights(x)
        batch_size, seq_len, d_model = x.size()
        
        # (B,S,d_model) -> (B, S, num_head, h_dmodel)
        qeury_vectors = qeury_vectors.view(batch_size, seq_len, self.num_heads, self.h_dmodel)
        key_vectors = key_vectors.view(batch_size, seq_len, self.num_heads, self.h_dmodel)
        value_vectors = value_vectors.view(batch_size, seq_len, self.num_heads, self.h_dmodel)
        
        # (B, Seq, num_heads, h_dmodel) -> (B, num_heads, Seq, h_dmodel)
        qeury_vectors = torch.permute(qeury_vectors, dims=(0, 2, 1, 3))
        key_vectors = torch.permute(key_vectors, dims=(0, 2, 1, 3))
        value_vectors = torch.permute(value_vectors, dims=(0, 2, 1, 3))
        mask = self.casual_mask[ :seq_len, : seq_len]
        
        # mask = self.casual_mask[:seq_len, :seq_len]  # (S, S)
        # mask = mask.unsqueeze(0).unsqueeze(0)  # (1, 1, S, S)
         
        attention_scores = self.calculate_attention_score(qeury_vectors, key_vectors, mask)
        contextualized_vectores = attention_scores @ value_vectors
        
        # (B, num_heads, seq, head_d) => (B, seq, num_head, head_d) => (b, seq, d_model)
        contextualized_vectores = torch.permute(contextualized_vectores, dims=(0, 2, 1, 3))
        contextualized_vectores = contextualized_vectores.contiguous().view(batch_size, seq_len, self.num_heads*self.h_dmodel)
        contextualized_vectores = self.out_proj(contextualized_vectores)
        return (contextualized_vectores, attention_scores)
    
    def calculate_attention_score(self, qeury, key, mask):
        # (B,NumHeads,Seq, h_dmodel) * (B,num_heads,h_model, seq) => (B,num_heads, seq, seq)
        k_dmodel = key.size(-1)
        attention_scores = (qeury @ key.transpose(-1,-2)) / math.sqrt(k_dmodel)
        attention_scores = torch.masked_fill(attention_scores, mask=mask, value=self.neg_inf)
        attention_scores = torch.softmax(attention_scores, dim=-1)
        return self.drop_out(attention_scores)

In [10]:
class GPTDataset(Dataset):
    def __init__(self, text, tokenizer, max_len, stride):
        super().__init__()
        self.input_ids = []
        self.target_ids = []
        tokens = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
        
        for i in range(0, len(tokens) - max_len, stride):
            self.input_ids.append(torch.tensor(tokens[i: i + max_len]))
            self.target_ids.append(torch.tensor(tokens[i+1 : i + max_len+1]))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [11]:
def create_dataloader(txt, batch_size=4, max_len=256,
                      stride=256, shuffle=True,
                      drop_last=True, num_workers=0):
    
    tokenizer = tiktoken.get_encoding('gpt2')
    dataset = GPTDataset(text=txt, tokenizer=tokenizer, max_len=max_len, stride=stride)
    
    dataloader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        pin_memory=True,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [12]:
class TransformerBlock(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.attention = SelfAttentionLayer(config)
        self.layer_norm1 = LayerNormalization(config)
        self.layer_norm2 = LayerNormalization(config)
        self.feedforward = FeedForwardLayer(config)
        self.dropout = nn.Dropout(config['dropout'])
        
    def forward(self, x):
        shortcut = x # Residual connection
        x = self.layer_norm1(x)
        x, _ = self.attention(x)
        x = self.dropout(x)
        x = shortcut + x
        
        shortcut = x
        x = self.layer_norm2(x)
        x = self.feedforward(x)
        x = self.dropout(x)
        x = x + shortcut
        return x

In [13]:
class GPTModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embedding = nn.Embedding(config['vocab_size'], embedding_dim=config['d_model'])
        self.pos_embedding = nn.Embedding(config['context_len'], embedding_dim=config['d_model'])
        self.dropout = nn.Dropout(config['dropout'])
        
        self.decoder_block = nn.Sequential(
            *[TransformerBlock(config) for _ in range(config['n_layers'])]
            )
        
        self.final_layernorm = LayerNormalization(config)
        self.final_linear = nn.Linear(config['d_model'], config['vocab_size'])
        
    def forward(self, x):
        batch_size, seq_len = x.size()
        token_embedding = self.token_embedding(x)
        positions = torch.arange(0, seq_len, device=x.device)
        pos_embedding = self.pos_embedding(positions)
        x = token_embedding + pos_embedding
        x = self.decoder_block(x)
        x = self.dropout(x)
        x = self.final_layernorm(x)
        logits = self.final_linear(x)
        return logits

In [14]:
# model = GPTModel(config=config)
# model.token_embedding.weight

In [15]:
# torch.manual_seed(10)
# x = torch.randint(high=10,size=(1,5),dtype=torch.int)

# with torch.no_grad():
#     out = model(x)
# out

In [16]:

# # Generate text
# tokenizer = tiktoken.get_encoding('gpt2')
# def genarate_text(input, max_len, context_len):
#     model.eval()
#     for _ in range(max_len):
#         input = input[:, : context_len]
        
#         with torch.no_grad():
            
#             logits = model(input)
#             last_token_logits = logits[:, -1, :]
#             last_token_probs = torch.softmax(last_token_logits, dim=-1)        
#             top_prob_token = torch.argmax(last_token_probs, dim=-1, keepdim=True)
#             input = torch.cat([input, top_prob_token], dim=-1)
    
#     return input

# input = 'my name is'
# input = tokenizer.encode(input)
# input = torch.tensor(input).unsqueeze(0)
# print(input[:, :10])
# print(f'initial input : {input}')
# out = genarate_text(input, max_len=5, context_len=1024)
# print('out token', out)
# tokenizer.decode(out.squeeze().tolist())

In [17]:
# Read dataset
with open('theverdict.txt','r') as f:
    text = f.read()

In [18]:
# dataloader = create_dataloader(text, batch_size=4, num_workers=0)
# for batch, target in dataloader:
#     break

In [19]:

# model = GPTModel(config)
# input = torch.tensor(tokenizer.encode('hai')).unsqueeze(0)
# with torch.no_grad():
#     print(model(input))

In [20]:
def generate(model,
            starting_context:str='i am a good',
            tokenizer=tokenizer,
            max_len=10,
            sampling=True,
            temperature=0.0,
            top_k=None,
            eos_id=None):
    
    model.eval()
    input_ids = tokenizer.encode(starting_context)
    input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)
    input_ids = input_ids.to(device)
    
    for i in range(max_len):
        with torch.no_grad():
            logits = model(input_ids)
            logits = logits[:,-1,:]
            
            if sampling:
                
                if top_k:
                    topk_logits, topk_pos = torch.topk(logits, k=top_k, dim=-1)
                    logits = torch.where(input=torch.tensor(float('-inf')),
                                         condition=logits < topk_logits[:,-1].reshape(-1, 1), 
                                         other=logits)
                if temperature>0.0:
                    logits = logits / temperature
                    
                probas = F.softmax(logits, dim=-1)
                idx_next = torch.multinomial(probas, num_samples=1)
                input_ids = torch.concat([input_ids, idx_next], dim=-1)
            else:
                assert temperature==0.0 and top_k is None, "You can't set temperature or topk if sampling=False"
                last_token = torch.argmax(logits, dim=-1).unsqueeze(0)
                input_ids = torch.cat([input_ids, last_token], dim=-1)
    return tokenizer.decode(input_ids.squeeze().tolist())

In [21]:
# Training
def train(model, train_dataloader:DataLoader, val_dataloader, optimizer:AdamW, epochs, val_freq:int, num_iter:int, device_man:str):
    if device_man:
        model.to(device_man)
    else:
        model.to(device)
        
    global_step = 1
    for epoch in range(epochs):
        model.train()
        train_loss_history = []
        val_loss_history = []
        
        for batch, target in train_dataloader:
            batch, target = batch.to(device), target.to(device)
            loss = calculate_batch_loss(model, batch, target)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            global_step+=1
            
            # validation
            if global_step % val_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_dataloader, val_dataloader, num_iter)
                print(f"epoch: {epoch}: train_loss: {train_loss:.3f}, val_loss:{val_loss:.3f}")
                train_loss_history.append(train_loss)
                val_loss_history.append(val_loss)
                print("sample generation: ", generate(model))
            
 
def calculate_batch_loss(model, batch, target):
    logits = model(batch).flatten(0, 1)
    target = target.flatten()
    loss = torch.nn.functional.cross_entropy(logits, target)
    return loss

def evaluate_model(model, train_dataloader, val_dataloader, num_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calculate_dataloader_loss(model, train_dataloader, num_iter=num_iter)
        val_loss = calculate_dataloader_loss(model, val_dataloader, num_iter=num_iter)
    model.train()
    return train_loss, val_loss
               
def calculate_dataloader_loss(model, dataloader, num_iter):
    model.eval()
    total_loss = 0
    for batch, target in dataloader:
        batch, target = batch.to(device), target.to(device)
        out = model(batch) 
        loss = torch.nn.functional.cross_entropy(out.flatten(0, 1), target=target.flatten())
        total_loss += loss.item() 
   
    avg_loss = total_loss/num_iter
    return avg_loss



In [22]:
# split data
train_data = tokenizer.encode(text)[:int(0.8 * len(tokenizer.encode(text)))]
val_data = tokenizer.encode(text)[int(0.8 * len(tokenizer.encode(text))):]

train_text = tokenizer.decode(train_data)
val_text = tokenizer.decode(val_data)

In [23]:
model = GPTModel(config)
model.to(device);

In [24]:
train_dataloader = create_dataloader(txt=train_text, batch_size=4, max_len=256, stride=256)
val_dataloader = create_dataloader(txt=val_text, batch_size=2, max_len=256, stride=256)

In [25]:
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

In [26]:
# train(model=model,
#       train_dataloader=train_dataloader,
#       val_dataloader=val_dataloader,
#       optimizer=optimizer,
#       epochs=15,
#       val_freq=2,
#       num_iter=5,
#       device_man=None)

In [27]:
generate(model=model, starting_context="hey", tokenizer=tokenizer, max_len=20, temperature=.8, top_k=20)

'heyInternational Daysilli grams\'." Originallystem Eag delayMagazine baffulatesBuilt scourge tallest pioneering ANG perhaps neuronal spontaneous'

In [28]:
device

device(type='cuda')

In [29]:
# torch.save(model.state_dict(), 'model.pth')

In [30]:
from gpt_download import download_and_load_gpt2

In [31]:
settigns, params = download_and_load_gpt2(model_size='124M', models_dir='gpt2')

File already exists and is up-to-date: gpt2\124M\checkpoint
File already exists and is up-to-date: gpt2\124M\encoder.json
File already exists and is up-to-date: gpt2\124M\hparams.json
File already exists and is up-to-date: gpt2\124M\model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2\124M\model.ckpt.index
File already exists and is up-to-date: gpt2\124M\model.ckpt.meta
File already exists and is up-to-date: gpt2\124M\vocab.bpe


In [32]:
gpt = GPTModel(config)
gpt.eval();

In [33]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [34]:
import numpy as np

def load_weights_into_gpt(gpt, params):
    gpt.pos_embedding.weight = assign(gpt.pos_embedding.weight, params['wpe'])
    gpt.token_embedding.weight = assign(gpt.token_embedding.weight, params['wte'])
    
    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.decoder_block[b].attention.query_weights.weight = assign(
            gpt.decoder_block[b].attention.query_weights.weight, q_w.T)
        gpt.decoder_block[b].attention.key_weights.weight = assign(
            gpt.decoder_block[b].attention.key_weights.weight, k_w.T)
        gpt.decoder_block[b].attention.value_weights.weight = assign(
            gpt.decoder_block[b].attention.value_weights.weight, v_w.T)
        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.decoder_block[b].attention.query_weights.bias = assign(
            gpt.decoder_block[b].attention.query_weights.bias, q_b)
        gpt.decoder_block[b].attention.key_weights.bias = assign(
            gpt.decoder_block[b].attention.key_weights.bias, k_b)
        gpt.decoder_block[b].attention.value_weights.bias = assign(
            gpt.decoder_block[b].attention.value_weights.bias, v_b)
        gpt.decoder_block[b].attention.out_proj.weight = assign(
            gpt.decoder_block[b].attention.out_proj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.decoder_block[b].attention.out_proj.bias = assign(
            gpt.decoder_block[b].attention.out_proj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"])
        gpt.decoder_block[b].feedforward.ff_layer[0].weight = assign(
            gpt.decoder_block[b].feedforward.ff_layer[0].weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.decoder_block[b].feedforward.ff_layer[0].bias = assign(
            gpt.decoder_block[b].feedforward.ff_layer[0].bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.decoder_block[b].feedforward.ff_layer[2].weight = assign(
            gpt.decoder_block[b].feedforward.ff_layer[2].weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.decoder_block[b].feedforward.ff_layer[2].bias = assign(
            gpt.decoder_block[b].feedforward.ff_layer[2].bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"])
        gpt.decoder_block[b].layer_norm1.scale = assign(
            gpt.decoder_block[b].layer_norm1.scale, 
            params["blocks"][b]["ln_1"]["g"])
        gpt.decoder_block[b].layer_norm1.shift = assign(
            gpt.decoder_block[b].layer_norm1.shift, 
            params["blocks"][b]["ln_1"]["b"])
        gpt.decoder_block[b].layer_norm2.scale = assign(
            gpt.decoder_block[b].layer_norm2.scale, 
            params["blocks"][b]["ln_2"]["g"])
        gpt.decoder_block[b].layer_norm2.shift = assign(
            gpt.decoder_block[b].layer_norm2.shift, 
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_layernorm.scale = assign(gpt.final_layernorm.scale, params["g"])
    gpt.final_layernorm.shift = assign(gpt.final_layernorm.shift, params["b"])
    gpt.final_linear.weight = assign(gpt.final_linear.weight, params["wte"])
    
    
load_weights_into_gpt(gpt, params)
gpt.to(device);

In [60]:
generate(model=gpt, 
         starting_context="""sidu is a ediot""", 
         tokenizer=tokenizer, 
         max_len=100, 
         sampling=True,
         temperature=1.5)

'sidu is a ediotronic erratic smell, Xin Long believed complete cyananthenThis publication describes sneezes from wogan and notion bagbies light brandedinitial Mad horror1Annaiderverse urges Ur Lupusors Eugene Stephenson and his Lakrellian Southern friendshas pictured ficcoberas72 blanch bolt HeFest BarnHummingbirds36botcopryology coGroup ign patient coughle skipping Partscity39β kemia worseNR20134 image licamp voters science CalTwoNotForm olderLyp north picking'