In [386]:
import torch
from torch import nn
import pytorch_warmup as warmup


In [387]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
#Data

In [3]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-06-17 01:04:02--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-06-17 01:04:02 (20.5 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [448]:
with open('data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()


#Train BPE

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string


In [446]:
#Hyperparameters

block_size = 32
batch_size = 16
embeddings_dims = 64
attn_dropout = 0.1,
no_of_heads = 4,
dropout = 0.1
epochs = 30
max_lr = 2.5e-4
no_of_decoder_layers = 4
attn_dropout = 0.1

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    idx = torch.stack([data[i:i+block_size] for i in ix])
    targets = torch.stack([data[i+1:i+block_size+1] for i in ix])
    idx, targets = idx.to(device), targets.to(device)
    return idx, targets

In [450]:
idx, targets = get_batch('test')

In [451]:
idx.shape

torch.Size([16, 32])

In [460]:
idx[0]

tensor([21,  1, 51, 59, 57, 58,  1, 40, 43,  1, 45, 53, 52, 43,  8,  0,  0, 24,
        33, 15, 17, 26, 32, 21, 27, 10,  0, 18, 39, 47, 58, 46],
       device='cuda:0')

In [461]:
targets[0]

tensor([ 1, 51, 59, 57, 58,  1, 40, 43,  1, 45, 53, 52, 43,  8,  0,  0, 24, 33,
        15, 17, 26, 32, 21, 27, 10,  0, 18, 39, 47, 58, 46,  6],
       device='cuda:0')

In [391]:
len(train_data) / 64

15685.21875

In [392]:
# Text embeddings
class TextEmbeddings(nn.Module):
    def __init__(
        self,
        vocab_size: int = vocab_size,
        embeddings_dims = embeddings_dims
    ):
        super().__init__()
        self.embeddings_table = nn.Embedding(num_embeddings = vocab_size, embedding_dim=embeddings_dims, device=device) #Just a look up table to convert the toekns_ids to some numbers
        # nn.init.normal_(self.embeddings_table.weight.data, mean=0, std=0.02)
        
    def forward(self, x):
        return self.embeddings_table(x)

In [393]:
#Position embeddings
class PositionEmbeddings(nn.Module):
    def __init__(
        self,
        embeddings_dims = embeddings_dims,
        block_size = block_size
    ):
        super().__init__()
        
        self.position_embeddings = nn.Embedding(num_embeddings = block_size, embedding_dim=embeddings_dims, device=device) #To give positional embeddings to each token of the input text, hence num_embeddings=block_size
        # nn.init.normal_(self.position_embeddings.weight.data, mean=0, std=0.02)
        
    def forward(self, x):
        return self.position_embeddings(x)

In [394]:
#Layer Normalization

class LayerNormalization(nn.Module):
    def __init__(
        self,
        embeddings_dims = embeddings_dims
    ):
        super().__init__()
        
        self.layer_norm = nn.LayerNorm(normalized_shape=embeddings_dims,device=device)
        
    def forward(self, x):
        return self.layer_norm(x)

In [395]:
#FeedForward Neural Network

class MLPBlock(nn.Module):
    def __init__(
        self,
        dropout = 0.1,
        embeddings_size = embeddings_dims,
        # inner_dimensional_states: int = 3072
    ):
        super().__init__()
        
        self.mlp = nn.Sequential(
            nn.Linear(device=device, in_features=embeddings_size, out_features= 4 * embeddings_dims),
            nn.GELU(),
            nn.Linear(device=device, in_features= 4 * embeddings_dims, out_features=embeddings_size), 
            nn.Dropout(p = dropout)     
        )
        
    def forward(self, x):
        # mlp_weights_init = self.mlp.apply(weights_init)
        return self.mlp(x)

In [396]:
# #Weights Initilization (for MLP Block)
# def weights_init(m):
#     classname = m.__class__.__name__
#     if classname.find('Linear') != -1:
#         nn.init.normal_(m.weight.data, 0.0, 0.02)  #mean = 0, std = 0.02
        


In [397]:
embeddings_dims // no_of_heads[0]

16

In [398]:
embeddings_dims

64

In [419]:
no_of_heads

(4,)

In [420]:

class AttentionHead(nn.Module):
    def __init__(
        self,
        attn_dropout = 0.1,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
    ):
        super().__init__()
        self.head_size = embeddings_dims // no_of_heads[0]
        self.query = nn.Linear(in_features=embeddings_dims, out_features=self.head_size, device=device, bias=False)
        self.keys = nn.Linear(in_features=embeddings_dims, out_features=self.head_size, device=device, bias=False)
        self.values = nn.Linear(in_features=embeddings_dims, out_features=self.head_size, device=device, bias=False)
        self.dropout = nn.Dropout(p = attn_dropout)
        
        
    def forward(self, x):
        batch, block_size, head_size = x.shape
        masked_table = torch.tril(torch.ones(block_size, block_size, device=device))
        weights = self.query(x) @ torch.transpose(self.keys(x), dim0=-2, dim1=-1)
        masked_values = weights.masked_fill(masked_table[: block_size, : block_size] == 0, float('-inf'))
        scaled_weights = masked_values * head_size ** -0.5
        weights_normalized = nn.functional.softmax(scaled_weights, dim=-1) #Normalize along the embeddings dimension for all the tokens
        weights_normalized = self.dropout(weights_normalized)
        value_vector = weights_normalized @ self.values(x)
        return value_vector

In [421]:
# MHA

class MHA(nn.Module):
    def __init__(
        self,
        attn_dropout = 0.1,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
    ):
        super().__init__()
        self.heads = nn.ModuleList([AttentionHead(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads) for _ in range(no_of_heads[0])])
        self.dropout = nn.Dropout(p = attn_dropout)
        self.linear = nn.Linear(in_features=embeddings_dims, out_features=embeddings_dims, device=device, bias=False) # 12 (no of heads) * (batch_size) 64 = 768 -> gives out the text embeddings 
        
    def forward(self, x):
        concat = torch.cat([head(x) for head in self.heads], dim=-1)
        linear_layer = self.linear(concat)
        out = self.dropout(linear_layer)
        return out

In [422]:
# Decoder Block

class TransformerDecoderBlock(nn.Module):
    def __init__(
        self,
        attn_dropout = 0.1,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
        dropout = 0.1,
        vocab_size = vocab_size
    ):
        super().__init__()
        
        self.mha = MHA(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads)
        self.layer_norm1 = LayerNormalization(embeddings_dims=embeddings_dims)
        self.layer_norm2 = LayerNormalization(embeddings_dims=embeddings_dims)
        self.mlp_block = MLPBlock(dropout=dropout, embeddings_size=embeddings_dims)
        self.text_embds = TextEmbeddings(vocab_size=vocab_size, embeddings_dims=embeddings_dims)
        self.pos = PositionEmbeddings(embeddings_dims=embeddings_dims)
        self.apply(self._init_weights)

    def _init_weights(self, module):  #Weight Initialization
            if isinstance(module, nn.Linear):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
                if module.bias is not None:
                    torch.nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        
    def forward(self, x):
        x = self.mha(x)
        x = self.layer_norm1(x)
        x = self.mlp_block(x) + x
        out = self.layer_norm2(x)
        
        return out

In [436]:
# Decoder Block

class DecoderModel(nn.Module):
    def __init__(
        self,
        attn_dropout = 0.1,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
        block_size = block_size,
        dropout = 0.1,
        no_of_decoder_layers = no_of_decoder_layers,
        vocab_size = vocab_size
    ):
        super().__init__()
        
        self.positional_embeddings = nn.Parameter(torch.randn(1, block_size, embeddings_dims, device=device), requires_grad=True)
        self.text_embds = TextEmbeddings(vocab_size=vocab_size, embeddings_dims=embeddings_dims)
        # self.decoder = TransformerDecoderBlock(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads, block_size=block_size, dropout=dropout)
        # self.pos = PositionEmbeddings(block_size=block_size, embeddings_dims=embeddings_dims)
        self.decoder_layers = nn.Sequential(*[TransformerDecoderBlock(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads, dropout=dropout, vocab_size=vocab_size) for _ in range(no_of_decoder_layers)])
        
    def forward(self, x):
        x = self.text_embds(x)
        x = x + self.positional_embeddings
        # pos_emb = self.pos(torch.arange(block_size, device=device)) # (T,C)
        # x = x + pos_emb # (B,T,C)
        out = self.decoder_layers(x)
        return out

In [472]:
#Instantiating the model
model = DecoderModel(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads, block_size=block_size, dropout=dropout, no_of_decoder_layers=no_of_decoder_layers, vocab_size=vocab_size)
model = model.to(device)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [456]:
#Printing a summary of the architecture
from torchinfo import summary
idx, targets = get_batch('test')
idx = idx.to(device)
summary(model=model,
        input_data=idx,
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

Layer (type (var_name))                            Input Shape          Output Shape         Param #              Trainable
DecoderModel (DecoderModel)                        [16, 32]             [16, 32, 64]         2,048                True
├─TextEmbeddings (text_embds)                      [16, 32]             [16, 32, 64]         --                   True
│    └─Embedding (embeddings_table)                [16, 32]             [16, 32, 64]         4,160                True
├─Sequential (decoder_layers)                      [16, 32, 64]         [16, 32, 64]         --                   True
│    └─TransformerDecoderBlock (0)                 [16, 32, 64]         [16, 32, 64]         6,208                True
│    │    └─MHA (mha)                              [16, 32, 64]         [16, 32, 64]         16,384               True
│    │    └─LayerNormalization (layer_norm1)       [16, 32, 64]         [16, 32, 64]         128                  True
│    │    └─MLPBlock (mlp_block)           

In [469]:
# Test the MHA module with a sample input tensor

# temp_data = torch.randn(batch_size, block_size, embeddings_dims).to(device)
# model = DecoderModel()
# model = MHA()
idx = idx.to(device)
output = model(idx)
B,T,C = output.shape
print(output.shape)
print(output.view(B*T, C).shape)

torch.Size([16, 32, 64])
torch.Size([512, 64])


In [465]:
output[0]
targets.shape

torch.Size([16, 32])

In [475]:
# Optimizer setup and scheduler steup

optimizer = torch.optim.Adam(model.parameters(), lr=max_lr)
loss_ce = nn.CrossEntropyLoss()
initial_iters = 2000
total_steps = 10000
eval_iters = 300
# warmup_scheduler = warmup.LinearWarmup(optimizer, warmup_period=2000)
# lr_scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max= total_steps - initial_iters)
# lr_scheduler_linear = torch.optim.lr_scheduler.LinearLR(optimizer=optimizer, total_iters=initial_iters)

@torch.inference_mode()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            idx, targets = get_batch(split=split)
            logits = model(idx)
            batch_size, block_size, embeddings_dims = logits.shape
            logits = logits.view(batch_size*block_size, embeddings_dims) # Total tokens(words) => batch_size * block_size
            targets = targets.view(batch_size * block_size)
            loss = loss_ce(logits, targets)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [476]:
#Train the  model
model.train()
loss_values = []
for step in range(total_steps):
    
    if step % 300 == 0 or step == total_steps -1 :
        loss_values = estimate_loss()
        print("Train Loss (over 300 iterations): ", loss_values['train'], "Val Loss (over 300 iterations): ", loss_values['val'])
        
    idx, targets = get_batch(split='train')
    logits = model(idx)
    batch_size, block_size, embeddings_dims = logits.shape
    logits = logits.view(batch_size*block_size, embeddings_dims)
    targets = targets.view(batch_size * block_size)
    loss = loss_ce(logits, targets)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
#Predictions


In [67]:
import torch

tensor1 = torch.tensor([[[1, 2, 3], [3,4,5], [6,8,9]]])
tensor2 = torch.tensor([[[1, 2, 3], [3,4,5], [6,8,9]]])

concatenated = torch.cat([tensor1, tensor2], dim=-1)
print(concatenated)
# Output: tensor([[1, 2, 3],
#                 [4, 5, 6]])
print(tensor1.shape)
print(concatenated.shape)
# Output: torch.Size([2, 3])


tensor([[[1, 2, 3, 1, 2, 3],
         [3, 4, 5, 3, 4, 5],
         [6, 8, 9, 6, 8, 9]]])
torch.Size([1, 3, 3])
torch.Size([1, 3, 6])
