In [2]:
import torch
from torch import nn
from pathlib import Path
from tokenizers import Tokenizer

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cpu"

In [4]:
#Data

In [5]:
#Collab setup

data_path = Path('data')
data_path.mkdir(exist_ok=True)
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
!cp input.txt data/input.txt


--2025-02-08 00:57:35--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.2’


2025-02-08 00:57:36 (6.19 MB/s) - ‘input.txt.2’ saved [1115394/1115394]



In [6]:
#Datasets

# Using tinyshakespeare

with open('data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

####################################################################

#Using BookCorpus
# from datasets import load_dataset
# data = load_dataset('bookcorpus/bookcorpus')

In [7]:
#Datasets

# Using tinyshakespeare

with open('data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

####################################################################

#Using BookCorpus
# from datasets import load_dataset
# data = load_dataset('bookcorpus/bookcorpus')

In [8]:
# # Extracting the content of  the Dataset
# # Open a file for writing
# with open('data/input.txt', 'w', encoding='utf-8') as f:
#     # Traverse the dataset and write text data to the file
#     for record in data['train']['text']:
#         f.write(record)

# print("Writing to file complete.")




In [9]:

#Loading custom trained BPE
# Load the tokenizer
# tokenizer = Tokenizer.from_file("bpe_tokenizer_tinyshakespeare_20k.json")
# vocab_size = tokenizer.get_vocab_size()
# Encode and decode functions
# encode = lambda s: tokenizer.encode(s).ids
# decode = lambda l: tokenizer.decode(l)

###############################################################################
#Character level tokenization

# # here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)


# create a mapping from characters to integers
stoi = { ch: i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string


In [10]:
#Hyperparameters

block_size = 128
batch_size = 64
embeddings_dims = 384
attn_dropout = 0.1
no_of_heads = 6 #IMP needs to be thoroughly calculated
dropout = 0.1
epochs = 100
max_lr = 3e-4
no_of_decoder_layers = 6 #IMP needs to be thoroughly calculated
attn_dropout = 0.1
weight_decay_optim = 0.01
experts=8
top_experts=2

In [11]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [12]:
# Text embeddings
class TextEmbeddings(nn.Module):
    def __init__(
        self,
        vocab_size = vocab_size,
        embeddings_dims = embeddings_dims
    ):
        super().__init__()
        self.embeddings_table = nn.Embedding(num_embeddings = vocab_size, embedding_dim=embeddings_dims, device=device) #Just a look up table to convert the toekns_ids to some numbers
        # nn.init.normal_(self.embeddings_table.weight.data, mean=0, std=0.02)

    def forward(self, x):
        return self.embeddings_table(x)

In [13]:
#Layer Normalization

class LayerNormalization(nn.Module):
    def __init__(
        self,
        embeddings_dims = embeddings_dims
    ):
        super().__init__()

        self.layer_norm = nn.LayerNorm(normalized_shape=embeddings_dims)

    def forward(self, x):
        return self.layer_norm(x)

In [14]:
class Swish(nn.Module):
    def __init__(
        self,
        block_size: int = block_size,
        embeddings_dims: int = embeddings_dims
    ):
        super().__init__()
        
        self.sig = torch.nn.Sigmoid()
        
        
    def forward(self, x):
        swish = x * self.sig(x)
        
        return swish
         

In [15]:
class SWiGLUExpertMoE(nn.Module):
    def __init__(
        self,
        block_size: int = block_size,
        embeddings_dims: int = embeddings_dims
    ):
        super().__init__()
        
        self.swish = Swish(block_size=block_size, embeddings_dims=embeddings_dims)
        self.linear_layer1 = nn.Linear(in_features=embeddings_dims, out_features=embeddings_dims, device=device, bias=False, dtype=torch.float32)
        self.linear_layer2 = nn.Linear(in_features=embeddings_dims, out_features=embeddings_dims, device=device, bias=False, dtype=torch.float32)
        self.linear_layer3 = nn.Linear(in_features=embeddings_dims, out_features=embeddings_dims, device=device, bias=False, dtype=torch.float32)

        
        
        
    def forward(self, x):
        swish_res = self.swish(self.linear_layer1(x))
        x_V = self.linear_layer2(x)
        res = torch.mul(swish_res, x_V)
        out = self.linear_layer3(res)
        return out
         

In [16]:
# #MoE Neural Network

# class MoeExpert(nn.Module):
#     def __init__(
#         self,
#         dropout = dropout,
#         embeddings_size = embeddings_dims,
#         # inner_dimensional_states: int = 3072
#     ):
#         super().__init__()

#         self.mlp = nn.Sequential(
#             nn.Linear(device=device, in_features=embeddings_size, out_features= 4 * embeddings_dims),
#             nn.GELU(),
#             nn.Linear(device=device, in_features= 4 * embeddings_dims, out_features=embeddings_size),
#             nn.Dropout(p = dropout)
#         )

#     def forward(self, x):
#         # mlp_weights_init = self.mlp.apply(weights_init)
#         return self.mlp(x)

In [45]:
#MoE Layer

class MoeLayer(nn.Module):
    def __init__(
        self,
        dropout = dropout,
        embeddings_size = embeddings_dims,
        # inner_dimensional_states: int = 3072
    ):
        super().__init__()

        self.heads = nn.ModuleList([SWiGLUExpertMoE() for _ in range(experts)])
        self.gate = nn.Linear(in_features=embeddings_dims, out_features=experts)
        self.outputs = torch.zeros((block_size, embeddings_size))
        
    def forward(self, x):
        # mlp_weights_init = self.mlp.apply(weights_init)
        self.gate_out = self.gate(x)
        top_k_values, top_k_indices = torch.topk(self.gate_out, k=top_experts)
        probs = torch.nn.functional.softmax(top_k_values)
        # print(top_k_indices[11])
        # print(top_k_values[20])
        # print(probs[20])
        out = 0
        for batch in range(batch_size):
            for i in range(block_size):
                for j in range(top_experts):
                    # print(i.shape)
                    # print('X batched shape: ', x[batch].shape)
                    # print('X shape: ', x.shape)
                    head_out = self.heads[i](x)
                    
                    # print('Softmax shape: ', torch.nn.functional.softmax(top_k_values[top_k_indices[i]]).shape)
                    # print('Head out shape: ', head_out.shape)
                    self.outputs[batch,i] = probs[top_k_indices[i]]
        print(self.outputs.shape)
        out += head_out * self.outputs
            
        return out
        

In [46]:

class AttentionHead(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
    ):
        super().__init__()
        self.head_size = embeddings_dims // no_of_heads
        self.query = nn.Linear(in_features=embeddings_dims, out_features=self.head_size, device=device, bias=False)
        self.keys = nn.Linear(in_features=embeddings_dims, out_features=self.head_size,device=device, bias=False)
        self.values = nn.Linear(in_features=embeddings_dims, out_features=self.head_size, device=device,bias=False)
        self.dropout = nn.Dropout(p = attn_dropout)


    def forward(self, x):
        batch, block_size, embd_dims = x.shape
        k = self.keys(x)
        q = self.query(x)
        v = self.values(x)
        masked_table = torch.tril(torch.ones(block_size, block_size, device=device))
        weights = q @ torch.transpose(k, dim0=-2, dim1=-1) * (k.shape[-1] ** -0.5)
        masked_values = weights.masked_fill(masked_table[: block_size, : block_size] == 0, float('-inf'))
        weights_normalized = nn.functional.softmax(masked_values, dim=-1) #Normalize along the embeddings dimension for all the tokens
        weights_normalized = self.dropout(weights_normalized)
        out = weights_normalized @ v
        return out



In [47]:
# MHA




class MHA(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
    ):
        super().__init__()
        self.heads = nn.ModuleList([AttentionHead(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads) for _ in range(no_of_heads)])
        self.dropout = nn.Dropout(p = attn_dropout)
        self.linear = nn.Linear(in_features=embeddings_dims, out_features=embeddings_dims, device=device, bias=False) # 12 (no of heads) * (batch_size) 64 = 768 -> gives out the text embeddings

    def forward(self, x):
        concat = torch.cat([head(x) for head in self.heads], dim=-1)
        linear_layer = self.linear(concat)
        out = self.dropout(linear_layer)
        return out

In [48]:
# Decoder Block

class TransformerDecoderBlock(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
        dropout = dropout,
        vocab_size = vocab_size
    ):
        super().__init__()

        self.mha = MHA(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads)
        self.layer_norm1 = LayerNormalization(embeddings_dims=embeddings_dims)
        self.layer_norm2 = LayerNormalization(embeddings_dims=embeddings_dims)
        self.moe_block = MoeLayer(dropout=dropout, embeddings_size=embeddings_dims)

    def forward(self, x):
        # x = self.mha(x)
        # x = x + self.layer_norm1(x)
        # x = x + self.mlp_block(x)
        # out = self.layer_norm2(x)
        x = x + self.mha(self.layer_norm1(x))  #Very important step -> Layer Norm on input and then passes it to the subsequent blocks
        x = x + self.moe_block(self.layer_norm2(x)) #Very important step

        return x

In [49]:
# Decoder Block

class DecoderModel(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
        block_size = block_size,
        dropout = dropout,
        no_of_decoder_layers = no_of_decoder_layers,
        vocab_size = vocab_size
    ):
        super().__init__()

        self.positional_embeddings = nn.Parameter(torch.randn(1, block_size, embeddings_dims, device=device), requires_grad=True) #To give positional embeddings to each token of the input text, hence num_embeddings=block_size
        torch.nn.init.normal_(self.positional_embeddings, mean=0.0, std=0.02)
        self.text_embds = TextEmbeddings(vocab_size=vocab_size, embeddings_dims=embeddings_dims)
        self.linear_layer = nn.Linear(in_features=embeddings_dims, out_features=vocab_size, device=device, bias=False) # Takes in logits of dimensions- embeds_dims and converts it into dimension of vocab_size (logits in range of vocab_size)
        # self.layer_norm = LayerNormalization(embeddings_dims=embeddings_dims)
        self.decoder_layers = nn.Sequential(*[TransformerDecoderBlock(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads, dropout=dropout, vocab_size=vocab_size) for _ in range(no_of_decoder_layers)])
        self.apply(self._init_weights)

    def _init_weights(self, module):  #Weight Initialization
            if isinstance(module, nn.Linear):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
                if module.bias is not None:
                    torch.nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, x):
        x = self.text_embds(x)
        x = x + self.positional_embeddings
        x = self.decoder_layers(x)
        # x = self.layer_norm(x)
        out = self.linear_layer(x)
        return out

In [50]:
#Instantiating the model
model = DecoderModel(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads, block_size=block_size, dropout=dropout, no_of_decoder_layers=no_of_decoder_layers, vocab_size=vocab_size)
model = model.to(device)

In [51]:
#Printing a summary of the architecture
from torchinfo import summary
idx, targets = get_batch('test')
# idx = idx.to(device)
summary(model=model,
        input_data=idx,
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

  probs = torch.nn.functional.softmax(top_k_values)


RuntimeError: Failed to run torchinfo. See above stack traces for more details. Executed layers up to: [TextEmbeddings: 1, Embedding: 2, LayerNormalization: 3, LayerNorm: 4, MHA: 3, AttentionHead: 5, Linear: 6, Linear: 6, Linear: 6, Dropout: 6, AttentionHead: 5, Linear: 6, Linear: 6, Linear: 6, Dropout: 6, AttentionHead: 5, Linear: 6, Linear: 6, Linear: 6, Dropout: 6, AttentionHead: 5, Linear: 6, Linear: 6, Linear: 6, Dropout: 6, AttentionHead: 5, Linear: 6, Linear: 6, Linear: 6, Dropout: 6, AttentionHead: 5, Linear: 6, Linear: 6, Linear: 6, Dropout: 6, Linear: 4, Dropout: 4, LayerNormalization: 3, LayerNorm: 4, Linear: 4, SWiGLUExpertMoE: 5, Linear: 6, Swish: 6, Sigmoid: 7, Linear: 6, Linear: 6]

: 

In [500]:
# # Test the MHA module with a sample input tensor

logits = model(idx)
batch_size, block_size, embeddings_dims = logits.shape
logits = logits.view(batch_size*block_size, embeddings_dims)
targets = targets.view(batch_size * block_size)
loss = nn.functional.cross_entropy(logits, targets)


OutOfMemoryError: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 

In [27]:
# Optimizer setup and scheduler steup

optimizer = torch.optim.AdamW(model.parameters(), lr=max_lr)
# optimizer = torch.optim.Adam(model.parameters(), lr=max_lr, weight_decay=weight_decay_optim)
initial_iters = 2000
total_steps = 10000
eval_iters = 100

@torch.inference_mode()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            idx, targets = get_batch(split=split)
            logits = model(idx)
            batch_size, block_size, embeddings_dims = logits.shape
            logits = logits.view(batch_size*block_size, embeddings_dims) # Total tokens(words) => batch_size * block_size
            targets = targets.view(batch_size * block_size)
            loss = nn.functional.cross_entropy(logits, targets)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
#Train the  model
from tqdm import tqdm

model.train()
for step in tqdm(range(total_steps)):

    # every once in a while evaluate the loss on train and val sets
    if (step  % eval_iters == 0 and step != 0) or step == total_steps - 1:
        losses = estimate_loss()
        print(f"step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")


    idx, targets = get_batch(split='train')
    logits = model(idx)
    batch_size, block_size, embeddings_dims = logits.shape
    logits = logits.view(batch_size*block_size, embeddings_dims)
    targets = targets.view(batch_size * block_size)
    loss = nn.functional.cross_entropy(logits, targets)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    # print(loss.item())
    # break

    # if step != 0 and (step % eval_iters == 0 or step == total_steps -1) :
    #     loss_values = estimate_loss()
    #     print("Train Loss at {} steps : {}".format(step, loss.item()), "Val Loss at {} steps : {}".format(step, loss_values['val']))

  1%|          | 100/10000 [01:07<1:50:06,  1.50it/s]

step 100: train loss 2.4663, val loss 2.4803


  2%|▏         | 200/10000 [02:49<1:49:32,  1.49it/s]

step 200: train loss 2.3936, val loss 2.4203


  3%|▎         | 300/10000 [04:32<1:48:08,  1.49it/s]

step 300: train loss 2.2675, val loss 2.3109


  3%|▎         | 305/10000 [05:11<8:46:44,  3.26s/it] 