In [1]:
import torch
from torch import nn
from pathlib import Path
from tokenizers import Tokenizer
from huggingface_hub import PyTorchModelHubMixin

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cpu"

In [3]:
!nvidia-smi

Fri Jun 21 05:28:24 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.03              Driver Version: 555.85         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4050 ...    On  |   00000000:01:00.0 Off |                  N/A |
| N/A   42C    P3             11W /   70W |       0MiB /   6141MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
#Data

In [11]:
#Collab setup

data_path = Path('data')
data_path.mkdir(exist_ok=True)
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
!cp input.txt data/input.txt


--2024-06-19 13:21:04--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-06-19 13:21:04 (145 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [8]:
#Datasets

# Using tinyshakespeare

with open('data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

####################################################################

#Using BookCorpus
# from datasets import load_dataset
# data = load_dataset('bookcorpus/bookcorpus')

In [7]:
data['train']

Dataset({
    features: ['text'],
    num_rows: 74004228
})

In [24]:
# Extracting the content of  the Dataset
# Open a file for writing
# with open('bookcorpus_text.txt', 'w', encoding='utf-8') as f:
#     # Traverse the dataset and write text data to the file
#     for record in data['train']['text']:
#         f.write(record)

# print("Writing to file complete.")

# Read the file contents into a single string
with open('bookcorpus_text.txt', 'r', encoding='utf-8') as f:
    concatenated_text = f.read()

# print("Reading from file and concatenation complete.")
# print(concatenated_text[:225000000])  # Print the first 1000 characters
# print(f"Total characters: {len(concatenated_text)}")
# print("Total words: ", len(concatenated_text.split()))

#Using only 1% of the total characters (225 million out of 4.2 billion ->Total words:  45756831 )
concatenated_text = concatenated_text[:225000000]
# print("Total words: ", len(concatenated_text.split()))


In [9]:

###############################################################################

#Subword level tokenization

#Loading custom trained BPE
# Load the tokenizer
tokenizer = Tokenizer.from_file("bpe_tokenizer_tinyshakespeare_1k.json")
vocab_size = tokenizer.get_vocab_size()
# Encode and decode functions
encode = lambda s: tokenizer.encode(s).ids
decode = lambda l: tokenizer.decode(l)

###############################################################################

#Character level tokenization

# here are all the unique characters that occur in this text
# chars = sorted(list(set(text)))
# vocab_size = len(chars)


# create a mapping from characters to integers
# stoi = { ch: i for i,ch in enumerate(chars) }
# itos = { i:ch for i,ch in enumerate(chars) }
# encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
# decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string


In [27]:
#Hyperparameters

block_size = 512
batch_size = 64
embeddings_dims = 768
attn_dropout = 0.1
no_of_heads = 12 #IMP needs to be thoroughly calculated
dropout = 0.1
epochs = 100
max_lr = 2.5e-4
no_of_decoder_layers = 12 #IMP needs to be thoroughly calculated
weight_decay_optim = 0.01

In [11]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [12]:
len(data) 
# steps_per_epoch = 383747 / 64 = 6000 
#Tptal epochs = epcoh * steps_per_epoch = 1 * 6000 = 6000

383747

In [28]:
# Text embeddings
class TextEmbeddings(nn.Module):
    def __init__(
        self,
        vocab_size = vocab_size,
        embeddings_dims = embeddings_dims
    ):
        super().__init__()
        self.embeddings_table = nn.Embedding(num_embeddings = vocab_size, embedding_dim=embeddings_dims, device=device) #Just a look up table to convert the toekns_ids to some numbers
        # nn.init.normal_(self.embeddings_table.weight.data, mean=0, std=0.02)

    def forward(self, x):
        return self.embeddings_table(x)

In [29]:
# #Position embeddings
# class PositionEmbeddings(nn.Module):
#     def __init__(
#         self,
#         embeddings_dims = embeddings_dims,
#         block_size = block_size
#     ):
#         super().__init__()

#         self.position_embeddings = nn.Parameter(torch.randn(1, block_size, embeddings_dims, device=device), requires_grad=True) #To give positional embeddings to each token of the input text, hence num_embeddings=block_size
#         # nn.init.normal_(self.position_embeddings.weight.data, mean=0, std=0.02)

#     def forward(self):
#         return self.position_embeddings

In [30]:
#Layer Normalization

class LayerNormalization(nn.Module):
    def __init__(
        self,
        embeddings_dims = embeddings_dims
    ):
        super().__init__()

        self.layer_norm = nn.LayerNorm(normalized_shape=embeddings_dims)

    def forward(self, x):
        return self.layer_norm(x)

In [31]:
#FeedForward Neural Network

class MLPBlock(nn.Module):
    def __init__(
        self,
        dropout = dropout,
        embeddings_size = embeddings_dims,
        # inner_dimensional_states: int = 3072
    ):
        super().__init__()

        self.mlp = nn.Sequential(
            nn.Linear(device=device, in_features=embeddings_size, out_features= 4 * embeddings_dims),
            nn.GELU(),
            nn.Linear(device=device, in_features= 4 * embeddings_dims, out_features=embeddings_size),
            nn.Dropout(p = dropout)
        )

    def forward(self, x):
        # mlp_weights_init = self.mlp.apply(weights_init)
        return self.mlp(x)

In [32]:
# #Weights Initilization (for MLP Block)
# def weights_init(m):
#     classname = m.__class__.__name__
#     if classname.find('Linear') != -1:
#         nn.init.normal_(m.weight.data, 0.0, 0.02)  #mean = 0, std = 0.02



In [33]:

class AttentionHead(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
    ):
        super().__init__()
        self.head_size = embeddings_dims // no_of_heads
        self.query = nn.Linear(in_features=embeddings_dims, out_features=self.head_size, device=device, bias=False)
        self.keys = nn.Linear(in_features=embeddings_dims, out_features=self.head_size,device=device, bias=False)
        self.values = nn.Linear(in_features=embeddings_dims, out_features=self.head_size, device=device,bias=False)
        self.dropout = nn.Dropout(p = attn_dropout)


    def forward(self, x):
        batch, block_size, embd_dims = x.shape
        k = self.keys(x)
        q = self.query(x)
        v = self.values(x)
        masked_table = torch.tril(torch.ones(block_size, block_size, device=device))
        weights = q @ torch.transpose(k, dim0=-2, dim1=-1) * (k.shape[-1] ** -0.5)
        masked_values = weights.masked_fill(masked_table[: block_size, : block_size] == 0, float('-inf'))
        weights_normalized = nn.functional.softmax(masked_values, dim=-1) #Normalize along the embeddings dimension for all the tokens
        # weights_normalized = self.dropout(weights_normalized)
        out = weights_normalized @ v
        out = self.dropout(out)
        return out

# class AttentionHead(nn.Module):
#     """ one head of self-attention """

#     def __init__(
#         self,
#         attn_dropout = attn_dropout,
#         embeddings_dims = embeddings_dims,
#         no_of_heads = no_of_heads,
#     ):
#         super().__init__()
#         self.head_size = embeddings_dims // no_of_heads
#         self.key = nn.Linear(embeddings_dims, self.head_size, bias=False)
#         self.query = nn.Linear(embeddings_dims, self.head_size, bias=False)
#         self.value = nn.Linear(embeddings_dims, self.head_size, bias=False)
#         self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

#         self.dropout = nn.Dropout(attn_dropout)

#     def forward(self, x):
#         # input of size (batch, time-step, channels)
#         # output of size (batch, time-step, head size)
#         B,T,C = x.shape
#         k = self.key(x)   # (B,T,hs)
#         q = self.query(x) # (B,T,hs)
#         # compute attention scores ("affinities")
#         wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
#         wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
#         wei = nn.functional.softmax(wei, dim=-1) # (B, T, T)
#         wei = self.dropout(wei)
#         # perform the weighted aggregation of the values
#         v = self.value(x) # (B,T,hs)
#         out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
#         return out

In [34]:
# MHA

# class MultiHeadAttention(nn.Module):
#     """ multiple heads of self-attention in parallel """
#     def __init__(self,
#                   attn_dropout = attn_dropout,
#                   embeddings_dims = embeddings_dims,
#                   no_of_heads = no_of_heads,
#                  ):
#         super().__init__()
#         self.heads = nn.ModuleList([AttentionHead(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads) for _ in range(num_heads)])
#         self.proj = nn.Linear(embeddings_dims, embeddings_dims)
#         self.dropout = nn.Dropout(dropout)

#     def forward(self, x):
#         out = torch.cat([h(x) for h in self.heads], dim=-1)
#         out = self.dropout(self.proj(out))
#         return out



class MHA(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
    ):
        super().__init__()
        self.heads = nn.ModuleList([AttentionHead(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads) for _ in range(no_of_heads)])
        self.dropout = nn.Dropout(p = attn_dropout)
        self.linear = nn.Linear(in_features=embeddings_dims, out_features=embeddings_dims, device=device, bias=False) # 12 (no of heads) * (batch_size) 64 = 768 -> gives out the text embeddings

    def forward(self, x):
        concat = torch.cat([head(x) for head in self.heads], dim=-1)
        linear_layer = self.linear(concat)
        out = self.dropout(linear_layer)
        return out

In [None]:
# Decoder Block

class TransformerDecoderBlock(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
        dropout = dropout,
        vocab_size = vocab_size
    ):
        super().__init__()

        self.mha = MHA(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads)
        self.layer_norm1 = LayerNormalization(embeddings_dims=embeddings_dims)
        self.layer_norm2 = LayerNormalization(embeddings_dims=embeddings_dims)
        self.mlp_block = MLPBlock(dropout=dropout, embeddings_size=embeddings_dims)

    def forward(self, x):
        # x = self.mha(x)
        # x = x + self.layer_norm1(x)
        # x = x + self.mlp_block(x)
        # out = self.layer_norm2(x)
        # x = x + self.mha(self.layer_norm1(x))  #Very important step -> Layer Norm on input and then passes it to the subsequent blocks
        # x = x + self.mlp_block(self.layer_norm2(x)) #Very important step
        x = self.layer_norm1(x + self.mha(x))
        x = self.layer_norm1(x + self.mlp_block(x))

        return x

In [None]:
# Decoder Block

class DecoderModel(nn.Module):
    def __init__(
        self,
        attn_dropout = attn_dropout,
        embeddings_dims = embeddings_dims,
        no_of_heads = no_of_heads,
        block_size = block_size,
        dropout = dropout,
        no_of_decoder_layers = no_of_decoder_layers,
        vocab_size = vocab_size
    ):
        super().__init__()

        self.positional_embeddings = nn.Parameter(torch.randn(1, block_size, embeddings_dims, device=device), requires_grad=True) #To give positional embeddings to each token of the input text, hence num_embeddings=block_size
        torch.nn.init.normal_(self.positional_embeddings, mean=0.0, std=0.02)
        self.text_embds = TextEmbeddings(vocab_size=vocab_size, embeddings_dims=embeddings_dims)
        self.linear_layer = nn.Linear(in_features=embeddings_dims, out_features=vocab_size, device=device, bias=False) # Takes in logits of dimensions- embeds_dims and converts it into dimension of vocab_size (logits in range of vocab_size)
        self.layer_norm = LayerNormalization(embeddings_dims=embeddings_dims)
        self.decoder_layers = nn.Sequential(*[TransformerDecoderBlock(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads, dropout=dropout, vocab_size=vocab_size) for _ in range(no_of_decoder_layers)])
        self.apply(self._init_weights)
        self.dropout = nn.Dropout(p = dropout)

    def _init_weights(self, module):  #Weight Initialization
            if isinstance(module, nn.Linear):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
                if module.bias is not None:
                    torch.nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, x):
        x = self.text_embds(x)
        x = x + self.positional_embeddings
        x = self.dropout(x)
        x = self.decoder_layers(x)
        x = self.layer_norm(x)
        out = self.linear_layer(x)
        return out

In [37]:
#Instantiating the model
model = DecoderModel(attn_dropout=attn_dropout, embeddings_dims=embeddings_dims, no_of_heads=no_of_heads, block_size=block_size, dropout=dropout, no_of_decoder_layers=no_of_decoder_layers, vocab_size=vocab_size)
model = model.to(device)

In [38]:
#Printing a summary of the architecture
from torchinfo import summary
idx, targets = get_batch('test')
# idx = idx.to(device)
summary(model=model,
        input_data=idx,
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

Layer (type (var_name))                            Input Shape          Output Shape         Param #              Trainable
DecoderModel (DecoderModel)                        [64, 512]            [64, 512, 1000]      393,216              True
├─TextEmbeddings (text_embds)                      [64, 512]            [64, 512, 768]       --                   True
│    └─Embedding (embeddings_table)                [64, 512]            [64, 512, 768]       768,000              True
├─Dropout (dropout)                                [64, 512, 768]       [64, 512, 768]       --                   --
├─Sequential (decoder_layers)                      [64, 512, 768]       [64, 512, 768]       --                   True
│    └─TransformerDecoderBlock (0)                 [64, 512, 768]       [64, 512, 768]       --                   True
│    │    └─LayerNormalization (layer_norm1)       [64, 512, 768]       [64, 512, 768]       1,536                True
│    │    └─MHA (mha)                        

In [39]:
# Optimizer setup and scheduler steup

optimizer = torch.optim.AdamW(model.parameters(), lr=max_lr)
# optimizer = torch.optim.Adam(model.parameters(), lr=max_lr, weight_decay=weight_decay_optim)
# initial_iters = 2000
total_steps = 5000
eval_iters = 100
# warmup_scheduler = warmup.LinearWarmup(optimizer, warmup_period=2000)
# lr_scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max= total_steps - initial_iters)
# lr_scheduler_linear = torch.optim.lr_scheduler.LinearLR(optimizer=optimizer, total_iters=initial_iters)

# @torch.inference_mode()
# def estimate_loss():
#     out = {}
#     model.eval()
#     for split in ['val']:
#         # losses = torch.zeros(eval_iters)
#         # for k in range(eval_iters):
#         idx, targets = get_batch(split=split)
#         logits = model(idx)
#         batch_size, block_size, embeddings_dims = logits.shape
#         logits = logits.view(batch_size*block_size, embeddings_dims) # Total tokens(words) => batch_size * block_size
#         targets = targets.view(batch_size * block_size)
#         loss = nn.functional.cross_entropy(logits, targets)
#         # losses[k] = loss.item()
#       # out[split] = losses.mean()
#         out[split] = loss.item()
#     model.train()
#     return out
@torch.inference_mode()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            idx, targets = get_batch(split=split)
            logits = model(idx)
            batch_size, block_size, embeddings_dims = logits.shape
            logits = logits.view(batch_size*block_size, embeddings_dims) # Total tokens(words) => batch_size * block_size
            targets = targets.view(batch_size * block_size)
            loss = nn.functional.cross_entropy(logits, targets)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [40]:
#Train the  model
from tqdm import tqdm

model.train()
for step in tqdm(range(total_steps)):

    # every once in a while evaluate the loss on train and val sets
    if (step  % eval_iters == 0 and step != 0) or step == total_steps - 1:
        losses = estimate_loss()
        print(f"step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        torch.save(model.state_dict(), 'weights/gpt_86M_steps_%d.pth' % (step))

    idx, targets = get_batch(split='train')
    logits = model(idx)
    batch_size, block_size, embeddings_dims = logits.shape
    logits = logits.view(batch_size*block_size, embeddings_dims)
    targets = targets.view(batch_size * block_size)
    loss = nn.functional.cross_entropy(logits, targets)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    # print(loss.item())
    # break

    # if step != 0 and (step % eval_iters == 0 or step == total_steps -1) :
    #     loss_values = estimate_loss()
    #     print("Train Loss at {} steps : {}".format(step, loss.item()), "Val Loss at {} steps : {}".format(step, loss_values['val']))

  2%|▏         | 100/5000 [02:13<1:51:20,  1.36s/it]

step 100: train loss 5.5448, val loss 5.5904


  4%|▍         | 200/5000 [05:59<1:50:37,  1.38s/it] 

step 200: train loss 4.7255, val loss 4.9167


  6%|▌         | 300/5000 [09:50<1:47:53,  1.38s/it] 

step 300: train loss 4.3034, val loss 4.6327


  8%|▊         | 400/5000 [13:39<1:46:15,  1.39s/it] 

step 400: train loss 4.0337, val loss 4.5531


 12%|█▏        | 600/5000 [21:18<1:40:22,  1.37s/it] 

step 600: train loss 3.0600, val loss 4.6718


 14%|█▍        | 700/5000 [25:08<1:39:11,  1.38s/it] 

step 700: train loss 2.0392, val loss 5.0161


 16%|█▌        | 800/5000 [28:59<1:36:49,  1.38s/it] 

step 800: train loss 0.9800, val loss 5.6312


 16%|█▌        | 803/5000 [30:36<2:39:58,  2.29s/it] 

KeyboardInterrupt



In [5]:
#Encoder

In [None]:
# Text embeddings
class InputEmbeddings(nn.Module):
    def __init__(
        self,
        vocab_size = vocab_size,
        embeddings_dims = embeddings_dims
    ):
        super().__init__()
        self.embeddings_table = nn.Embedding(num_embeddings = vocab_size, embedding_dim=embeddings_dims, device=device) #Just a look up table to convert the toekns_ids to some numbers
 

    def forward(self, x):
        return self.embeddings_table(x)

In [None]:
#Position Embeddings

class PositionEmbeddings(nn.Module):
    def __init__(
        self,
        block_size = block_size,
        embeddings_dims = embeddings_dims
    ):
        super().__init__()
        self.embeddings_table = nn.Embedding(num_embeddings = block_size, embedding_dim=embeddings_dims, device=device)


    def forward(self, x):
        return self.embeddings_table(x)

In [25]:
# Pushing model to huggingface repo
from huggingface_hub import interpreter_login
interpreter_login() 


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .


Enter your token (input will not be visible):  ········
Add token as git credential? (Y/n)  n


Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
model_name = 'YuvrajSingh9886/GPT-86M
model.push_to_hub(model_name)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Specify the path to your custom model directory
model_path = model_name

# Load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_path)

# Example usage
text = "This is a sample text."
inputs = tokenizer(text, return_tensors="pt")

# Get model predictions
with torch.no_grad():
    outputs = model(**inputs)

print(outputs)


In [None]:
#Evaluation
!pip install evaluate
from evaluate import load
perplexity = load("perplexity", module_type="metric")
results = perplexity.compute(predictions=predictions, model_id='gpt2')