In [None]:

import torch
device=None
if torch.cuda.is_available():
    device = 'cuda'
elif (hasattr(torch, "hpu") and torch.hpu.is_available()):
    device = 'hpu'
elif torch.backends.mps.is_available():
    device = 'mps' # mac
else :
    device = 'cpu'

print(f'Using device  : {device}')



In [None]:
# Data set source : Hugging Face 
# https://huggingface.co/datasets/leonweber/teaching_motivational_quotes/tree/main

torch.manual_seed(12131306)
batch_size = 16
text = None
vocab_size = 1000

with open('data/motivational_quotes.txt', 'r', encoding='utf-8') as f:
    text = f.read()




In [None]:

class Head(torch.nn.Module): # attention head
    def __init__(self,head_size, embedding_dimension,context_length):
        super.__init__()
        """one head of the self-attention block
        its a nxm matrics and hence we can consider it as a linear block with bias=False
        here n : input embedding size and m : head size (which is typically kept as embedding dimension
        divided by the number of heads, this is done so that in the end when we concatenate the output
        of the attention heads they become equal to the embedding dimension 
        typically for models such as bert and gpt the embedding dimension used is 768, and the number of heads 
        is 12 , hence the number of colums for the weights should be 768/12 =64 , hence the dimension is 
        768x64 )"""
        self.head_size = head_size

        #weights for the Q,K,V vectors
        self.Qw = torch.nn.Linear(embedding_dimension, self.head_size, bias=False)
        self.Qw = torch.nn.Linear(embedding_dimension, self.head_size, bias=False)
        self.Qw = torch.nn.Linear(embedding_dimension, self.head_size, bias=False)

        """Next we create our tril tensor to mask the output for auto regressive training,
        we will use the register_buffer utiltiy so that this tensor is not considered part of 
        the model parameters but rather considered as a model state.
        we name the buffer as tril"""
        self.register_buffer('tril', torch.tril(torch.ones(context_length, context_length)))

        """Next define a drop out layer
        The main purpose of incorporating dropout in the attention block is to mitigate overfitting,
        especially in models with a large number of parameters. By randomly dropping units during training, 
        the model learns to be more robust and less reliant on specific features."""

        self.dropout = torch.nn.Dropout(0.2)  #default value is 0.5, we set it to 0.2 
    
    def forward(self,x):
        """To get the q,k,v vectors we should matmul the input with the wieghts of the q,k,v vectors
        the input dimensions is B: batch size, T: context_length, D : embedding dimension
        hence to get proper output we should have the output from each attention head as 
        B,T and Head dimension i.e (D/num_attention_heads)"""
        batch_size,token_length,embedding_dimension = x.shape
        """creating the q,k,v vectors :
            k = self.key(x):
            This line applies the linear transformation defined by self.key to the input tensor x.
            The input x is expected to have a shape of (batch_size, sequence_length, n_embd), where:
            batch_size is the number of samples in the batch.
            sequence_length is the length of each input sequence.
            n_embd is the dimensionality of the input embeddings.
            The linear layer projects each input vector of size n_embd to a vector of size head_size, 
            resulting in a tensor k with shape (batch_size, sequence_length, head_size).
            This tensor k represents the key vectors for the attention mechanism."""
        Q = self.Qw(x) # project x to Qw , the output is of dimension batch_size,context_length, head_size
        K = self.Kw(x) # batch_size,context_length,head_size
        V = self.Vw(x) # batch_size,context_length,head_size

        """Now compute the attention score : the scaled dot product attention formulat is :
        Attention (Q,K,V) = softmax(((Q@Kt/sqrt(head_size)))@V
        1. compute Q@Kt we transpose the Kt in the last two dimension
            this transpose can be achieve by either k.transpose(-2,-1)  - swap the last two dimensions
            or we can also use permute , k.permute(0,3)"""
        out = Q @ K.transpose(-2,-1) # dot product of Q and K , (batch_size,context_length,head_size) @ (batch_size, head_size,context_length) --> (batch_size,context_length,context_length)
        scaling = K.shape(-1)**0.5 # square root of the head dimension of K
        out = out/scaling
        out = torch.softmax(out) # softmax of the output
        out = out @ V # (batch_size,context_length, context_length) @ (batch_size, context_legth, head_size)--> (batch_size,context_length,head_size)
        return out 
        

        





In [None]:
"""Multi-headed attention , We need to have multi-headed attention , for this to preserve the dimensionality this number is chosen so that its divisible by the embedding dimension
for the original attention paper the embedding dimension is 764 , the numb attention heads was chosen as 12 , hence the size of each head is 764/12 == 64
thus at the output of the attention block we can concatenate all the attention heads to arrive at the output dimension 764 (12*64)"""
class MultiHeadedAttention(torch.nn.Module):
    def __init__(self, num_head, head_size,embedding_dimension,context_length):
        super.__init__()
        self.num_head = num_head
        self.heads = torch.nn.ModuleList([Head(head_size,embedding_dimension,context_length) for _ in range(num_head)])
        self.projection = torch.Linear(head_size * num_head, embedding_dimension)
        self.dropout = torch.nn.Dropout(0.2)
    
    def forward(self, x):
        """concatenate all the attention heads, project it to a Linear module with trainable prameters and add drop out"""
        out = torch.concat([h(x) for h in self.heads],dim=-1) # concatenate the last dimension , i.e the head_size
        out = self.projection(out)
        return self.dropout(out)
    
   


In [None]:
""" The output of the attention block is fed to a Fully connected feed forward network with relu activation"""
class FeedForward(torch.nn.Module):
    def __init__(self,embedding_dimension):
        super.__init__()
        self.fc = torch.nn.Sequential(torch.nn.Linear(embedding_dimension,4 * embedding_dimension),
                                      torch.nn.ReLU(),
                                      torch.nn.Linear(4 * embedding_dimension, embedding_dimension),
                                      torch.nn.Dropout(0.2)
                                    )
    def forward(self,x):
        return self.fc(x)
        


In [None]:
"""Stiching the multi-headed attention to a Feedforward and adding layer norm , also adding the residual connections - The Transformers block"""
class TransformerBlock(torch.nn.Module):
    def __init__(self,num_head,head_size,embedding_dimension,context_length):
        """We will try to create a decoder block, the components of the decoder block are 
        1. The inputs are the positional embedding vectors of each token x
        2. The next layer is a Layer Norm 
        3. Followed by SDPA (Scaled Dot Product Attention)
        4. Drop Out (Our MultiHeadedAttention Has the dropout included )
        5. Layer Normalization 
        6. FeedForward Network
        7. Drop Out 
        8. In addition there are some residual connections
        """
        super.__init__()
        self.spda = MultiHeadedAttention(num_head, head_size, embedding_dimension, context_length)
        self.ffwdnet = FeedForward(embedding_dimension)
        self.layer_norm_1 = torch.nn.LayerNorm(embedding_dimension)
        self.layer_norm_2 = torch.nn.LayerNorm(embedding_dimension)
    
    def forward(self,x):
        x = x + self.sdpa(self.layer_norm_1(x))
        out = x + self.ffwdnet(self.layer_norm_2())
        return out




In [None]:
"""Making the GPT2 Decoder Block 
The decoder Block consists of the following 
1. Tokenizer (eg: BPE, sentence-piece, GPT uses BPE aka Byte pair encoding)
2. Word Embedding (word2vec)
3. Positional Embedding 
4. 12 * Transformer Blocks
5. Layer Normalization
6. Feed Forward"""

"""The output of the GPT2Decoder can have many other heads such as summarization head, causal language modelling, translation head etc.
1. Classification Head
This head is used for tasks where the model needs to classify the input into predefined categories. It typically consists of one or more fully connected layers followed by a softmax activation function to output probabilities for each class.
2. Regression Head
Used for tasks that require predicting continuous values, such as predicting prices or coordinates. This head usually consists of fully connected layers without an activation function at the output layer (or with a linear activation).
3. Sequence-to-Sequence Head
This head is suitable for tasks like translation or summarization, where the model generates a sequence of outputs based on the input sequence. It often involves a decoder structure that can process the output from the GPT-2 block.
4. Token Classification Head
Used for tasks like Named Entity Recognition (NER) or Part-of-Speech (POS) tagging, where each token in the input sequence needs to be classified. This head typically outputs a classification for each token.
5. Multi-Task Head
This head can handle multiple tasks simultaneously, combining different types of outputs (e.g., classification and regression) from a single model. It may involve multiple branches in the architecture.
6. Attention Heads
Additional attention heads can be added for specialized attention mechanisms, allowing the model to focus on different aspects of the input data. This is particularly useful in multi-head attention setups.
7. Mixture of Experts Head
This head leverages multiple expert models, activating only a subset of them for each input, which can improve efficiency and performance on diverse tasks.
8. Contrastive Learning Head
Used in self-supervised learning scenarios, this head is designed to maximize the similarity between positive pairs and minimize it between negative pairs, often implemented using a projection head followed by a contrastive loss.
9. Generative Head
For tasks involving text generation, this head can be designed to produce sequences based on the learned representations, often using techniques like beam search or sampling.
"""

class GPTLanguageModel(torch.nn.Module):

    def __init__(self,vocab_size, context_length, embedding_dimension,num_transformer_blocks,num_attention_heads):
        super().__init__()
        
        self.vocab_size=vocab_size
        self.context_length = context_length
        self.embedding_dimension = embedding_dimension
        self.num_transformer_blocks = num_transformer_blocks
        self.num_attention_heads = num_attention_heads 
        self.head_dimension = embedding_dimension/num_attention_heads

        """ GPT2 Pipeline
        [Tokenizer]->[Word Embeddings]->[Postional Embedding]->[TrasformerBlock1: [[Layer Norm]->[Self Attention Block x12]->[Drop Out]->[Layer Norm]->[FC layers]->[Drop Out]].....
        [TransformerBlock12: [[Layer Norm]->[Self Attention Block x12]->[Drop Out]->[Layer Norm]->[FC layers]->[Drop Out] ] ->[Layer Normalization]->[GPT2 Workload Head/ Linear Layer]"""

        #https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
        self.token_embedding_table = torch.nn.Embedding(self.vocab_size, self.embedding_dimension) #Word Embedding
        self.position_embedding_table = torch.nn.Embedding(self.context_length, self.embedding_dimension) # Positional Embedding
        self.self_attention_blocks = torch.nn.Sequential(*[TransformerBlock(self.num_attention_heads,self.head_dimension,self.embedding_dimension,self.context_length) for _ in range(num_transformer_blocks)]) # Self Attention
        self.ln_f = torch.nn.LayerNorm(self.embedding_dimension)   # Layer Norm after self attention block 
        self.lm_head = torch.nn.Linear(self.embedding_dimension, self.vocab_size) # GPT2 Workload Head

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, torch.nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, torch.nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, input_tokens, targets=None):
        batch_size, context_length = input_tokens.shape # Batch size x Context_length (max tokens supported)

        tok_emb = self.token_embedding_table(input_tokens) # Dimension : batch_size, context_length, embedding_dimension
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # Dimension : token_size x embedding_dimension
        x = tok_emb + pos_emb # Dimension : batch_size, context_length, embedding_dimension
        x = self.self_attention_blocks(x) # Dimension : batch_size, context_length, embedding_dimension
        x = self.ln_f(x) # Dimension : batch_size, context_length, embedding_dimension 
        logits = self.lm_head(x) # Logits -> The raw results , Dimension : batch_size, context_length, vocab_size

        """  We need to reshape the logits since the cross entropy loss only accepts m x n format : 
             original shape of logits was (batch_size, context_length, vocab_size).
             loss functions, such as cross-entropy loss, expect the logits to have a shape of (num_samples, num_classes), where : 
             num_samples is the total number of samples (in this case, batch_size * context_length) and num_classes is the number of classes (in this case, vocab_size).
             After reshaping, the new shape becomes (batch_size * context_length, vocab_size). This effectively flattens the first two dimensions of logits into a single dimension.
        """

        if targets is None:
            loss = None
        else:
            batch_size, context_length, vocab_size = logits.shape
            logits = logits.view(batch_size*context_length, vocab_size) # reshape to 2D tensor
            targets = targets.view(batch_size*context_length) # 1D tensor , with the next token 
            loss = torch.nn.functional.cross_entropy(logits, targets)

        return logits, loss # return logits in Dimension : batch_size*context_length, vocab_size, 

    """The generator :
        causal next word prediction 
        eg : THe world is a ......
        prediction : The world is a mystery in itself
        generate_token() -> this is our main generator, Arguments : 
        input token : The input given by the user , Dimension : batch_size x context_length
        max_new_tokens : maximum length of the generated text/tokens
    """
    def generate_tokens(self, input, max_size_of_generated_tokens): 
        
        for _ in range(max_size_of_generated_tokens):
            # Take only the tokens till the max context length 
           
            input_tokens = input[:, -self.context_length:] # Dimension : batch_size x context_length
          
            # get the raw data of size vocab_size
            logits, loss = self(input_tokens) # Dimension logits : batch_size x context_length x vocab_size
           
            # Take the slice of the last token , the resulting 
            logits = logits[:, -1, :]  # The resultant tensor will be of size batch_size x vocab_size , data only of the last tokens in those dimension
            
            # Now apply Softmax along the vocab_size dimension to get the probabilites 
           
            probs = torch.nn.functional.softmax(logits, dim=-1) # sofmax along column, so that rows have probabilty of 1
            # use torch.multinomial to select decrete values from probability distribution , we would like to select one index from a row ( of batches)
            # hence we set the num_samples to 1.
            #https://pytorch.org/docs/stable/generated/torch.multinomial.html
            # This is our next token 
            input_next = torch.multinomial(probs, num_samples=1)  # Dimension : batch_size x 1
            # append this to the next input token 
            input = torch.cat((input, input_next), dim=1) # 
        return input


