In [101]:
from transformers import AutoTokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling  
from transformers import Trainer, TrainingArguments 
import torch  
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer 
import torch.nn.functional as F 
import os
from math import sqrt

In [102]:
from torch import nn 
from transformers import AutoConfig

model_ckpt = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# If the tokenizer does not have a padding token, set it to be the same as the EOS token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    
config = AutoConfig.from_pretrained(model_ckpt)

Using pad_token, but it is not set yet.


### DATA

In [103]:
sentences = [  
    "Sachin Tendulkar is regarded as one of the greatest batsmen in the history of cricket.",  
    "He holds numerous records, including the highest number of runs scored in both Test and One-Day Internationals.",  
    "Tendulkar made his debut for the Indian cricket team in 1989 and played for 24 years before retiring in 2013.",  
    "Throughout his career, he received numerous awards and accolades, cementing his legacy as a cricketing legend."  
]  
  
text = " ".join(sentences)

In [104]:
# I need to count the number of words in the text using the split() method

words = text.split()
print('Number of words in text:', len(words))

Number of words in text: 68


## TRAIN AND TARGET SEQUENCES

In [105]:
train_len = 9

text_sequences = []
for i in range(train_len, len(words)):
    seq = words[i-train_len:i]
    text_sequences.append(" ".join(seq))

print('Number of text sequences:', len(text_sequences))

target = []
for i in range(train_len, len(words)):
    target.append(words[i])

print('Number of targets:', len(target))

Number of text sequences: 59
Number of targets: 59


In [106]:
train_sequences = text_sequences[0]
print('First training sequence:', train_sequences)
target_sequences = target[0]
print('First target sequence:', target_sequences)

First training sequence: Sachin Tendulkar is regarded as one of the greatest
First target sequence: batsmen


In [107]:
class GPT2Dataset(Dataset):  
    def __init__(self, train_text,target_text ,tokenizer,max_len):  
        self.tokenizer = tokenizer
        self.train_sequences = train_text
        self.labels = target_text
        self.max_len = max_len
  
    def __len__(self):  
        return len(self.train_sequences) 
  
    def __getitem__(self, idx):
        train_seq = str(self.train_sequences[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            train_seq,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        encoding_label = self.tokenizer.encode_plus(
            label,
            add_special_tokens=True,
            max_length=1,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        #input_ids=encoding['input_ids'] #shape: (1,9)
        #print(f' shape of input_ids: {input_ids.shape}') 
        #input_ids=encoding['input_ids'].flatten() #shape: (9,)
        #print(f' shape of input_ids after flattening: {input_ids.shape}')
        return dict(  
            input_ids=encoding['input_ids'].flatten(), 
            attention_mask=encoding['attention_mask'].flatten(),
            label=encoding_label['input_ids'].flatten()
        )





In [108]:
#sample = GPT2Dataset(train_sequences, target_sequences, tokenizer, 9)
#print(len(sample))


## Embedding

In [109]:
class Embeddings(nn.Module):
  """
  Creates a single Dense Embedding for each token --> Token Embedding + Positional Embedding
  """
  def __init__(self,config):
    super().__init__()
    self.token_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
    self.position_embedding = nn.Embedding(config.n_positions, config.hidden_size)
    self.layer_norm = nn.LayerNorm(config.hidden_size, eps= 1e-12)
    self.dropout = nn.Dropout()

  def forward(self,input_ids):
    token_embeddings = self.token_embedding(input_ids)
    seq_length = token_embeddings.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0) #shape: [1,seq_length]
    position_embeddings = self.position_embedding(position_ids) #shape: [1,seq_length,embedding_dim]
    combined_embeddings = token_embeddings + position_embeddings #shape: [1,seq_length,embedding_dim]
    normalized_embedding = self.layer_norm(combined_embeddings) #shape: [1,seq_length,embedding_dim]
    normalized_embedding = self.dropout(normalized_embedding) #shape: [1,seq_length,embedding_dim]
    return normalized_embedding #shape: [1,seq_length,embedding_dim]

### Output of embedding 
- Intital sentence is tokenized and input ids are passed to embedding layer

In [110]:
#calculate same embedding for all the tokens in the sequence

#sample input ids
input_ids = torch.tensor([[31,51,99],[15,5,0]])
print(input_ids.size())
embeddings = Embeddings(config)
embeddings(input_ids).size()

torch.Size([2, 3])


torch.Size([2, 3, 768])

## Attention Head

In [111]:
from torch import nn
class AttentionHead(nn.Module):
  def __init__(self, embed_dim, head_dim):
    super().__init__()
    self.head_dim = head_dim #dimension of one head 
    #infeatures=embed_dim
    #outfeatures=head_dim
    self.q = nn.Linear(embed_dim, head_dim)
    self.k = nn.Linear(embed_dim, head_dim)
    self.v = nn.Linear(embed_dim, head_dim)
    
  
  def causal_mask(self,batch_size,size, dtype):  
    mask = torch.tril(torch.ones(size,size)).unsqueeze(0)
    return mask
    
  
      
  def scaled_dot_product_attention(self,query, key, value):
    dim_k = query.size(dim=-1)  
    #print(dim_k)    
    #print(f'Dimension of the q,k,v Matrix [Batch_size, seq_len, Head_dim] of One Head {dim_k}')
    scores = torch.bmm(query,key.transpose(1,2))/ sqrt(dim_k)  #[(1,5,768)*(1,768,5)]/sqrt(768) >>> [batch_size,5,5] 
    
    mask = self.causal_mask(scores.size(0),scores.size(1),dtype=torch.int32)
    #print(mask)
    scores = scores.masked_fill(mask==0, float(0)) 
    weights = F.softmax(scores, dim=-1) #[batch_size,5,5]
    #print(weights)
    #print(f'Softmax for each column across one row {weights.shape}')
    weights_dot_values = torch.bmm(weights,value) 
    #print(f'Last Step is to multiply weights and values {weights_dot_values.shape}')
    return weights_dot_values 

  def forward(self, hidden_state):
    #print(f'Input Embedding for Each Token with X Matrix {hidden_state.size()}')
    #q = X*W_q
    q = self.q(hidden_state)
    #print(f'Shape of the Query Matrix W_q {q.size()}')
    k = self.k(hidden_state)
    #print(f'Shape of the Key Matrix W_k {k.size()}')
    v = self.k(hidden_state)
    #print(f'Shape of the Value Matrix W_k {v.size()}')
    #print('-----------------Calculating Self Attention--------------------')
    attn_outputs = self.scaled_dot_product_attention(q,k,v)
    #print(f'Shape of the attention Output with one Head and Head Dimension {self.head_dim} is {attn_outputs.size()}')
    return attn_outputs

### one head output example

In [112]:
from torch import nn 
from transformers import AutoConfig
#
text=sentences[0][0:4]
config = AutoConfig.from_pretrained(model_ckpt)
#
inputs = tokenizer(text, return_tensors='pt', add_special_tokens=False)
token_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
#infeatures= embed_dim---> making of the X matrix 
input_embedding = token_embedding(inputs.input_ids)
head_1 = AttentionHead(768,64)
attn_outputs_1 = head_1(input_embedding)
attn_outputs_1.size()

torch.Size([1, 2, 64])

## Multi Head Attention

In [113]:
class MultiHeadAttention(nn.Module):
  def __init__(self,config):
    super().__init__()
    embed_dim = config.hidden_size
    num_heads = config.num_attention_heads
    head_dim = embed_dim // num_heads
    self.heads = [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
    self.w_0 = nn.Linear(embed_dim,embed_dim)

  def forward(self,hidden_state):
    '''
    hidden_state: Input Embedding with dimensions [batch_size, seq_len, embedding_dimension]
    '''
    attention_outputs = [head(hidden_state) for head in self.heads] #Calculating Self-Attention on each head
    contcat_attn_outputs_allheads = torch.cat(attention_outputs, dim=-1) #[batch_size,seq_len, embed_dim]
    Z =   self.w_0(contcat_attn_outputs_allheads) #[batch_size, seq_len, embed_dim]
    return Z

## Feedforward 

In [114]:
class FeedForward(nn.Module):
  def __init__(self,config):
    super().__init__()
    self.linear1 = nn.Linear(config.hidden_size, 3072)
    self.linear2 = nn.Linear(3072, config.hidden_size)
    self.gelu = nn.GELU()
    self.dropout = nn.Dropout(config.embd_pdrop)
  
  def forward(self, attention_outputs):
    output_l1 = self.linear1(attention_outputs)
    activated_outputs = self.gelu(output_l1)
    output_l2 = self.linear2(activated_outputs)
    output = self.dropout(output_l2)
    return output

In [115]:
config.n_positions

1024

## One layer of the Decoder Transformer 
- consist of Multihead Attention: concatenation of all individual attention heads
- Feedforward layer: final output layer
- Input Embedding : size --> [batch_size, seq_len, embedding_dimension]

In [116]:
class TransformerDecoderLayer(nn.Module):
  def __init__(self, config):
    super(TransformerDecoderLayer,self).__init__()
    self.layer_norm1 = nn.LayerNorm(config.hidden_size)
    self.layer_norm2 = nn.LayerNorm(config.hidden_size)
    self.multi_attention = MultiHeadAttention(config)
    self.feedforward = FeedForward(config)

  def forward(self, input_embeddings):
     #pre-layer normalization approach
     
     #Step 1: Applying Layer Normalization to Input Embeddings
     normalized_input_embeddings = self.layer_norm1(input_embeddings)
     
     #Step 2: Applying MultiHeadAttention to Normalized Output
     multi_head_attn = self.multi_attention(normalized_input_embeddings)
     
     #Step 3: Add input embeddings to the Multihead Attention Output
     skip_connection_1 = input_embeddings + multi_head_attn

     #step 4: Pass the output to another Layer Normalization 
     layer_norm_2 = self.layer_norm2(skip_connection_1)

     #Step 5: Adding skip connection 1 outputs to the output of the FeedForward Network (applied on Step 4)
     skip_connection_2 = skip_connection_1 + self.feedforward(layer_norm_2)
     #print(f'output of MultiHeadAttention and FeedForward Network is {skip_connection_2.shape}')
     return skip_connection_2

## Transformer Decoder Module
- n_layers: number of layers of the decoder block

In [117]:
class TransferDecoder(nn.Module):
  def __init__(self,config):
    super().__init__()
    self.embedding = Embeddings(config)
    self.layers = nn.ModuleList([TransformerDecoderLayer(config) for _ in range(config.n_layer)])
                                
  def forward(self, input_ids):
    embeddings = self.embedding(input_ids)
    for layer in self.layers:
      embeddings = layer(embeddings)
    return embeddings

In [118]:
config.hidden_size

768

In [119]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F
from pytorch_lightning import LightningModule, Trainer
import torch.nn as nn

class TransformerDecoderForNextTokenPrediction(LightningModule):
  def __init__(self,config):
    super().__init__()
    self.decoder_embeddings = TransferDecoder(config)
    self.dropout = nn.Dropout(config.embd_pdrop)
    self.classifier = nn.Linear(config.hidden_size, config.vocab_size)

  def forward(self, input_ids):
    N, L = input_ids.shape  # get the batch size and sequence length
    print(f'Batch Size {N} and Sequence Length {L}')
    decoder_embeddings = self.decoder_embeddings(input_ids) #shape: [batch_size, seq_len, embedding_dim]
    drop = self.dropout(decoder_embeddings) #shape: [batch_size, seq_len, embedding_dim]

    # Reshape drop to [-1, drop.size(-1)] before applying the classifier
    drop = drop.view(-1, drop.size(-1)) #shape: [batch_size*seq_len, embedding_dim]

    # Reshape classify back to [N, L, C]
    classify =  self.classifier(drop) #shape: [batch_size*seq_len, vocab_size]
    
    classify = classify.view(N, L, -1) #shape: [batch_size, seq_len, vocab_size]

    # Average over the sequence dimension
    logits = classify.mean(dim=1) #shape: [batch_size, vocab_size]

    return logits
  
  def training_step(self, batch):
    input_ids = batch['input_ids']
    labels = batch['label']
    logits = self.forward(input_ids) #shape: [batch_size, vocab_size]
    labels = labels.view(-1)   #shape: [batch_size, vocab_size]
    loss = F.cross_entropy(logits, labels) #shape: [batch_size, vocab_size]

  def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)



    
model = TransformerDecoderForNextTokenPrediction(config)
train_dataset = GPT2Dataset(train_text=text_sequences,target_text=target,tokenizer=tokenizer,max_len=9)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=False)
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1,
)
trainer = Trainer(
                  max_epochs=1
                  )
trainer.fit(model, train_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name               | Type            | Params
-------------------------------------------------------
0 | decoder_embeddings | TransferDecoder | 103 M 
1 | dropout            | Dropout         | 0     
2 | classifier         | Linear          | 38.6 M
-------------------------------------------------------
141 M     Trainable params
0         Non-trainable params
141 M     Total params
567.305   Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Batch Size 8 and Sequence Length 9
Batch Size 8 and Sequence Length 9
Batch Size 8 and Sequence Length 9
Batch Size 8 and Sequence Length 9
Batch Size 8 and Sequence Length 9
Batch Size 8 and Sequence Length 9
Batch Size 8 and Sequence Length 9
Batch Size 3 and Sequence Length 9


`Trainer.fit` stopped: `max_epochs=1` reached.


In [120]:
trainer.save_checkpoint("gpt2_model.ckpt")


In [121]:
#predicting the next word using a sample text

text = "Sachin Tendulkar is regarded as one of "
inputs = tokenizer(text, return_tensors='pt', add_special_tokens=False)
#load the checkpoint
model = TransformerDecoderForNextTokenPrediction.load_from_checkpoint("gpt2_model.ckpt",config=config)
logits = model(inputs.input_ids)
predicted_index = torch.argmax(logits, dim=-1).item()
predicted_text = tokenizer.decode(predicted_index)
print(predicted_text)


Batch Size 1 and Sequence Length 12
leg


In [122]:
predicted_index

1455