# This is a encoder decode Transformer used for abstractive Summarization
## It uses subword tokenization pretrained by bert-base-cased
## Moreover, It relies on the articles of CNN DailyMail Dataset
## It's trained using A P100 GPU for 10000 batches
## All hyperparameters are down 

In [1]:
import numpy as np
from numpy import array
from numpy import asarray
from numpy import zeros
import torch
!pip install transformers
from transformers import AutoTokenizer
import pandas as pd
import tensorflow as tf
import time
import re
import pickle
import torch
from torch.utils.data import Dataset, DataLoader
import random




[notice] A new release of pip is available: 23.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


### Loading Data

In [40]:
news = pd.read_excel("../../datasets/news.xlsx")

In [41]:
news.drop(['Source ', 'Time ', 'Publish Date'], axis=1, inplace=True)

In [42]:
news.head()

Unnamed: 0,Headline,Short
0,4 ex-bank officials booked for cheating bank o...,The CBI on Saturday booked four former officia...
1,Supreme Court to go paperless in 6 months: CJI,Chief Justice JS Khehar has said the Supreme C...
2,"At least 3 killed, 30 injured in blast in Sylh...","At least three people were killed, including a..."
3,Why has Reliance been barred from trading in f...,Mukesh Ambani-led Reliance Industries (RIL) wa...
4,Was stopped from entering my own studio at Tim...,TV news anchor Arnab Goswami has said he was t...


In [43]:
news.shape

(55104, 2)

In [44]:
document = news['Short']
summary = news['Headline']

In [45]:
document[30], summary[30]

('According to the Guinness World Records, the most generations alive in a single family have been seven.  The difference between the oldest and the youngest person in the family was about 109 years, when Augusta Bunge&#39;s great-great-great-great grandson was born on January 21, 1989. The family belonged to the United States of America.',
 'The most generations alive in a single family have been 7')

#### Obtaining insights on lengths for defining maxlen

In [46]:
document_lengths = pd.Series([len(x) for x in document])
summary_lengths = pd.Series([len(x) for x in summary])

In [47]:
document_lengths.describe()

count    55104.000000
mean       368.003049
std         26.235510
min        280.000000
25%        350.000000
50%        369.000000
75%        387.000000
max        469.000000
dtype: float64

In [48]:
summary_lengths.describe()

count    55104.000000
mean        51.620282
std          7.267463
min          8.000000
25%         47.000000
50%         51.000000
75%         57.000000
max         84.000000
dtype: float64

In [49]:
# maxlen
# taking values > and round figured to 75th percentile
# at the same time not leaving high variance
encoder_maxlen = 512
decoder_maxlen = 128

#### Tokenizing the texts into integer tokens

In [50]:
# since < and > from default tokens cannot be removed
filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'
oov_token = '[UNK]'

In [51]:
tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-cased",
    use_fast=True,
    max_size = 1000,
    unk_token=oov_token,
)
# Get the sos token
sos_token = tokenizer.cls_token

# Get the eos token
eos_token = tokenizer.sep_token

### Preprocessing before tokenizing

In [52]:
# for decoder sequence
# remove all the filters from documents
document = document.apply(lambda x: x.translate(str.maketrans('', '', filters)))
# remove numbers between words which are noisy as governme39ts
document = document.apply(lambda x: re.sub(r'(?<=\w)\d+(?=\w)', '', x))


# remove all the filters from summaries and add sos and eos tokens
summary = summary.apply(lambda x: sos_token + ' ' + x.translate(str.maketrans('', '', filters)) + ' '+ eos_token)
# remove numbers between words which are noisy as governme39ts
summary = summary.apply(lambda x: re.sub(r'(?<=\w)\d+(?=\w)', '', x))


summary.head(), document.head()

(0    [CLS] 4 exbank officials booked for cheating b...
 1    [CLS] Supreme Court to go paperless in 6 month...
 2    [CLS] At least 3 killed 30 injured in blast in...
 3    [CLS] Why has Reliance been barred from tradin...
 4    [CLS] Was stopped from entering my own studio ...
 Name: Headline, dtype: object,
 0    The CBI on Saturday booked four former officia...
 1    Chief Justice JS Khehar has said the Supreme C...
 2    At least three people were killed including a ...
 3    Mukesh Ambaniled Reliance Industries RIL was b...
 4    TV news anchor Arnab Goswami has said he was t...
 Name: Short, dtype: object)

In [53]:
# Train the tokenizer on the dataset
tokenizer = tokenizer.train_new_from_iterator(document + summary, vocab_size=1000, show_progress=True)

In [54]:
tokenizer.vocab_size

1000

In [55]:
# Tokenize the strings
# applying Padding/Truncating sequences for identical sequence lengths
tokenized_data_inputs = [torch.tensor(tokenizer.encode(text, truncation="longest_first", padding='max_length', max_length=encoder_maxlen, add_special_tokens=False)).to(device) for text in document]
tokenized_data_outputs = [torch.tensor(tokenizer.encode(text, truncation="longest_first", padding='max_length', max_length=decoder_maxlen, add_special_tokens=False)).to(device) for text in summary]

In [56]:
# Print the first encoded article and its summary
print(tokenized_data_inputs[0], tokenizer.decode(tokenized_data_inputs[0]))
print(tokenized_data_outputs[0], tokenizer.decode(tokenized_data_outputs[0]))


tensor([414,  17, 235, 236, 395, 988, 471,  44, 695, 367, 996, 411, 598, 820,
        207, 386,  33, 218, 380, 385, 485,  16, 614, 392,  61, 934, 807, 207,
        411, 684, 203, 790, 411, 726, 558, 499, 429, 366, 377, 983, 209, 403,
        413, 218, 392,  45, 212, 444, 378, 199, 232, 227, 659,  54, 464, 207,
        382, 368, 889, 210, 448,  44, 614, 414, 620, 987, 565,  43, 217, 605,
        367,  50, 675,  54, 204, 576, 392, 499, 367, 383, 472,  33, 218, 380,
        385, 485,  16, 614, 395, 368,  44, 379, 381, 386, 411, 693, 392,  48,
        462, 759, 556, 944, 206, 456, 741, 414, 879,  48, 643, 207, 587,  48,
        449, 490, 466, 948, 770, 205, 207, 964, 408, 213, 382, 368, 586, 370,
        492,  57, 958, 367, 443, 368, 620, 987, 703, 551, 205, 207,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

In [57]:
len(tokenized_data_inputs[0])

512

In [135]:
tokenizer.encode("This is a test")

[2, 808, 433, 43, 720, 404, 3]

In [136]:
tokenizer.decode([2, 800, 450, 64, 740, 420, 3])

'[CLS] Sing at vaim re [SEP]'

In [137]:
vocab_size = tokenizer.vocab_size
vocab_size

1000

### Creating dataset pipeline

In [138]:

class MyDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

batch_size = 64
shuffle = 20000

dataset = MyDataset(tokenized_data_inputs, tokenized_data_outputs)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)


In [139]:
for inputs, outputs in dataloader:
    print(len(inputs), len(inputs[0]))
    print(len(outputs), len(outputs[0]))
    break

64 512
64 128


### Masking

- Padding mask for masking "pad" sequences
- Lookahead mask for masking future words from contributing in prediction of current words in self attention

In [10]:
def create_padding_mask(input_sequence):
    # input_sequence : B, T
    mask = (input_sequence != 0).unsqueeze(-2).float()
    return mask

# # B, T, T
# mask = create_padding_mask(inputs[0:2])
# tx = torch.randn(2, 10, 10).to('cuda')
# tx = tx.masked_fill(mask == 0, float('-inf'))
# mask

In [11]:
def create_look_ahead_mask(size):
    return torch.tril(torch.ones(size, size, device=device))

# create_look_ahead_mask(3)

### Building the Model

In [12]:
# tril = torch.tril(torch.ones(8, 8))

# T = 3
# wei = torch.ones(T, T) * 5
# wei = wei.masked_fill(tril[:T, :T] == 0, float('-inf')) # (B, T, T)
# tril, wei

#### Hyperparameters

In [15]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
encoder_block_size = encoder_maxlen # what is the maximum context length for predictions?
decoder_block_size = decoder_maxlen # what is the maximum context length for predictions?
max_iters = 10000
eval_interval = 500
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
eval_iters = 200
n_embd = 192
n_head = 6
n_layer = 3
dropout = 0.2

cpu


#### Head

In [16]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.head_size = head_size
        
        # TODO: move this tril to decoder block and pass it in ma attention better as a mask
        # I'm creating this Trill variable Trill is not a parameter of the module so in sort of pytorch
        # conventions this is called a buffer it's not a parameter and you have to call it you have to assign it to the module
        # using a register buffer so that creates the trail, the triangle lower triangular Matrix
        # self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        # we can also drop out here when we calculate the basically affinities and after the softmax we can drop out
        # some of those so we can randomly prevent some of the nodes from communicating
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask, key_value = None):
        B,T,C = x.shape
        if key_value is None:
            # key_value will be None if its self attention, but will be with value if it comes from other source as cross attention 
            key_value = x

        k = self.key(key_value)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * self.head_size**-0.5  # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(mask[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(key_value) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

#### Multi-Headed Attention

In [17]:
# class MultiHeadAttention(tf.keras.layers.Layer):
#     def __init__(self, d_model, num_heads):
#         super(MultiHeadAttention, self).__init__()
#         self.num_heads = num_heads
#         self.d_model = d_model

#         assert d_model % self.num_heads == 0

#         self.depth = d_model // self.num_heads

#         self.wq = tf.keras.layers.Dense(d_model)
#         self.wk = tf.keras.layers.Dense(d_model)
#         self.wv = tf.keras.layers.Dense(d_model)

#         self.dense = tf.keras.layers.Dense(d_model)
        
#     def split_heads(self, x, batch_size):
#         # size before : Batch, SequenceLength, Embedding : (64, 400, 128) B,T,C
#         x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
#         # size now : Batch, SequenceLength, #heads, Embedding/#heads : (64, 400, 8, 16) 
        
#         # returned indices : Batch, #heads, SequenceLength, Embedding/#heads: (64, 8, 400, 16)
#         return tf.transpose(x, perm=[0, 2, 1, 3])


    
#     def call(self, v, k, q, mask):
#         batch_size = tf.shape(q)[0]

#         q = self.wq(q)
#         k = self.wk(k)
#         v = self.wv(v)

#         q = self.split_heads(q, batch_size)
#         k = self.split_heads(k, batch_size)
#         v = self.split_heads(v, batch_size)

#         scaled_attention, attention_weights = scaled_dot_product_attention(
#             q, k, v, mask)

#         scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

#         concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
#         output = self.dense(concat_attention)
            
#         return output, attention_weights

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        # a linear transformation layer that projects the concatenated output from the self.heads module to the original embedding size n_embd.
        self.proj = nn.Linear(n_embd, n_embd)
        
        # a dropout layer that randomly sets some of the output values to zero during training to prevent overfitting.
        # Dropout is something that you can add right before the residual connection back or right before the connection back into the original pathway
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask, key_value = None):
        # mask : mask we will be applying for each head
        # key_value : can be passed in case of cross attention.

        # we run all of the heads in parallel into a list and simply concatenate all of the outputs and we're concatenating over the channel dimension
        out = torch.cat([h(x, mask, key_value) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

### Feed Forward Network

In [18]:
# def point_wise_feed_forward_network(d_model, dff):
#     return tf.keras.Sequential([
#         tf.keras.layers.Dense(dff, activation='relu'),
#         tf.keras.layers.Dense(d_model)
#     ])



class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        # The module is defined using the PyTorch nn.Sequential class, which allows us to define a sequence of layers that are applied to the input in order.
        # a feedforward neural network module with two linear layers, a ReLU activation function, and a dropout layer. 
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            # a dropout layer that randomly sets some of the output values to zero during training to prevent overfitting.
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

#### Fundamental Unit of Transformer encoder

In [19]:
class EncoderLayer(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        # n_head heads each of head_size-dimensional self attention running in parallel
        self.sa = MultiHeadAttention(n_head, head_size)

        # feedforward neural network purpose: before when we had the multi-headed self-attention only that did the communication, we went way too fast
        # to calculate the logits so the tokens looked at each other but didn't really have a lot of time to think on what they found from the other tokens
        # notice: that the feed forward here when it's applying linear this is on a per token level all the tokens do this independently so the self-attention is the communication and 
        # then once they've gathered all the data now they need to think on that data individually and so that's what feed forward is doing
        self.ffwd = FeedFoward(n_embd)

        # the size of the layer Norm here is n_embd of 32. so when the layer Norm is normalizing our features it is the normalization here
        # happens the mean and the variance are taking over 32 numbers so the batch and the time act as batch Dimensions both of
        # them so this is kind of like a per token transformation that just normalizes the features and makes them a unit mean unit gaussian at initialization
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x, mask):
        # this is actually something that slightly departs from the original paper you see that the [ADD and Norm] is applied after the transformation
        # but um in now it is a bit more basically common to apply the layer Norm before the transformation so there's a reshuffling of the layer Norms 
        # so this is called the [pre-norm formulation] and that's the one that we're going to implement as well
        x = x + self.sa(self.ln1(x), mask)
        x = x + self.ffwd(self.ln2(x))
        return x


#### Fundamental Unit of Transformer decoder

In [20]:
class DecoderLayer(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        # n_head heads each of head_size-dimensional self attention running in parallel
        self.self_attention = MultiHeadAttention(n_head, head_size)
        
        # cross attention
        self.cross_attention = MultiHeadAttention(n_head, head_size)
        
        # feedforward neural network purpose: before when we had the multi-headed self-attention only that did the communication, we went way too fast
        # to calculate the logits so the tokens looked at each other but didn't really have a lot of time to think on what they found from the other tokens
        # notice: that the feed forward here when it's applying linear this is on a per token level all the tokens do this independently so the self-attention is the communication and 
        # then once they've gathered all the data now they need to think on that data individually and so that's what feed forward is doing
        self.ffwd = FeedFoward(n_embd)

        # the size of the layer Norm here is n_embd of 32. so when the layer Norm is normalizing our features it is the normalization here
        # happens the mean and the variance are taking over 32 numbers so the batch and the time act as batch Dimensions both of
        # them so this is kind of like a per token transformation that just normalizes the features and makes them a unit mean unit gaussian at initialization
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        self.ln3 = nn.LayerNorm(n_embd)


    def forward(self, x, encoder_mask, decoder_mask, encoder_output):
        # this is actually something that slightly departs from the original paper you see that the [ADD and Norm] is applied after the transformation
        # but um in now it is a bit more basically common to apply the layer Norm before the transformation so there's a reshuffling of the layer Norms 
        # so this is called the [pre-norm formulation] and that's the one that we're going to implement as well
        x = x + self.self_attention(self.ln1(x), decoder_mask)
        x = x + self.cross_attention(self.ln2(x), encoder_mask, encoder_output)
        x = x + self.ffwd(self.ln3(x))
        return x



#### Encoder consisting of multiple EncoderLayer(s)

In [21]:
class Encoder(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        # take care now embedding size (= n_embd) != vocab size 
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        # positional encoding lookup 
        self.position_embedding_table = nn.Embedding(encoder_block_size, n_embd)

        self.blocks = nn.Sequential(*[EncoderLayer(n_embd, n_head) for _ in range(n_layer)])
        

    def forward(self, idx, encoder_mask):
        B, T = idx.shape

        # idx is both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        
        # now total embedding = token embedding + positional embedding
        x = tok_emb + pos_emb # (B,T,C)
        
        # pass x into
        for i in range(n_layer):
            x = self.blocks[i](x, encoder_mask) # (B,T,C)
        return x

#### Decoder consisting of multiple DecoderLayer(s)

In [22]:
class Decoder(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        # take care now embedding size (= n_embd) != vocab size 
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        # positional encoding lookup 
        self.position_embedding_table = nn.Embedding(decoder_block_size, n_embd)

        self.blocks = nn.Sequential(*[DecoderLayer(n_embd, n_head=n_head) for _ in range(n_layer)])

    def forward(self, idx, encoder_mask, decoder_mask, encoder_output):
        B, T = idx.shape

        # idx is both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        
        # now total embedding = token embedding + positional embedding
        x = tok_emb + pos_emb # (B,T,C)
        
        # pass x into
        # pass x into
        for i in range(n_layer):
          x = self.blocks[i](x, encoder_mask, decoder_mask, encoder_output) # (B,T,C)
        return x

#### Finally, the Transformer

In [24]:
class Transformer(nn.Module):

    def __init__(self):
        super().__init__()
        
        self.encoder = Encoder()
        self.decoder = Decoder()
        # final layer norm at the end of the transfomer
        self.ln_f = nn.LayerNorm(n_embd)
        
        # a fully connected (linear) layer by performing a linear transformation on the input tensor
        # with a weight matrix of size (n_embd, vocab_size) and adding a bias vector of size (vocab_size,)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, encoder_input_idx, encoding_mask, decoder_input_idx, decoder_target_idx, decoder_mask):
        
        encoder_output = self.encoder(encoder_input_idx, encoding_mask)

        decoder_output = self.decoder(decoder_input_idx, encoding_mask, decoder_mask, encoder_output)


        # pass x into
        decoder_output = self.ln_f(decoder_output) # (B,T,C)


        # logits is the ouput of the fully connected (linear) layer now given input decoder_output
        logits = self.lm_head(decoder_output) # (B,T,vocab_size)
        
        if decoder_target_idx is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            decoder_target_idx = decoder_target_idx.reshape(B*T)
            mask = (decoder_target_idx != 0) # create a mask of non-padding tokens
            loss = F.cross_entropy(logits[mask], decoder_target_idx[mask])

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):

            # because now we're using positional embeddings we can never have more than block size coming in because if idx is
            # more than block size then our position embedding table is going to run out of scope because it only has embeddings for up to block size 
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -decoder_block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx




#### Adam optimizer with custom learning rate scheduling

In [25]:
# class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
#     def __init__(self, d_model, warmup_steps=4000):
#         super(CustomSchedule, self).__init__()

#         self.d_model = d_model
#         self.d_model = tf.cast(self.d_model, tf.float32)

#         self.warmup_steps = warmup_steps
    
#     def __call__(self, step):
#         arg1 = tf.math.rsqrt(step)
#         arg2 = step * (self.warmup_steps ** -1.5)

#         return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


In [28]:
model = Transformer()
m = model.to(device)

#### Masks

In [29]:
def create_masks(input, target):
    encoder_mask = create_padding_mask(input)
    look_ahead_mask = create_look_ahead_mask(target.shape[1])
    dec_target_padding_mask = create_padding_mask(target)
    decoder_mask = torch.minimum(dec_target_padding_mask, look_ahead_mask)
  
    return encoder_mask, decoder_mask


#### Defining losses and other metrics 

In [30]:
for inputs, outputs in dataloader:
    print(len(inputs), len(inputs[0]))
    print(len(outputs), len(outputs[0]))
    break

NameError: name 'dataloader' is not defined

In [178]:
# data loading
def get_batch():#split):
    # generate a small batch of data of inputs x and targets y
    # data = train_data if split == 'train' else val_data
    batch = random.choice(list(dataloader))
    x, y = batch
    x, y = x.to(device), y.to(device)
    return x, y


# @torch.no_grad() this line says to pytorch to prevent backprop since we will be evaluating not real training
@torch.no_grad()
def estimate_loss():
    out = {}
    # let model be in evaluation phase so layers like normalization, .. change their behaviour at inference time
    model.eval()
    for split in ['train']: #, 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch() #split)
            target_input = Y[:, :-1]
            target_real = Y[:, 1:]
            encoder_mask, decoder_mask = create_masks(X, target_input)
            logits, loss = model(X, encoder_mask, target_input, target_real, decoder_mask)
                                
            losses[k] = loss.item()
        out[split] = losses.mean()
    # back to training phase
    model.train()
    return out

  
    
    
 

In [31]:
# learning_rate = CustomSchedule(float(d_model))
# optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)



3.810088 M parameters


In [180]:
# loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
# def loss_function(real, pred):
#     mask = tf.math.logical_not(tf.math.equal(real, 0))
#     loss_ = loss_object(real, pred)

#     mask = tf.cast(mask, dtype=loss_.dtype)
#     loss_ *= mask

#     return tf.reduce_sum(loss_)/tf.reduce_sum(mask)


In [181]:
# train_loss = tf.keras.metrics.Mean(name='train_loss')

#### Transformer

#### Training steps

In [184]:
def train_step(input, target):
    target_input = target[:, :-1]
    target_real = target[:, 1:]

    encoder_mask, decoder_mask = create_masks(input, target_input)
    
    
    logits, loss = model(
        input,
        encoder_mask,
        target_input,
        target_real,
        decoder_mask,
    )
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [185]:
for epoch in range(5):
    start = time.time()

    # train_loss.reset_states()
    for (batch, (inputs, outputs)) in enumerate(dataloader):
        
        # (inputs, outputs) here represent batch of examples
        # inputs: Batch_size * input_sequence_length
        # outputs: Batch_size * output_sequence_length
        
        # every once in a while evaluate the loss on train and val sets
        if batch % eval_interval == 0 or batch == max_iters - 1:
            losses = estimate_loss()
            print(f"step {batch}: train loss {losses['train']:.4f}") #, val loss {losses['val']:.4f}")
        
        
        train_step(inputs, outputs)
      
    # if (epoch + 1) % 5 == 0:
    #     ckpt_save_path = ckpt_manager.save()
    #     print ('Saving checkpoint for epoch {} at {}'.format(epoch+1, ckpt_save_path))
    
    # print ('Epoch {} Loss {:.4f}'.format(epoch + 1, train_loss.result()))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))


step 0: train loss 7.0860
step 500: train loss 3.6643
Time taken for 1 epoch: 505.03278613090515 secs

step 0: train loss 3.1695
step 500: train loss 2.7930
Time taken for 1 epoch: 492.1347658634186 secs

step 0: train loss 2.6130
step 500: train loss 2.3931
Time taken for 1 epoch: 515.6874475479126 secs

step 0: train loss 2.1328
step 500: train loss 1.7971
Time taken for 1 epoch: 517.3843352794647 secs

step 0: train loss 1.6653
step 500: train loss 1.5609
Time taken for 1 epoch: 493.74125814437866 secs



### Checkpointing

In [8]:
checkpoint_path = "./checkpoints/model.pt"

def save(epoch, model, optimizer, loss):
  checkpoint = {
      'epoch': epoch,
      'model_state_dict': model.state_dict(),
      'optimizer_state_dict': optimizer.state_dict(),
      'loss': loss,
  }
  torch.save(checkpoint, checkpoint_path)

# save model, optimizer, losses['train']
save(epoch + 1, model, optimizer, losses['train'])

NameError: name 'epoch' is not defined

### Inference

#### Predicting one word at a time at the decoder and appending it to the output; then taking the complete sequence as an input to the decoder and repeating until maxlen or stop keyword appears

In [186]:
decoder_input = tokenizer.encode('[CLS]', add_special_tokens=False)  # will be list of 1 
# convert it to a tensor and add new dimension to be like batchsize of 1 and sequence
output = torch.tensor(decoder_input, device=device).unsqueeze(0)


In [36]:
# turn of grad while evaluaring model
@torch.no_grad()
def evaluate(input_document):
  
    # let model be in evaluation phase so layers like normalization, .. change their behaviour at inference time
    model.eval()
    
    # TODO:: must add add_special_tokens=False later after this session
    input_document = torch.tensor(tokenizer.encode(input_document, truncation="longest_first", padding='max_length', max_length=encoder_maxlen), device=device)
    
    # make it of size (1, input_document_size) to match transformers convention of (batch, input_documents_size)
    encoder_input = input_document.unsqueeze(0)

    
    decoder_input = tokenizer.encode("[CLS]", add_special_tokens=False)  # will be list of 1 
    
    # make it of size (1, input_document_size) to match transformers convention of (batch, input_documents_size)
    output = torch.tensor(decoder_input, device=device).unsqueeze(0)
    
    for i in range(decoder_maxlen):
        encoder_mask, decoder_mask = create_masks(encoder_input, output)
        
        logits, _ = model(encoder_input, encoder_mask, output, None, decoder_mask)
        # in this case : logits size (B,T,vocab_size)
        
        # predictions, attention_weights = transformer(
        #     encoder_input, 
        #     output,
        #     False,
        #     enc_padding_mask,
        #     combined_mask,
        #     dec_padding_mask
        # )

        logits = logits[: ,-1:, :]
        predicted_id = torch.argmax(logits, dim=-1)

        if predicted_id == tokenizer.sep_token_id:
            return output

        output = torch.cat((output, predicted_id), dim=-1)

    return output


In [37]:
def summarize(input_document):
    # not considering attention weights for now, can be used to plot attention heatmaps in the future
    summarized = evaluate(input_document=input_document)
    return tokenizer.decode(summarized.view(-1))

In [211]:
for i in range(10):
  # generate a random number between low and high
  index = random.uniform(0, len(document))//1
  print("index: ",index)
  print("Document: \n",document[index])
  print("Real Summary: \n", summary[index])
  print("Model Summary: \n", summarize(document[index]))
  print("--------------------------------------\n")


index:  10977.0
Document: 
 The UK government is set to construct a tunnel underneath Stonehenge in an effort to reduce traffic congestion Transport Secretary Chris Grayling said the 2kilometre tunnel could improve the environment around the 5yearold World Heritage Site and help the local economy by linking people with jobs Meanwhile the decision has been criticised by environmentalists and historians
Real Summary: 
 [CLS] UK to build tunnel underneath Stonehenge [SEP]
Model Summary: 
 [CLS] UK govt to constructure underneath Stonehenge
--------------------------------------

index:  52456.0
Document: 
 Turkmenistan President Kurbanguly Berdymukhamedov has drafted a new Constitution to remove the upper age limit for Presidential candidates and extend the Presidential term to seven years from the current fiveyearterm With no limit on the number of terms a President can serve under the existing Constitution the new Constitution will potentially allow President Berdymukhamedov to rule for

# Loading model

In [32]:
checkpoint_path = "./checkpoints/model.pt"

def load():
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  model = Transformer().to(device)  # create an instance of your model
  optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
  
  checkpoint = torch.load(checkpoint_path, map_location=device)
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  epoch = checkpoint['epoch']
  loss = checkpoint['loss']

  model.to(device)
  return model, optimizer, epoch, loss
  
# test load
model, optimizer, epoch, loss = load()

In [33]:
print(loss, epoch)

tensor(1.5609) 4


In [19]:
vocab_size=1000

In [59]:
summarize("brazilian police on wednesday arrested the head of the european olympic committees patrick hickey in rio de janeiro over illegal sales of olympic tickets police said hickey and at least six others are accused of illegally passing on tickets for the games to be sold on at extortionate prices hickey was taken to hospital after his arrest")

'[CLS] WPv does the wednfing head of hickey in riots'