# This is a encoder decode Transformer used for abstractive Summarization
## It uses subword tokenization pretrained by bert-base-cased
## Moreover, It relies on the articles of CNN DailyMail Dataset
## It's trained using A P100 GPU for 10000 batches
## All hyperparameters are down 

In [1]:
import numpy as np
from numpy import array
from numpy import asarray
from numpy import zeros
import torch
from transformers import AutoTokenizer
import pandas as pd
import tensorflow as tf
import time
import re
import pickle
import torch
from torch.utils.data import Dataset, DataLoader
import random
from datasets import load_dataset



In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


### Loading Data

In [3]:
# load the CNN/DailyMail dataset
dataset = load_dataset('cnn_dailymail', '3.0.0')

# print the first example
print(dataset['train'][0])


Downloading builder script:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/3.0.0 (download: 558.32 MiB, generated: 1.28 GiB, post-processed: Unknown size, total: 1.82 GiB) to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

{'article': 'It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria. Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons. The proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction." It\'s a step that is set to turn an international crisis into a fierce domestic political battle. There are key questions looming over the debate: What did U.N. weapons inspectors find in Syria? What happens if Congress votes no? And how will the Syrian government react? In a televised address from the White House Rose Garden earlier Saturday, the president said he would take his case to Congress, not because he has to -- but bec

## converting dataset to pandas

In [4]:
train = pd.DataFrame(dataset['train'])
document = train['article']
summary = train['highlights']


In [5]:
document[30], summary[30]

('(CNN) -- The big winners of this Formula One season could be road drivers rather than F1 racers, according to one former world champion. Jody Scheckter, who took the drivers\' title in 1979, hopes a raft of technological changes -- notably smaller, hybrid engines that promise greater fuel efficiency -- will help improve road cars\' performance. "It\'s very positive for the sport, this is the first time you\'ve seen the sport bring in regulations that really push the envelope of technology for every type of car," the South African told CNN. "They are trying to take efficiency from everywhere they can on a car." This year\'s race cars will boast an enhanced Energy Recovery System (ERS) and 1.6-liter V6 engines, compared to the 2.4-liter V8s on show last year. The ERS uses heat generated when braking and thermal energy from exhaust gases to create extra power. The Kinetic Energy Recovery System (KERS) has been used in F1 since 2009, but Scheckter says these latest advancements in the sp

#### Obtaining insights on lengths for defining maxlen

In [6]:
document = document[:100000]
summary = summary[:100000]

In [7]:
# words count in each 
document_lengths = pd.Series([len(x.split()) for x in document])
summary_lengths = pd.Series([len(x.split()) for x in summary])

In [8]:
document_lengths.describe()

count    100000.000000
mean        662.352980
std         341.325023
min          18.000000
25%         393.000000
50%         612.000000
75%         873.000000
max        1908.000000
dtype: float64

In [9]:
summary_lengths.describe()

count    100000.000000
mean         46.397610
std          12.335318
min           7.000000
25%          38.000000
50%          46.000000
75%          54.000000
max         474.000000
dtype: float64

In [10]:
# maxlen
# taking values > and round figured to 75th percentile
# at the same time not leaving high variance
encoder_maxlen = 750
decoder_maxlen = 100

#### Tokenizing the texts into integer tokens

In [11]:
# since < and > from default tokens cannot be removed
filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'
oov_token = '[UNK]'

In [12]:
tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-cased",
    use_fast=True,
    max_size = 2000,
    unk_token=oov_token,
)
# Get the sos token
sos_token = tokenizer.cls_token

# Get the eos token
eos_token = tokenizer.sep_token

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

### Preprocessing before tokenizing

In [13]:
# for decoder sequence
# remove all the filters from documents
document = document.apply(lambda x: x.translate(str.maketrans('', '', filters)))
# remove numbers between words which are noisy as governme39ts
document = document.apply(lambda x: re.sub(r'(?<=\w)\d+(?=\w)', '', x))


# remove all the filters from summaries and add sos and eos tokens
summary = summary.apply(lambda x: sos_token + ' ' + x.translate(str.maketrans('', '', filters)) + ' '+ eos_token)
# remove numbers between words which are noisy as governme39ts
summary = summary.apply(lambda x: re.sub(r'(?<=\w)\d+(?=\w)', '', x))


summary.head(), document.head()

(0    [CLS] Syrian official Obama climbed to the top...
 1    [CLS] Usain Bolt wins third gold of world cham...
 2    [CLS] The employee in agency's Kansas City off...
 3    [CLS] NEW A Canadian doctor says she was part ...
 4    [CLS] Another arrest made in gang rape outside...
 Name: highlights, dtype: object,
 0    It's official US President Barack Obama wants ...
 1    CNN  Usain Bolt rounded off the world champion...
 2    Kansas City Missouri CNN  The General Services...
 3    Los Angeles CNN  A medical doctor in Vancouver...
 4    CNN  Police arrested another teen Thursday the...
 Name: article, dtype: object)

In [14]:
document[30], summary[30]

("CNN  The big winners of this Formula One season could be road drivers rather than F1 racers according to one former world champion Jody Scheckter who took the drivers' title in 19 hopes a raft of technological changes  notably smaller hybrid engines that promise greater fuel efficiency  will help improve road cars' performance It's very positive for the sport this is the first time you've seen the sport bring in regulations that really push the envelope of technology for every type of car the South African told CNN They are trying to take efficiency from everywhere they can on a car This year's race cars will boast an enhanced Energy Recovery System ERS and 1liter V6 engines compared to the 2liter Vs on show last year The ERS uses heat generated when braking and thermal energy from exhaust gases to create extra power The Kinetic Energy Recovery System KERS has been used in F1 since 29 but Scheckter says these latest advancements in the sport will only benefit everyday drivers Whereve

In [15]:
# Tokenize the strings
# applying Padding/Truncating sequences for identical sequence lengths
tokenized_data_inputs = [torch.tensor(tokenizer.encode(text, truncation="longest_first", padding='max_length', max_length=encoder_maxlen, add_special_tokens=False)).to(device) for text in document]
tokenized_data_outputs = [torch.tensor(tokenizer.encode(text, truncation="longest_first", padding='max_length', max_length=decoder_maxlen, add_special_tokens=False)).to(device) for text in summary]

# Define a function to tokenize a single document
# def tokenize(text):
#     return torch.tensor(tokenizer.encode(text, truncation="longest_first", padding='max_length', max_length=encoder_maxlen, add_special_tokens=False)).to(device)

# # Tokenize the data using the GPU
# tokenized_data_inputs = [tokenize(text) for text in document]
# tokenized_data_outputs = [tokenize(text) for text in summary]




In [16]:
# Print the first encoded article and its summary
print(tokenized_data_inputs[0], tokenizer.decode(tokenized_data_inputs[0]))
print(tokenized_data_outputs[0], tokenizer.decode(tokenized_data_outputs[0]))


tensor([ 1135,   112,   188,  2078,  1646,  1697, 14319,  7661,  3349,  1644,
        11877,  1106, 18678,  1107,  1113,  2480,  1106,  1329,  1764,  2049,
         1107,  7303,  7661,  1850,   170,  2998,  1106,  1103,  4075,  1104,
         1103,  1585,  1105,  3279,  1113,  4306,  1480,  2005,  1170, 14563,
         1115,  1119,  6616,  1764,  2168,  1222,  8697,  7539,  1110,  1103,
         1268,  2585,  1106,  1321,  1166,  1103,  6351,  1329,  1104,  5297,
         3595,  1109,  3000,  5626,  1121,  7661,  4390,  2757,  1106, 14942,
         1103,  1329,  1104,  1764,  2049,  1106,  1260,  2083, 26499,  3843,
         1105,  1260, 24633,  1103,  3209,  1111,  2174,  2745,  1104,  5297,
         3595,  1137,  1168,  3595,  1104,  3367,  5915,  1135,   112,   188,
          170,  2585,  1115,  1110,  1383,  1106,  1885,  1126,  1835,  5532,
         1154,   170,  9250,  4500,  1741,  2321,  1247,  1132,  2501,  3243,
        27003,  1166,  1103,  5655,  1327,  1225,  7414,  3595, 

In [17]:
len(tokenized_data_inputs[0])

750

In [18]:
tokenizer.encode("This is a test")

[101, 1188, 1110, 170, 2774, 102]

In [19]:
tokenizer.decode([101, 1188, 1110, 170, 2774, 102])

'[CLS] This is a test [SEP]'

In [20]:
vocab_size = tokenizer.vocab_size
vocab_size

28996

### Creating dataset pipeline

In [21]:

class MyDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

batch_size = 64
shuffle = 20000

dataset = MyDataset(tokenized_data_inputs, tokenized_data_outputs)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)


In [22]:
for inputs, outputs in dataloader:
    print(len(inputs), len(inputs[0]))
    print(len(outputs), len(outputs[0]))
    break

64 750
64 100


### Masking

- Padding mask for masking "pad" sequences
- Lookahead mask for masking future words from contributing in prediction of current words in self attention

In [23]:
def create_padding_mask(input_sequence):
    # input_sequence : B, T
    mask = (input_sequence != 0).unsqueeze(-2).float()
    return mask

# # B, T, T
# mask = create_padding_mask(inputs[0:2])
# tx = torch.randn(2, 10, 10).to('cuda')
# tx = tx.masked_fill(mask == 0, float('-inf'))
# mask

In [24]:
def create_look_ahead_mask(size):
    return torch.tril(torch.ones(size, size, device=device))

# create_look_ahead_mask(3)

### Building the Model

In [25]:
# tril = torch.tril(torch.ones(8, 8))

# T = 3
# wei = torch.ones(T, T) * 5
# wei = wei.masked_fill(tril[:T, :T] == 0, float('-inf')) # (B, T, T)
# tril, wei

#### Hyperparameters

In [26]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
encoder_block_size = encoder_maxlen # what is the maximum context length for predictions?
decoder_block_size = decoder_maxlen # what is the maximum context length for predictions?
max_iters = 10000
eval_interval = 500
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
eval_iters = 100
n_embd = 192
n_head = 6
n_layer = 3
dropout = 0.2

cuda


#### Head

In [27]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.head_size = head_size
        
        # TODO: move this tril to decoder block and pass it in ma attention better as a mask
        # I'm creating this Trill variable Trill is not a parameter of the module so in sort of pytorch
        # conventions this is called a buffer it's not a parameter and you have to call it you have to assign it to the module
        # using a register buffer so that creates the trail, the triangle lower triangular Matrix
        # self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        # we can also drop out here when we calculate the basically affinities and after the softmax we can drop out
        # some of those so we can randomly prevent some of the nodes from communicating
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask, key_value = None):
        B,T,C = x.shape
        if key_value is None:
            # key_value will be None if its self attention, but will be with value if it comes from other source as cross attention 
            key_value = x

        k = self.key(key_value)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * self.head_size**-0.5  # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(mask[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(key_value) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

#### Multi-Headed Attention

In [28]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        # a linear transformation layer that projects the concatenated output from the self.heads module to the original embedding size n_embd.
        self.proj = nn.Linear(n_embd, n_embd)
        
        # a dropout layer that randomly sets some of the output values to zero during training to prevent overfitting.
        # Dropout is something that you can add right before the residual connection back or right before the connection back into the original pathway
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask, key_value = None):
        # mask : mask we will be applying for each head
        # key_value : can be passed in case of cross attention.

        # we run all of the heads in parallel into a list and simply concatenate all of the outputs and we're concatenating over the channel dimension
        out = torch.cat([h(x, mask, key_value) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

### Feed Forward Network

In [29]:


class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        # The module is defined using the PyTorch nn.Sequential class, which allows us to define a sequence of layers that are applied to the input in order.
        # a feedforward neural network module with two linear layers, a ReLU activation function, and a dropout layer. 
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            # a dropout layer that randomly sets some of the output values to zero during training to prevent overfitting.
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

#### Fundamental Unit of Transformer encoder

In [30]:
class EncoderLayer(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        # n_head heads each of head_size-dimensional self attention running in parallel
        self.sa = MultiHeadAttention(n_head, head_size)

        # feedforward neural network purpose: before when we had the multi-headed self-attention only that did the communication, we went way too fast
        # to calculate the logits so the tokens looked at each other but didn't really have a lot of time to think on what they found from the other tokens
        # notice: that the feed forward here when it's applying linear this is on a per token level all the tokens do this independently so the self-attention is the communication and 
        # then once they've gathered all the data now they need to think on that data individually and so that's what feed forward is doing
        self.ffwd = FeedFoward(n_embd)

        # the size of the layer Norm here is n_embd of 32. so when the layer Norm is normalizing our features it is the normalization here
        # happens the mean and the variance are taking over 32 numbers so the batch and the time act as batch Dimensions both of
        # them so this is kind of like a per token transformation that just normalizes the features and makes them a unit mean unit gaussian at initialization
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x, mask):
        # this is actually something that slightly departs from the original paper you see that the [ADD and Norm] is applied after the transformation
        # but um in now it is a bit more basically common to apply the layer Norm before the transformation so there's a reshuffling of the layer Norms 
        # so this is called the [pre-norm formulation] and that's the one that we're going to implement as well
        x = x + self.sa(self.ln1(x), mask)
        x = x + self.ffwd(self.ln2(x))
        return x


#### Fundamental Unit of Transformer decoder

In [31]:
class DecoderLayer(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        # n_head heads each of head_size-dimensional self attention running in parallel
        self.self_attention = MultiHeadAttention(n_head, head_size)
        
        # cross attention
        self.cross_attention = MultiHeadAttention(n_head, head_size)
        
        # feedforward neural network purpose: before when we had the multi-headed self-attention only that did the communication, we went way too fast
        # to calculate the logits so the tokens looked at each other but didn't really have a lot of time to think on what they found from the other tokens
        # notice: that the feed forward here when it's applying linear this is on a per token level all the tokens do this independently so the self-attention is the communication and 
        # then once they've gathered all the data now they need to think on that data individually and so that's what feed forward is doing
        self.ffwd = FeedFoward(n_embd)

        # the size of the layer Norm here is n_embd of 32. so when the layer Norm is normalizing our features it is the normalization here
        # happens the mean and the variance are taking over 32 numbers so the batch and the time act as batch Dimensions both of
        # them so this is kind of like a per token transformation that just normalizes the features and makes them a unit mean unit gaussian at initialization
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        self.ln3 = nn.LayerNorm(n_embd)


    def forward(self, x, encoder_mask, decoder_mask, encoder_output):
        # this is actually something that slightly departs from the original paper you see that the [ADD and Norm] is applied after the transformation
        # but um in now it is a bit more basically common to apply the layer Norm before the transformation so there's a reshuffling of the layer Norms 
        # so this is called the [pre-norm formulation] and that's the one that we're going to implement as well
        x = x + self.self_attention(self.ln1(x), decoder_mask)
        x = x + self.cross_attention(self.ln2(x), encoder_mask, encoder_output)
        x = x + self.ffwd(self.ln3(x))
        return x



#### Encoder consisting of multiple EncoderLayer(s)

In [32]:
class Encoder(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        # take care now embedding size (= n_embd) != vocab size 
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        # positional encoding lookup 
        self.position_embedding_table = nn.Embedding(encoder_block_size, n_embd)

        self.blocks = nn.Sequential(*[EncoderLayer(n_embd, n_head) for _ in range(n_layer)])
        

    def forward(self, idx, encoder_mask):
        B, T = idx.shape

        # idx is both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        
        # now total embedding = token embedding + positional embedding
        x = tok_emb + pos_emb # (B,T,C)
        
        # pass x into
        for i in range(n_layer):
            x = self.blocks[i](x, encoder_mask) # (B,T,C)
        return x

#### Decoder consisting of multiple DecoderLayer(s)

In [33]:
class Decoder(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        # take care now embedding size (= n_embd) != vocab size 
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        # positional encoding lookup 
        self.position_embedding_table = nn.Embedding(decoder_block_size, n_embd)

        self.blocks = nn.Sequential(*[DecoderLayer(n_embd, n_head=n_head) for _ in range(n_layer)])

    def forward(self, idx, encoder_mask, decoder_mask, encoder_output):
        B, T = idx.shape

        # idx is both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        
        # now total embedding = token embedding + positional embedding
        x = tok_emb + pos_emb # (B,T,C)
        
        # pass x into
        # pass x into
        for i in range(n_layer):
          x = self.blocks[i](x, encoder_mask, decoder_mask, encoder_output) # (B,T,C)
        return x

#### Finally, the Transformer

In [34]:

class Transformer(nn.Module):

    def __init__(self):
        super().__init__()
        
        self.encoder = Encoder()
        self.decoder = Decoder()
        # final layer norm at the end of the transfomer
        self.ln_f = nn.LayerNorm(n_embd)
        
        # a fully connected (linear) layer by performing a linear transformation on the input tensor
        # with a weight matrix of size (n_embd, vocab_size) and adding a bias vector of size (vocab_size,)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, encoder_input_idx, encoding_mask, decoder_input_idx, decoder_target_idx, decoder_mask):
        
        encoder_output = self.encoder(encoder_input_idx, encoding_mask)

        decoder_output = self.decoder(decoder_input_idx, encoding_mask, decoder_mask, encoder_output)


        # pass x into
        decoder_output = self.ln_f(decoder_output) # (B,T,C)


        # logits is the ouput of the fully connected (linear) layer now given input decoder_output
        logits = self.lm_head(decoder_output) # (B,T,vocab_size)
        
        if decoder_target_idx is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            decoder_target_idx = decoder_target_idx.reshape(B*T)
            mask = (decoder_target_idx != 0) # create a mask of non-padding tokens
            loss = F.cross_entropy(logits[mask], decoder_target_idx[mask])

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):

            # because now we're using positional embeddings we can never have more than block size coming in because if idx is
            # more than block size then our position embedding table is going to run out of scope because it only has embeddings for up to block size 
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -decoder_block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx




#### Adam optimizer with custom learning rate scheduling

In [35]:
# class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
#     def __init__(self, d_model, warmup_steps=4000):
#         super(CustomSchedule, self).__init__()

#         self.d_model = d_model
#         self.d_model = tf.cast(self.d_model, tf.float32)

#         self.warmup_steps = warmup_steps
    
#     def __call__(self, step):
#         arg1 = tf.math.rsqrt(step)
#         arg2 = step * (self.warmup_steps ** -1.5)

#         return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


In [36]:
model = Transformer()
m = model.to(device)

#### Masks

In [37]:
def create_masks(input, target):
    encoder_mask = create_padding_mask(input)
    look_ahead_mask = create_look_ahead_mask(target.shape[1])
    dec_target_padding_mask = create_padding_mask(target)
    decoder_mask = torch.minimum(dec_target_padding_mask, look_ahead_mask)
  
    return encoder_mask, decoder_mask


#### Defining losses and other metrics 

In [38]:
# data loading
def get_batch():#split):
    # generate a small batch of data of inputs x and targets y
    # data = train_data if split == 'train' else val_data
    batch = random.choice(list(dataloader))
    x, y = batch
    x, y = x.to(device), y.to(device)
    return x, y


# @torch.no_grad() this line says to pytorch to prevent backprop since we will be evaluating not real training
@torch.no_grad()
def estimate_loss():
    out = {}
    # let model be in evaluation phase so layers like normalization, .. change their behaviour at inference time
    model.eval()
    for split in ['train']: #, 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch() #split)
            target_input = Y[:, :-1]
            target_real = Y[:, 1:]
            encoder_mask, decoder_mask = create_masks(X, target_input)
            logits, loss = model(X, encoder_mask, target_input, target_real, decoder_mask)
                                
            losses[k] = loss.item()
        out[split] = losses.mean()
    # back to training phase
    model.train()
    return out

  
    
    
 

In [39]:
# learning_rate = CustomSchedule(float(d_model))
# optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)



20.0041 M parameters


In [40]:
# loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
# def loss_function(real, pred):
#     mask = tf.math.logical_not(tf.math.equal(real, 0))
#     loss_ = loss_object(real, pred)

#     mask = tf.cast(mask, dtype=loss_.dtype)
#     loss_ *= mask

#     return tf.reduce_sum(loss_)/tf.reduce_sum(mask)


In [41]:
# train_loss = tf.keras.metrics.Mean(name='train_loss')

#### Transformer

#### Checkpoints

In [42]:
!pwd

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/kaggle/working


In [53]:
# checkpoint_path = "checkpoints"

# ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)

# ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if ckpt_manager.latest_checkpoint:
#     ckpt.restore(ckpt_manager.latest_checkpoint)
#     print ('Latest checkpoint restored!!')

checkpoint_path = "/kaggle/working/models/model.pt"

def save(epoch, model, optimizer, loss):
  checkpoint = {
      'epoch': epoch,
      'model_state_dict': model.state_dict(),
      'optimizer_state_dict': optimizer.state_dict(),
      'loss': loss,
  }
  torch.save(checkpoint, checkpoint_path)

def load():
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  model = Transformer().to(device)  # create an instance of your model
  optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
  
  checkpoint = torch.load(checkpoint_path, map_location=device)
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  epoch = checkpoint['epoch']
  loss = checkpoint['loss']

  model.to(device)
  return model, optimizer, epoch, loss

#### Training steps

In [56]:
def train_step(input, target):
    target_input = target[:, :-1]
    target_real = target[:, 1:]

    encoder_mask, decoder_mask = create_masks(input, target_input)
    
    
    logits, loss = model(
        input,
        encoder_mask,
        target_input,
        target_real,
        decoder_mask,
    )
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [57]:
for epoch in range(5):
    start = time.time()

    # train_loss.reset_states()
    for (batch, (inputs, outputs)) in enumerate(dataloader):
        
        # (inputs, outputs) here represent batch of examples
        # inputs: Batch_size * input_sequence_length
        # outputs: Batch_size * output_sequence_length
        
        # every once in a while evaluate the loss on train and val sets
        if batch % eval_interval == 0 or batch == max_iters - 1:
            losses = estimate_loss()
            print(f"step {batch}: train loss {losses['train']:.4f}") #, val loss {losses['val']:.4f}")
        
        
        train_step(inputs, outputs)
     
    # save model, optimizer, losses['train']
    save(epoch + 1, model, optimizer, losses['train'])
    
    # print ('Epoch {} Loss {:.4f}'.format(epoch + 1, train_loss.result()))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))


step 0: train loss 5.5495
step 500: train loss 5.3065
step 1000: train loss 5.1505
step 1500: train loss 5.0089
Time taken for 1 epoch: 942.272579908371 secs

step 0: train loss 4.9867
step 500: train loss 4.8857
step 1000: train loss 4.7981
step 1500: train loss 4.7110
Time taken for 1 epoch: 943.8305606842041 secs

step 0: train loss 4.6989
step 500: train loss 4.5986
step 1000: train loss 4.5342
step 1500: train loss 4.5063
Time taken for 1 epoch: 940.857351064682 secs

step 0: train loss 4.4796
step 500: train loss 4.4115
step 1000: train loss 4.3793
step 1500: train loss 4.3147
Time taken for 1 epoch: 938.5788416862488 secs

step 0: train loss 4.3240
step 500: train loss 4.2882
step 1000: train loss 4.2649
step 1500: train loss 4.2034
Time taken for 1 epoch: 937.0985174179077 secs



### Inference

#### Predicting one word at a time at the decoder and appending it to the output; then taking the complete sequence as an input to the decoder and repeating until maxlen or stop keyword appears

In [58]:
decoder_input = tokenizer.encode('[CLS]', add_special_tokens=False)  # will be list of 1 
# convert it to a tensor and add new dimension to be like batchsize of 1 and sequence
output = torch.tensor(decoder_input, device=device).unsqueeze(0)


In [59]:
# turn of grad while evaluaring model
@torch.no_grad()
def evaluate(input_document):
  
    # let model be in evaluation phase so layers like normalization, .. change their behaviour at inference time
    model.eval()
    
    # TODO:: must add add_special_tokens=False later after this session
    input_document = torch.tensor(tokenizer.encode(input_document, truncation="longest_first", padding='max_length', max_length=encoder_maxlen), device=device)
    
    # make it of size (1, input_document_size) to match transformers convention of (batch, input_documents_size)
    encoder_input = input_document.unsqueeze(0)

    
    decoder_input = tokenizer.encode("[CLS]", add_special_tokens=False)  # will be list of 1 
    
    # make it of size (1, input_document_size) to match transformers convention of (batch, input_documents_size)
    output = torch.tensor(decoder_input, device=device).unsqueeze(0)
    
    for i in range(decoder_maxlen):
        encoder_mask, decoder_mask = create_masks(encoder_input, output)
        
        logits, _ = model(encoder_input, encoder_mask, output, None, decoder_mask)
        # in this case : logits size (B,T,vocab_size)
        
        # predictions, attention_weights = transformer(
        #     encoder_input, 
        #     output,
        #     False,
        #     enc_padding_mask,
        #     combined_mask,
        #     dec_padding_mask
        # )

        logits = logits[: ,-1:, :]
        predicted_id = torch.argmax(logits, dim=-1)

        if predicted_id == tokenizer.sep_token_id:
            return output

        output = torch.cat((output, predicted_id), dim=-1)

    return output


In [60]:
def summarize(input_document):
    # not considering attention weights for now, can be used to plot attention heatmaps in the future
    summarized = evaluate(input_document=input_document)
    return tokenizer.decode(summarized.view(-1))

In [66]:
for i in range(2):
  # generate a random number between low and high
  index = random.uniform(0, len(document))//1
  print("index: ",index)
  print("Document: \n",document[index])
  print("Real Summary: \n", summary[index])
  print("Model Summary: \n", summarize(document[index]))
  print("--------------------------------------\n")


index:  31580.0
Document: 
 CNN  For years the message was simple Use condoms to prevent HIV But if you are at high risk of contracting the virus health experts want you to consider an additional strategy  taking a pill every day to reduce your chance of being infected New guidelines published by the Centers for Disease Control and Prevention say preexposure prophylaxis or PrEP should be taken daily by people who are at high risk for contracting HIV The recommendation is based on several large national and international studies which were done in varying atrisk populations such as gay and bisexual men heterosexual couples where one person is HIVpositive the other is not and injection drug users The studies all showed that this drug can help reduce infection rates by more than 90 when taken daily While a vaccine or cure may one day end the HIV epidemic PrEP is a powerful tool that has the potential to alter the course of the US HIV epidemic today DrJonathan Mermin director of the CDC's 

In [None]:
# test load 
model, optimizer, epoch, loss = load()

In [None]:
print(loss, epoch)