<a href="https://colab.research.google.com/github/capGoblin/Transformer-from-Scratch-Text_Summarizer/blob/main/Transformer_from_Scratch_for_Text_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Copyright 2019 The TensorFlow Authors.

#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# https://github.com/tensorflow/docs/blob/master/site/en/tutorials/text/transformer.ipynb

In [None]:
# Modifications Copyright (C) 2020 Rohan Jagtap

In [None]:
# cd /content/drive/My Drive/Colab Notebooks/summarizer_transformer/

In [None]:
!pip install kaggle



In [None]:
from google.colab import files

uploaded = files.upload()


Saving kaggle.json to kaggle.json


In [None]:
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/

In [None]:
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!kaggle datasets download -d shashichander009/inshorts-news-data

Downloading inshorts-news-data.zip to /content
  0% 0.00/12.6M [00:00<?, ?B/s] 72% 9.00M/12.6M [00:00<00:00, 85.0MB/s]
100% 12.6M/12.6M [00:00<00:00, 101MB/s] 


In [None]:
!unzip /content/inshorts-news-data.zip

Archive:  /content/inshorts-news-data.zip
  inflating: Inshorts Cleaned Data.xlsx  


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import re
import pickle

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, LayerNormalization, Layer
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.train import Checkpoint, CheckpointManager
from tensorflow.keras.metrics import Mean

### Loading Data

In [None]:
data = pd.read_excel('/content/Inshorts Cleaned Data.xlsx', sheet_name='Sheet1', usecols=[0,1])

In [None]:
# data.drop(['Source ', 'Time ', 'Publish Date'], axis=1, inplace=True)

In [None]:
data.head()

Unnamed: 0,Headline,Short
0,4 ex-bank officials booked for cheating bank o...,The CBI on Saturday booked four former officia...
1,Supreme Court to go paperless in 6 months: CJI,Chief Justice JS Khehar has said the Supreme C...
2,"At least 3 killed, 30 injured in blast in Sylh...","At least three people were killed, including a..."
3,Why has Reliance been barred from trading in f...,Mukesh Ambani-led Reliance Industries (RIL) wa...
4,Was stopped from entering my own studio at Tim...,TV news anchor Arnab Goswami has said he was t...


In [None]:
data.shape

(55104, 2)

In [None]:
news = data['Short']
summary = data['Headline']

In [None]:
news[30], summary[30]

('According to the Guinness World Records, the most generations alive in a single family have been seven.  The difference between the oldest and the youngest person in the family was about 109 years, when Augusta Bunge&#39;s great-great-great-great grandson was born on January 21, 1989. The family belonged to the United States of America.',
 'The most generations alive in a single family have been 7')

### Preprocessing

In [None]:
# for decoder sequence
# summary = summary.apply(lambda x: '<START> ' + x + ' <END>')
# summary.head()


def add_tokens(x):
    return '<START> ' + x + ' <END>'

summary = summary.apply(add_tokens)
print("Summary after adding tokens:")
print(summary)

Summary after adding tokens:
0        <START> 4 ex-bank officials booked for cheatin...
1        <START> Supreme Court to go paperless in 6 mon...
2        <START> At least 3 killed, 30 injured in blast...
3        <START> Why has Reliance been barred from trad...
4        <START> Was stopped from entering my own studi...
                               ...                        
55099    <START> Sensex loses 400 points to hit 52-week...
55100    <START> China to inject $91 bn into the money ...
55101    <START> Ghulam Ali set to make acting debut in...
55102    <START> IS acknowledges death of Jihadi John: ...
55103    <START> Cairn to seek $600 mn from India in da...
Name: Headline, Length: 55104, dtype: object


#### Tokenizing the texts into integer tokens

In [None]:
# since < and > from default tokens cannot be removed
filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'
oov_token = '<unk>'

news_tokenizer = Tokenizer(oov_token=oov_token)
summary_tokenizer = Tokenizer(filters=filters, oov_token=oov_token)

news_tokenizer.fit_on_texts(news)
summary_tokenizer.fit_on_texts(summary)

inputs = news_tokenizer.texts_to_sequences(news)
targets = summary_tokenizer.texts_to_sequences(summary)

In [None]:
summary_tokenizer.texts_to_sequences(["This is a test"])

[[184, 22, 12, 71]]

In [None]:
summary_tokenizer.sequences_to_texts([[2, 22, 12, 71]])

['<start> is a test']

In [None]:
encoder_vocab_size = len(news_tokenizer.word_index) + 1
decoder_vocab_size = len(summary_tokenizer.word_index) + 1

# vocab_size
encoder_vocab_size, decoder_vocab_size

(76362, 29661)

#### Obtaining insights on lengths for defining maxlen

In [None]:
news_lengths = []
summary_lengths = []

for n,s in zip(news, summary):
    news_lengths.append(len(n))
    summary_lengths.append(len(s))



news_lengths = pd.Series(news_lengths)
summary_lengths = pd.Series(summary_lengths)


In [None]:
news_lengths.describe()

count    55104.000000
mean       368.003049
std         26.235510
min        280.000000
25%        350.000000
50%        369.000000
75%        387.000000
max        469.000000
dtype: float64

In [None]:
summary_lengths.describe()

count    55104.000000
mean        65.620282
std          7.267463
min         22.000000
25%         61.000000
50%         65.000000
75%         71.000000
max         98.000000
dtype: float64

In [None]:
# maxlen
# taking values > and round figured to 75th percentile
# at the same time not leaving high variance
encoder_maxlen = 400
decoder_maxlen = 75

#### Padding/Truncating sequences for identical sequence lengths

In [None]:
inputs = pad_sequences(inputs, maxlen=encoder_maxlen, padding='post', truncating='post')
targets = pad_sequences(targets, maxlen=decoder_maxlen, padding='post', truncating='post')

### Creating ds pipeline

In [None]:
inputs = tf.cast(inputs, dtype=tf.int32)
targets = tf.cast(targets, dtype=tf.int32)

In [None]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

In [None]:
ds = tf.data.Dataset.from_tensor_slices((inputs, targets)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [None]:
count = 0
for inputs , targets in enumerate(ds):
    print(inputs)
    print(targets)
    count+=1

    if count > 2:
        break

0
(<tf.Tensor: shape=(64, 400), dtype=int32, numpy=
array([[  164,   380,   178, ...,     0,     0,     0],
       [    2,   347,    82, ...,     0,     0,     0],
       [ 2051,     6,    52, ...,     0,     0,     0],
       ...,
       [  170, 19239,  9864, ...,     0,     0,     0],
       [  370,   841,   372, ...,     0,     0,     0],
       [  520,  2961,  3926, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(64, 75), dtype=int32, numpy=
array([[    2,   386,     4, ...,     0,     0,     0],
       [    2,    82,   781, ...,     0,     0,     0],
       [    2,  2855,  1055, ...,     0,     0,     0],
       ...,
       [    2, 10631,  8464, ...,     0,     0,     0],
       [    2,   360,  2679, ...,     0,     0,     0],
       [    2,  2265,   168, ...,     0,     0,     0]], dtype=int32)>)
1
(<tf.Tensor: shape=(64, 400), dtype=int32, numpy=
array([[11450,    13,   468, ...,     0,     0,     0],
       [ 9063,    13,    34, ...,     0,     0,     0],
       [

In [None]:
print("Dataset created:")
print("Number of batches in the dataset:", tf.data.experimental.cardinality(ds))
print("Batch size:", BATCH_SIZE)

Dataset created:
Number of batches in the dataset: tf.Tensor(861, shape=(), dtype=int64)
Batch size: 64


### Positional Encoding for adding notion of position among words as unlike RNN this is non-directional

In [None]:
def get_angles(position, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return position * angle_rates

In [None]:
# def positional_encoding(position, d_model):
#     angle_rads = get_angles(
#         np.arange(position)[:, np.newaxis],
#         np.arange(d_model)[np.newaxis, :],
#         d_model
#     )

#     # apply sin to even indices in the array; 2i
#     angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

#     # apply cos to odd indices in the array; 2i+1
#     angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

#     pos_encoding = angle_rads[np.newaxis, ...]

#     return tf.cast(pos_encoding, dtype=tf.float32)


def positional_encoding(position, d_model):
    position = np.arange(position)[:, np.newaxis]
    i = np.arange(d_model)[np.newaxis, :]

    exponent = (2 * (i // 2)) / np.float32(d_model)
#     print(exponent.shape)
    # Create angle values for each position and dimension
    angle_rates = 1 / np.power(10000, exponent)
#     print(angle_rates.shape)
    # Compute the angle values for the positions
    angle_rads = position * angle_rates
#     print(angle_rads.shape)
    # Apply sine to even indices
    angle_rads[:, ::2] = np.sin(angle_rads[:, ::2])

    # Apply cosine to odd indices
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
#     print(angle_rads.shape)
    # Add an extra dimension to the array
    pos_encoding = np.expand_dims(angle_rads, axis=0)
#     print(pos_encoding.shape)

    # Convert to TensorFlow float32 data type
    pos_encoding = tf.cast(pos_encoding, dtype=tf.float32)

    return pos_encoding

### Masking

- Padding mask for masking "pad" sequences
- Lookahead mask for masking future words from contributing in prediction of current words in self attention

In [None]:
# def create_padding_mask(seq):
#     seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
#     return seq[:, tf.newaxis, tf.newaxis, :]

In [None]:
# def create_look_ahead_mask(size):
#     mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
#     return mask

In [None]:
def create_padding_mask(seq):
    padding_mask = tf.math.equal(seq, 0)
    padding_mask = tf.cast(padding_mask, tf.float32)

    padding_mask = tf.expand_dims(padding_mask, axis=1)
    padding_mask = tf.expand_dims(padding_mask, axis=2)
    return padding_mask

def create_look_ahead_mask(size):
    ones = tf.ones((size, size))

    req_matrix = tf.linalg.band_part(ones, -1, 0)
    toggle_req_matrix = 1 - req_matrix
    mask = toggle_req_matrix
    return mask

### Building the Model

#### Scaled Dot Product

In [None]:
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)

    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled += (mask * -1e9)

    attention_weights = tf.nn.softmax(scaled, axis=-1)

    values = tf.matmul(attention_weights, v)
    return values, attention_weights

#### Multi-Headed Attention

In [None]:
class MultiHeadAttention(Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads


#         assert d_model % self.num_heads == 0

        self.head_dim = d_model // self.num_heads

        self.wq = Dense(d_model)
        self.wk = Dense(d_model)
        self.wv = Dense(d_model)

        self.linear_dense = Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.head_dim))
        # x = tf.reshape(x, (batch_size, seq_len, self.num_heads, self.head_dim))

        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, q, k, v, mask):
        batch_size = tf.shape(q)[0]
        # seq_len = tf.shape(q)[1]
        # print(q.shape)


        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        # q = self.split_heads(q, batch_size, seq_len)
        # k = self.split_heads(k, batch_size, seq_len)
        # v = self.split_heads(v, batch_size, seq_len)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        values, attention_weights = scaled_dot_product_attention(q, k, v, mask)

        values = tf.transpose(values, perm=[0, 2, 1, 3])

        concat_values = tf.reshape(values, (batch_size, -1, self.d_model))
        # concat_values = tf.reshape(values, (batch_size, seq_len, self.d_model))

        output = self.linear_dense(concat_values)

        return output, attention_weights

### Feed Forward Network

In [None]:
def point_wise_feed_forward_network(d_model, hidden):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(hidden, activation='relu'),
        tf.keras.layers.Dense(d_model)
    ])

In [None]:
class PointwiseFeedForward(Layer):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PointwiseFeedForward, self).__init__()
        self.linear1 = Dense(hidden, activation='relu')
        self.linear2 = Dense(d_model)
        self.dropout = Dropout(rate=drop_prob)

    def call(self, x):
        x = self.linear1(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

#### Fundamental Unit of Transformer encoder

In [None]:
class EncoderLayer(Layer):
    def __init__(self, d_model, num_heads, hidden, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.attention = MultiHeadAttention(d_model, num_heads)
        self.norm1 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate=rate)

        self.ffn = PointwiseFeedForward(d_model, hidden, drop_prob=rate)
        self.norm2 = LayerNormalization(epsilon=1e-6)
        self.dropout2 = Dropout(rate=rate)


    def call(self, x, training, mask):
        residual_x = tf.identity(x)
        x, _ = self.attention(x, x, x, mask)
        x = self.dropout1(x, training=training)
        x = self.norm1(residual_x + x)
        residual_x = tf.identity(x)

        x = self.ffn(x)
        x = self.dropout2(x, training=training)
        x = self.norm2(residual_x + x)

        return x


#### Fundamental Unit of Transformer decoder

In [None]:
# class DecoderLayer(Layer):
#     def __init__(self, d_model, num_heads, hidden, rate=0.1):
#         super(DecoderLayer, self).__init__()

#         self.mha1 = MultiHeadAttention(d_model, num_heads)
#         self.mha2 = MultiHeadAttention(d_model, num_heads)

#         # self.ffn = point_wise_feed_forward_network(d_model, hidden)
#         self.ffn = PointwiseFeedForward(d_model, hidden, drop_prob=rate)

#         self.layernorm1 = LayerNormalization(epsilon=1e-6)
#         self.layernorm2 = LayerNormalization(epsilon=1e-6)
#         self.layernorm3 = LayerNormalization(epsilon=1e-6)

#         self.dropout1 = Dropout(rate)
#         self.dropout2 = Dropout(rate)
#         self.dropout3 = Dropout(rate)


#     def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
#         attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
#         attn1 = self.dropout1(attn1, training=training)
#         out1 = self.layernorm1(attn1 + x)

#         attn2, attn_weights_block2 = self.mha2(out1, enc_output, enc_output,  padding_mask)
#         attn2 = self.dropout2(attn2, training=training)
#         out2 = self.layernorm2(attn2 + out1)

#         ffn_output = self.ffn(out2)
#         ffn_output = self.dropout3(ffn_output, training=training)
#         out3 = self.layernorm3(ffn_output + out2)

#         return out3, attn_weights_block1, attn_weights_block2
class DecoderLayer(Layer):
    def __init__(self, d_model, num_heads, hidden, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.attention1 = MultiHeadAttention(d_model, num_heads)
        self.attention2 = MultiHeadAttention(d_model, num_heads)

        # self.ffn = point_wise_feed_forward_network(d_model, hidden)
        self.ffn = PointwiseFeedForward(d_model, hidden, drop_prob=rate)

        self.norm1 = LayerNormalization(epsilon=1e-6)
        self.norm2 = LayerNormalization(epsilon=1e-6)
        self.norm3 = LayerNormalization(epsilon=1e-6)

        self.dropout1 = Dropout(rate=rate)
        self.dropout2 = Dropout(rate=rate)
        self.dropout3 = Dropout(rate=rate)


    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        output1, attention_weights1 = self.attention1(x, x, x, look_ahead_mask)
        output1 = self.dropout1(output1, training=training)
        output1 = self.norm1(output1 + x)

        output2, attention_weights2 = self.attention2(output1, enc_output, enc_output,  padding_mask)
        output2 = self.dropout2(output2, training=training)
        output2 = self.norm2(output2 + output1)

        output3 = self.ffn(output2)
        output3 = self.dropout3(output3, training=training)
        output3 = self.norm3(output3 + output2)

        return output3, attention_weights1, attention_weights2


#### Encoder consisting of multiple EncoderLayer(s)

In [None]:
class Encoder(Layer):
    def __init__(self,  d_model, num_layers, num_heads, hidden, input_vocab_size, max_pos_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(max_pos_encoding, self.d_model)

        self.enc_layers = [EncoderLayer(d_model, num_heads, hidden, rate) for _ in range(num_layers)]

        # self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        # x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x


#### Decoder consisting of multiple DecoderLayer(s)

In [None]:
class Decoder(Layer):
    def __init__(self, d_model, num_layers,  num_heads, hidden, target_vocab_size, max_pos_encoding, rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(max_pos_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, hidden, rate) for _ in range(num_layers)]
        self.dropout = Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

        return x, attention_weights


#### Finally, the Transformer

In [None]:
class Transformer(Model):
    def __init__(self, d_model, num_layers, num_heads, hidden, input_vocab_size, target_vocab_size, max_pos_input, max_pos_target, rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(d_model, num_layers, num_heads, hidden, input_vocab_size, max_pos_input, rate)

        self.decoder = Decoder(d_model, num_layers, num_heads, hidden, target_vocab_size, max_pos_target, rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
        enc_output = self.encoder(inp, training, enc_padding_mask)

        dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)

        final_output = self.final_layer(dec_output)

        return final_output, attention_weights


### Training

In [None]:
# hyper-params
num_layers = 4
d_model = 128
hidden = 512
num_heads = 8
EPOCHS = 5

#### Adam optimizer with custom learning rate scheduling

In [None]:
class CustomSchedule(LearningRateSchedule):  # Using the Adam optimizer with a custom learning rate scheduler according to the formula in the original Transformer paper.
    def __init__(self, d_model, warmup_steps=4000): # lrate = d_model^-0.5 * min(step_num^-0.5, step_num * warmup_steps^-1.5)
        super().__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step* (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


#### Defining losses and other metrics

In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [None]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)


In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')

#### Transformer

In [None]:
transformer = Transformer(
    d_model,
    num_layers,
    num_heads,
    hidden,
    encoder_vocab_size,
    decoder_vocab_size,
    max_pos_input=encoder_vocab_size,
    max_pos_target=decoder_vocab_size,
)

#### Masks

In [None]:
def create_masks(inputs, targets):
    enc_padding_mask = create_padding_mask(inputs)
    dec_padding_mask = create_padding_mask(inputs)

    look_ahead_mask = create_look_ahead_mask(tf.shape(targets)[1])
#     look_ahead_mask = create_look_ahead_mask(75)

    dec_target_padding_mask = create_padding_mask(targets)

    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

#### Checkpoints

In [None]:
checkpoint_path = "checkpoints"

ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)


if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!')


Latest checkpoint restored!


#### Training steps

In [None]:
@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    with tf.GradientTape() as tape:
        predictions, _ = transformer(
            inp, tar_inp,
            True,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask
        )
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)

In [None]:
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    count = 0
    for (batch, (inp, tar)) in enumerate(ds):
        train_step(inp, tar)
        # count+=1


        # print('hi')
        # 55k samples
        # we display 3 batch results -- 0th, middle and last one (approx)
        # 55k / 64 ~ 858; 858 / 2 = 429
        if batch % 429 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, train_loss.result()))
        # if count > 1:
        #     raise Exception('yap')
    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1, ckpt_save_path))

    print ('Epoch {} Loss {:.4f}'.format(epoch + 1, train_loss.result()))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

(64, 400, 128)
(64, 400, 128)
(64, 400, 128)
(64, 400, 128)
(64, 74, 128)
(64, 74, 128)
(64, 74, 128)
(64, 74, 128)
(64, 74, 128)
(64, 74, 128)
(64, 74, 128)
(64, 74, 128)
(64, 400, 128)
(64, 400, 128)
(64, 400, 128)
(64, 400, 128)
(64, 74, 128)
(64, 74, 128)
(64, 74, 128)
(64, 74, 128)
(64, 74, 128)
(64, 74, 128)
(64, 74, 128)
(64, 74, 128)
Epoch 1 Batch 0 Loss 8.5961
Epoch 1 Batch 429 Loss 5.3961
Epoch 1 Batch 858 Loss 4.3236
Epoch 1 Loss 4.3198
Time taken for 1 epoch: 370.92382431030273 secs

Epoch 2 Batch 0 Loss 2.5660
Epoch 2 Batch 429 Loss 2.2479
Epoch 2 Batch 858 Loss 2.0138
Epoch 2 Loss 2.0131
Time taken for 1 epoch: 331.52879095077515 secs

Epoch 3 Batch 0 Loss 1.4927
Epoch 3 Batch 429 Loss 1.4243
Epoch 3 Batch 858 Loss 1.3569
Epoch 3 Loss 1.3566
Time taken for 1 epoch: 331.2192060947418 secs

Epoch 4 Batch 0 Loss 1.1613
Epoch 4 Batch 429 Loss 1.0922
Epoch 4 Batch 858 Loss 1.0670
Epoch 4 Loss 1.0665
Time taken for 1 epoch: 331.4067430496216 secs

Epoch 5 Batch 0 Loss 0.9217
Ep

### Inference

#### Predicting one word at a time at the decoder and appending it to the output; then taking the complete sequence as an input to the decoder and repeating until maxlen or stop keyword appears

In [None]:
def evaluate(input_news):
    input_news = news_tokenizer.texts_to_sequences([input_news])
    input_news = tf.keras.preprocessing.sequence.pad_sequences(input_news, maxlen=encoder_maxlen, padding='post', truncating='post')

    encoder_input = tf.expand_dims(input_news[0], 0)

    decoder_input = [summary_tokenizer.word_index['<start>']]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(decoder_maxlen):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output)

        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask
        )

        predictions = predictions[: ,-1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        if predicted_id == summary_tokenizer.word_index['<end>']:
            return tf.squeeze(output, axis=0), attention_weights

        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights


In [None]:
def summarize(input_news):
    # not considering attention weights for now, can be used to plot attention heatmaps in the future
    summarized = evaluate(input_news=input_news)[0].numpy()
    summarized = np.expand_dims(summarized[1:], 0)  # not printing <START> token
    return summary_tokenizer.sequences_to_texts(summarized)[0]  # since there is just one translated news

In [None]:


def checkinbulk(randomnumber):
  print('Actual summary:', summarize(news[randomnumber]))

  print('News: ', news[randomnumber])
  print('Actual summary: ', summary[randomnumber][7:-6])



In [None]:
import random
random_number = random.randint(0, 55104)

checkinbulk(random_number)

(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 1, 128)
(1, 1, 128)
(1, 1, 128)
(1, 1, 128)
(1, 1, 128)
(1, 1, 128)
(1, 1, 128)
(1, 1, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 2, 128)
(1, 2, 128)
(1, 2, 128)
(1, 2, 128)
(1, 2, 128)
(1, 2, 128)
(1, 2, 128)
(1, 2, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 3, 128)
(1, 3, 128)
(1, 3, 128)
(1, 3, 128)
(1, 3, 128)
(1, 3, 128)
(1, 3, 128)
(1, 3, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 4, 128)
(1, 4, 128)
(1, 4, 128)
(1, 4, 128)
(1, 4, 128)
(1, 4, 128)
(1, 4, 128)
(1, 4, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 5, 128)
(1, 5, 128)
(1, 5, 128)
(1, 5, 128)
(1, 5, 128)
(1, 5, 128)
(1, 5, 128)
(1, 5, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 6, 128)
(1, 6, 128)
(1, 6, 128)
(1, 6, 128)
(1, 6, 128)
(1, 6, 128)
(1, 6, 128)
(1, 6, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 7, 128)
(1, 7, 128)
(1, 7, 1

In [None]:
summarize(
    "A historic achievement has been made in the realm of space exploration. Astronomers have detected the presence of an Earth-like planet orbiting a distant star within the habitable zone. This exciting discovery raises the possibility of finding extraterrestrial life and provides valuable insights into the existence of other habitable worlds beyond our own. Scientists are now planning detailed observations and future missions to explore this intriguing exoplanet further. The discovery marks a significant milestone in our quest to unravel the mysteries of the universe and understand our place in the cosmos"
)

(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 1, 128)
(1, 1, 128)
(1, 1, 128)
(1, 1, 128)
(1, 1, 128)
(1, 1, 128)
(1, 1, 128)
(1, 1, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 2, 128)
(1, 2, 128)
(1, 2, 128)
(1, 2, 128)
(1, 2, 128)
(1, 2, 128)
(1, 2, 128)
(1, 2, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 3, 128)
(1, 3, 128)
(1, 3, 128)
(1, 3, 128)
(1, 3, 128)
(1, 3, 128)
(1, 3, 128)
(1, 3, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 4, 128)
(1, 4, 128)
(1, 4, 128)
(1, 4, 128)
(1, 4, 128)
(1, 4, 128)
(1, 4, 128)
(1, 4, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 5, 128)
(1, 5, 128)
(1, 5, 128)
(1, 5, 128)
(1, 5, 128)
(1, 5, 128)
(1, 5, 128)
(1, 5, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 6, 128)
(1, 6, 128)
(1, 6, 128)
(1, 6, 128)
(1, 6, 128)
(1, 6, 128)
(1, 6, 128)
(1, 6, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 400, 128)
(1, 7, 128)
(1, 7, 128)
(1, 7, 1

'newly made new space detected for earth like planet'

In [None]:
transformer.summary()

Model: "transformer_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_15 (Encoder)        multiple                  10567424  
                                                                 
 decoder_15 (Decoder)        multiple                  4854912   
                                                                 
 dense_4911 (Dense)          multiple                  3826269   
                                                                 
Total params: 19,248,605
Trainable params: 19,248,605
Non-trainable params: 0
_________________________________________________________________
