<a href="https://colab.research.google.com/github/aliakbarbadri/natural-language-inference/blob/master/snli.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
import numpy as np
import unicodedata
import re
import matplotlib.pyplot as plt
import pandas as pd
import os
import imageio

# Load data

In [3]:
MODE = 'train'
BATCH_SIZE = 64
EMBEDDING_SIZE = 256
RNN_SIZE = 512
NUM_EPOCHS = 15
ATTENTION_FUNC = 'concat'

In [34]:
source = pd.read_pickle(r'https://github.com/aliakbarbadri/natural-language-inference/blob/master/premises_train.pickle?raw=true')
target = pd.read_pickle(r'https://github.com/aliakbarbadri/natural-language-inference/blob/master/hypotheses_train.pickle?raw=true')

In [5]:
print(source[0])
print(target[0])

A person on a horse jumps over a broken down airplane .
A person is outdoors , on a horse .


In [6]:
src = source[:5000]
trg = target[:5000]

In [7]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalize_string(s):
    s = unicode_to_ascii(s)
    s = re.sub(r'([!.?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    s = re.sub(r'\s+', r' ', s)
    return s

In [8]:
raw_data_src = [normalize_string(data) for data in src]
raw_data_trg_in = ['<start> ' + normalize_string(data) for data in trg]
raw_data_trg_out = [normalize_string(data) + ' <end>' for data in trg]

In [9]:
src_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
src_tokenizer.fit_on_texts(raw_data_src)
data_src = src_tokenizer.texts_to_sequences(raw_data_src)
data_src = tf.keras.preprocessing.sequence.pad_sequences(data_src,
                                                        padding='post')
print(data_src[:2])

[[   1   52    6    1  229  209   70    1 1316   40  545    2    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0]
 [  57  134    5  825   15   66    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0]]


In [10]:
trg_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
trg_tokenizer.fit_on_texts(raw_data_trg_in)
trg_tokenizer.fit_on_texts(raw_data_trg_out)
data_trg_in = trg_tokenizer.texts_to_sequences(raw_data_trg_in)
data_trg_in = tf.keras.preprocessing.sequence.pad_sequences(data_trg_in,
                                                           padding='post')
print(data_trg_in[:2])

data_trg_out = trg_tokenizer.texts_to_sequences(raw_data_trg_out)
data_trg_out = tf.keras.preprocessing.sequence.pad_sequences(data_trg_out,
                                                            padding='post')
print(data_trg_out[:2])

[[  3   1  19   6  39  11   1 148   2   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  3  15   7  49 378   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
[[  1  19   6  39  11   1 148   2   4   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 15   7  49 378   4   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]]


In [11]:
dataset = tf.data.Dataset.from_tensor_slices(
    (data_src, data_trg_in, data_trg_out))
dataset = dataset.shuffle(len(raw_data_src)).batch(
    BATCH_SIZE, drop_remainder=True)

In [12]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, lstm_size):
        super(Encoder, self).__init__()
        self.lstm_size = lstm_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.lstm = tf.keras.layers.LSTM(
            lstm_size, return_sequences=True, return_state=True)

    def call(self, sequence, states):
        embed = self.embedding(sequence)
        output, state_h, state_c = self.lstm(embed, initial_state=states)

        return output, state_h, state_c

    def init_states(self, batch_size):
        return (tf.zeros([batch_size, self.lstm_size]),
                tf.zeros([batch_size, self.lstm_size]))


src_vocab_size = len(src_tokenizer.word_index) + 1

encoder = Encoder(src_vocab_size, EMBEDDING_SIZE, RNN_SIZE)
initial_state = encoder.init_states(1)

In [13]:
test_encoder_output = encoder(tf.constant(
    [[1, 23, 4, 5, 0, 0]]), initial_state)
print(test_encoder_output[0].shape)

(1, 6, 512)


In [14]:
class LuongAttention(tf.keras.Model):
  def __init__(self, rnn_size, attention_func):
    super(LuongAttention, self).__init__()
    self.attention_func = attention_func
    if attention_func not in ['dot', 'general', 'concat']:
      raise ValueError(
        'Unknown attention score function! Must be either dot, general or concat.')
    if attention_func == 'general':
    # General score function
      self.wa = tf.keras.layers.Dense(rnn_size)
    elif attention_func == 'concat':
      # Concat score function
      self.wa = tf.keras.layers.Dense(rnn_size, activation='tanh')
      self.va = tf.keras.layers.Dense(1)
  def call(self, decoder_output, encoder_output):
    if self.attention_func == 'dot':
      # Dot score function: decoder_output (dot) encoder_output
      # decoder_output has shape: (batch_size, 1, rnn_size)
      # encoder_output has shape: (batch_size, max_len, rnn_size)
      # => score has shape: (batch_size, 1, max_len)
      score = tf.matmul(decoder_output, encoder_output, transpose_b=True)
    elif self.attention_func == 'general':
      # General score function: decoder_output (dot) (Wa (dot) encoder_output)
      # decoder_output has shape: (batch_size, 1, rnn_size)
      # encoder_output has shape: (batch_size, max_len, rnn_size)
      # => score has shape: (batch_size, 1, max_len)
      score = tf.matmul(decoder_output, self.wa(encoder_output), transpose_b=True)
    elif self.attention_func == 'concat':
      # Concat score function: va (dot) tanh(Wa (dot) concat(decoder_output + encoder_output))
      # Decoder output must be broadcasted to encoder output's shape first
      decoder_output = tf.tile(
      decoder_output, [1, encoder_output.shape[1], 1])
      # Concat => Wa => va
      # (batch_size, max_len, 2 * rnn_size) => (batch_size, max_len, rnn_size) => (batch_size, max_len, 1)
      score = self.va(
    self.wa(tf.concat((decoder_output, encoder_output), axis=-1)))
    # Transpose score vector to have the same shape as other two above
    # (batch_size, max_len, 1) => (batch_size, 1, max_len)
    score = tf.transpose(score, [0, 2, 1])
    # alignment a_t = softmax(score)
    alignment = tf.nn.softmax(score, axis=2)
    # context vector c_t is the weighted average sum of encoder output
    context = tf.matmul(alignment, encoder_output)
    return context, alignment

In [15]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_size, rnn_size, attention_func):
    super(Decoder, self).__init__()
    self.attention = LuongAttention(rnn_size, attention_func)
    self.rnn_size = rnn_size
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
    self.lstm = tf.keras.layers.LSTM(
    rnn_size, return_sequences=True, return_state=True)
    self.wc = tf.keras.layers.Dense(rnn_size, activation='tanh')
    self.ws = tf.keras.layers.Dense(vocab_size)
  def call(self, sequence, state, encoder_output):
    # Remember that the input to the decoder
    # is now a batch of one-word sequences,
    # which means that its shape is (batch_size, 1)
    embed = self.embedding(sequence)
    # Therefore, the lstm_out has shape (batch_size, 1, rnn_size)
    lstm_out, state_h, state_c = self.lstm(embed, initial_state=state)
    # Use self.attention to compute the context and alignment vectors
    # context vector's shape: (batch_size, 1, rnn_size)
    # alignment vector's shape: (batch_size, 1, source_length)
    context, alignment = self.attention(lstm_out, encoder_output)
    # Combine the context vector and the LSTM output
    # Before combined, both have shape of (batch_size, 1, rnn_size),
    # so let's squeeze the axis 1 first
    # After combined, it will have shape of (batch_size, 2 * rnn_size)
    lstm_out = tf.concat(
                [tf.squeeze(context, 1), tf.squeeze(lstm_out, 1)], 1)
    # lstm_out now has shape (batch_size, rnn_size)
    lstm_out = self.wc(lstm_out)
    # Finally, it is converted back to vocabulary space: (batch_size, vocab_size)
    logits = self.ws(lstm_out)
    return logits, state_h, state_c, alignment


# trg_vocab_size = len(trg_tokenizer.word_index) + 1
# decoder = Decoder(trg_vocab_size, EMBEDDING_SIZE, LSTM_SIZE)
# de_initial_state = test_encoder_output[1:]

trg_vocab_size = len(trg_tokenizer.word_index) + 1
decoder = Decoder(trg_vocab_size, EMBEDDING_SIZE, RNN_SIZE, ATTENTION_FUNC)

In [43]:
# These lines can be used for debugging purpose
# Or can be seen as a way to build the models

# initial_state = encoder.init_states(1)
# encoder_outputs = encoder(tf.constant([[1]]), initial_state)
# decoder_outputs = decoder(tf.constant(
#     [[1]]), encoder_outputs[1:], encoder_outputs[0])

In [16]:
def loss_func(targets, logits):
  crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, dtype=tf.int64)
  loss = crossentropy(targets, logits, sample_weight=mask)
  return loss

optimizer = tf.keras.optimizers.Adam(clipnorm=5.0)

In [23]:
accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

In [24]:
@tf.function
def train_step(source_seq, target_seq_in, target_seq_out, en_initial_states):
  loss = 0
  acc = 0
  with tf.GradientTape() as tape:
    en_outputs = encoder(source_seq, en_initial_states)
    en_states = en_outputs[1:]
    de_state_h, de_state_c = en_states
    
    # We need to create a loop to iterate through the target sequences
    for i in range(target_seq_out.shape[1]):
      # Input to the decoder must have shape of (batch_size, length)
      # so we need to expand one dimension
      decoder_in = tf.expand_dims(target_seq_in[:, i], 1)
      logit, de_state_h, de_state_c, _ = decoder(
      decoder_in, (de_state_h, de_state_c), en_outputs[0])

      # The loss is now accumulated through the whole batch
      loss += loss_func(target_seq_out[:, i], logit)
      accuracy.update_state(target_seq_out[:, i], logit)
      # acc += acc_func(target_seq_out[:, i], logit)
  # print("acc",)
  # print(acc / target_seq_out.shape[1])
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))
  return loss / target_seq_out.shape[1], accuracy.result()

In [18]:

if not os.path.exists('checkpoints_luong/encoder'):
  os.makedirs('checkpoints_luong/encoder')
if not os.path.exists('checkpoints_luong/decoder'):
  os.makedirs('checkpoints_luong/decoder')


# Uncomment these lines for inference mode
encoder_checkpoint = tf.train.latest_checkpoint('checkpoints_luong/encoder')
decoder_checkpoint = tf.train.latest_checkpoint('checkpoints_luong/decoder')

if encoder_checkpoint is not None and decoder_checkpoint is not None:
  encoder.load_weights(encoder_checkpoint)
  decoder.load_weights(decoder_checkpoint)

In [77]:
# train_dataset, test_dataset = train_test_split(dataset,test_size=0.2)
# dataset.take(-1)

<TakeDataset shapes: ((64, 52), (64, 35), (64, 35)), types: (tf.int32, tf.int32, tf.int32)>

In [26]:
! rm -rf ./checkpoints_luong/encoder
! rm -rf ./checkpoints_luong/decoder
! mkdir ./checkpoints_luong/encoder
! mkdir ./checkpoints_luong/decoder

In [27]:
if MODE == 'train':
  for e in range(NUM_EPOCHS):
    en_initial_states = encoder.init_states(BATCH_SIZE)
    encoder.save_weights('checkpoints_luong/encoder/encoder_{}.h5'.format(e + 1))
    decoder.save_weights('checkpoints_luong/decoder/decoder_{}.h5'.format(e + 1))
    
    for batch, (source_seq, target_seq_in, target_seq_out) in enumerate(dataset.take(-1)):
      loss, acc = train_step(source_seq, target_seq_in,target_seq_out, en_initial_states)
      
      if batch % 100 == 0:
        print('Epoch {}, Batch {}, Loss {:.4f} , Acc {:.4f}'.format(e + 1, batch, loss.numpy(), acc.numpy()))

Epoch 1, Batch 0, Loss 1.9721 , Acc 0.0000
Epoch 2, Batch 0, Loss 0.9815 , Acc 0.0568
Epoch 3, Batch 0, Loss 0.8765 , Acc 0.0697
Epoch 4, Batch 0, Loss 0.7569 , Acc 0.0765
Epoch 5, Batch 0, Loss 0.8037 , Acc 0.0810
Epoch 6, Batch 0, Loss 0.8641 , Acc 0.0844
Epoch 7, Batch 0, Loss 0.6659 , Acc 0.0872
Epoch 8, Batch 0, Loss 0.6744 , Acc 0.0896
Epoch 9, Batch 0, Loss 0.5517 , Acc 0.0917
Epoch 10, Batch 0, Loss 0.5254 , Acc 0.0937
Epoch 11, Batch 0, Loss 0.5420 , Acc 0.0957
Epoch 12, Batch 0, Loss 0.4876 , Acc 0.0976
Epoch 13, Batch 0, Loss 0.4798 , Acc 0.0997
Epoch 14, Batch 0, Loss 0.4232 , Acc 0.1017
Epoch 15, Batch 0, Loss 0.4046 , Acc 0.1037


In [29]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [30]:
! cp -r checkpoints_luong "/content/drive/My Drive/nn/"

In [31]:
def predict(test_source_text=None):
  if test_source_text is None:
    test_source_text = raw_data_en[np.random.choice(len(raw_data_en))]
    print(test_source_text)
  test_source_seq = src_tokenizer.texts_to_sequences([test_source_text])
  # print(test_source_seq)
  
  en_initial_states = encoder.init_states(1)
  en_outputs = encoder(tf.constant(test_source_seq), en_initial_states)
  
  de_input = tf.constant([[trg_tokenizer.word_index['<start>']]])
  de_state_h, de_state_c = en_outputs[1:]
  out_words = []
  alignments = []
  
  while True:
    de_output, de_state_h, de_state_c, alignment = decoder(de_input, (de_state_h, de_state_c), en_outputs[0])
    de_input = tf.expand_dims(tf.argmax(de_output, -1), 0)
    out_words.append(trg_tokenizer.index_word[de_input.numpy()[0][0]])
    alignments.append(alignment.numpy())
    
    if out_words[-1] == '<end>' or len(out_words) >= 20:
      break
  return np.array(alignments), test_source_text.split(' '), out_words

In [35]:
test_src_sents = source[-5:]
test_trg_sents = target[-5:]

In [40]:
len(test_src_sents)

5

In [36]:
! rm -rf heatmap/
! mkdir heatmap
filenames = []
for i, test_sent in enumerate(test_src_sents):
  
  test_sequence = normalize_string(test_sent)
  alignments, source, prediction = predict(test_sequence)
  print("input:",test_sent)
  print("actual:",test_trg_sents[i])
  print("predicted:",' '.join(prediction)[:-6])
  print("---------")
  attention = np.squeeze(alignments, (1, 2))
  fig = plt.figure(figsize=(10, 10))
  fig.show()
  ax = fig.add_subplot(1, 1, 1)
  ax.matshow(attention, cmap='jet')
  ax.set_xticklabels([''] + source, rotation=90)
  ax.set_yticklabels([''] + prediction)
  filenames.append('heatmap/test_{}.png'.format(i))
  plt.savefig('heatmap/test_{}.png'.format(i))
  plt.close()

with imageio.get_writer('translation_heatmaps.gif', mode='I', duration=2) as writer:
  for filename in filenames:
    image = imageio.imread(filename)
    writer.append_data(image)

input: A group of four kids stand in front of a statue of a large animal .
actual: four kids standing
predicted: a group of people are playing soccer .
---------
input: a kid doing tricks on a skateboard on a bridge
actual: a kid is skateboarding
predicted: a dog is outside .
---------
input: A dog with a blue collar plays ball outside .
actual: a dog is outside
predicted: a dog is in the water .
---------
input: Four dirty and barefooted children .
actual: four children have dirty feet .
predicted: children are sitting
---------
input: A man is surfing in a bodysuit in beautiful blue water .
actual: On the beautiful blue water there is a man in a bodysuit surfing .
predicted: a man is wearing a shirt .
---------


In [None]:
# ! zip -r checkpoints_luong.zip checkpoints_luong
# ! zip -r heatmap.zip heatmap
! cp -r heatmap "/content/drive/My Drive/nn/"

# BLEU

In [37]:
! pip install -q nltk

In [44]:
inp = pd.read_pickle(r'https://github.com/aliakbarbadri/natural-language-inference/blob/master/premises_train.pickle?raw=true')
outp = pd.read_pickle(r'https://github.com/aliakbarbadri/natural-language-inference/blob/master/hypotheses_train.pickle?raw=true')

In [45]:
X_test = inp[-1000:]
y_test = outp[-1000:]

In [47]:
len(X_test),len(y_test)

(1000, 1000)

In [60]:
list_of_references = []
list_of_hypotheses = []

for i, test_sent in enumerate(X_test):
  
  test_sequence = normalize_string(test_sent)
  alignments, source, prediction = predict(test_sequence)
  list_of_references.append([y_test[i].split()[:-1]])
  list_of_hypotheses.append(prediction[:-1])
  # print("input:",test_sent)
  # print("actual:",test_trg_sents[i])
  # print("predicted:",' '.join(prediction)[:-6])
  # print("---------")

In [62]:
list_of_references[0]

[['A', 'group', 'of', 'four', 'are', 'sitting', 'on', 'the', 'sidewalk']]

In [63]:
list_of_hypotheses[0]

['people', 'are', 'on', 'a', 'sidewalk']

In [61]:
import nltk

nltk.bleu_score.corpus_bleu(list_of_references, list_of_hypotheses)

0.035916743120916805