# Setup

In [None]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import pandas as pd

import unicodedata
import re
import numpy as np
import os
import io
import time

In [1]:
#--download dataset
!git clone https://github.com/wangcunxiang/SemEval2020-Task4-Commonsense-Validation-and-Explanation.git

Cloning into 'SemEval2020-Task4-Commonsense-Validation-and-Explanation'...
remote: Enumerating objects: 88, done.[K
remote: Counting objects: 100% (88/88), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 88 (delta 36), reused 64 (delta 19), pack-reused 0[K
Unpacking objects: 100% (88/88), done.


In [None]:
#--read dataset
input_train = pd.read_csv('/content/SemEval2020-Task4-Commonsense-Validation-and-Explanation/ALL data/Training  Data/subtaskC_data_all.csv', index_col='id')
target_train = pd.read_csv('/content/SemEval2020-Task4-Commonsense-Validation-and-Explanation/ALL data/Training  Data/subtaskC_answers_all.csv', names=['id','r1', 'r2','r3' ], index_col='id')

input_dev = pd.read_csv('/content/SemEval2020-Task4-Commonsense-Validation-and-Explanation/ALL data/Dev Data/subtaskC_dev_data.csv', index_col='id')
target_dev= pd.read_csv('/content/SemEval2020-Task4-Commonsense-Validation-and-Explanation/ALL data/Dev Data/subtaskC_gold_answers.csv', names=['id','r1', 'r2','r3' ], index_col='id')

input_test = pd.read_csv('/content/SemEval2020-Task4-Commonsense-Validation-and-Explanation/ALL data/Test Data/subtaskC_test_data.csv', index_col='id')
target_test= pd.read_csv('/content/SemEval2020-Task4-Commonsense-Validation-and-Explanation/ALL data/Test Data/subtaskC_gold_answers.csv', names=['id','r1', 'r2','r3' ], index_col='id')
input_train.head(2)

Unnamed: 0_level_0,FalseSent
id,Unnamed: 1_level_1
0,He poured orange juice on his cereal.
1,He drinks apple.


In [None]:
'''
Merge nonsensical sentences (false sent) with Referential Reasons (r1, r2, and r3) separately

sent1    r1
sent1    r2
sent1    r3

'''
def merge_dataset (inp_col, tar1_col, tar2_col, tar3_col, mode):
    data = []
    reasons = []
    start_tag = []

    if mode == "test":
        for i in range(len(inp_col)):
            start_tag.append("<START>")
            data.append(inp_col[i])
    else:
        for i in range(len(inp_col)):
            data.append(inp_col[i])
            data.append(inp_col[i])
            data.append(inp_col[i])

            reasons.append(tar1_col[i])
            reasons.append(tar2_col[i])
            reasons.append(tar3_col[i])

    if mode == "test":
        df = pd.DataFrame(list(zip(data, start_tag)),
                columns =['statment', 'reasons'])

    else:
        df = pd.DataFrame(list(zip(data, reasons)),
                columns =['statment', 'reasons'])
    
    return df

In [None]:
dataset_train = merge_dataset(input_train['FalseSent'], target_train['r1'], target_train['r2'], target_train['r3'],"train")

input_dev = input_dev.reset_index(drop=True)
target_dev=target_dev.reset_index(drop = True)
dataset_dev = merge_dataset(input_dev['FalseSent'], target_dev['r1'], target_dev['r2'], target_dev['r3'], "train")

input_test = input_test.reset_index(drop= True)
target_test = target_test.reset_index(drop= True)
dataset_test = merge_dataset(input_test['FalseSent'], target_test['r1'], target_test['r2'], target_test['r3'],"test") #"test")


dataset_train.head(3)

Unnamed: 0,statment,reasons
0,He poured orange juice on his cereal.,Orange juice doesn't taste good on cereal.
1,He poured orange juice on his cereal.,Orange juice is poured in a glass.
2,He poured orange juice on his cereal.,Orange juice does not taste good on cereal.


In [None]:
#---merge all columns into one column; to build vocabulary dictionary ----

one_col = input_train['FalseSent'].append([target_train['r1'], target_train['r2'], target_train['r3'], input_dev['FalseSent'], target_dev['r1'], target_dev['r2'], target_dev['r3'],input_test['FalseSent'], target_test['r1'], target_test['r2'], target_test['r3'] ]).reset_index(drop=True)
one_col

0                    He poured orange juice on his cereal.
1                                         He drinks apple.
2                             Jeff ran 100,000 miles today
3                                       I sting a mosquito
4                                   A giraffe is a person.
                               ...                        
47983                        The sun only shines sunlight.
47984    Ice hockey is like an financial institution wi...
47985    Throwing water in the freezer would just make ...
47986               Sand is neither imbibable nor a fluid.
47987    2 inches of distance is not enough for a perso...
Length: 47988, dtype: object

In [None]:
def preprocess_sentence(w):
  w = w.lower().strip()

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [None]:
sentence = "He poured orange juice on his cereal."
print(preprocess_sentence(sentence))


<start> he poured orange juice on his cereal . <end>


In [None]:
inp_lang_train= dataset_train['statment'].map(lambda x:preprocess_sentence(x))
targ_lang_train= dataset_train['reasons'].map(lambda x:preprocess_sentence(x))


inp_lang_val = dataset_dev['statment'].map(lambda x:preprocess_sentence(x))
targ_lang_val = dataset_dev['reasons'].map(lambda x:preprocess_sentence(x))

inp_lang_test = dataset_test['statment'].map(lambda x:preprocess_sentence(x))
targ_lang_test = dataset_test['reasons'].map(lambda x:preprocess_sentence(x))

print(inp_lang_train[1], targ_lang_train[1] )

<start> he poured orange juice on his cereal . <end> <start> orange juice is poured in a glass . <end>


In [None]:
one_col_pre = one_col.map(lambda x: preprocess_sentence(x))

In [None]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

In [None]:
#---tokenizer----
lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
lang_tokenizer.fit_on_texts(one_col_pre)

#train
input_tensor = tf.keras.preprocessing.sequence.pad_sequences(lang_tokenizer.texts_to_sequences(inp_lang_train),
                                                        padding='post')
target_tensor = tf.keras.preprocessing.sequence.pad_sequences(lang_tokenizer.texts_to_sequences(targ_lang_train),
                                                        padding='post')

#Val 
input_tensor_val = tf.keras.preprocessing.sequence.pad_sequences(lang_tokenizer.texts_to_sequences(inp_lang_val), padding='post')
target_tensor_val = tf.keras.preprocessing.sequence.pad_sequences(lang_tokenizer.texts_to_sequences(targ_lang_val),padding='post')

In [None]:
target_tensor_val.shape

(2991, 38)

In [None]:
inp_lang_train[0]

'<start> he poured orange juice on his cereal . <end>'

In [None]:
input_tensor[0]

array([   1,    9,  513, 1569,  616,   11,   16, 1570,    4,    2,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0], dtype=int32)

In [None]:
input_tensor.shape

(30000, 27)

# **Parameters**

In [None]:
# Try experimenting with the size of that dataset
num_examples = input_tensor.shape #30000

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [None]:
input_tensor_train  = input_tensor
target_tensor_train = target_tensor

print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))


30000 30000 2991 2991


In [None]:
target_tensor_train.shape

(30000, 82)

In [None]:
def convert(lang, tensor):
  for t in tensor:
    if t != 0:
      print(f'{t} ----> {lang.index_word[t]}') #id --> word

print("Target Language; index to word mapping")
convert(lang_tokenizer, target_tensor_train[0])

Target Language; index to word mapping
1 ----> <start>
734 ----> orange
605 ----> juice
71 ----> doesn't
395 ----> taste
88 ----> good
17 ----> on
2029 ----> cereal
3 ----> .
2 ----> <end>


### Create a tf.data dataset

In [None]:
len(lang_tokenizer.word_index)

13580

In [None]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(lang_tokenizer.word_index)+1 
vocab_tar_size = len(lang_tokenizer.word_index)+ 1

#input_tensor_train -->X
#target_tensor_train -->Y
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 27]), TensorShape([64, 82]))

##Encoder_decoder

In [None]:
class Encoder(tf.keras.Model):
    #It contains an embedding layer and a GRU.
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print('Encoder output shape: (batch size, sequence length, units)', sample_output.shape)
print('Encoder Hidden state shape: (batch size, units)', sample_hidden.shape)

Encoder output shape: (batch size, sequence length, units) (64, 27, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)




```
With attention, we also use the input context. 
But not the complete input context, we use the 
last GRU hidden states to select the part of the 
input context that we should pay attention to
```



In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # query hidden state shape == (batch_size, hidden size)
    # query_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, max_len, hidden size)
    # we are doing this to broadcast addition along the time axis to calculate the score
    query_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [None]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units)", attention_result.shape)
print("Attention weights shape: (batch_size, sequence_length, 1)", attention_weights.shape)

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 27, 1)


In [None]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [None]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print('Decoder output shape: (batch_size, vocab size)', sample_decoder_output.shape)

Decoder output shape: (batch_size, vocab size) (64, 13581)


##optimizer and loss function




In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction='none')


def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

## Checkpoints

In [None]:
mkdir training_checkpoints

In [None]:
ls

[0m[01;34msample_data[0m/
[01;34mSemEval2020-Task4-Commonsense-Validation-and-Explanation[0m/
[01;34mtraining_checkpoints[0m/


In [None]:
checkpoint_dir = '/content/training_checkpoints/' #./
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

## Training


In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([lang_tokenizer.word_index['<start>']] * BATCH_SIZE, 1) #---targ_lang

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [None]:
EPOCHS = 55

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix=checkpoint_prefix)

  print(f'Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}')
  print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

Epoch 1 Batch 0 Loss 1.0952
Epoch 1 Batch 100 Loss 0.6636
Epoch 1 Batch 200 Loss 0.6229
Epoch 1 Batch 300 Loss 0.5761
Epoch 1 Batch 400 Loss 0.6568
Epoch 1 Loss 0.6434
Time taken for 1 epoch 350.14 sec

Epoch 2 Batch 0 Loss 0.5048
Epoch 2 Batch 100 Loss 0.5712
Epoch 2 Batch 200 Loss 0.5939
Epoch 2 Batch 300 Loss 0.5628
Epoch 2 Batch 400 Loss 0.5379
Epoch 2 Loss 0.5425
Time taken for 1 epoch 235.66 sec

Epoch 3 Batch 0 Loss 0.5819
Epoch 3 Batch 100 Loss 0.5023
Epoch 3 Batch 200 Loss 0.4504
Epoch 3 Batch 300 Loss 0.4736
Epoch 3 Batch 400 Loss 0.5203
Epoch 3 Loss 0.4978
Time taken for 1 epoch 234.26 sec

Epoch 4 Batch 0 Loss 0.4812
Epoch 4 Batch 100 Loss 0.4271
Epoch 4 Batch 200 Loss 0.4186
Epoch 4 Batch 300 Loss 0.4456
Epoch 4 Batch 400 Loss 0.4147
Epoch 4 Loss 0.4466
Time taken for 1 epoch 235.36 sec

Epoch 5 Batch 0 Loss 0.4006
Epoch 5 Batch 100 Loss 0.3907
Epoch 5 Batch 200 Loss 0.3826
Epoch 5 Batch 300 Loss 0.3849
Epoch 5 Batch 400 Loss 0.3661
Epoch 5 Loss 0.3959
Time taken for 1 epo

## Generation

In [None]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess_sentence(sentence)

  inputs = [lang_tokenizer.word_index[i] for i in sentence.split(' ')] #inp_lang
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([lang_tokenizer.word_index['<start>']], 0) #-----targ_lang

  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += lang_tokenizer.index_word[predicted_id] + ' ' #---targ_lang

    if lang_tokenizer.index_word[predicted_id] == '<end>': #----targ_lang
      return result, sentence, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence, attention_plot

In [None]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
  fig = plt.figure(figsize=(10, 10))
  ax = fig.add_subplot(1, 1, 1)
  ax.matshow(attention, cmap='viridis')

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()

In [None]:
def translate(sentence):
  result, sentence, attention_plot = evaluate(sentence)

  
  print('Input:', sentence)
  print('Predicted translation:', result)
  return result
  #attention_plot = attention_plot[:len(result.split(' ')),:len(sentence.split(' '))]
  #plot_attention(attention_plot, sentence.split(' '), result.split(' '))

## Restore the latest checkpoint and test

In [None]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f99beb913d0>

In [None]:
#----generation
output = []
output.append(input_test['FalseSent'].map(lambda x:translate(x)))

#write predivtion into
df = pd.DataFrame(output)
df.head(3)
df.to_csv('seq2seq55.csv')

In [None]:
translate('sugar is used to make coffee sour')



Input: <start> sugar is used to make coffee sour <end>
Predicted translation: sugar is sweet . <end> 


"\nfor i in range(len(input_test['FalseSent'])):\n    translate(input_test['FalseSent'][i])"

In [None]:
translate('The desert has sand that you can drink.')

Input: <start> the desert has sand that you can drink . <end>
Predicted translation: sand is not a solid and cannot be eaten . <end> 


In [None]:
#one 17
translate(u'The window looks out her') 

Input: <start> the window looks out her <end>
Predicted translation: windows are not alive <end> 


'windows are not alive <end> '