In [None]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
from collections import OrderedDict
from nltk.tokenize import word_tokenize, RegexpTokenizer
import time

In [None]:
tf.enable_eager_execution()
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
#files required for NLI Task
untokenized_premises = './data/part1/premises.txt'
untokenized_hypothesis = './data/part1/hypothesis.txt'
tokenized_premises = './data/part1/tokenized_premises.txt'
tokenized_hypothesis = './data/part1/tokenized_hypothesis.txt'
untokenized_premises_dev = './data/part1/premises_dev.txt'
untokenized_hypothesis_dev = './data/part1/hypothesis_dev.txt'
tokenized_premises_dev = './data/part1/tokenized_premises_dev.txt'
tokenized_hypothesis_dev = './data/part1/tokenized_hypothesis_dev.txt'

In [None]:
#files required for CP task
untokenized_source_file_CP = './data/part2/en.txt'
untokenized_target_file_CP = './data/part2/pt.txt'
tokenized_source_file_CP = './data/part2/source.txt'
tokenized_target_file_CP = './data/part2/target.txt'

In [None]:
#files required for NMT task
untokenized_source_file_NMT = './data/part3/english.txt'
untokenized_target_file_NMT = './data/part3/german.txt'
tokenized_source_file_NMT = './data/part3/source.txt'
tokenized_target_file_NMT = './data/part3/target.txt'

In [None]:
#Tokenizing the input files
def convert_to_tokens(input_file, output_file):
    with open(input_file) as fr,open(output_file, 'w') as fw:
        for index, sentence in enumerate(fr):
            sentence = ' '.join(tokenizer.tokenize(sentence))
            words = word_tokenize(sentence.strip().lower())
            fw.write(f"{' '.join(words)}\n")
            if index % 100000 == 0:
                print(index)

In [None]:
#separating the punctuations - used in the CP task
import string
def process_string(s):
    exclude = set(string.punctuation)
    out = ''
    for ch in s:
        if(ch in exclude):
            out += ' ' + ch + ' '
        else:
            out += ch
    return ' '.join(out.split())

In [None]:
def convert_to_tokens_CP(input_file, output_file):
    with open(input_file) as fr,open(output_file, 'w') as fw:
        for index, sentence in enumerate(fr):
            sentence = process_string(sentence)
            fw.write(f"{sentence}\n")
            if index % 100000 == 0:
                print(index)

In [None]:
#NLI Task
convert_to_tokens(untokenized_premises, tokenized_premises)
convert_to_tokens(untokenized_premises_dev, tokenized_premises_dev)
convert_to_tokens(untokenized_hypothesis, tokenized_hypothesis)
convert_to_tokens(untokenized_hypothesis_dev, tokenized_hypothesis_dev)

In [None]:
#CP Task
convert_to_tokens_CP(untokenized_source_file_CP, tokenized_source_file_CP)
convert_to_tokens_CP(untokenized_target_file_CP, tokenized_target_file_CP)

In [None]:
#NMT
convert_to_tokens(untokenized_source_file_NMT, tokenized_source_file_NMT)
convert_to_tokens(untokenized_target_file_NMT, tokenized_target_file_NMT)

In [None]:
#Count words and return words in a sorted order
def count_words(counter, sentences_file):
    for sentence in open(sentences_file):
        words = sentence[2:].strip().split()
        for word in words:
            counter[word] = counter.get(word, 0) + 1
    return counter

In [None]:
#Build the Vocab
def build_vocab(word_counts):
    vocab = OrderedDict()
    vocab['PAD'] = 0
    vocab['EOS'] = 1
    vocab['UNK'] = 2
    vocab['GO'] = 3
    count = 4
    for word, freq in word_counts:
        if(count == 30000):
            break
        vocab[word] = len(vocab) + 1
        count += 1
    return vocab

In [None]:
#NLI 
counter = dict()
counter = count_words(counter, tokenized_premises)
counter = count_words(counter, tokenized_hypothesis)
counter = count_words(counter, tokenized_source_file_CP)
counter = count_words(counter, tokenized_target_file_CP)
english_counter = count_words(counter, tokenized_source_file_NMT)
english_word_counts = sorted(english_counter.items(), key=lambda pair:pair[1], reverse=True)
english_vocab = build_vocab(english_word_counts)

In [None]:
english_word_counts[:10]

In [None]:
german_counter = dict()
german_counter = count_words(german_counter, tokenized_target_file_NMT)
german_word_counts = sorted(german_counter.items(), key=lambda pair:pair[1], reverse=True)
german_vocab = build_vocab(german_word_counts)

In [None]:
german_word_counts[:10]

In [None]:
english_vocab_file = './data/english_vocab.txt'
german_vocab_file = './data/german_vocab.txt'

In [None]:
def write_vocab_file(vocab_file, vocab):
    with open(vocab_file, 'w') as fw:
        for word in vocab:
            fw.write(f'{word}\n')

In [None]:
write_vocab_file(english_vocab_file, english_vocab)
write_vocab_file(german_vocab_file, german_vocab)

In [None]:
#embedder used in this project
class Embedding(tf.keras.Model):
    def __init__(self, V, d):
        super(Embedding, self).__init__()
        self.W = tfe.Variable(tf.random_uniform(minval=-1.0, maxval=1.0, shape=[V, d]))
    
    def call(self, word_indexes):
        return tf.cast(tf.nn.embedding_lookup(self.W, word_indexes), tf.float32)

In [None]:
#load the english and german vocabs from the file
from tensorflow.python.ops import lookup_ops
english_vocab_table = lookup_ops.index_table_from_file(english_vocab_file, default_value=0)
german_vocab_table = lookup_ops.index_table_from_file(german_vocab_file, default_value=0)

In [None]:
#all the file names
sentences_file_NLI = './data/part1/training.txt'
shuffled_sentences_file_NLI = './data/part1/shuffled_training.txt'
check_sentences_file_NLI = './data/part1/check_training.txt'
valid_file_NLI = './data/part1/valid.txt'
sentences_file_CP = './data/part2/training.txt'
valid_file_CP = './data/part2/valid.txt'
shuffled_sentences_file_CP = './data/part2/shuffled_training.txt'
check_sentences_file_CP = './data/part2/check_training.txt'
sentences_file_NMT = './data/part3/training.txt'
valid_file_NMT = './data/part3/valid.txt'
shuffled_sentences_file_NMT = './data/part3/shuffled_training.txt'
check_sentences_file_NMT = './data/part3/check_training.txt'

In [None]:
#combine source and target files for the NLI task
def combine_files_NLI(sentences_file, premises, hypothesis):
    with open(premises) as fp, open(hypothesis) as fh, open(sentences_file, 'w') as fw:
        fp = list(fp)
        fh = list(fh)
        for i in range(len(list(fp))):
            premise = ((fp[i])[2:]).strip()
            hypothesis = ((fh[i])[2:]).strip()
            label = ((fp[i])[0]).strip()
            fw.write(f"{','.join([label, premise, hypothesis])}\n")
combine_files_NLI(sentences_file_NLI, tokenized_premises, tokenized_hypothesis)
combine_files_NLI(valid_file_NLI, tokenized_premises_dev, tokenized_hypothesis_dev)

In [None]:
#combine the source and target files for NMT task
def combine_files_CP_NMT(sentences_file, source_file, target_file):
    with open(source_file) as fp, open(target_file) as fh, open(sentences_file, 'w') as fw:
        fp = list(fp)
        fh = list(fh)
        for i in range(len(list(fp))):
            sent = fp[i].strip()
            tree = fh[i].strip()
            if((sent == '') or (tree == '')):
                continue
            concat = '\t'.join([sent, tree])
            fw.write(f"{concat}\n")
combine_files_CP_NMT(sentences_file_CP, tokenized_source_file_CP, tokenized_target_file_CP)
combine_files_CP_NMT(sentences_file_NMT, tokenized_source_file_NMT, tokenized_target_file_NMT)

In [None]:
#creating the check, and valid dataset - small datasets for demo purposes/fast training
training_samples = 256
valid_samples = 128
import os
def shuffle_sentence_file(file_name, shuffled_file, check_file, valid_file):
    os.system("gshuf " + file_name + " > " + shuffled_file)
    os.system("head -" + str(training_samples) + " " + file_name + " > " + check_file)
    os.system("tail -" + str(valid_samples) + " " + file_name + " > " + valid_file)
shuffle_sentence_file(sentences_file_NLI, shuffled_sentences_file_NLI, check_sentences_file_NLI, valid_file_NLI)
shuffle_sentence_file(sentences_file_CP, shuffled_sentences_file_CP, check_sentences_file_CP, valid_file_CP)
shuffle_sentence_file(sentences_file_NMT, shuffled_sentences_file_NMT, check_sentences_file_NMT, valid_file_NMT)

In [None]:
#create the datasets

#(label, sent1, len1, sent2, len2)
def create_dataset_NLI(sentences_file, vocab_table, batch_size):
    dataset = tf.data.TextLineDataset(sentences_file)
    dataset = dataset.map(lambda sentence: (
        tf.cast(tf.string_to_number((tf.string_split([sentence],',')).values[0]), tf.int64), 
        vocab_table.lookup(tf.string_split([(tf.string_split([sentence],',')).values[1]]).values),
        tf.size(vocab_table.lookup(tf.string_split([(tf.string_split([sentence],',')).values[1]]).values)),
        vocab_table.lookup(tf.string_split([(tf.string_split([sentence],',')).values[2]]).values),
        tf.size(vocab_table.lookup(tf.string_split([(tf.string_split([sentence],',')).values[2]]).values))
                         ))
    dataset = dataset.padded_batch(batch_size=batch_size, padded_shapes=([], [None], [], [None], []))
    return dataset

#(sent1, len1, sent2, sent2_shifted, len2)
def create_dataset_CP(sentences_file, vocab_table, batch_size):
    dataset = tf.data.TextLineDataset(sentences_file)
    dataset = dataset.filter(lambda sentence : (tf.size(english_vocab_table.lookup(tf.string_split([(tf.string_split([sentence],'\t')).values[1]]).values)) <= 100))
    dataset = dataset.map(lambda sentence: (
        tf.cast(vocab_table.lookup(tf.string_split([(tf.string_split([sentence],'\t')).values[0]]).values), dtype=tf.int32),
        tf.size(vocab_table.lookup(tf.string_split([(tf.string_split([sentence],'\t')).values[0]]).values)),
        tf.cast(vocab_table.lookup(tf.string_split([(tf.string_split([sentence],'\t')).values[1]]).values), dtype=tf.int32),
        tf.cast(vocab_table.lookup(tf.concat(((tf.string_split([(tf.string_split([sentence],'\t')).values[1]]).values)[1:],['EOS']), axis=0) ), dtype=tf.int32),
        tf.size(vocab_table.lookup(tf.string_split([(tf.string_split([sentence],'\t')).values[1]]).values))
                         ))
    dataset = dataset.padded_batch(batch_size=batch_size, padded_shapes=([None], [], [None], [None], []))
    return dataset

#(sent1, len1, sent2, sent2_shifted, len2)
def create_dataset_NMT(sentences_file, english_vocab_table, german_vocab_table, batch_size):
    dataset = tf.data.TextLineDataset(sentences_file)
    dataset = dataset.filter(lambda sentence : (tf.size(english_vocab_table.lookup(tf.string_split([(tf.string_split([sentence],'\t')).values[1]]).values)) <= 100))
    dataset = dataset.map(lambda sentence: (
        english_vocab_table.lookup(tf.string_split([(tf.string_split([sentence],'\t')).values[0]]).values),
        tf.size(english_vocab_table.lookup(tf.string_split([(tf.string_split([sentence],'\t')).values[0]]).values)),
        german_vocab_table.lookup(tf.string_split([(tf.string_split([sentence],'\t')).values[1]]).values),
        german_vocab_table.lookup(tf.concat(((tf.string_split([(tf.string_split([sentence],'\t')).values[1]]).values)[1:],['EOS']), axis=0) ),
        tf.size(german_vocab_table.lookup(tf.string_split([(tf.string_split([sentence],'\t')).values[1]]).values))
                         ))
    dataset = dataset.padded_batch(batch_size=batch_size, padded_shapes=([None], [], [None], [None], []))
    return dataset

In [None]:
batch_size = 32

In [None]:
#dataset for the nli part
dataset_NLI = create_dataset_NLI(sentences_file_NLI, english_vocab_table, batch_size)
valid_dataset_NLI = create_dataset_NLI(valid_file_NLI, english_vocab_table, batch_size)
dataset_NLI = dataset_NLI.shuffle(buffer_size=10000)
valid_dataset_NLI = valid_dataset_NLI.shuffle(buffer_size=10000)

In [None]:
#static rnn
class StaticRNN(tf.keras.Model):
    def __init__(self, h, cell):
        super(StaticRNN, self).__init__()
        if cell == 'lstm':
            self.cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=h)
        elif cell == 'gru':
            self.cell = tf.nn.rnn_cell.GRUCell(num_units=h)
        else:
            self.cell = tf.nn.rnn_cell.BasicRNNCell(num_units=h)
        
        
    def call(self, state, word_vectors, num_words):
        word_vectors_time = tf.unstack(word_vectors, axis=1)
        outputs, final_state = tf.nn.static_rnn(cell=self.cell, initial_state = state, inputs=word_vectors_time, sequence_length=num_words, dtype=tf.float32)
        return outputs, final_state

In [None]:
#same encoder has been used for all the three parts
class Encoder(tf.keras.Model):
    def __init__(self, V, d, h, cell):
        super(Encoder, self).__init__()
        self.word_embedding = Embedding(V, d)
        self.rnn = StaticRNN(h, cell)
        
    def call(self, word_vector, word_length):
        word_vectors = self.word_embedding(word_vector)
        rnn_outputs_time, final_state = self.rnn(None, word_vectors, word_length)
        output = []
        #take only the outputs at the end of the step of that particular sentence
        for i in range(int(tf.size(word_length))):
            output.append(rnn_outputs_time[int(word_length[i]) - 1][i])
        t = tf.convert_to_tensor(output, dtype=tf.float32)
        return t, final_state, self.word_embedding

In [None]:
# Decoder for NLI - Task (A multilayer perceptron)
n_hidden_1 = 512 # 1st layer number of features
n_input = 2048 
n_classes = 3 
keep_prob = 0.7 #drop out of 0.3 in the hidden layer

# Store layers weight &amp; bias - xavier initialisation has been used
weights = {
'h1': tf.get_variable("W", shape=[n_input, n_hidden_1], initializer=tf.contrib.layers.xavier_initializer()),
'out': tf.get_variable("W", shape=[n_hidden_1, n_classes], initializer=tf.contrib.layers.xavier_initializer())
}

biases = {
'b1': tf.get_variable("b1", shape=[n_hidden_1], initializer=tf.contrib.layers.xavier_initializer()),
'out': tf.get_variable("out", shape=[n_classes], initializer=tf.contrib.layers.xavier_initializer())
}  

def multilayer_perceptron(x, weights, biases):
    
    # Hidden layer with ReLU activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    drop_out = tf.nn.dropout(layer_1, keep_prob)

    # Output layer with linear activation
    out_layer = tf.matmul(drop_out, weights['out']) + biases['out']
    return out_layer

In [None]:
#Decoder for CP - Task, old embeddings has been used for this word vector
class Decoder_CP(tf.keras.Model):
    def __init__(self, vocab_size, h, cell):
        super(Decoder_CP, self).__init__()
        self.rnn = StaticRNN(h, cell)
        self.output_layer = tf.keras.layers.Dense(units=vocab_size)

    def call(self, embeddings, encoder_final_state, word_vector, word_length):
        decoder_inputs_embedded = embeddings(word_vector)
        decoder_outputs,_ = self.rnn(encoder_final_state, decoder_inputs_embedded, word_length)
        rnn_outputs = tf.stack(decoder_outputs, axis=1)
        logits = self.output_layer(rnn_outputs)
        return logits

In [None]:
#decoder model for the NMT task - notice that a new word embedding has been created for german language
class Decoder_NMT(tf.keras.Model):
    def __init__(self, vocab_size, d, h, cell):
        super(Decoder_NMT, self).__init__()
        self.rnn = StaticRNN(h, cell)
        self.word_embedding = Embedding(vocab_size, d)
        self.output_layer = tf.keras.layers.Dense(units=vocab_size)

    def call(self, english_embeddings, encoder_final_state, word_vector, word_length):
        german_embeddings = self.word_embedding(word_vector)
        decoder_outputs,_ = self.rnn(encoder_final_state, german_embeddings, word_length)
        rnn_outputs = tf.stack(decoder_outputs, axis=1)
        logits = self.output_layer(rnn_outputs)
        return logits

In [None]:
#loss function for each of the parts
def loss_fun_nli(encoder, datum):
    premise_logits,_,_ = encoder(datum[1], datum[2])
    hypothesis_logits,_,_ = encoder(datum[3], datum[4])
    concatenated_logits = tf.concat([tf.concat([premise_logits,hypothesis_logits], axis = 1), 
                                tf.abs(tf.subtract(premise_logits, hypothesis_logits)), 
                                tf.multiply(premise_logits, hypothesis_logits)], axis = 1
                               )
    logits = multilayer_perceptron(concatenated_logits, weights, biases)
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=datum[0])
    return (tf.reduce_sum(loss) / int(tf.reduce_sum(datum[2])))

def loss_fun_cp(encoder, decoder, datum):
    _, encoder_final_state, encoder_embeddings = encoder(datum[0], datum[1])
    logits = decoder(encoder_embeddings, encoder_final_state, datum[2], datum[4])
    mask = tf.sequence_mask(datum[4], dtype=tf.float32)
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=datum[3]) * mask
    return (tf.reduce_sum(loss) / int(tf.reduce_sum(datum[4])))

def loss_fun_nmt(encoder, decoder, datum):
    _, encoder_final_state,_ = encoder(datum[0], datum[1])
    logits = decoder(encoder_final_state, datum[2], datum[4])
    mask = tf.sequence_mask(datum[4], dtype=tf.float32)
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=datum[3]) * mask
    return (tf.reduce_sum(loss) / int(tf.reduce_sum(datum[4])))

In [None]:
#used for clipping gradients
def clip_gradients(grads_and_vars, clip_ratio):
  gradients, variables = zip(*grads_and_vars)
  clipped, _ = tf.clip_by_global_norm(gradients, clip_ratio)
  return zip(clipped, variables)

In [None]:
#used for printing the log with the timestamp
logging = tf.logging
logging.set_verbosity(logging.INFO)
def log_msg(msg):
       logging.info(f'{time.ctime()}: {msg}') 

In [None]:
#computing the perplexity given the models, and the dataset
def compute_ppl_nli(model, dataset):
    total_loss = 0.
    total_words = 0
    for batch_num, datum in enumerate(dataset):
        num_words = int(tf.reduce_sum(datum[2]))
        avg_loss = loss_fun_nli(model, datum)
        total_loss = avg_loss * num_words
        total_words += num_words
        if batch_num % 50 == 0:
            log_msg(f'ppl Done batch: {batch_num}')
    loss = total_loss / total_words
    return np.exp(loss)

def compute_ppl_cp(encoder, decoder, valid_dataset):
    total_loss = 0.
    total_words = 0
    for batch_num, datum in enumerate(valid_dataset):
        num_words = int(tf.reduce_sum(datum[4]))
        avg_loss = loss_fun_cp(encoder, decoder, datum)
        total_loss = avg_loss * num_words
        total_words += num_words
        if batch_num % 50 == 0:
            log_msg(f'ppl Done batch: {batch_num}')
    loss = total_loss / total_words
    return np.exp(loss)

def compute_ppl_nmt(encoder, decoder, valid_dataset):
    total_loss = 0.
    total_words = 0
    for batch_num, datum in enumerate(valid_dataset):
        num_words = int(tf.reduce_sum(datum[4]))
        avg_loss = loss_fun_nmt(encoder, decoder, datum)
        total_loss = avg_loss * num_words
        total_words += num_words
        if batch_num % 50 == 0:
            log_msg(f'ppl Done batch: {batch_num}')
    loss = total_loss / total_words
    return np.exp(loss)

In [None]:
#define the encoder for NLI part, deocder is actually the MLP
import numpy as np
opt = tf.train.AdamOptimizer(learning_rate=0.002)
encoder_nli = Encoder(english_vocab_table.size(), 256, 512, 'gru')

In [None]:
#define the loss value & graditent function using the original loss function
loss_and_grads_fun_nli = tfe.implicit_value_and_gradients(loss_fun_nli)
loss_and_grads_fun_cp = tfe.implicit_value_and_gradients(loss_fun_cp)
loss_and_grads_fun_nmt = tfe.implicit_value_and_gradients(loss_fun_nmt)

In [None]:
#store the model at this checkpoint whenever the perplexity improves
import os
checkpoint_dir = './encoder_nli'
root = tfe.Checkpoint(optimizer=opt, model=encoder_nli, optimizer_step=tf.train.get_or_create_global_step())
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')

In [None]:
#Training the NLI Model
NUM_EPOCHS = 1
STATS_STEPS = 50
EVAL_STEPS = 500

valid_ppl = compute_ppl_nli(encoder_nli, valid_dataset_NLI)
print(f'Start :Valid ppl: {valid_ppl}')

for epoch_num in range(NUM_EPOCHS):
    batch_loss = []
    dataset_NLI = dataset_NLI.shuffle(buffer_size=10000)
    for step_num, datum in enumerate(dataset_NLI, start=1):
        loss_value, gradients = loss_and_grads_fun_nli(encoder_nli, datum)
        batch_loss.append(loss_value)
        
        if step_num % STATS_STEPS == 0:
            print(f'Epoch: {epoch_num} Step: {step_num} Avg Loss: {np.average(np.asarray(loss_value))}')
            batch_loss = []
        opt.apply_gradients(clip_gradients(gradients, 5.0), global_step=tf.train.get_or_create_global_step())
        
        if step_num % EVAL_STEPS == 0:
            ppl = compute_ppl_nli(encoder_nli, valid_dataset_NLI)
            
            #Save model!
            if ppl < valid_ppl:
                save_path = root.save(checkpoint_prefix)
                print(f'Epoch: {epoch_num} Step: {step_num} ppl improved: {ppl} old: {valid_ppl} Model saved: {save_path}')
                valid_ppl = ppl
            else:
                print(f'Epoch: {epoch_num} Step: {step_num} ppl worse: {ppl} old: {valid_ppl}')
                
        
    print(f'Epoch{epoch_num} Done!')

In [None]:
#get the training and validation dataset for the CP part, and shuffle them
dataset_CP = create_dataset_CP(sentences_file_CP, english_vocab_table, 32)
valid_dataset_CP = create_dataset_CP(valid_file_CP, english_vocab_table, 32)
dataset_CP = dataset_CP.shuffle(buffer_size=10000)
valid_dataset_CP = valid_dataset_CP.shuffle(buffer_size=10000)

In [None]:
#Models for the Task - 2 - CP
opt = tf.train.AdamOptimizer(learning_rate=0.002)
encoder_cp = Encoder(english_vocab_table.size(), 256, 512, 'gru')
decoder_cp = Decoder_CP(english_vocab_table.size(), 512, 'gru')

In [None]:
#load back the trained model after the nli part
checkpoint_dir = './encoder_nli'
root = tfe.Checkpoint(optimizer=opt, model=encoder_cp, optimizer_step=tf.train.get_or_create_global_step())
root.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
#define the new checkpoint to store the trained model after the cp part
import os
checkpoint_dir = './encoder_cp'
root = tfe.Checkpoint(optimizer=opt, model=encoder_cp, optimizer_step=tf.train.get_or_create_global_step())
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')

In [None]:
#train the model for the CP part
NUM_EPOCHS = 1
STATS_STEPS = 10
EVAL_STEPS = 100

valid_ppl = compute_ppl_cp(encoder_cp, decoder_cp, valid_dataset_CP)
log_msg(f'Start :Valid ppl: {valid_ppl}')

for epoch_num in range(NUM_EPOCHS):
    batch_loss = []
    dataset_CP = dataset_CP.shuffle(buffer_size = 10000)
    for step_num, datum in enumerate(dataset_CP, start=1):
        loss_value, gradients = loss_and_grads_fun_cp(encoder_cp, decoder_cp, datum)
        batch_loss.append(loss_value)
        
        if step_num % STATS_STEPS == 0:
            log_msg(f'Epoch: {epoch_num} Step: {step_num} Avg Loss: {np.average(np.asarray(loss_value))}')
            batch_loss = []
        opt.apply_gradients(clip_gradients(gradients, 5.0), global_step=tf.train.get_or_create_global_step())
        
        if step_num % EVAL_STEPS == 0:
            ppl = compute_ppl_cp(encoder_cp, decoder_cp, valid_dataset_CP)
            
            #Save model!
            if ppl < valid_ppl:
                save_path = root.save(checkpoint_prefix)
                log_msg(f'Epoch: {epoch_num} Step: {step_num} ppl improved: {ppl} old: {valid_ppl} Model saved: {save_path}')
                valid_ppl = ppl
            else:
                log_msg(f'Epoch: {epoch_num} Step: {step_num} ppl worse: {ppl} old: {valid_ppl}')
                
        
    log_msg(f'Epoch{epoch_num} Done!')

In [None]:
#get the NMT training set
dataset_NMT = create_dataset_NMT(sentences_file_NMT, english_vocab_table, german_vocab_table, 32)
valid_dataset_NMT = create_dataset_NMT(valid_file_NMT, english_vocab_table, german_vocab_table, 32)
dataset_NMT = dataset_NMT.shuffle(buffer_size=10000)
valid_dataset_NMT = valid_dataset_NMT.shuffle(buffer_size=10000)

In [None]:
#Model for the Task - NMT
opt = tf.train.AdamOptimizer(learning_rate=0.002)
encoder_nmt = Encoder(english_vocab_table.size(), 256, 512, 'gru')
decoder_nmt = Decoder_NMT(german_vocab_table.size(), 256, 512, 'gru')

In [None]:
#load back the encoder trained after the cp part
checkpoint_dir = './encoder_cp'
root = tfe.Checkpoint(optimizer=opt, model=encoder_nmt, optimizer_step=tf.train.get_or_create_global_step())
root.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
#load the new trained model at this checkpoint
import os
checkpoint_dir = './encoder_nmt'
root = tfe.Checkpoint(optimizer=opt, model=encoder_nmt, optimizer_step=tf.train.get_or_create_global_step())
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')

In [None]:
#train the model for the NMT part
NUM_EPOCHS = 1
STATS_STEPS = 50
EVAL_STEPS = 500

valid_ppl = compute_ppl_nmt(encoder_nmt, decoder_nmt, valid_dataset_NMT)
log_msg(f'Start :Valid ppl: {valid_ppl}')

for epoch_num in range(NUM_EPOCHS):
    batch_loss = []
    dataset_NMT = dataset_NMT.shuffle(buffer_size = 10000)
    for step_num, datum in enumerate(dataset_NMT, start=1):
        loss_value, gradients = loss_and_grads_fun_nmt(encoder_nmt, decoder_nmt, datum)
        batch_loss.append(loss_value)
        
        if step_num % STATS_STEPS == 0:
            log_msg(f'Epoch: {epoch_num} Step: {step_num} Avg Loss: {np.average(np.asarray(loss_value))}')
            batch_loss = []
        opt.apply_gradients(clip_gradients(gradients, 5.0), global_step=tf.train.get_or_create_global_step())
        
        if step_num % EVAL_STEPS == 0:
            ppl = compute_ppl_nmt(encoder_nmt, decoder_nmt, valid_dataset_NMT)
            
            #Save model!
            if ppl < valid_ppl:
                save_path = root.save(checkpoint_prefix)
                log_msg(f'Epoch: {epoch_num} Step: {step_num} ppl improved: {ppl} old: {valid_ppl} Model saved: {save_path}')
                valid_ppl = ppl
            else:
                log_msg(f'Epoch: {epoch_num} Step: {step_num} ppl worse: {ppl} old: {valid_ppl}')
                
        
    log_msg(f'Epoch{epoch_num} Done!')

In [None]:
#loading the final trained model
checkpoint_dir = './encoder_nmt'
root = tfe.Checkpoint(optimizer=opt, model=encoder_nmt, optimizer_step=tf.train.get_or_create_global_step())
root.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
#get the final word embeddings of the encoder
word_embeddings = encoder_nmt.word_embedding

In [None]:
#get the embeddings of all the words in the vocab
all_embeddings = word_embeddings([i for i in range(int(english_vocab_table.size()))]).numpy()

In [None]:
#make an invert map - index2word - so that it can be printed back as a file
keys = english_vocab_table.export()[0].numpy()
values = english_vocab_table.export()[1].numpy()
index2word = {v: k for k, v in zip(keys, values)}
word2index = {k: v for k, v in zip(keys, values)}

In [None]:
#using the sklearn cosine similarity, return the distances and indices of the closest k points from any fixed point
from sklearn.neighbors import NearestNeighbors
def cosine_knn(corpus_vector, queries_vector, k=6):
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='cosine')
    nbrs.fit(corpus_vector)
    distances, indices = nbrs.kneighbors(queries_vector)
    return distances, indices

In [None]:
#use only the indices
closest_indices = cosine_knn(all_embeddings, all_embeddings)[1]

In [None]:
#create the file with 5 closest neighbours of each word in the vocab
def create_closest_word_file(closest_indices, english_vocab_table, index2word):
    with open('closest_neighbours.txt', 'w+') as f:
        for i in range(int(english_vocab_table.size())):
            curr_word = index2word[i]
            closest_points = closest_indices[i][1:]
            string = str(curr_word.decode("utf-8")) + ', '
            for j, nbr in enumerate(list(closest_points)):
                neighbour = str(index2word[nbr].decode("utf-8"))
                if(j == len(list(closest_points)) - 1):
                    string += neighbour
                else:
                    string += neighbour + ', '
            string += '\n'
            f.write(string)
            

In [None]:
create_closest_word_file(closest_indices, english_vocab_table, index2word)