#CS224N Assignment 4 (using TF 2.0)

Good tutorials on TF: https://riptutorial.com/tensorflow/example/29069/how-to-use-tf-gather-nd

colab timeout:

https://stackoverflow.com/questions/57113226/how-to-prevent-google-colab-from-disconnecting/57114793

https://www.reddit.com/r/datascience/comments/bkrzah/google_colab_how_to_avoid_timeoutdisconnect_issues/

In [0]:
# !pip install tensorflow-gpu #no need to do this anymore as it's available by default in colab

In [0]:
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__) #use TF vesion 2.0

TensorFlow 2.x selected.
2.0.0


## 1. UTILITY FUNCTIONS

In [0]:
#utils.py

"""
CS224N 2018-19: Homework 4
nmt.py: NMT Model
Pencheng Yin <pcyin@cs.cmu.edu>
Sahil Chopra <schopra8@stanford.edu>
Implemented in TF 2.0 by Amit Patel
"""

import math
from typing import List

import numpy as np
import tensorflow as tf



def pad_sents(sents, pad_token):
    """ Pad list of sentences according to the longest sentence in the batch.
    @param sents (list[list[str]]): list of sentences, where each sentence
                                    is represented as a list of words
    @param pad_token (str): padding token
    @returns sents_padded (list[list[str]]): list of sentences where sentences shorter
        than the max length sentence are padded out with the pad_token, such that
        each sentences in the batch now has equal length.
    """
    sents_padded = []

    ### YOUR CODE HERE (~6 Lines)
    max_len = 0
    for s in sents:
        max_len = max(len(s), max_len)
        
    for s in sents:
        temp = s[:] #deep copy
        n = len(temp)
        for i in range(n,max_len,1):
            temp.append(pad_token)
        sents_padded.append(temp)
    ### END YOUR CODE

    return sents_padded



def read_corpus(file_path, source):
    """ Read file, where each sentence is dilineated by a `\n`.
    @param file_path (str): path to file containing corpus
    @param source (str): "tgt" or "src" indicating whether text
        is of the source language or target language
    """
    data = []
    for line in open(file_path):
        sent = line.strip().split(' ')
        # only append <s> and </s> to the target sentence
        if source == 'tgt':
            sent = ['<s>'] + sent + ['</s>']
        data.append(sent)

    return data


def batch_iter(data, batch_size, shuffle=False):
    """ Yield batches of source and target sentences reverse sorted by source length (largest to smallest).
    @param data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (int): batch size
    @param shuffle (boolean): whether to randomly shuffle the dataset
    """
    batch_num = math.ceil(len(data) / batch_size)
    index_array = list(range(len(data)))

    if shuffle:
        np.random.shuffle(index_array)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]
        examples = [data[idx] for idx in indices]

        examples = sorted(examples, key=lambda e: len(e[0]), reverse=True)
        src_sents = [e[0] for e in examples]
        tgt_sents = [e[1] for e in examples]

        yield src_sents, tgt_sents



## 2. VOCABULARY
Will generate the word2id and id2word dictionaries for source and target languages.

In [0]:
#vocab.py
"""
CS224N 2018-19: Homework 4
vocab.py: Vocabulary Generation
Pencheng Yin <pcyin@cs.cmu.edu>
Sahil Chopra <schopra8@stanford.edu>
Amit Patel: Changed to TF 2.0

Usage:
    vocab.py --train-src=<file> --train-tgt=<file> [options] VOCAB_FILE

Options:
    -h --help                  Show this screen.
    --train-src=<file>         File of training source sentences
    --train-tgt=<file>         File of training target sentences
    --size=<int>               vocab size [default: 50000]
    --freq-cutoff=<int>        frequency cutoff [default: 2]
"""

from collections import Counter
from docopt import docopt
from itertools import chain
import json
from typing import List


class VocabEntry(object):
    """ Vocabulary Entry, i.e. structure containing either
    src or tgt language terms.
    """
    def __init__(self, word2id=None):
        """ Init VocabEntry Instance.
        @param word2id (dict): dictionary mapping words 2 indices
        """
        if word2id:
            self.word2id = word2id
        else:
            self.word2id = dict()
            self.word2id['<pad>'] = 0   # Pad Token
            self.word2id['<s>'] = 1 # Start Token
            self.word2id['</s>'] = 2    # End Token
            self.word2id['<unk>'] = 3   # Unknown Token
        self.unk_id = self.word2id['<unk>']
        self.id2word = {v: k for k, v in self.word2id.items()}

    def __getitem__(self, word):
        """ Retrieve word's index. Return the index for the unk
        token if the word is out of vocabulary.
        @param word (str): word to look up.
        @returns index (int): index of word 
        """
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        """ Check if word is captured by VocabEntry.
        @param word (str): word to look up
        @returns contains (bool): whether word is contained    
        """
        return word in self.word2id

    def __setitem__(self, key, value):
        """ Raise error, if one tries to edit the VocabEntry.
        """
        raise ValueError('vocabulary is readonly')

    def __len__(self):
        """ Compute number of words in VocabEntry.
        @returns len (int): number of words in VocabEntry
        """
        return len(self.word2id)

    def __repr__(self):
        """ Representation of VocabEntry to be used
        when printing the object.
        """
        return 'Vocabulary[size=%d]' % len(self)

    def id2word(self, wid):
        """ Return mapping of index to word.
        @param wid (int): word index
        @returns word (str): word corresponding to index
        """
        return self.id2word[wid]

    def add(self, word):
        """ Add word to VocabEntry, if it is previously unseen.
        @param word (str): word to add to VocabEntry
        @return index (int): index that the word has been assigned
        """
        if word not in self:
            wid = self.word2id[word] = len(self)
            self.id2word[wid] = word
            return wid
        else:
            return self[word]

    def words2indices(self, sents):
        """ Convert list of words or list of sentences of words
        into list or list of list of indices.
        @param sents (list[str] or list[list[str]]): sentence(s) in words
        @return word_ids (list[int] or list[list[int]]): sentence(s) in indices
        """
        if type(sents[0]) == list:
            return [[self[w] for w in s] for s in sents]
        else:
            return [self[w] for w in sents]

    def indices2words(self, word_ids):
        """ Convert list of indices into words.
        @param word_ids (list[int]): list of word ids
        @return sents (list[str]): list of words
        """
        return [self.id2word[w_id] for w_id in word_ids]

    def to_input_tensor(self, sents, device):
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tesnor, i.e. CPU or GPU

        @returns sents_var: tensor of (batch_size, max_sentence_length)
        """
        word_ids = self.words2indices(sents)
        sents_t = pad_sents(word_ids, self.word2id['<pad>'])

        with tf.device(device):
            sents_var = tf.constant(sents_t, dtype=tf.int64) #there's a bug in TF2.0 RC0 that prevents tf.int32 tensor to be placed on GPU (but tf.int16/64 or tf.float32 is fine)
        #print(sents_var.device)
        
        #sents_var = tf.linalg.matrix_transpose(sents_var) #Don't do this for TF as we want batch to be the first dimension (unlike Pytorch)
        return sents_var 

    @staticmethod
    def from_corpus(corpus, size, freq_cutoff=2):
        """ Given a corpus construct a Vocab Entry.
        @param corpus (list[str]): corpus of text produced by read_corpus function
        @param size (int): # of words in vocabulary
        @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word
        @returns vocab_entry (VocabEntry): VocabEntry instance produced from provided corpus
        """
        vocab_entry = VocabEntry()
        word_freq = Counter(chain(*corpus))
        valid_words = [w for w, v in word_freq.items() if v >= freq_cutoff]
        print('number of word types: {}, number of word types w/ frequency >= {}: {}'
              .format(len(word_freq), freq_cutoff, len(valid_words)))
        top_k_words = sorted(valid_words, key=lambda w: word_freq[w], reverse=True)[:size]
        for word in top_k_words:
            vocab_entry.add(word)
        return vocab_entry


class Vocab(object):
    """ Vocab encapsulating src and target langauges.
    """
    def __init__(self, src_vocab, tgt_vocab):
        """ Init Vocab.
        @param src_vocab (VocabEntry): VocabEntry for source language
        @param tgt_vocab (VocabEntry): VocabEntry for target language
        """
        self.src = src_vocab
        self.tgt = tgt_vocab

    @staticmethod
    def build(src_sents, tgt_sents, vocab_size, freq_cutoff) -> 'Vocab':
        """ Build Vocabulary.
        @param src_sents (list[str]): Source sentences provided by read_corpus() function
        @param tgt_sents (list[str]): Target sentences provided by read_corpus() function (will have beginning and end of sentence tokens)
        @param vocab_size (int): Size of vocabulary for both source and target languages
        @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word.
        """
        assert len(src_sents) == len(tgt_sents)

        print('initialize source vocabulary ..')
        src = VocabEntry.from_corpus(src_sents, vocab_size, freq_cutoff)

        print('initialize target vocabulary ..')
        tgt = VocabEntry.from_corpus(tgt_sents, vocab_size, freq_cutoff)

        return Vocab(src, tgt)

    def save(self, file_path):
        """ Save Vocab to file as JSON dump.
        @param file_path (str): file path to vocab file
        """
        json.dump(dict(src_word2id=self.src.word2id, tgt_word2id=self.tgt.word2id), open(file_path, 'w'), indent=2)

    @staticmethod
    def load(file_path):
        """ Load vocabulary from JSON dump.
        @param file_path (str): file path to vocab file
        @returns Vocab object loaded from JSON dump
        """
        entry = json.load(open(file_path, 'r'))
        src_word2id = entry['src_word2id']
        tgt_word2id = entry['tgt_word2id']

        return Vocab(VocabEntry(src_word2id), VocabEntry(tgt_word2id))

    def __repr__(self):
        """ Representation of Vocab to be used
        when printing the object.
        """
        return 'Vocab(source %d words, target %d words)' % (len(self.src), len(self.tgt))



def main_vocabulory(train_src, train_tgt, size=50000, freq_cutoff=2, VOCAB_FILE=None, create=False, readin=True):
    ''' 
    Main function for managing the vocabulory (i.e. create or readin):
    @param train-src=<file>: File of training source sentences
    @param train-tgt=<file>: File of training target sentences
    @param size=<int>: vocab size [default: 50000]
    @param freq-cutoff=<int>: frequency cutoff [default: 2]
    @param VOCAB_FILE=<file>: File to store the vocabulory
    @param create=<True/False>: A flag to create the vocabulory
    @param readin=<True/False>: A flag to readin the already created vocabulory file
    '''
    
    if create == True:
        print('read in source sentences: %s' % train_src)
        print('read in target sentences: %s' % train_tgt)

        src_sents = read_corpus(train_src, source='src')
        tgt_sents = read_corpus(train_tgt, source='tgt') #(will have beginning and end of sentence tokens)
        print('\nExample of source and target sentences: \nSource: {} \nTarget: {}\n'.format(src_sents[0], tgt_sents[0]))

        vocab = Vocab.build(src_sents, tgt_sents, int(size), int(freq_cutoff))
        print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt)))

        vocab.save(VOCAB_FILE)
        print('vocabulary saved to %s' % VOCAB_FILE)
    
    if readin == True:
        return Vocab.load(VOCAB_FILE) 

In [0]:
#Create/readin the Vocabulary file
from google.colab import drive
drive.mount('/content/gdrive')

import os
os.chdir('/content/gdrive/My Drive/Colab Notebooks/Deep_Learning/CS224N_2019/a4')
!pwd
!ls
print()
train_src = './a4/en_es_data/train.es'
train_tgt = './a4/en_es_data/train.en'
VOCAB_FILE = './VOCAB_FILE.json'
VOCABULARY = None #global variable to store the vocabulary
VOCABULARY = main_vocabulory(train_src, train_tgt, VOCAB_FILE=VOCAB_FILE, create=False, readin=True)
print(VOCABULARY)
#VOCABULARY.src.id2word[1] #/tgt.word2id/id2word #for the source vocab dictionary
# print(VOCABULARY.src.id2word(1))
!ls

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
/content/gdrive/My Drive/Colab Notebooks/Deep_Learning/CS224N_2019/a4
a4	A4_Tensorflow.ipynb	    __MACOSX   __pycache__  VOCAB_FILE.json
a4.pdf	Keras_RNN_Playground.ipynb  model.png  saved_model

Vocab(source 50004 words, target 50002 words)
a4	A4_Tensorflow.ipynb	    __MACOSX   __pycache__  VOCAB_FILE.json
a4.pdf	Keras_RNN_Playground.ipynb  model.png  saved_model


##VOCAB Playground

In [0]:
#Vocabulary playground
print(VOCABULARY)
print(len(VOCABULARY.tgt))
tgt_corpus = read_corpus(train_tgt, 'tgt')[0:4]
print(tgt_corpus)
target_padded = VOCABULARY.tgt.to_input_tensor(tgt_corpus, 'GPU:0')
print('target padded', target_padded, target_padded[:,:-1])
target_masks = tf.cast(target_padded != VOCABULARY.tgt['<pad>'], tf.float32)
print(target_masks)

'''
#There's a bug in TF2.0 RC0 that prevents tf.int32 tensor to be placed on GPU (but tf.int16/64 or tf.float32 is fine)
dtype = tf.int64
var1 = np.array([[1,2,3],[11,22,33]])
with tf.device("GPU:0"):
    var1 = tf.constant(var1, dtype=dtype)
print(var1.device)

var1 = np.array([[1,2,3],[11,22,33]])
with tf.device("CPU:0"):
    var1 = tf.constant(var1, dtype=dtype)
print(var1.device)
'''

Vocab(source 50004 words, target 50002 words)
50002
[['<s>', 'Thank', 'you', 'so', 'much,', 'Chris.', 'And', "it's", 'truly', 'a', 'great', 'honor', 'to', 'have', 'the', 'opportunity', 'to', 'come', 'to', 'this', 'stage', 'twice;', "I'm", 'extremely', 'grateful.', '</s>'], ['<s>', 'I', 'have', 'been', 'blown', 'away', 'by', 'this', 'conference,', 'and', 'I', 'want', 'to', 'thank', 'all', 'of', 'you', 'for', 'the', 'many', 'nice', 'comments', 'about', 'what', 'I', 'had', 'to', 'say', 'the', 'other', 'night.', '</s>'], ['<s>', 'And', 'I', 'say', 'that', 'sincerely,', 'partly', 'because', '(Mock', 'sob)', 'I', 'need', 'that.', '', 'Put', 'yourselves', 'in', 'my', 'position.', '</s>'], ['<s>', 'I', 'flew', 'on', 'Air', 'Force', 'Two', 'for', 'eight', 'years.', '</s>']]
target padded tf.Tensor(
[[    1   183    13    39  1587  6916    15    38  1094     7   199  2917
      5    21     4   796     5   149     5    16  1262 45903    77   926
  16104     2     0     0     0     0     0     0]


'\n#There\'s a bug in TF2.0 RC0 that prevents tf.int32 tensor to be placed on GPU (but tf.int16/64 or tf.float32 is fine)\ndtype = tf.int64\nvar1 = np.array([[1,2,3],[11,22,33]])\nwith tf.device("GPU:0"):\n    var1 = tf.constant(var1, dtype=dtype)\nprint(var1.device)\n\nvar1 = np.array([[1,2,3],[11,22,33]])\nwith tf.device("CPU:0"):\n    var1 = tf.constant(var1, dtype=dtype)\nprint(var1.device)\n'

In [0]:
# Playground
#1. Pad sentences (pad an uneven list of lists into a 2d matrix)
x = [[1,2,3], [1,21], [1], [11,22,1,3]]
x_padded = tf.keras.preprocessing.sequence.pad_sequences(x, padding='post', value=0.0)
print(x_padded)

#Constants and Variables
a = tf.constant([1,2,3])
a2 = tf.constant(a)
b = tf.ones((3,1))

c = tf.Variable(initial_value=a)
print(a, a2, b, c)
print(c[2])
print()

print(a[0], a2[0], b[0], c[0])
#a[0] = 11 #error as constant does not support item assignment
#b[0] = 11 #error as constant does not support item assignment
#c[0] = 11 #error as variable does not support item assignment but it does have a method to update the value
c[0].assign(11)
print(c)
print()

# Good read on pack padded sentences for RNN
# 1. Pytoch: https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html
'''
        The whole sequence is:
        pad
        embed
        pack_padded
        – [rnn] -->
        pad_packed
        eval
'''
# 2. TF: https://danijar.com/variable-sequence-lengths-in-tensorflow/

# Don't need to do pack padding in TF 2.0 due to masking operation in embeddings layer and other layers that support masking.

[[ 1  2  3  0]
 [ 1 21  0  0]
 [ 1  0  0  0]
 [11 22  1  3]]
tf.Tensor([1 2 3], shape=(3,), dtype=int32) tf.Tensor([1 2 3], shape=(3,), dtype=int32) tf.Tensor(
[[1.]
 [1.]
 [1.]], shape=(3, 1), dtype=float32) <tf.Variable 'Variable:0' shape=(3,) dtype=int32, numpy=array([1, 2, 3], dtype=int32)>
tf.Tensor(3, shape=(), dtype=int32)

tf.Tensor(1, shape=(), dtype=int32) tf.Tensor(1, shape=(), dtype=int32) tf.Tensor([1.], shape=(1,), dtype=float32) tf.Tensor(1, shape=(), dtype=int32)
<tf.Variable 'Variable:0' shape=(3,) dtype=int32, numpy=array([11,  2,  3], dtype=int32)>



'\n        The whole sequence is:\n        pad\n        embed\n        pack_padded\n        – [rnn] -->\n        pad_packed\n        eval\n'

## 3. MODEL

In [0]:
import tensorflow as tf
import tensorflow.keras as K

In [0]:
#Playground
tf.random.set_seed(0)

'''
#Embeddings
print('1. EMBEDDINGS')
tf.random.set_seed(1)
embd = ModelEmbeddings(3, VOCABULARY)
x = tf.constant([[1,0,3,0]], dtype=tf.int32)
# x = tf.constant([1], dtype=tf.int32)
print(embd.source(x))
print(embd.source.variables)

in_ = K.layers.Input(shape=(4,))
y = embd.source(in_)
x = tf.constant([[1,0,3,0]], dtype=tf.int32)
model = K.Model(inputs=in_, outputs=y)
print(y)
print(model(x))
print()


#TF Basics
a = tf.Variable([1,2])
print(a)
a = tf.cast(a, dtype=tf.float32)
print(a)

a = tf.Variable([[1.0,2,3], [11,22,33]])
print(a)
b = tf.nn.softmax(a, axis=-1)
print(b)
c = tf.nn.log_softmax(a, axis=-1)
print(c)

print('zzz')
a = tf.ones((5,4,2))
b = tf.split(a, a.shape[1], 1)
print(b)
print(a[:,1,:])


device_dict = {'cpu': "CPU:0", 'gpu': "GPU:0"}
with tf.device(device_dict['gpu']):
    #Embeddings layer
    model_embeddings = K.layers.Embedding(4, 3)
    print(model_embeddings.trainable_variables)
    x = tf.constant([[1,0,3,0]], dtype=tf.int64)
    print(model_embeddings(x))
    print(model_embeddings.trainable_variables)
    print(model_embeddings.trainable_variables[0].device)
    print(x.device)


# TF Boolean mask

# This doesn't work
a = tf.Variable([[1,2],[11,22],[1,3]])
b = tf.Variable([[1,0],[1,1],[1,0]])
print(a>2)
print(tf.boolean_mask(a,a>2))
# a[b].assign(1)
# a

#This works
a = tf.Variable([[1,2],[11,22],[1,3]])
b = tf.Variable([[1,0],[1,1],[1,0]])
b = tf.cast(b, tf.bool)
print('A: ', a>2)
print('B: ', tf.where(a>2, 1, 0))
print('C: ', tf.where(b, a, 0))
# a[b].assign(1)
# a

#Numpy Boolean mask
a = np.array([[1,2],[11,22],[1,3]])
b = np.array([[1,0],[1,1],[1,0]], dtype=np.bool) #boolean mask
a[b] = 10
print(a)
a[a==2] = 11
print(a)

# Reshape vs Transpose (In most cases, use transpose and not reshape, esp when using matrix multiplication!)
a = tf.random.uniform([2,3])
print(a,'\n')
print(tf.transpose(a, [1,0]),'\n')
print(tf.reshape(a, [3,2]),'\n')

#tf.gather
b,t,v = 2,3,2
params = tf.random.uniform([b,t,v])
# params = tf.transpose(params)
print(params, '\n')
print()
#print(params[:,[1,2]]) #doesn't work
indices = [0,1,1,1,0,0]
gthr = tf.gather(params, indices, axis=0)
print(gthr, '\n')
# indices = tf.constant([[1],[0]])
#gthr = tf.gather_nd(params, indices)
#print(gthr, '\n')


params = tf.random.uniform([b,t,v])
# indices = [1,0] #tf.constant([[1],[2]])
indices = [[0,1,0], [1,1,1]]
indices = tf.constant([[1],[0],[1],[0],[1],[0]])
print(params, '\n')
print(indices, '\n')
gthr = tf.gather(params, indices, axis=2)
print(gthr, '\n')
gthr = tf.gather_nd(params, indices)
print(gthr, '\n')
params = tf.reshape(params, [-1,v])
print(params, '\n')
params = tf.reshape(params, [b,t,v])
print(params, '\n')

indices = [[0, 0], [1, 1]]
params = [['a', 'b'], ['c', 'd']]
#output = ['a', 'd']
gthr = tf.gather_nd(params, indices)
print(gthr, '\n')


params = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
# indices = tf.constant([[0,0], [2,1]])
indices = [0,1]
print(tf.gather(params, indices, axis=0))
'''


'''
#TF vs Pytroch gather function comparison

import time
import numpy as np
import tensorflow as tf
from numba import jit
import torch

tf.gather_nd([[1,2,3]], [[1]]) #do this first as tf.gather_nd() takes a lot of time the first time its called

# b, t, v = 2, 4, 5
b, t, v = 64, 300, 20000
params = np.arange(b*t*v).reshape([b,t,v])
indices = np.random.randint(0,v, [b,t])

def func_pythn(indices, params):
    #Using for Python loop
    def get_indices(indices):
        #indices = [[[0,0,0]],[[1,1,0]]]
        indices = [[[i,j,indices[i,j]] for j in range(indices.shape[1])] for i in range(indices.shape[0])] #this is slow
        return indices
    indices = get_indices(indices)
    indices = tf.Variable(indices)
    selected = tf.gather_nd(params, indices)
    return selected

@jit #(nopython=True, parallel=True) # about 3x faster
def func_pythn_numba(indices, params):
    #using python for loop but then optimized using Numba
    def get_indices(indices):
        #indices = [[[0,0,0]],[[1,1,0]]]
        indices = [[[i,j,indices[i,j]] for j in range(indices.shape[1])] for i in range(indices.shape[0])] #this is slow
        return indices
    indices = get_indices(indices)
    indices = tf.Variable(indices)
    selected = tf.gather_nd(params, indices)
    return selected

def func_tf_with_reshape(indices, params):
    #purely TF with reshape
    (b,t,v) = params.shape #indices: (b,t)
    params = tf.reshape(params, [-1, v])
    indices = tf.reshape(indices, [-1,1])
    indices = tf.stack([tf.range(b*t, dtype=tf.int64),indices[:,0]],axis=-1)
    selected = tf.gather_nd(params, indices)
    selected = tf.reshape(selected, [b,t])
    return selected

def func_tf(indices, params):
    #purely TF without reshape
    (b,t,v) = params.shape #indices: (b,t)
    B,T = tf.meshgrid(tf.range(b), tf.range(t), indexing='ij')
    indices = tf.stack([B,T,indices], axis=-1) #(b,t,3)
    selected = tf.gather_nd(params, indices)
    return selected



def func_trch(indices, params):
    #purely Pytroch
    #params = torch.tensor(params)
    #indices = torch.tensor(indices)
    indices = torch.unsqueeze(indices, dim=-1)
    selected = torch.gather(params, 2, indices)
    return selected


pp = tf.Variable(params, dtype=tf.float32)
strt = time.time()
func_pythn(indices, pp)
print('func_pythn w/ TF: ', time.time()-strt)

pp = tf.Variable(params, dtype=tf.float32)
strt = time.time()
func_pythn_numba(indices, pp)
print('func_pythn_numba w/ TF: ', time.time()-strt)

pp = tf.Variable(params, dtype=tf.float32)
strt = time.time()
func_pythn_numba(indices, pp)
print('func_pythn_numba w/ TF: ', time.time()-strt)


pp = tf.Variable(params, dtype=tf.float32)
strt = time.time()
func_tf_with_reshape(indices, pp)
print('func_tf_with_reshape: ', time.time()-strt)


pp = tf.Variable(params, dtype=tf.float32)
strt = time.time()
func_tf(indices, pp)
print('func_tf: ', time.time()-strt)


pp = torch.tensor(params, dtype=torch.float32)
ii = torch.tensor(indices)
strt = time.time()
func_trch(ii, pp)
print('func_trch: ', time.time()-strt)

print('\nTF Gradient')
with tf.GradientTape() as t:
    params_tf = tf.Variable(params, dtype=tf.float32)
    y_tf = func_tf(indices, params_tf)
    y = tf.reduce_sum(y_tf)
# print(y_tf, '\n')
print(y, '\n')
g_tf = t.gradient(y, params_tf)
# print(g_tf, '\n')

print('\nPytorch Gradient')
params_trch = torch.tensor(params, dtype=torch.float32)
params_trch.requires_grad = True
indices_trch = torch.tensor(indices)
y_trch = func_trch(indices_trch, params_trch)
y = torch.sum(y_trch)
y.backward()
# print(y_trch, '\n')
print(y, '\n')
g_trch = params_trch.grad
# print(g_trch, '\n')

print('TF vs Pytroch Gradient Comparison')
print(np.all(g_tf.numpy() == g_trch.numpy()))


#TF Gradient Tape
a = tf.Variable(1.0)
b = tf.Variable(2.0)
b2 = tf.Variable(3.0)
a2 = tf.stop_gradient(a)
with tf.GradientTape() as t:
    c = 2*a2 + b + a
grad = t.gradient(c, a)
print(grad)
'''

"\n#TF vs Pytroch gather function comparison\n\nimport time\nimport numpy as np\nimport tensorflow as tf\nfrom numba import jit\nimport torch\n\ntf.gather_nd([[1,2,3]], [[1]]) #do this first as tf.gather_nd() takes a lot of time the first time its called\n\n# b, t, v = 2, 4, 5\nb, t, v = 64, 300, 20000\nparams = np.arange(b*t*v).reshape([b,t,v])\nindices = np.random.randint(0,v, [b,t])\n\ndef func_pythn(indices, params):\n    #Using for Python loop\n    def get_indices(indices):\n        #indices = [[[0,0,0]],[[1,1,0]]]\n        indices = [[[i,j,indices[i,j]] for j in range(indices.shape[1])] for i in range(indices.shape[0])] #this is slow\n        return indices\n    indices = get_indices(indices)\n    indices = tf.Variable(indices)\n    selected = tf.gather_nd(params, indices)\n    return selected\n\n@jit #(nopython=True, parallel=True) # about 3x faster\ndef func_pythn_numba(indices, params):\n    #using python for loop but then optimized using Numba\n    def get_indices(indices):\n

In [0]:
#model_embeddings.py

"""
CS224N 2018-19: Homework 4
model_embeddings.py: Embeddings for the NMT model
Pencheng Yin <pcyin@cs.cmu.edu>
Sahil Chopra <schopra8@stanford.edu>
Anand Dhoot <anandd@stanford.edu>
Implemented in TF 2.0 by Amit Patel

"""
#Embeddings Layer
class ModelEmbeddings(tf.keras.layers.Layer): 
    """
    Class that converts input words to their embeddings.
    """
    def __init__(self, embed_size, vocab):
        """
        Init the Embedding layers.

        @param embed_size (int): Embedding size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        """
        super(ModelEmbeddings, self).__init__()
        self.embed_size = embed_size

        # default values
        self.source = None
        self.target = None

        src_pad_token_idx = vocab.src['<pad>'] #idx is 0 for the pad token
        tgt_pad_token_idx = vocab.tgt['<pad>'] #idx is 0 for the pad token
        #print(src_pad_token_idx)
        #print(tgt_pad_token_idx)

        ### YOUR CODE HERE (~2 Lines)
        ### TODO - Initialize the following variables:
        ###     self.source (Embedding Layer for source language)
        ###     self.target (Embedding Layer for target langauge)
        ###
        ### Note:
        ###     1. `vocab` object contains two vocabularies:
        ###            `vocab.src` for source
        ###            `vocab.tgt` for target
        ###     2. You can get the length of a specific vocabulary by running:
        ###             `len(vocab.<specific_vocabulary>)`
        ###     3. Remember to include the padding token for the specific vocabulary
        ###        when creating your Embedding.
        ###
        ### Use the following docs to properly initialize these variables:
        ###     Embedding Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding
        ###         https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/layers/Embedding
        
        #Don't do len(vocab.src)+1 because 'pad' token is already in the vocabulary
        self.source = K.layers.Embedding(len(vocab.src), self.embed_size,  mask_zero=True)
        self.target = K.layers.Embedding(len(vocab.tgt), self.embed_size,  mask_zero=True) #not necessary to use mask_zero as decoder uses a for loop
        ### END YOUR CODE
    
    def build(self, input_shape):
        self.source.build(input_shape)
        self.target.build(input_shape)
        self.built = True

    '''
    def __call__(self, X):
        in_shape = X.shape
        self.build(in_shape)
        return self.call(X)
    '''

In [0]:
#nmt_model.py

"""
CS224N 2018-19: Homework 4
nmt_model.py: NMT Model
Pencheng Yin <pcyin@cs.cmu.edu>
Sahil Chopra <schopra8@stanford.edu>
Amit Patel: Implemented in TF 2.04/sAEboJhx2nVdbGNmAjoESGlyKXs0Tk__ssNz-EM_MPGjMgwUv-kNeBw
"""
from collections import namedtuple
import sys
from typing import List, Tuple, Dict, Set, Union

Hypothesis = namedtuple('Hypothesis', ['value', 'score'])

#NMT Model
class NMT(tf.keras.Model):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, device='gpu', training=True):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        @param device (string): gpu or cpu
        @param training: True or False used for the dropout layer
        """
        super(NMT, self).__init__()
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab
        self.training = training


        # default values
        self.encoder = None 
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None


        ### YOUR CODE HERE (~8 Lines)
        ### TODO - Initialize the following variables:
        ###     self.encoder (Bidirectional LSTM with bias)
        ###     self.decoder (LSTM Cell with bias)
        ###     self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
        ###     self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
        ###     self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
        ###     self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
        ###     self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
        ###     self.dropout (Dropout Layer)
        ###
        ### Use the following docs to properly initialize these variables:
        ###     LSTM:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
        ###         https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/layers/LSTM
        ###     LSTM Cell:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell
        ###     Linear Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
        ###     Dropout Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout

        #cpu or gpu
        device_dict = {'cpu': "CPU:0", 'gpu': "GPU:0"}
        self.tf_device = device_dict[device]
        
        #Embeddings layer
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        #encoder layer
        encd_fwd = K.layers.LSTM(hidden_size, return_sequences=True, return_state=True)
        encd_bck = K.layers.LSTM(hidden_size, return_sequences=True, return_state=True, go_backwards=True)
        self.encoder = K.layers.Bidirectional(layer=encd_fwd, backward_layer=encd_bck)
        #decoder layer
        self.decoder = K.layers.LSTMCell(hidden_size, implementation=1) #fails with implementation=2
        #w_h
        self.h_projection = K.layers.Dense(hidden_size, input_shape=(2*hidden_size,), use_bias=False) #no need to specify the input shape. Will figure it out when build() us called.
        #w_c
        self.c_projection = K.layers.Dense(hidden_size, input_shape=(2*hidden_size,), use_bias=False) #no need to specify the input shape. Will figure it out when build() us called.
        #w_att
        self.att_projection = K.layers.Dense(hidden_size, input_shape=(2*hidden_size,), use_bias=False) #no need to specify the input shape. Will figure it out when build() us called.
        #w_u
        self.combined_output_projection = K.layers.Dense(hidden_size, input_shape=(3*hidden_size,), use_bias=False) #no need to specify the input shape. Will figure it out when build() us called.
        #w_vocab
        self.target_vocab_projection = K.layers.Dense(len(vocab.tgt), input_shape=(hidden_size,), use_bias=False) #no need to specify the input shape. Will figure it out when build() us called.
        #Dropout Layer
        self.dropout = K.layers.Dropout(dropout_rate)

     ### END YOUR CODE


    def call(self, source, target):
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.vocab.src.to_input_tensor(source, device=self.device)   # Tensor: (b, src_len)
        target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device)   # Tensor: (b, tgt_len)

        ###     Run the network forward:
        ###     1. Apply the encoder to `source_padded` by calling `self.encode()`
        ###     2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()`
        ###     3. Apply the decoder to compute combined-output by calling `self.decode()`
        ###     4. Compute log probability distribution over the target vocabulary using the
        ###        combined_outputs returned by the `self.decode()` function.

        enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths) #(b,src_len,2h), ((b,2h),(b,2h))
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) #(b,src_len)
        combined_outputs, dec_state = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) #(b, tgt_len-1, h),
        P = tf.nn.log_softmax(self.target_vocab_projection(combined_outputs), axis=-1) #(b, tgt_len, Vt)
        # Zero out, probabilities for which we have nothing in the target text (i.e. padded tokens)
        target_masks = tf.cast(target_padded != self.vocab.tgt['<pad>'], tf.float32) #(b, tgt_len)
        
        # Compute log probability of generating true target words
        #Doing idx=target_padded[:,1:] and target_masks[:,1:], as we are predicting the next word for each input word.
        ###P = tf.transpose(P, perm=[1,0,2]) #(b, tgt_len, Vt). NB: don't use reshape! (no need to transpose in TF as it is in proper shape)
        (b,tgt_len,_) = P.shape
        B,T = tf.meshgrid(tf.range(b, dtype=tf.int64), tf.range(tgt_len, dtype=tf.int64), indexing='ij')

        idx = tf.stack([B, T, target_padded[:,1:]], axis=-1) #(b, tgt_len-1, 3)
        target_gold_words_log_prob = tf.gather_nd(P, idx) #(b,tgt_len-1)
        target_gold_words_log_prob = target_gold_words_log_prob * target_masks[:,1:] #(b,tgt_len-1)
        scores = tf.reduce_sum(target_gold_words_log_prob, axis=1) #sum across the time axis. Shape is (b,) (each number corresponds to the log probability of each sentence)
        
        #Pytorch code for reference
        #target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:]
        #scores = target_gold_words_log_prob.sum(dim=0) #dim 0 is tgt_len

        return scores



    def encode(self, source_padded, source_lengths):
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with shape (b, src_len), where
                                        b = batch_size, src_len = maximum source sentence length. Note that 
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch (not being used in TF version as no need to do 'pack_padded_sequence' etc)
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        ### YOUR CODE HERE (~ 8 Lines)
        ### TODO:
        ###     1. Construct Tensor `embd` of source sentences with shape (b, src_len, e) using the source model embeddings.
        ###         src_len = maximum source sentence length, b = batch size, e = embedding size. Note
        ###         that there is no initial hidden state or cell for the decoder.
        ###     2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `embd`.
        ###         - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to embd.
        ###         - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens.
        ###         - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to
        ###           return a tensor of shape (b, src_len, h*2) as `enc_hiddens`.
        ###     3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell):
        ###         - `init_decoder_hidden`:
        ###             `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the h_projection layer to this in order to compute init_decoder_hidden.
        ###             This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###         - `init_decoder_cell`:
        ###             `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the c_projection layer to this in order to compute init_decoder_cell.
        ###             This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###
        ### See the following docs, as you may need to use some of the following functions in your implementation:
        ###     Pack the padded sequence embd before passing to the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence
        ###     Pad the packed sequence, enc_hiddens, returned by the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Permute:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute

        #AMIT TODO VERIFY THIS: DUE TO MASKING OPERATION SUPPORTED BY EMBEDDINGS AND LSTM LAYERS in TF 2.0, 
        #DON'T NEED TO 'pack_padded_sequence' AND 'pad_packed_sequence'

        #1 Embed the source input/sentences
        embd = self.model_embeddings.source(source_padded) #b x sen_len x embd_dim

        #2 Apply the encoder layer
        enc_hiddens, last_hidden_fwd, last_cell_fwd, last_hidden_bck, last_cell_bck  = self.encoder(embd, initial_state=None) 
        #(enc_hiddens: b x sen_len x 2*hidden_size, last_hidden_fwd: b x hidden_size, last_cell_fwd: b x hidden_size, last_hidden_bck: b x hidden_size, last_cell_bck: b x hidden_size)
        mask = tf.cast(embd._keras_mask, tf.float32) #b x sen_len
        mask = tf.broadcast_to(mask, shape=[2*self.hidden_size, embd.shape[0], embd.shape[1]]) #2h x b x sen_len
        mask = tf.transpose(mask, perm=[1,2,0]) #b x sen_len x 2h
        enc_hiddens = mask * enc_hiddens #so that padded inputs have zero output and proper gradients (#version B of masking where we take output from every time step (see notes for details on masking))

        #3 Compute dec_init_state
        h0_dec = tf.concat([last_hidden_fwd, last_hidden_bck], axis=1) #b x 2*hidden_size
        h0_dec = self.h_projection(h0_dec) #b x hidden_size
        c0_dec = tf.concat([last_cell_fwd, last_cell_bck], axis=1) #b x 2*hidden_size
        c0_dec = self.c_projection(c0_dec) #b x hidden_size
        dec_init_state = (h0_dec, c0_dec)
        ### END YOUR CODE

        return enc_hiddens, dec_init_state


    def decode(self, enc_hiddens, enc_masks, dec_init_state, target_padded):
        """Compute combined output vectors for a batch.

        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences (b, tgt_len), where
                                       tgt_len = maximum target sentence length, b = batch size. 
        
        @returns combined_outputs (Tensor): combined output tensor  (b, tgt_len,  h), where
                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """
        # Chop off the <END> token for max length sentences. As the last input is 'end_of_sentence' token.
        # For shorter sentences in the batch, they will have an 'end_of_sentence' token followed by 
        # many 'pad' tokens to mask the input. An input of 'pad' token will return the previous output with no gradient flow (see below code).
        # But for 'end_of_sentence' token, the model will learn to predict the 'pad' token
        # As in the source input, for shorter target sentences, after 'end_of_sentence' token, they are padded with 'pad' tokens.
        target_padded = target_padded[:,:-1] #(b, tgt_len-1)

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.shape[0]
        with tf.device(self.tf_device):
            o_prev = tf.zeros((batch_size, self.hidden_size), dtype=tf.float32)
        #print(o_prev.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []

        ### YOUR CODE HERE (~9 Lines)
        ### TODO:
        ###     1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`,
        ###         which should be shape (b, src_len, h),
        ###         where b = batch size, src_len = maximum source length, h = hidden size.
        ###         This is applying W_{attProj} to h^enc, as described in the PDF.
        ###     2. Construct tensor `Y` of target sentences with shape (b, tgt_len, e) using the target model embeddings.
        ###         where tgt_len = maximum target sentence length, b = batch size, e = embedding size.
        ###     3. Use the torch.split function to iterate over the time dimension of Y.
        ###         Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size.
        ###             - Squeeze Y_t into a tensor of dimension (b, e). 
        ###             - Construct Ybar_t by concatenating Y_t with o_prev.
        ###             - Use the step function to compute the the Decoder's next (cell, state) values
        ###               as well as the new combined output o_t.
        ###             - Append o_t to combined_outputs
        ###             - Update o_prev to the new o_t.
        ###     4. Use torch.stack to convert combined_outputs from a list length tgt_len of
        ###         tensors shape (b, h), to a single tensor shape (b, tgt_len, h)
        ###         where tgt_len = maximum target sentence length, b = batch size, h = hidden size.
        ###
        ### Note:
        ###    - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###      over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###   
        ### Use the following docs to implement this functionality:
        ###     Zeros Tensor:
        ###         https://pytorch.org/docs/stable/torch.html#torch.zeros
        ###     Tensor Splitting (iteration):
        ###         https://pytorch.org/docs/stable/torch.html#torch.split
        ###     Tensor Dimension Squeezing:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Stacking:
        ###         https://pytorch.org/docs/stable/torch.html#torch.stack

        
        #1. Apply attention projection
        #enc_hiddens is b x sen_len x 2*hidden_size
        #self.att_projection matrix is hidden_size x 2*hidden_size
        #it will first reshape enc_hiddens to b*sen_len x 2*hidden_size (i.e. maintain the last dimension)
        #then apply att_projection's matrix multiplication
        #the reshape the output to b x sen_len x hidden_size
        enc_hiddens_proj = self.att_projection(enc_hiddens) #b x sen_len x hidden_size

        #2 Embed the target input/sentences
        Y = self.model_embeddings.target(target_padded) #b x tgt_len x embd_dim

        #3 Iterate over the time dimension
        #no need to use tf.split
        #Amit TODO: make sure for padded inputs, no gradients are passed.

        tgt_len = Y.shape[1]
        prev_dec_state = (tf.zeros_like(dec_state[0]), tf.zeros_like(dec_state[1]))
        dec_masks = tf.cast(Y._keras_mask, tf.float32) #b x t
        #o_prev is initialized to zeros and is of shape b x h (see above)
        for t in range(tgt_len):
            #padded inputs mask to zero-out gradients
            mask_t = dec_masks[:,t:t+1] #b x 1
            mask_t = tf.stop_gradient(mask_t) #Not necessary actually, just doing it for my paranoia!
            mask_t_b = 1-mask_t #inverse of mask_t

            #do the actual computation using step method
            Y_t = Y[:,t,:]
            Ybar_t = tf.concat([Y_t,o_prev], axis=1)
            dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks)

            #apply the mask (if comment out the below masking, then the results pass sanity check with rtol 1e-3. But commenting it out will be wrong, I think.)
            ##o_t = mask_t*o_t #version B of masking where we take output from every time step (see notes for details on masking)
            o_t = mask_t*o_t + mask_t_b*tf.stop_gradient(o_prev) #which type of masking is this? It is version A but with gradient flow of verison B
            dec_state[0] = mask_t*dec_state[0] + mask_t_b*prev_dec_state[0] #version A of masking where we want the gradient flow not to be corrupted by masked inputs (see notes for details on masking)
            dec_state[1] = mask_t*dec_state[1] + mask_t_b*prev_dec_state[1] #version A of masking where we want the gradient flow not to be corrupted by masked inputs (see notes for details on masking)

            #update the respective variables
            o_prev = o_t #(b,h)
            prev_dec_state = dec_state
            combined_outputs.append(o_t)

        #4. Combine the outputs from all time steps
        combined_outputs = tf.stack(combined_outputs, axis=1) #(b, tgt_len, h)
        ### END YOUR CODE

        return combined_outputs, dec_state


    def step(self, Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks=None):
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length.

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        ### YOUR CODE HERE (~3 Lines)
        ### TODO:
        ###     1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
        ###     2. Split dec_state into its two parts (dec_hidden, dec_cell)
        ###     3. Compute the attention scores e_t, a Tensor shape (b, src_len). 
        ###        Note: b = batch_size, src_len = maximum source length, h = hidden size.
        ###
        ###       Hints:
        ###         - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched)
        ###         - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched).
        ###         - Use batched matrix multiplication (torch.bmm) to compute e_t.
        ###         - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing.
        ###         - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###             over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###
        ### Use the following docs to implement this functionality:
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor Unsqueeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.unsqueeze
        ###     Tensor Squeeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze

        #1 Apply decoder layer
        _, dec_state = self.decoder(Ybar_t, dec_state)
        
        #2 Split dec_state
        dec_hidden, dec_cell = dec_state #both tensors are b x h

        #3. Attention scores
        #enc_hiddens_proj is shape (b, src_len, h)
        #A. using batch_matrix multiplication
        b_dim, src_len_dim, h_dim = enc_hiddens_proj.shape
        var_a = tf.expand_dims(dec_hidden, axis=1) #b x 1 x h
        ###var_b = tf.reshape(enc_hiddens_proj, [-1,h_dim,src_len_dim]) #b x h x src_len
        var_b = tf.transpose(enc_hiddens_proj, perm=[0,2,1]) #b x h x src_len (DON'T USE RESHAPE AS IT WILL GIVE WRONG RESULTS WHEN COMBINED WITH MATRIX MULTIPLICATION)
        e_t = tf.linalg.matmul(var_a, var_b) #b x 1 x src_len (will do matrix multiplication for the last 2 dimensions of both tensors, i.e. 1 x h by h x src_len and keep the other dimensions i.e. b)
        e_t = tf.squeeze(e_t, axis=1) #b x src_len
        '''
        #B. using a for loop for sanity check
        e_t = []
        for i in range(enc_hiddens_proj.shape[0]):
            var_a = dec_hidden[i:i+1,:] #1xh
            var_b = enc_hiddens_proj[i,:,:] #src_len x h
            e_t_tmp = tf.linalg.matmul(var_a, var_b, transpose_b=True) #1 x src_len
            e_t.append(e_t_tmp)
        e_t = tf.concat(e_t, axis=0) #b x src_len
        '''      
        ### END YOUR CODE

        #e_t is b x src_len
        # Set e_t to -inf where enc_masks has True else keep as is
        if enc_masks is not None:
            e_t = tf.where(enc_masks, -float('inf'), e_t)

        ### YOUR CODE HERE (~6 Lines)
        ### TODO:
        ###     1. Apply softmax to e_t to yield alpha_t
        ###     2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the
        ###         attention output vector, a_t.
        #$$     Hints:
        ###           - alpha_t is shape (b, src_len)
        ###           - enc_hiddens is shape (b, src_len, 2h)
        ###           - a_t should be shape (b, 2h)
        ###           - You will need to do some squeezing and unsqueezing.
        ###     Note: b = batch size, src_len = maximum source length, h = hidden size.
        ###
        ###     3. Concatenate dec_hidden with a_t to compute tensor U_t
        ###     4. Apply the combined output projection layer to U_t to compute tensor V_t
        ###     5. Compute tensor O_t by first applying the Tanh function and then the dropout layer.
        ###
        ### Use the following docs to implement this functionality:
        ###     Softmax:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor View:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tanh:
        ###         https://pytorch.org/docs/stable/torch.html#torch.tanh


        #1. Apply softmax (to get attention weights)
        alpha_t = K.activations.softmax(e_t, axis=-1) #b x src_len

        #2. Obtain attention output vector a_t
        #enc_hiddens is of shape (b, src_len, 2h)
        var_a = tf.transpose(enc_hiddens, perm=[0,2,1]) #(b, 2h, src_len) (DON'T USE RESHAPE AS IT WILL GIVE WRONG RESULTS WHEN COMBINED WITH MATRIX MULTIPLICATION)
        alpha_t = tf.expand_dims(alpha_t, axis=2) #(b, src_len, 1)
        a_t = tf.linalg.matmul(var_a, alpha_t) #(b, 2h, 1)
        a_t = tf.squeeze(a_t, axis=2) #(b, 2h)

        #3. compute ut
        #dec_hidden is (b, h)
        u_t = tf.concat([a_t, dec_hidden], axis=1) #(b, 3h)

        #4. Compute V_t by applying the combined output projection layer
        V_t = self.combined_output_projection(u_t) #(b, h)

        #5. Compute O_t
        O_t = tf.nn.tanh(V_t)
        O_t = self.dropout(O_t, training=self.training) #(b,h)

        ### END YOUR CODE

        combined_output = O_t
        return dec_state, combined_output, e_t


    def generate_sent_masks(self, enc_hiddens, source_lengths):
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                     src_len = max source length, h = hidden size. 
        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.
        
        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                    where src_len = max source length, h = hidden size.
        """

        enc_masks = tf.zeros(shape=enc_hiddens.shape[0:2], dtype=tf.float32) #b x src_len
        enc_masks = tf.Variable(enc_masks)
        for e_id, src_len in enumerate(source_lengths):
            #enc_masks[e_id, src_len:] = 1 #can't do this as variable tensors don't support assignment operator as well as broadcating (see below)
            enc_masks[e_id, src_len:].assign(tf.broadcast_to(1.0, shape=[enc_hiddens.shape[1]-src_len,]))

        with tf.device(self.tf_device):
            enc_masks = tf.constant(enc_masks.numpy()) #tf.constant is more memory efficient
            #enc_masks = tf.Variable(enc_masks) #tf.Variable takes more memory
            enc_masks = tf.cast(enc_masks, dtype=tf.bool)
        
        #print(enc_masks.device)
        return enc_masks #(b,src_len)


    def beam_search(self, src_sent, beam_size, max_decoding_time_step=50):
        #code provided with the assignment (converted to TF from Pytorch)
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) #(1,sen_len)

        src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) #src_encoddings = (1,sen_len,h)
        src_encodings_att_linear = self.att_projection(src_encodings) #(1,sen_len,h)

        h_tm1 = dec_init_vec #tuple of 1xh
        with tf.device(self.device):
            att_tm1 = tf.zeros([1, self.hidden_size])

        eos_id = self.vocab.tgt['</s>']

        hypotheses = [['<s>']]
        with tf.device(self.device):
            hyp_scores = tf.zeros(len(hypotheses), dtype=tf.float32)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = tf.broadcast_to(src_encodings, shape=[hyp_num] + src_encodings.shape[1:]) #(hyp_num,sen_len,2h)
            exp_src_encodings_att_linear = tf.broadcast_to(src_encodings_att_linear, [hyp_num] + src_encodings_att_linear.shape[1:]) #(hyp_num, sen_len, h)

            y_tm1 = tf.constant([self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=tf.float32)
            y_t_embed = self.model_embeddings.target(y_tm1)

            x = tf.concat([y_t_embed, att_tm1], axis=-1)

            (h_t, cell_t), att_t, _  = self.step(x, h_tm1,
                                                      exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None)

            # log probabilities over target words
            log_p_t = tf.nn.log_softmax(self.target_vocab_projection(att_t), axis=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            #contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
            contiuating_hyp_scores = tf.reshape(tf.broadcast_to(tf.expand_dims(hyp_scores, 1), log_p_t.shape) + 
                                                 log_p_t, [-1])

            #top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num)
            top_cand_hyp_pos = tf.argsort(contiuating_hyp_scores, axis=-1, direction='DESCENDING')[0:live_hyp_num].numpy()
            top_cand_hyp_scores = contiuating_hyp_scores.numpy()[top_cand_hyp_pos]

            prev_hyp_ids = top_cand_hyp_pos // len(self.vocab.tgt)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1],
                                                           score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            h_tm1 = (tf.constant(h_t.numpy()[live_hyp_ids]), tf.constant(cell_t.numpy()[live_hyp_ids]))
            att_tm1 = tf.constant(att_t.numpy()[live_hyp_ids])

            hypotheses = new_hypotheses
            with tf.device(self.device):
                hyp_scores = tf.constant(new_hyp_scores, dtype=tf.float32).numpy()

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:],
                                                   score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        return completed_hypotheses


    def uniformly_initialize_layers(self, uniform_init):
        """ Reinitialize the Layer Weights for Sanity Checks.
        Amit added this for TF
        """
        def initialize_layer(l):
            for i in range(len(l.variables)):
                init_val = tf.random.uniform(l.variables[i].shape, -uniform_init, uniform_init)
                l.variables[i].assign(init_val)

        e, h = self.model_embeddings.embed_size, self.hidden_size
        input_dims = [(1,1), (1,1,e), (1,1,(e+h)), (1,2*h), (1,2*h), (1,2*h), (1,3*h), (1,h), None]
        for l_num,l in enumerate(self.layers):
            l.build(input_dims[l_num])
            initialize_layer(l)


    @property
    def device(self):
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.tf_device


    @staticmethod
    def load(path, vocab=VOCABULARY):
        #Loads the model weights and model hyper parameters
        """ Load the model from a file.
        @param path (str): path to model
        """
        #1. build model using the saved hyper parameters
        print('restore model hyper parameters', file=sys.stderr)
        with open(path + '_model_hyper_params.json', 'r') as f:
            hyper_params = json.load(f)
        args = hyper_params['args']
        model = NMT(vocab=vocab, **args)
        model.uniformly_initialize_layers(0.1) #to build it

        #2. load model weights
        print('restore model weights', file=sys.stderr)
        model.load_weights(path)

        return model

    def save(self, path):
        #Saves the model weights and model hyper parameters
        """ Save the odel to a file.
        @param path (str): path to the model
        """

        #1. Save model weights
        print('save model parameters to [%s]' % path, file=sys.stderr)
        self.save_weights(path)

        #2. Save the model_hyper_params
        hyper_params = {
            'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate),
        }
        with open(path + '_model_hyper_params.json', 'w') as f:
            json.dump(hyper_params, f)

    #Amit ToDo
    def no_beam_search(self, src_sent, max_decoding_time_step=50):
        #Done by Amit
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        self.training = False
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) #(1,src_len)

        src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) #(1,src_len,2h), ((1,h),(1,h))
        src_encodings_prj = self.att_projection(src_encodings) #(1,src_len,h)

        h_tm1 = dec_init_vec #tuple of 1xh
        with tf.device(self.device):
            o_tm1 = tf.zeros([1, self.hidden_size])
        bos = tf.constant([self.vocab.tgt['<s>']]) #(1,1)
        y_t = self.model_embeddings.target(bos) #(1,embed_dim)
        y_t = tf.concat([y_t, o_tm1], axis=-1) #(1,embed_dim+h)
        res_tgt = []
        eos_id = self.vocab.tgt['</s>']
        #print(eos_id)

        for t in range(max_decoding_time_step):
            '''
            #self.step() does this
            _, h_t = self.decoder(y_t, h_tm1) #h_t = tuple of 1xh
            e_t = tf.linalg.matmul(h_t[0], tf.transpose(src_encodings_prj[0,:,:])) #(1,src_len)
            alpha_t = tf.keras.activations.softmax(e_t, axis=-1) #(1,src_len)
            a_t = tf.linalg.matmul(alpha_t, src_encodings[0,:,:]) #(1,2h)
            u_t = tf.concat([a_t, h_t[0]], axis=-1) #(1,3h)
            v_t = self.combined_output_projection(u_t) #(1,h)
            o_t = tf.nn.tanh(v_t)
            o_t = self.dropout(o_t, training=self.training) #(1,h)
            '''

            h_t, o_t, _ = self.step(y_t, h_tm1, src_encodings, src_encodings_prj)

            p_t = self.target_vocab_projection(o_t) #(1,vocab)
            p_t = tf.nn.log_softmax(p_t, axis=-1) #(1,vocab)
            p_t_idx_sorted = tf.argsort(p_t, -1, 'DESCENDING')
            if p_t_idx_sorted[0][0] == eos_id:
                break
            idx_t = p_t_idx_sorted[0][0].numpy()
            res_tgt.append(self.vocab.tgt.id2word[idx_t])           

            h_tm1 = h_t
            y_t = tf.constant([idx_t]) #(1,1)
            y_t = self.model_embeddings.target(y_t) #(1,embed_dim)
            y_t = tf.concat([y_t, o_t], axis=-1) #(1,embed_dim+h)

        #res_tgt = " ".join(res_tgt)
        #print(res_tgt)
        #print(t)
        res_tgt = Hypothesis(value=res_tgt, score=0.0)

        return [res_tgt]

    #Amit ToDo
    def amit_beam_search(self, src_sent, beam_size=3, max_decoding_time_step=50):
        #Done by Amit (Note: this is much slower than the verison provided with the assignment i.e. beam_search() but the they both yield the same results)
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size for beam search
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        self.training = False
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) #(1,src_len)

        src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) #(1,src_len,2h), ((1,h),(1,h))
        src_encodings_prj = self.att_projection(src_encodings) #(1,src_len,h)

        h_tm1 = dec_init_vec #tuple of 1xh
        with tf.device(self.device):
            o_tm1 = tf.zeros([1, self.hidden_size])
        bos = tf.constant([self.vocab.tgt['<s>']]) #(1,1)
        y_t = self.model_embeddings.target(bos) #(1,embed_dim)
        y_t = tf.concat([y_t, o_tm1], axis=-1) #(1,embed_dim+h)
        eos_tag = '</s>'
        eos_id = self.vocab.tgt[eos_tag]
        #print(eos_id)
        hyp = {'prob':0, 'h_tm1':h_tm1, 'y_t':y_t, 'tgt_output': []}
        hypotheses = [hyp]
        for t in range(max_decoding_time_step):
            temp_hypotheses = [] #will have a max size of beam_size^2
            for hyp in hypotheses:
                if len(hyp['tgt_output']) > 0 and hyp['tgt_output'][-1] == eos_tag:
                    temp_hypotheses.append(hyp)
                    continue #go to next hyp
                h_t, o_t, _ = self.step(hyp['y_t'], hyp['h_tm1'], src_encodings, src_encodings_prj)

                p_t = self.target_vocab_projection(o_t) #(1,vocab)
                p_t = tf.nn.log_softmax(p_t, axis=-1) #(1,vocab)
                p_t_idx_sorted = tf.argsort(p_t, -1, 'DESCENDING')

                for k in range(beam_size):
                    idx_t = p_t_idx_sorted[0][k].numpy()
                    new_hyp = {}
                    new_hyp['prob'] = hyp['prob']+p_t[0][idx_t].numpy()
                    new_hyp['h_tm1'] = h_t
                    y_t = tf.constant([idx_t]) #(1,1)
                    y_t = self.model_embeddings.target(y_t) #(1,embed_dim)
                    y_t = tf.concat([y_t, o_t], axis=-1) #(1,embed_dim+h)
                    new_hyp['y_t'] = y_t
                    new_hyp['tgt_output'] = hyp['tgt_output'][:] #deep copy
                    new_hyp['tgt_output'].append(self.vocab.tgt.id2word[idx_t])
                    temp_hypotheses.append(new_hyp)

            temp_hypotheses = sorted(temp_hypotheses, reverse=True, key=lambda x: x['prob'])
            hypotheses = temp_hypotheses[0:beam_size]
            if hypotheses[0]['tgt_output'][-1] == eos_tag:
                res_tgt = Hypothesis(value=hypotheses[0]['tgt_output'][0:-1], score=hypotheses[0]['prob']) #ignore eos_tag
                break
            
        if t == max_decoding_time_step-1:
            res_tgt = Hypothesis(value=hypotheses[0]['tgt_output'], score=hypotheses[0]['prob']) #no eos_tag


        #print(t)
        #res_tgt = " ".join(res_tgt)
        #print(res_tgt)
        return [res_tgt]


In [0]:
def dummy_decode(test_source_file, test_target_file, model_save_path='./saved_model/model'):
    #load the best saved model
    print('load the previously best saved model', file=sys.stderr)
    model = NMT.load(model_save_path)

    print("load test source sentences from [{}]".format(test_source_file))
    test_data_src = read_corpus(test_source_file, source='src')
    print("load test target sentences from [{}]".format(test_target_file))
    test_data_tgt = read_corpus(test_target_file, source='tgt')
    print(len(test_data_src), len(test_data_tgt))

    for idx, sent in enumerate(test_data_src[0:1]):
        #print(sent)
        with tf.device(model.device):
            print(model.no_beam_search(sent))
            print(model.amit_beam_search(sent, beam_size=10))
            print(model.beam_search(sent, beam_size=10))
            print(test_data_tgt[idx])

#dummy_decode('./a4/en_es_data/dev.es', './a4/en_es_data/dev.en')
#print()

##4. SANITY CHECK

In [0]:
#Pytorch Model for Debug Only and initialize the weights for the LSTMCell and Bidirectional LSTM in Tensorflow (as it is randomly initialized)
#Amit added this

#----------
# CONSTANTS
#----------
BATCH_SIZE = 5
EMBED_SIZE = 3
HIDDEN_SIZE = 3
DROPOUT_RATE = 0.0
NUM_LAYERS = 1

import torch
class Pytorch_NMT(torch.nn.Module):
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, device='cpu:0'):
        super(Pytorch_NMT, self).__init__()
        self.embed_src = torch.nn.Embedding(len(vocab.src), embed_size, padding_idx=0)
        self.embed_tgt = torch.nn.Embedding(len(vocab.tgt), embed_size, padding_idx=0)
        self.encoder = torch.nn.LSTM(embed_size, hidden_size, num_layers=1, batch_first=True, bidirectional=True)
        #self.encoder = torch.nn.LSTM(embed_size, hidden_size, num_layers=1, batch_first=True)
        self.decoder = torch.nn.LSTMCell(embed_size+hidden_size, hidden_size)
        self.h_projection = torch.nn.Linear(2*hidden_size, hidden_size, bias=False)
        self.vocab = vocab
        self.device = device

    def encode(self, source_padded, source_lengths):
        embed = self.embed_src(source_padded)
        pack_source_padded = torch.nn.utils.rnn.pack_padded_sequence(embed, source_lengths, batch_first=True, enforce_sorted=False)
        s_h = torch.zeros((NUM_LAYERS*2, BATCH_SIZE, HIDDEN_SIZE))
        s_c = torch.zeros((NUM_LAYERS*2, BATCH_SIZE, HIDDEN_SIZE))        
        out, state = self.encoder(pack_source_padded, [s_h,s_c])
        out = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)[0]

        last_hidden_fwd = state[0][0,:,:] #bxh
        last_hidden_bck = state[0][1,:,:] #bxh
        last_cell_fwd = state[1][0,:,:] #bxh
        last_cell_bck = state[1][1,:,:] #bxh
        h0_dec = torch.cat([last_hidden_fwd, last_hidden_bck], axis=1) #b x 2*hidden_size
        h0_dec = self.h_projection(h0_dec) #b x hidden_size
        c0_dec = torch.cat([last_cell_fwd, last_cell_bck], axis=1) #b x 2*hidden_size
        c0_dec = self.h_projection(c0_dec) #b x hidden_size
        dec_init_state = (h0_dec, c0_dec)

        # print(out.shape)
        # print(state[0].shape)
        # print(state[1].shape)
        return out, dec_init_state


def reinitialize_layers(model):
    """ Reinitialize the Layer Weights for Sanity Checks.
    """
    def init_weights(m):
        if type(m) == torch.nn.Linear:
            m.weight.data.fill_(0.3)
            if m.bias is not None:
                m.bias.data.fill_(0.1)
        elif type(m) == torch.nn.Embedding:
            m.weight.data.fill_(0.15)
        elif type(m) == torch.nn.Dropout:
            torch.nn.Dropout(DROPOUT_RATE)
    with torch.no_grad():
        model.apply(init_weights)


def question_1d_sanity_check(model, src_sents, tgt_sents, vocab):
    """ Sanity check for question 1d. 
        Compares student output to that of model with dummy data.
    """
    print("Running Sanity Check for Question 1d: Encode")
    print ("-"*80)

    # Configure for Testing
    reinitialize_layers(model)
    source_lengths = [len(s) for s in src_sents]
    source_padded = model.vocab.src.to_input_tensor(src_sents, device=model.device)
    source_padded = torch.tensor(source_padded.numpy())

    # Load Outputs
    enc_hiddens_target = torch.load('./a4/sanity_check_en_es_data/enc_hiddens.pkl')
    dec_init_state_target = torch.load('./a4/sanity_check_en_es_data/dec_init_state.pkl')

    # Test
    with torch.no_grad():
        enc_hiddens_pred, dec_init_state_pred = model.encode(source_padded, source_lengths)
    
    assert(np.allclose(enc_hiddens_target.numpy(), enc_hiddens_pred.numpy())), "enc_hiddens is incorrect: it should be:\n {} but is:\n{}".format(enc_hiddens_target, enc_hiddens_pred)
    print("enc_hiddens Sanity Checks Passed!")
    assert(np.allclose(dec_init_state_target[0].numpy(), dec_init_state_pred[0].numpy())), "dec_init_state[0] is incorrect: it should be:\n {} but is:\n{}".format(dec_init_state_target[0], dec_init_state_pred[0])
    print("dec_init_state[0] Sanity Checks Passed!")
    assert(np.allclose(dec_init_state_target[1].numpy(), dec_init_state_pred[1].numpy())), "dec_init_state[1] is incorrect: it should be:\n {} but is:\n{}".format(dec_init_state_target[1], dec_init_state_pred[1])
    print("dec_init_state[1] Sanity Checks Passed!")
    print ("-"*80)
    print("All Sanity Checks Passed for Question 1d: Encode!")
    print ("-"*80)


def generate_outputs(model, source, target, vocab):
    """ Generate outputs.
    """
    print ("-"*80)
    print("Generating Comparison Outputs")
    reinitialize_layers(model)

    # Compute sentence lengths
    source_lengths = [len(s) for s in source]

    # Convert list of lists into tensors
    source_padded = model.vocab.src.to_input_tensor(source, device=model.device)
    source_padded = torch.tensor(source_padded.numpy())
    target_padded = model.vocab.tgt.to_input_tensor(target, device=model.device)
    target_padded = torch.tensor(target_padded.numpy())

    # Run the model forward
    enc_hiddens, dec_init_state = model.encode(source_padded, source_lengths)
    #enc_masks = model.generate_sent_masks(enc_hiddens, source_lengths)
    #combined_outputs = model.decode(enc_hiddens, enc_masks, dec_init_state, targe


def main_sanity_check():
    """ Main function for sanity checking
    """
    # Seed the Random Number Generators
    seed = 1234
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed * 13 // 7)

    # Load training data & vocabulary
    train_data_src = read_corpus('./a4/sanity_check_en_es_data/train_sanity_check.es', 'src')
    train_data_tgt = read_corpus('./a4/sanity_check_en_es_data/train_sanity_check.en', 'tgt')
    train_data = list(zip(train_data_src, train_data_tgt))

    for src_sents, tgt_sents in batch_iter(train_data, batch_size=BATCH_SIZE, shuffle=True):
        src_sents = src_sents
        tgt_sents = tgt_sents
        break
    vocab = Vocab.load('./a4/sanity_check_en_es_data/vocab_sanity_check.json') 

    # Create NMT Model
    model_pytorch = Pytorch_NMT(EMBED_SIZE, HIDDEN_SIZE, vocab)

    question_1d_sanity_check(model_pytorch, src_sents, tgt_sents, vocab)
    #print(model_pytorch.decoder.state_dict())
    #question_1e_sanity_check(model_pytorch, src_sents, tgt_sents, vocab)
    #question_1f_sanity_check(model_pytorch, src_sents, tgt_sents, vocab)
    #generate_outputs(model_pytorch, src_sents, tgt_sents, vocab)

    return model_pytorch

model_pytorch = main_sanity_check() #used to initialize the weights for the LSTMCell and BidirectionalLSTM in Tensorflow (as it is randomly initialized)

Running Sanity Check for Question 1d: Encode
--------------------------------------------------------------------------------
enc_hiddens Sanity Checks Passed!
dec_init_state[0] Sanity Checks Passed!
dec_init_state[1] Sanity Checks Passed!
--------------------------------------------------------------------------------
All Sanity Checks Passed for Question 1d: Encode!
--------------------------------------------------------------------------------


In [0]:
#sanity_check.py

"""
CS224N 2018-19: Homework 4
sanity_check.py: sanity checks for assignment 4
Sahil Chopra <schopra8@stanford.edu>
Michael Hahn <>
Amit modified it for TF

Usage:
    sanity_check.py 1d
    sanity_check.py 1e
    sanity_check.py 1f

"""
import numpy as np
import tensorflow as tf
import torch #to load the target tensors


#----------
# CONSTANTS
#----------
BATCH_SIZE = 5
EMBED_SIZE = 3
HIDDEN_SIZE = 3
DROPOUT_RATE = 0.0

def save_tf_vars(vars, path, use_list):
    #saving tf variables (assume the variables are numpy arrays or Python native numeric types. But can be inside a list.)
    if use_list:
        chkpt = tf.train.Checkpoint()
        chkpt.listed = [tf.Variable(v) for v in vars]
        tf.saved_model.save(chkpt, path)
    else:
        tf.saved_model.save(tf.Variable(vars), path)

def load_tf_vars(path, use_list):
    #loading saved tf variables
    restored = tf.saved_model.load(path)
    if use_list:
        restored = list(restored.listed)
    return restored


def reinitialize_layers(model):
    """ Reinitialize the Layer Weights for Sanity Checks.
    """
    def init_dense(l):
        for i in range(len(l.variables)):
            if len(l.variables[i].shape) == 1: #bias kernel
                init_val = tf.fill(l.variables[i].shape, 0.1)
                l.variables[i].assign(init_val)
            else:
                init_val = tf.fill(l.variables[i].shape, 0.3) #weights kernel
                l.variables[i].assign(init_val)

    e, h = EMBED_SIZE, HIDDEN_SIZE
    input_dims = [(1,1), (1,1,e), (1,1,(e+h)), (1,2*h), (1,2*h), (1,2*h), (1,3*h), (1,h), None]
    for l_num,l in enumerate(model.layers):
        if type(l) == tf.keras.layers.Dense:
            l.build(input_dims[l_num])
            init_dense(l)
        elif type(l) == ModelEmbeddings:
            l.source.build(input_dims[l_num])
            init_val = tf.fill(l.source.variables[0].shape, 0.15) #has only 1 variable i.e. embeddings matrix
            l.source.variables[0].assign(init_val)
            #
            l.target.build(input_dims[l_num])
            init_val = tf.fill(l.target.variables[0].shape, 0.15) #has only 1 variable i.e. embeddings matrix
            l.target.variables[0].assign(init_val)
        elif type(l) == tf.keras.layers.Dropout:
            l = tf.keras.layers.Dropout(DROPOUT_RATE) #works because lists are mutable
        elif type(l) == tf.keras.layers.Bidirectional:
            l.build(input_dims[l_num])
            tf_vars = l.variables 
            py_trch = model_pytorch.encoder.state_dict()
            tf_vars[0].assign(tf.Variable(np.transpose(py_trch['weight_ih_l0'].numpy())))
            tf_vars[1].assign(tf.Variable(np.transpose(py_trch['weight_hh_l0'].numpy())))
            tf_vars[2].assign(tf.Variable((py_trch['bias_ih_l0'] + py_trch['bias_hh_l0']).numpy()))
            tf_vars[3].assign(tf.Variable(np.transpose(py_trch['weight_ih_l0_reverse'].numpy())))
            tf_vars[4].assign(tf.Variable(np.transpose(py_trch['weight_hh_l0_reverse'].numpy())))
            tf_vars[5].assign(tf.Variable((py_trch['bias_ih_l0_reverse'] + py_trch['bias_hh_l0_reverse']).numpy()))
        elif type(l) == tf.keras.layers.LSTMCell:
            l.build(input_dims[l_num])
            tf_vars = l.variables 
            py_trch = model_pytorch.decoder.state_dict()
            tf_vars[0].assign(tf.Variable(np.transpose(py_trch['weight_ih'].numpy())))
            tf_vars[1].assign(tf.Variable(np.transpose(py_trch['weight_hh'].numpy())))
            tf_vars[2].assign(tf.Variable((py_trch['bias_ih'] + py_trch['bias_hh']).numpy()))
        else:
            pass #don't do anything


def generate_outputs(model, source, target, vocab):
    """ Generate outputs.
    """
    print ("-"*80)
    print("Generating Comparison Outputs")
    reinitialize_layers(model)

    # Compute sentence lengths
    source_lengths = [len(s) for s in source]

    # Convert list of lists into tensors
    source_padded = model.vocab.src.to_input_tensor(source, device=model.device)
    target_padded = model.vocab.tgt.to_input_tensor(target, device=model.device)

    # Run the model forward
    enc_hiddens, dec_init_state = model.encode(source_padded, source_lengths)
    enc_masks = model.generate_sent_masks(enc_hiddens, source_lengths)
    combined_outputs = model.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)

    # Save Tensors to disk
    save_tf_vars(enc_hiddens, './a4/sanity_check_en_es_data/Amit_output/enc_hiddens', use_list=False)
    save_tf_vars(dec_init_state, './a4/sanity_check_en_es_data/Amit_output/dec_init_state', use_list=True)
    save_tf_vars(enc_masks, './a4/sanity_check_en_es_data/Amit_output/enc_masks', use_list=False)
    save_tf_vars(combined_outputs, './a4/sanity_check_en_es_data/Amit_output/combined_outputs', use_list=False)

    # Load Tensors from disk
    # print(load_tf_vars('./a4/sanity_check_en_es_data/Amit_output/enc_hiddens', use_list=False))
    # print(load_tf_vars('./a4/sanity_check_en_es_data/Amit_output/dec_init_state', use_list=True))
    # print(load_tf_vars('./a4/sanity_check_en_es_data/Amit_output/enc_masks', use_list=False))
    # print(load_tf_vars('./a4/sanity_check_en_es_data/Amit_output/combined_outputs', use_list=False))


def question_1d_sanity_check(model, src_sents, tgt_sents, vocab):
    """ Sanity check for question 1d. 
        Compares student output to that of model with dummy data.
    """
    print("Running Sanity Check for Question 1d: Encode")
    print ("-"*80)

    # Configure for Testing
    reinitialize_layers(model)
    source_lengths = [len(s) for s in src_sents]
    source_padded = model.vocab.src.to_input_tensor(src_sents, device=model.device)

    # print(np.allclose(model.layers[1].variables[0].numpy(), 
    #                           np.transpose(model_pytorch.encoder.state_dict()['weight_ih_l0'].numpy())))
            
    # Load Target Outputs (in PyTorch, as that's what it's saved as)
    enc_hiddens_target = torch.load('./a4/sanity_check_en_es_data/enc_hiddens.pkl')
    dec_init_state_target = torch.load('./a4/sanity_check_en_es_data/dec_init_state.pkl')
    #print(enc_hiddens_target.shape)
    #print(dec_init_state_target[0].shape)
    #print(dec_init_state_target[1].shape)

    # Test
    enc_hiddens_pred, dec_init_state_pred = model.encode(source_padded, source_lengths)
    assert(np.allclose(enc_hiddens_target.numpy(), enc_hiddens_pred.numpy())), "enc_hiddens is incorrect: it should be:\n {} but is:\n{}".format(enc_hiddens_target, enc_hiddens_pred)
    print("enc_hiddens Sanity Checks Passed!")
    assert(np.allclose(dec_init_state_target[0].numpy(), dec_init_state_pred[0].numpy())), "dec_init_state[0] is incorrect: it should be:\n {} but is:\n{}".format(dec_init_state_target[0], dec_init_state_pred[0])
    print("dec_init_state[0] Sanity Checks Passed!")
    assert(np.allclose(dec_init_state_target[1].numpy(), dec_init_state_pred[1].numpy())), "dec_init_state[1] is incorrect: it should be:\n {} but is:\n{}".format(dec_init_state_target[1], dec_init_state_pred[1])
    print("dec_init_state[1] Sanity Checks Passed!")
    print ("-"*80)
    print("All Sanity Checks Passed for Question 1d: Encode!")
    print ("-"*80)


def question_1e_sanity_check(model, src_sents, tgt_sents, vocab):
    """ Sanity check for question 1e. 
        Compares student output to that of model with dummy data.
    """
    print ("-"*80)
    print("Running Sanity Check for Question 1e: Decode")
    print ("-"*80)

    # Load Inputs
    dec_init_state = torch.load('./a4/sanity_check_en_es_data/dec_init_state.pkl')
    dec_init_state = (tf.constant(dec_init_state[0].numpy()), tf.constant(dec_init_state[1].numpy()))
    enc_hiddens = torch.load('./a4/sanity_check_en_es_data/enc_hiddens.pkl')
    enc_hiddens = tf.constant(enc_hiddens.numpy())
    enc_masks = torch.load('./a4/sanity_check_en_es_data/enc_masks.pkl')
    enc_masks = tf.constant(enc_masks.numpy(), dtype=tf.bool)
    target_padded = torch.load('./a4/sanity_check_en_es_data/target_padded.pkl')
    target_padded = tf.constant(np.transpose(target_padded.numpy())) #so it's batch first
    # print(dec_init_state[0].shape, dec_init_state[1].shape)
    # print(enc_hiddens.shape)
    # print(enc_masks.shape)
    # print(target_padded.shape)

    # Load Outputs
    combined_outputs_target = torch.load('./a4/sanity_check_en_es_data/combined_outputs.pkl').permute(1,0,2)
    #print(combined_outputs_target.shape)

    # Configure for Testing
    reinitialize_layers(model)
    COUNTER = [0]
    def stepFunction(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks):
       dec_state = torch.load('./a4/sanity_check_en_es_data/step_dec_state_{}.pkl'.format(COUNTER[0]))
       dec_state = [tf.constant(dec_state[0].numpy()), tf.constant(dec_state[1].numpy())]
       o_t = torch.load('./a4/sanity_check_en_es_data/step_o_t_{}.pkl'.format(COUNTER[0]))
       o_t = tf.constant(o_t.numpy())
       COUNTER[0]+=1
       return dec_state, o_t, None
    temp = model.step
    model.step = stepFunction

    # Run Tests
    combined_outputs_pred, _ = model.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)
    model.step = temp
    #print(combined_outputs_pred[3], '\n')
    #print(combined_outputs_target[3].numpy(), '\n')
    #print(combined_outputs_pred[3].numpy()/combined_outputs_target[3].numpy(), '\n')
    assert(np.allclose(combined_outputs_pred.numpy(), combined_outputs_target.numpy(), rtol=1e-1)), "combined_outputs is incorrect: it should be:\n {} but is:\n{}".format(combined_outputs_target, combined_outputs_pred)
    print("combined_outputs Sanity Checks Passed!")
    print ("-"*80)
    print("All Sanity Checks Passed for Question 1e: Decode!")
    print ("-"*80)

def question_1f_sanity_check(model, src_sents, tgt_sents, vocab):
    """ Sanity check for question 1f. 
        Compares student output to that of model with dummy data.
    """
    print ("-"*80)
    print("Running Sanity Check for Question 1f: Step")
    print ("-"*80)
    reinitialize_layers(model)

    # Inputs
    Ybar_t = torch.load('./a4/sanity_check_en_es_data/Ybar_t.pkl')
    Ybar_t = tf.constant(Ybar_t.numpy())
    dec_init_state = torch.load('./a4/sanity_check_en_es_data/dec_init_state.pkl')
    dec_init_state = (tf.constant(dec_init_state[0].numpy()), tf.constant(dec_init_state[1].numpy()))
    enc_hiddens = torch.load('./a4/sanity_check_en_es_data/enc_hiddens.pkl')
    enc_hiddens = tf.constant(enc_hiddens.numpy())
    enc_masks = torch.load('./a4/sanity_check_en_es_data/enc_masks.pkl')
    enc_masks = tf.constant(enc_masks.numpy(), dtype=tf.bool)
    enc_hiddens_proj = torch.load('./a4/sanity_check_en_es_data/enc_hiddens_proj.pkl')
    enc_hiddens_proj = tf.constant(enc_hiddens_proj.numpy())
    # print(Ybar_t.shape)
    # print(dec_init_state[0].shape)
    # print(dec_init_state[1].shape)
    # print(enc_hiddens.shape)
    # print(enc_masks.shape)
    # print(enc_hiddens_proj.shape)

    # Output
    dec_state_target = torch.load('./a4/sanity_check_en_es_data/dec_state.pkl')
    o_t_target = torch.load('./a4/sanity_check_en_es_data/o_t.pkl')
    e_t_target = torch.load('./a4/sanity_check_en_es_data/e_t.pkl')
    
    # Run Tests
    dec_state_pred, o_t_pred, e_t_pred= model.step(Ybar_t, dec_init_state, enc_hiddens, enc_hiddens_proj, enc_masks)
    # print(dec_state_pred[0].shape, dec_state_pred[1].shape)
    # print(o_t_pred.shape)
    # print(e_t_pred.shape)
    assert(np.allclose(dec_state_target[0].numpy(), dec_state_pred[0].numpy())), "decoder_state[0] is incorrect: it should be:\n {} but is:\n{}".format(dec_state_target[0], dec_state_pred[0])
    print("dec_state[0] Sanity Checks Passed!")
    assert(np.allclose(dec_state_target[1].numpy(), dec_state_pred[1].numpy())), "decoder_state[1] is incorrect: it should be:\n {} but is:\n{}".format(dec_state_target[1], dec_state_pred[1])
    print("dec_state[1] Sanity Checks Passed!")
    assert(np.allclose(o_t_target.numpy(), o_t_pred.numpy())), "combined_output is incorrect: it should be:\n {} but is:\n{}".format(o_t_target, o_t_pred)
    print("combined_output  Sanity Checks Passed!")
    assert(np.allclose(e_t_target.numpy(), e_t_pred.numpy())), "e_t is incorrect: it should be:\n {} but is:\n{}".format(e_t_target, e_t_pred)
    print("e_t Sanity Checks Passed!")
    print ("-"*80)    
    print("All Sanity Checks Passed for Question 1f: Step!")
    print ("-"*80)


def main_sanity_check():
    """ Main function for sanity checking
    """
    # Seed the Random Number Generators
    seed = 1234
    tf.random.set_seed(seed)
    np.random.seed(seed * 13 // 7)

    # Load training data & vocabulary
    train_data_src = read_corpus('./a4/sanity_check_en_es_data/train_sanity_check.es', 'src')
    train_data_tgt = read_corpus('./a4/sanity_check_en_es_data/train_sanity_check.en', 'tgt')
    train_data = list(zip(train_data_src, train_data_tgt))

    for src_sents, tgt_sents in batch_iter(train_data, batch_size=BATCH_SIZE, shuffle=True):
        src_sents = src_sents
        tgt_sents = tgt_sents
        break
    vocab = Vocab.load('./a4/sanity_check_en_es_data/vocab_sanity_check.json') 

    # Create NMT Model
    model = NMT(embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, 
                dropout_rate=DROPOUT_RATE, vocab=vocab, device='cpu', training=True)

    question_1d_sanity_check(model, src_sents, tgt_sents, vocab)
    question_1e_sanity_check(model, src_sents, tgt_sents, vocab)
    question_1f_sanity_check(model, src_sents, tgt_sents, vocab)
    #generate_outputs(model, src_sents, tgt_sents, vocab)

main_sanity_check()

Running Sanity Check for Question 1d: Encode
--------------------------------------------------------------------------------
enc_hiddens Sanity Checks Passed!
dec_init_state[0] Sanity Checks Passed!
dec_init_state[1] Sanity Checks Passed!
--------------------------------------------------------------------------------
All Sanity Checks Passed for Question 1d: Encode!
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Running Sanity Check for Question 1e: Decode
--------------------------------------------------------------------------------
combined_outputs Sanity Checks Passed!
--------------------------------------------------------------------------------
All Sanity Checks Passed for Question 1e: Decode!
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Running S

##5. RUN

In [0]:
#run.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
CS224N 2018-19: Homework 4
run.py: Run Script for Simple NMT Model
Pencheng Yin <pcyin@cs.cmu.edu>
Sahil Chopra <schopra8@stanford.edu>
Updated by Amit for TF 2.0

Usage:
    run.py train --train-src=<file> --train-tgt=<file> --dev-src=<file> --dev-tgt=<file> --vocab=<file> [options]
    run.py decode [options] MODEL_PATH TEST_SOURCE_FILE OUTPUT_FILE
    run.py decode [options] MODEL_PATH TEST_SOURCE_FILE TEST_TARGET_FILE OUTPUT_FILE

Options:
    --train-src=<file>                      train source file
    --train-tgt=<file>                      train target file
    --dev-src=<file>                        dev source file
    --dev-tgt=<file>                        dev target file
    --vocab=<file>                          vocab file
    --seed=<int>                            seed [default: 0]
    --batch-size=<int>                      batch size [default: 32]
    --embed-size=<int>                      embedding size [default: 256]
    --hidden-size=<int>                     hidden size [default: 256]
    --clip-grad=<float>                     gradient clipping [default: 5.0]
    --log-every=<int>                       log every [default: 10]
    --max-epoch=<int>                       max epoch [default: 30]
    --input-feed                            use input feeding
    --patience=<int>                        wait for how many iterations to decay learning rate [default: 5]
    --max-num-trial=<int>                   terminate training after how many trials [default: 5]
    --lr-decay=<float>                      learning rate decay [default: 0.5]
    --beam-size=<int>                       beam size [default: 5]
    --sample-size=<int>                     sample size [default: 5]
    --lr=<float>                            learning rate [default: 0.001]
    --uniform-init=<float>                  uniformly initialize all parameters [default: 0.1]
    --save-to=<file>                        model save path [default: model.bin]
    --valid-niter=<int>                     perform validation after how many iterations [default: 2000]
    --dropout=<float>                       dropout [default: 0.3]
    --max-decoding-time-step=<int>          maximum number of decoding time steps [default: 70]
"""
import math
import sys
import json
import time
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
import numpy as np
from typing import List, Tuple, Dict, Set, Union
from tqdm import tqdm


def evaluate_ppl(model, dev_data, batch_size=32):
    """ Evaluate perplexity on dev sentences
    @param model (NMT): NMT Model
    @param dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (batch size)
    @returns ppl (perplixty on dev sentences)
    """
    model.training = False

    cum_loss = 0.
    cum_tgt_words = 0.

    for src_sents, tgt_sents in batch_iter(dev_data, batch_size):
        loss = -tf.reduce_sum(model(src_sents, tgt_sents))

        cum_loss += loss
        tgt_word_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>` i.e. 'start sentence' tag
        cum_tgt_words += tgt_word_num_to_predict

    ppl = np.exp(cum_loss / cum_tgt_words)

    model.training=True

    return ppl


def compute_corpus_level_bleu_score(references, hypotheses):
    """ Given decoding results and reference sentences, compute corpus-level BLEU score.
    @param references (List[List[str]]): a list of gold-standard reference target sentences
    @param hypotheses (List[Hypothesis]): a list of hypotheses, one for each reference
    @returns bleu_score (float): corpus-level BLEU score
    """
    if references[0][0] == '<s>':
        references = [ref[1:-1] for ref in references]
    bleu_score = corpus_bleu([[ref] for ref in references],
                             [hyp.value for hyp in hypotheses])
    return bleu_score


def train(train_src, train_tgt, dev_src, dev_tgt, train_batch_size=32, clip_grad=5.0, valid_niter=2000, log_every=50, 
          model_save_path='./saved_model/model', uniform_init=0.1, lr=1.e-3, lr_decay=0.5, max_epoch=30, patience_lr_decay=5, max_num_trial=5, continue_training=False):
    """ Train the NMT Model.
    @param train_src (string): train source file
    @param train_tgt (string): train target file
    @param dev_src (string): dev source file
    @param dev_tgt (string): dev target file
    @param train_batch_size (int): batch size [default: 32]
    @param clip_grad (string): gradient clipping [default: 5.0]
    @param valid_niter (int): perform validation after how many iterations [default: 2000]
    @param log_every (int): log every [default: 10]
    @param model_save_path (string): model save path [default: model.bin]
    @param uniform_init (float): uniformly initialize all parameters [default: 0.1]
    @param lr (float): learning rate [default: 0.001]
    @param lr_decay (float): learning rate decay [default: 0.5]
    @param max_epoch (int): max epoch [default: 30]
    @param patience_lr_decay (int): wait for how many iterations to decay learning rate [default: 5]
    @param max_num_trial (int): terminate training after how many trials [default: 5]
    @param continue_training (bool): whether to start a new training process or continue training from the previously saved model [default=False]
    """    
    train_data_src = read_corpus(train_src, source='src')
    train_data_tgt = read_corpus(train_tgt, source='tgt')

    dev_data_src = read_corpus(dev_src, source='src')
    dev_data_tgt = read_corpus(dev_tgt, source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    optimizer = tf.keras.optimizers.Adam(learning_rate=lr) #create the optimizer object
    if continue_training == True:
        #load the best saved model
        print('load the previously best saved model', file=sys.stderr)
        model = NMT.load(model_save_path)
        # load the optimizer's state
        print('restore parameters of the optimizer', file=sys.stderr)
        with open(model_save_path+'_optim.json', 'r') as f:
            config = json.load(f)
            optimizer.from_config(config)
    else:
        model = NMT(200, 100, VOCABULARY) #create a new model to train from scratch
        if np.abs(uniform_init) > 0.:
            print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr)
            model.uniformly_initialize_layers(uniform_init)


    vocab = model.vocab
    model.training = True

    vocab_mask = tf.Variable(tf.ones([len(vocab.tgt)]))
    vocab_mask[vocab.tgt['<pad>']].assign(0)

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('Begin Maximum Likelihood Training:')
    print('use device: %s' % model.device, file=sys.stderr)

    while True:
        epoch += 1

        for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True):
            train_iter += 1

            batch_size = len(src_sents)

            with tf.device(model.device):
                with tf.GradientTape() as t:
                    example_losses = -model(src_sents, tgt_sents) # (batch_size,)
                    batch_loss = tf.reduce_sum(example_losses)
                    loss = batch_loss / batch_size

                grads = t.gradient(loss, model.trainable_variables)

                # clip gradient
                #grads = [tf.clip_by_norm(g, clip_grad) for g in grads] #this makes more sense to me
                grads, _ = tf.clip_by_global_norm(grads, clip_grad) #tf recommends this however

                optimizer.apply_gradients(zip(grads, model.trainable_variables))

            batch_losses_val = batch_loss.numpy()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter,
                                                                                         cum_loss / cum_examples,
                                                                                         np.exp(cum_loss / cum_tgt_words),
                                                                                         cum_examples), file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl = evaluate_ppl(model, dev_data, batch_size=128)   # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr)

                is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    # save the model
                    print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)
                    model.save(model_save_path)

                    # save the optimizer's state
                    print('save parameters of the optimizer', file=sys.stderr)
                    with open(model_save_path + '_optim.json', 'w') as f:
                        config = optimizer.get_config()
                        config = {k:(v.item() if type(config[k]) == np.float32 else v) for (k,v) in config.items()} #convert to native type for json serialization
                        json.dump(config, f)

                elif patience < patience_lr_decay:
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == patience_lr_decay:
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == max_num_trial:
                            print('early stop!', file=sys.stderr)
                            return

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.learning_rate * lr_decay
                        print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)

                        # load model
                        model = NMT.load(model_save_path)
                        # load the optimizer's state
                        print('restore parameters of the optimizer', file=sys.stderr)
                        with open(model_save_path+'_optim.json', 'r') as f:
                            config = json.load(f)
                            optimizer.from_config(config)

                        # set new lr
                        optimizer.learning_rate.assign(lr)

                        # reset patience
                        patience = 0

                if epoch == max_epoch:
                    print('reached maximum number of epochs!', file=sys.stderr)
                    return


def decode(test_source_file, test_target_file=None, model=None, model_save_path='./saved_model/model', 
           beam_size=3, max_decoding_time_step=50, output_file=None, val_size=500):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param test_source_file (string): path to open the file
    @param test_target_file (string): path to open the file
    @param model (NMT model): the trained NMT model
    @param model_save_path (string): saved model path
    @param beam_size (int): scope of the bean search
    @param max_decoding_time_step (int): as the name implies
    @param output_file (string): path to save the results
    @returns None
    """
    print("load test source sentences from [{}]".format(test_source_file))
    test_data_src = read_corpus(test_source_file, source='src')
    if test_target_file:
        print("load test target sentences from [{}]".format(test_target_file))
        test_data_tgt = read_corpus(test_target_file, source='tgt')

    if model == None:
        #load the best saved model
        print("load model from {}".format(model_save_path), file=sys.stderr)
        model = NMT.load(model_save_path)

    with tf.device(model.device):
        hypotheses = beam_search(model, test_data_src, beam_size, max_decoding_time_step)

    if test_target_file:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt[0:val_size], top_hypotheses[0:val_size])
        print('Corpus BLEU: {}'.format(bleu_score * 100))

    if output_file:
        with open(output_file, 'w') as f:
            for src_sent, hyps in zip(test_data_src, hypotheses):
                top_hyp = hyps[0]
                hyp_sent = ' '.join(top_hyp.value)
                f.write(hyp_sent + '\n')


def beam_search(model, test_data_src, beam_size, max_decoding_time_step, val_size=500):
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    model.training = False 

    hypotheses = []
    for src_sent in tqdm(test_data_src[0:val_size], desc='Decoding', file=sys.stdout):
        # example_hyps = model.beam_search(src_sent, beam_size, max_decoding_time_step) #do for each sentence
        example_hyps = model.no_beam_search(src_sent)
        hypotheses.append(example_hyps)

    model.training = True
    return hypotheses

def test_save_load_model(dev_src, dev_tgt):
    '''
    To make sure we get identical results when save/load the model
    '''
    model = NMT(50,50,VOCABULARY)
    dev_data_src = read_corpus(dev_src, source='src')[0:5]
    dev_data_tgt = read_corpus(dev_tgt, source='tgt')[0:5]
    dev_data = list(zip(dev_data_src, dev_data_tgt))
    model.uniformly_initialize_layers(0.5)
    
    print('Initial Results:')
    model.training = False
    res1 = model(dev_data_src, dev_data_tgt)
    ppl = evaluate_ppl(model, dev_data)
    #print('model call results {}'.format(res1))
    #print(ppl)

    print('save the model')
    test_path = './saved_model/test_save_load_model_delete'
    model.save(test_path)
    time.sleep(2)

    print('load model')
    model = NMT.load(test_path)
    time.sleep(2)
    model.training = False
    res2 = model(dev_data_src, dev_data_tgt)
    ppl = evaluate_ppl(model, dev_data)
    #print('model call results {}'.format(res2))
    #print(ppl)

    print('Same results? {}'.format(np.allclose(res1.numpy(), res2.numpy())))



def main_run():
    """ Main func.
    """

    # seed the random number generators
    seed = 1234
    tf.random.set_seed(seed)
    np.random.seed(seed * 13 // 7)

    train_src='./a4/en_es_data/train.es'
    train_tgt = './a4/en_es_data/train.en'
    dev_src = './a4/en_es_data/dev.es'
    dev_tgt = './a4/en_es_data/dev.en'

    #test_save_load_model(dev_src, dev_tgt)

    #train(train_src, train_tgt, dev_src, dev_tgt, continue_training=True)
    decode(dev_src, test_target_file=dev_tgt, beam_size=10)

main_run()

'''
500 training data Bleu score
    beam_search: 36.0 (took 2 mins to run) beam_search=3
    amit_beam_search: 36.0 (took 6 mins to run) beam_search=3
    beam_search: 37.1 (took 3.5 mins to run) beam_search=10
    amit_beam_search: 37.1 (took 39 mins to run) beam_search=10
    no beam: 33.97

500 validation data Bleu score
    beam_search: 26.0 (took 3 mins to run) beam_search=10
    no beam search: 25 (took 1.5 mins)
'''

load test source sentences from [./a4/en_es_data/dev.es]
load test target sentences from [./a4/en_es_data/dev.en]


load model from ./saved_model/model
restore model hyper parameters


Decoding:   0%|          | 0/500 [00:00<?, ?it/s]

restore model weights


Decoding: 100%|██████████| 500/500 [01:26<00:00,  5.87it/s]
Corpus BLEU: 24.79346637453408


'research\n500 training data Bleu score\n    beam_search: 36.0 (took 2 mins to run) beam_search=3\n    amit_beam_search: 36.0 (took 6 mins to run) beam_search=3\n    beam_search: 37.1 (took 3.5 mins to run) beam_search=10\n    amit_beam_search: 37.1 (took 39 mins to run) beam_search=10\n    no beam: 33.97\n\n500 validation data Bleu score\n    beam_search: 26.0 (took 3 mins to run) beam_search=10\n'

##PLAYGROUND

In [0]:
import tensorflow as tf
import torch
import numpy as np

In [0]:
a = torch.zeros(4)
a = a.unsqueeze(1)
a = a.view(2,-1)
a
a = torch.tensor(0.1)
a.item()
#(hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)

0.10000000149011612