### Download Data

In [1]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


In [2]:
import nltk
from collections import Counter
from tqdm import tqdm_notebook
import numpy as np
import tensorflow as tf
from tensorflow.contrib import seq2seq
from tensorflow.contrib.rnn import DropoutWrapper
import random

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/bishal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
MAX_SEQ_LEN = 20
BATCH_SIZE = 64

In [5]:
class Lang:
    def __init__(self, counter, vocab_size):
        self.word2id = {}
        self.id2word = {}
        self.pad = "<PAD>"
        self.sos = "<SOS>"
        self.eos = "<EOS>"
        self.unk = "<UNK>"
        
        self.ipad = 0
        self.isos = 1
        self.ieos = 2
        self.iunk = 3
        
        self.word2id[self.pad] = 0
        self.word2id[self.sos] = 1
        self.word2id[self.eos] = 2
        self.word2id[self.unk] = 3
        
        self.id2word[0] = self.pad
        self.id2word[1] = self.sos
        self.id2word[2] = self.eos
        self.id2word[3] = self.unk
        
        curr_id = 4
        for w, c in counter.most_common(vocab_size):
            self.word2id[w] = curr_id
            self.id2word[curr_id] = w
            curr_id += 1
            
    def encodeSentence(self, s, max_len=-1):
        wseq = s.lower().strip()
        if max_len == -1:
            return [self.word2id[w] if w in self.word2id else self.iunk for w in wseq]
        else:
            return ([self.word2id[w] if w in self.word2id else self.iunk for w in wseq] + [self.ieos] + [self.ipad]*max_len)[:max_len]
        
    def encodeSentence2(self, s, max_len=-1):
        wseq = wseq = s.lower().strip()
        return min(max_len, len(wseq)+1), \
            ([self.word2id[w] if w in self.word2id else self.iunk for w in wseq] + \
                [self.ieos] + [self.ipad]*max_len)[:max_len]
    
    def decodeSentence(self, id_seq):
        id_seq = np.array(id_seq + [self.ieos])
        j = np.argmax(id_seq==self.ieos)
        s = ''.join([self.id2word[x] for x in id_seq[:j]])
        s = s.replace(self.unk, "UNK")
        return s

In [6]:
N = 30823

In [7]:
hi_counter = Counter()
hi_sentences=[]
en_counter = Counter()
en_sentences=[]
with open("data/Hindi - Word Transliteration Pairs 1.txt") as f:
    for line in tqdm_notebook(f, total=N, desc="Reading file:"):
        en, hi = line.strip().split("\t")
        hi_sentences.append(hi)
        en_sentences.append(en)
    for line in tqdm_notebook(hi_sentences, desc="Processing inputs:"):
        for w in line.strip():
            hi_counter[w] += 1
    for line in tqdm_notebook(en_sentences, desc="Processing inputs:"):
        for w in line.strip():
            en_counter[w] += 1

HBox(children=(IntProgress(value=0, description='Reading file:', max=30823, style=ProgressStyle(description_wi…




HBox(children=(IntProgress(value=0, description='Processing inputs:', max=30823, style=ProgressStyle(descripti…




HBox(children=(IntProgress(value=0, description='Processing inputs:', max=30823, style=ProgressStyle(descripti…




In [8]:
hi_counter.most_common(10)

[('ा', 21123),
 ('र', 9205),
 ('े', 8100),
 ('न', 7225),
 ('ी', 6546),
 ('ल', 6434),
 ('ं', 5748),
 ('म', 5707),
 ('ि', 5602),
 ('त', 5571)]

In [9]:
print(len(hi_counter))

66


In [10]:
en_counter.most_common(10)

[('a', 57220),
 ('n', 15015),
 ('i', 14015),
 ('h', 13805),
 ('e', 12264),
 ('r', 9262),
 ('u', 8539),
 ('t', 7181),
 ('o', 6691),
 ('k', 6498)]

In [11]:
print(len(en_counter))

27


In [12]:
en_lang = Lang(en_counter, len(en_counter))
hi_lang = Lang(hi_counter, len(hi_counter))

In [13]:
(en_lang.encodeSentence("Shukriya"))

[15, 7, 10, 13, 9, 6, 20, 4]

In [14]:
en_lang.decodeSentence(en_lang.encodeSentence("Shukriya", 10))

'shukriya'

In [15]:
(hi_lang.encodeSentence("शुक्रिया", 10))

[35, 19, 15, 22, 5, 12, 21, 4, 2, 0]

In [16]:
hi_lang.decodeSentence((hi_lang.encodeSentence("शुक्रिया", 10)))

'शुक्रिया'

In [17]:
VE = len(en_lang.word2id)
VH = len(hi_lang.word2id)

### Load Pre-trained word vectors (word2vec, glove, fasttext etc.)

### The Seq2Seq architecture

#### Debugging Tip: Always keep track of tensor dimensions!

#### Word Embedding Matrix

In [18]:
en_word_emb_matrix = tf.get_variable("en_word_emb_matrix", (VE, 300), dtype=tf.float32)
hi_word_emb_matrix = tf.get_variable("hi_word_emb_matrix", (VH, 300), dtype=tf.float32)

Instructions for updating:
Colocations handled automatically by placer.


In [19]:
keep_prob = tf.placeholder(tf.float32)

In [20]:
input_ids = tf.placeholder(tf.int32, (None, MAX_SEQ_LEN))
input_lens = tf.placeholder(tf.int32, (None, ))

In [21]:
ph_target_ids = tf.placeholder(tf.int32, (None, MAX_SEQ_LEN))
target_lens = tf.placeholder(tf.int32, (None, ))

#### Tensorflow Graphs

In [22]:
# Add SOS or GO symbol
target_ids = tf.concat([tf.fill([BATCH_SIZE,1], hi_lang.isos), ph_target_ids], -1)

In [23]:
input_emb = tf.nn.embedding_lookup(en_word_emb_matrix, input_ids)
target_emb = tf.nn.embedding_lookup(hi_word_emb_matrix, target_ids[:, :-1])

In [24]:
input_emb.shape

TensorShape([Dimension(None), Dimension(20), Dimension(300)])

#### Encoder

##### RNN Units

In [25]:
encoder_cell = tf.nn.rnn_cell.GRUCell(128)
encoder_cell = DropoutWrapper(encoder_cell, output_keep_prob=keep_prob)

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.


In [26]:
enc_outputs, enc_state = tf.nn.dynamic_rnn(
    encoder_cell, input_emb, sequence_length=input_lens, initial_state=encoder_cell.zero_state(BATCH_SIZE, dtype=tf.float32)
)

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [27]:
enc_outputs.shape

TensorShape([Dimension(64), Dimension(20), Dimension(128)])

In [28]:
enc_state.shape

TensorShape([Dimension(64), Dimension(128)])

#### Decoder

In [29]:
decoder_cell = tf.nn.rnn_cell.GRUCell(128)
decoder_cell = DropoutWrapper(decoder_cell, output_keep_prob=keep_prob)

In [30]:
output_projection = tf.layers.Dense(len(hi_lang.word2id))

#### Decoder Training Helper

In [31]:
helper = seq2seq.TrainingHelper(target_emb, target_lens)
decoder = seq2seq.BasicDecoder(decoder_cell, helper, enc_state, output_projection)
outputs, _, outputs_lens = seq2seq.dynamic_decode(decoder, maximum_iterations=MAX_SEQ_LEN, 
                                                  impute_finished=False, swap_memory=True)
output_max_len = tf.reduce_max(outputs_lens)

#### And Decoder Inference Helper

In [32]:
# Using the decoder_cell without dropout here.
infer_helper = seq2seq.GreedyEmbeddingHelper(hi_word_emb_matrix, tf.fill([BATCH_SIZE, ], hi_lang.isos), hi_lang.ieos)
infer_decoder = seq2seq.BasicDecoder(decoder_cell, infer_helper, enc_state, output_projection)
infer_output = seq2seq.dynamic_decode(infer_decoder, maximum_iterations=MAX_SEQ_LEN, swap_memory=True)

#### Loss and Optimizers

In [33]:
masks = tf.sequence_mask(target_lens, output_max_len, dtype=tf.float32, name='masks')

# Loss function - weighted softmax cross entropy
cost = tf.contrib.seq2seq.sequence_loss(
    outputs[0],
    target_ids[:, 1:(output_max_len + 1)],
    masks)

# Optimizer
optimizer = tf.train.AdamOptimizer(0.0001)

# Gradient Clipping
# gradients = optimizer.compute_gradients(cost)
# capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
# train_op = optimizer.apply_gradients(capped_gradients)

In [34]:
train_op = optimizer.minimize(cost)

In [35]:
init = tf.global_variables_initializer()

#### Tensorflow Sessions

In [36]:
sess_config = tf.ConfigProto()
sess_config.gpu_options.allow_growth = True

In [37]:
sess = tf.InteractiveSession(config=sess_config)
sess.run(init)

#### Minibatch Training

In [38]:
random.seed(41)

In [39]:
parallel = list(zip(en_sentences, hi_sentences))

In [40]:
random.shuffle(parallel)

In [41]:
parallel[1000]

('hazaarii', 'हज़ारी')

In [42]:
train_n = int(0.95*N)
valid_n = N - train_n

In [43]:
train_pairs = parallel[:train_n]
valid_pairs = parallel[train_n:]

In [47]:
def small_test():
    all_bleu = []
    smoothing = nltk.translate.bleu_score.SmoothingFunction().method7
    for m in range(0, valid_n, BATCH_SIZE):
        # print(f"Status: {m}/{N}", end='\r')
        n = m + BATCH_SIZE
        if n > valid_n:
            # print("Epoch Complete...")
            break

        input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
        input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
        for i in range(m, n):
            b,a = en_lang.encodeSentence2(valid_pairs[i][0], MAX_SEQ_LEN)
            input_batch[i-m,:] = a
            input_lens_batch[i-m] = b

    #     target_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
    #     target_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
    #     for i in range(m, n):
    #         b,a = hi_lang.encodeSentence2(valid_pairs[i][1], MAX_SEQ_LEN)
    #         target_batch[i-m,:] = a
    #         target_lens_batch[i-m] = b

        feed_dict={
            input_ids: input_batch,
            input_lens: input_lens_batch,
            #target_ids: target_batch,
            #target_lens: target_lens_batch,
            keep_prob: 1.0
        }
        pred_batch = sess.run(infer_output[0].sample_id, feed_dict=feed_dict)
        for k, pred_ in enumerate(pred_batch):
            pred_s = hi_lang.decodeSentence(list(pred_))
            ref = valid_pairs[m+k][1]
            try:
                _bx = nltk.translate.bleu_score.sentence_bleu(
                    [ref],
                    pred_s,
                    weights=[1/4]*4,
                    smoothing_function=smoothing)
            except ZeroDivisionError:
                _bx = 0
            all_bleu.append(_bx)

    print(f"\n\nBLEU Score: {np.mean(all_bleu)}\n")

In [None]:
for _e in range(20):
    for m in range(0, train_n, BATCH_SIZE):
        n = m + BATCH_SIZE
        if n > train_n:
            print("\nEpoch Complete...")
            break

        input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
        input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
        for i in range(m, n):
            b,a = en_lang.encodeSentence2(train_pairs[i][0], MAX_SEQ_LEN)
            input_batch[i-m,:] = a
            input_lens_batch[i-m] = b

        target_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
        target_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
        for i in range(m, n):
            b,a = hi_lang.encodeSentence2(train_pairs[i][1], MAX_SEQ_LEN)
            target_batch[i-m,:] = a
            target_lens_batch[i-m] = b

        feed_dict={
            input_ids: input_batch,
            input_lens: input_lens_batch,
            ph_target_ids: target_batch,
            target_lens: target_lens_batch,
            keep_prob: 0.8 
        }
        sess.run(train_op, feed_dict=feed_dict)
        batch_loss = sess.run(cost, feed_dict=feed_dict)
        print(f"Epoch: {_e} >> Status: {n}/{train_n} >> Loss: {batch_loss}", end="\r")
        if (1 + n//BATCH_SIZE) % 100 == 0:
            small_test()

Epoch: 0 >> Status: 6336/29281 >> Loss: 0.9172216057777405

BLEU Score: 0.4724988605397558

Epoch: 0 >> Status: 12736/29281 >> Loss: 0.9279507398605347

BLEU Score: 0.48221586559038104

Epoch: 0 >> Status: 19136/29281 >> Loss: 0.9116573929786682

BLEU Score: 0.48318973653939395

Epoch: 0 >> Status: 25536/29281 >> Loss: 0.8010704517364502

BLEU Score: 0.49410303340931244

Epoch: 0 >> Status: 29248/29281 >> Loss: 0.8818286657333374
Epoch Complete...
Epoch: 1 >> Status: 6336/29281 >> Loss: 0.8470025658607483

BLEU Score: 0.4965431799164603

Epoch: 1 >> Status: 12736/29281 >> Loss: 0.8599395155906677

BLEU Score: 0.5058346841913631

Epoch: 1 >> Status: 19136/29281 >> Loss: 0.8254892230033875

BLEU Score: 0.5091217964978457

Epoch: 1 >> Status: 25536/29281 >> Loss: 0.7479752898216248

BLEU Score: 0.5149384027416605

Epoch: 1 >> Status: 29248/29281 >> Loss: 0.8322769403457642
Epoch Complete...
Epoch: 2 >> Status: 6336/29281 >> Loss: 0.7693257331848145

BLEU Score: 0.5230378639120795

Epoch: 

#### Performance Evaluation using BLEU scores

### Let's see some real translation examples now!

In [49]:
def translate(s):
    input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
    input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
    b,a = en_lang.encodeSentence2(s, MAX_SEQ_LEN)
    input_batch[0, :] = a
    input_lens_batch[0] = b
    
    feed_dict={
        input_ids: input_batch,
        input_lens: input_lens_batch,
        #target_ids: target_batch,
        #target_lens: target_lens_batch,
        keep_prob: 1.0
    }
    pred_batch = sess.run(infer_output[0].sample_id, feed_dict=feed_dict)
    pred_ = pred_batch[0]
    pred_s = hi_lang.decodeSentence(list(pred_))
    # ref = valid_pairs[m+k][1]
    return pred_s

In [63]:
translate("gambhir")

'समा'

### Bonus Resources
Last but not the least, learn PyTorch also.