### Download Data

In [1]:
# !mkdir data
# !wget "http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download/parallel.tgz" -P data
# !tar -xf data/parallel.tgz -C data/

In [2]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


In [8]:
import nltk
from collections import Counter
from tqdm import tqdm_notebook
import numpy as np
import tensorflow as tf
from tensorflow.contrib import seq2seq
from tensorflow.contrib.rnn import DropoutWrapper
import random

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/bishal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
VE = 10000
VH = 10000

In [6]:
MAX_SEQ_LEN = 50
BATCH_SIZE = 64

In [6]:
class Lang:
    def __init__(self, counter, vocab_size):
        self.word2id = {}
        self.id2word = {}
        self.pad = "<PAD>"
        self.sos = "<SOS>"
        self.eos = "<EOS>"
        self.unk = "<UNK>"
        
        self.ipad = 0
        self.isos = 1
        self.ieos = 2
        self.iunk = 3
        
        self.word2id[self.pad] = 0
        self.word2id[self.sos] = 1
        self.word2id[self.eos] = 2
        self.word2id[self.unk] = 3
        
        self.id2word[0] = self.pad
        self.id2word[1] = self.sos
        self.id2word[2] = self.eos
        self.id2word[3] = self.unk
        
        curr_id = 4
        for w, c in counter.most_common(vocab_size):
            self.word2id[w] = curr_id
            self.id2word[curr_id] = w
            curr_id += 1
            
    def encodeSentence(self, s, max_len=-1):
        wseq = nltk.tokenize.word_tokenize(s.lower().strip())
        if max_len == -1:
            return [self.word2id[w] if w in self.word2id else self.iunk for w in wseq]
        else:
            return ([self.word2id[w] if w in self.word2id else self.iunk for w in wseq] + [self.ieos] + [self.ipad]*max_len)[:max_len]
        
    def encodeSentence2(self, s, max_len=-1):
        wseq = nltk.tokenize.word_tokenize(s.lower().strip()) 
        return min(max_len, len(wseq)), \
            ([self.word2id[w] if w in self.word2id else self.iunk for w in wseq] + \
                [self.ieos] + [self.ipad]*max_len)[:max_len]
    
    def decodeSentence(self, id_seq):
        id_seq = np.array(id_seq + [self.ieos])
        j = np.argmax(id_seq==self.ieos)
        s = ' '.join([self.id2word[x] for x in id_seq[:j]])
        s = s.replace(self.unk, "UNK")
        return s

In [7]:
N = 200000
# N = 1561840

In [8]:
hi_counter = Counter()
hi_sentences=[]
with open("data/parallel/IITB.en-hi.hi") as f:
    for line in tqdm_notebook(f, total=1561840, desc="Reading file:"):
        hi_sentences.append(line.strip())
    for line in tqdm_notebook(hi_sentences[:N], desc="Processing inputs:"):
        # hi_sentences.append(line.strip())
        for w in nltk.tokenize.word_tokenize(line.strip()):
            hi_counter[w] += 1

HBox(children=(IntProgress(value=0, description='Reading file:', max=1561840, style=ProgressStyle(description_…




HBox(children=(IntProgress(value=0, description='Processing inputs:', max=200000, style=ProgressStyle(descript…




In [9]:
hi_counter.most_common(10)

[('.', 36555),
 ('(', 35172),
 (')', 35134),
 ('के', 25558),
 ('%', 24045),
 ('है', 23951),
 ('_', 22906),
 ('में', 19818),
 ('को', 17586),
 ('करें', 17529)]

In [10]:
print(len(hi_counter))

37175


In [11]:
en_counter = Counter()
en_sentences=[]
with open("data/parallel/IITB.en-hi.en") as f:
    for line in tqdm_notebook(f, total=1561840, desc="Reading file:"):
        en_sentences.append(line.strip())
    for line in tqdm_notebook(en_sentences[:N], desc="Processing inputs:"):
        # en_sentences.append(line.strip())
        for w in nltk.tokenize.word_tokenize(line.strip()):
            en_counter[w] += 1

HBox(children=(IntProgress(value=0, description='Reading file:', max=1561840, style=ProgressStyle(description_…




HBox(children=(IntProgress(value=0, description='Processing inputs:', max=200000, style=ProgressStyle(descript…




In [12]:
en_counter.most_common(10)

[('the', 34438),
 ('.', 30741),
 ('%', 25035),
 ('to', 24884),
 ('_', 24232),
 (':', 22777),
 ('s', 15453),
 ('a', 12644),
 ('not', 12292),
 ('of', 12162)]

In [13]:
print(len(en_counter))

28916


In [14]:
en_lang = Lang(en_counter, VE)
hi_lang = Lang(hi_counter, VH)

In [15]:
(en_lang.encodeSentence("How are you?"))

[1017, 59, 24, 38]

In [16]:
en_lang.decodeSentence(en_lang.encodeSentence("How are you?", 10))

'how are you ?'

In [17]:
(hi_lang.encodeSentence("आप कैसे है?", 10))

[33, 1221, 9, 52, 2, 0, 0, 0, 0, 0]

In [18]:
hi_lang.decodeSentence((hi_lang.encodeSentence("आप कैसे है?", 10)))

'आप कैसे है ?'

### Load Pre-trained word vectors (word2vec, glove, fasttext etc.)

### The Seq2Seq architecture

#### Debugging Tip: Always keep track of tensor dimensions!

#### Word Embedding Matrix

In [19]:
en_word_emb_matrix = tf.get_variable("en_word_emb_matrix", (len(en_lang.word2id), 300), dtype=tf.float32)
hi_word_emb_matrix = tf.get_variable("hi_word_emb_matrix", (len(hi_lang.word2id), 300), dtype=tf.float32)

Instructions for updating:
Colocations handled automatically by placer.


In [20]:
keep_prob = tf.placeholder(tf.float32)

In [21]:
input_ids = tf.placeholder(tf.int32, (None, MAX_SEQ_LEN))
input_lens = tf.placeholder(tf.int32, (None, ))

In [22]:
ph_target_ids = tf.placeholder(tf.int32, (None, MAX_SEQ_LEN))
target_lens = tf.placeholder(tf.int32, (None, ))

#### Add SOS or GO symbol

#### Tensorflow Graphs

In [23]:
target_ids = tf.concat([tf.fill([BATCH_SIZE,1], hi_lang.isos), ph_target_ids], -1)

In [24]:
input_emb = tf.nn.embedding_lookup(en_word_emb_matrix, input_ids)
target_emb = tf.nn.embedding_lookup(hi_word_emb_matrix, target_ids[:, :-1])

In [25]:
input_emb.shape

TensorShape([Dimension(None), Dimension(50), Dimension(300)])

#### Encoder

##### RNN Units

In [26]:
encoder_cell = tf.nn.rnn_cell.GRUCell(128)
encoder_cell = DropoutWrapper(encoder_cell, output_keep_prob=keep_prob)

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.


In [27]:
enc_outputs, enc_state = tf.nn.dynamic_rnn(
    encoder_cell, input_emb, sequence_length=input_lens, initial_state=encoder_cell.zero_state(BATCH_SIZE, dtype=tf.float32)
)

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.


In [28]:
enc_outputs.shape

TensorShape([Dimension(64), Dimension(50), Dimension(128)])

In [29]:
enc_state.shape

TensorShape([Dimension(64), Dimension(128)])

#### Decoder

In [30]:
decoder_cell = tf.nn.rnn_cell.GRUCell(128)

In [31]:
decoder_cell = DropoutWrapper(decoder_cell, output_keep_prob=keep_prob)

In [32]:
output_projection = tf.layers.Dense(len(hi_lang.word2id))

#### Decoder Training Helper

In [33]:
helper = seq2seq.TrainingHelper(target_emb, target_lens)

In [34]:
decoder = seq2seq.BasicDecoder(decoder_cell, helper, enc_state, output_projection)

In [35]:
outputs, _, outputs_lens = seq2seq.dynamic_decode(decoder, maximum_iterations=MAX_SEQ_LEN, impute_finished=False, swap_memory=True)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [36]:
output_max_len = tf.reduce_max(outputs_lens)

#### And Decoder Inference Helper

In [37]:
enc_state

<tf.Tensor 'rnn/while/Exit_3:0' shape=(64, 128) dtype=float32>

In [38]:
# Using the decoder_cell without dropout here.
infer_helper = seq2seq.GreedyEmbeddingHelper(hi_word_emb_matrix, tf.fill([BATCH_SIZE, ], hi_lang.isos), hi_lang.ieos)

In [39]:
infer_decoder = seq2seq.BasicDecoder(decoder_cell, infer_helper, enc_state, output_projection)

In [40]:
infer_output = seq2seq.dynamic_decode(infer_decoder, maximum_iterations=MAX_SEQ_LEN, swap_memory=True)

#### Loss and Optimizers

In [41]:
masks = tf.sequence_mask(target_lens, output_max_len, dtype=tf.float32, name='masks')

In [42]:
# Loss function - weighted softmax cross entropy
cost = tf.contrib.seq2seq.sequence_loss(
    outputs[0],
    target_ids[:, 1:(output_max_len + 1)],
    masks)

# Optimizer
optimizer = tf.train.AdamOptimizer(0.0001)

# Gradient Clipping
# gradients = optimizer.compute_gradients(cost)
# capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
# train_op = optimizer.apply_gradients(capped_gradients)

In [43]:
train_op = optimizer.minimize(cost)

#### Tensorflow Sessions

In [44]:
sess_config = tf.ConfigProto()
sess_config.gpu_options.allow_growth = True

In [45]:
init = tf.global_variables_initializer()

In [46]:
sess = tf.InteractiveSession(config=sess_config)

In [47]:
sess.run(init)

#### Minibatch Training

In [48]:
random.seed(41)

In [49]:
parallel = list(zip(en_sentences, hi_sentences))

In [50]:
random.shuffle(parallel)

In [51]:
parallel[0]

('British Pound Sterling', 'ब्रिटिश पोंड स्टर्लिंगName')

In [52]:
# train_n = int(0.95*N)
# valid_n = N - train_n
valid_n = 1000
train_n = N - valid_n
# valid_n = N - train_n

In [53]:
train_pairs = parallel[:train_n]
valid_pairs = parallel[train_n:]

In [59]:
def small_test():
    all_bleu = []
    smoothing = nltk.translate.bleu_score.SmoothingFunction().method7
    for m in range(0, valid_n, BATCH_SIZE):
        # print(f"Status: {m}/{N}", end='\r')
        n = m + BATCH_SIZE
        if n > N:
            # print("Epoch Complete...")
            break

        input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
        input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
        for i in range(m, n):
            b,a = en_lang.encodeSentence2(valid_pairs[i][0], MAX_SEQ_LEN)
            input_batch[i-m,:] = a
            input_lens_batch[i-m] = b

    #     target_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
    #     target_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
    #     for i in range(m, n):
    #         b,a = hi_lang.encodeSentence2(valid_pairs[i][1], MAX_SEQ_LEN)
    #         target_batch[i-m,:] = a
    #         target_lens_batch[i-m] = b

        feed_dict={
            input_ids: input_batch,
            input_lens: input_lens_batch,
            #target_ids: target_batch,
            #target_lens: target_lens_batch,
            keep_prob: 1.0
        }
        pred_batch = sess.run(infer_output[0].sample_id, feed_dict=feed_dict)
        for k, pred_ in enumerate(pred_batch):
            pred_s = hi_lang.decodeSentence(list(pred_))
            ref = valid_pairs[m+k][1]
            _bx = nltk.translate.bleu_score.sentence_bleu(
                [nltk.tokenize.word_tokenize(ref)],
                nltk.tokenize.word_tokenize(pred_s),
                weights=(1/3,1/3,1/3),
                smoothing_function=smoothing)
            all_bleu.append(_bx)

    print(f"\n\nBLEU Score: {np.mean(all_bleu)}\n")

In [55]:
for _e in range(10):
    for m in range(0, train_n, BATCH_SIZE):
        n = m + BATCH_SIZE
        if n > train_n:
            print("\nEpoch Complete...")
            break

        input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
        input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
        for i in range(m, n):
            b,a = en_lang.encodeSentence2(train_pairs[i][0], MAX_SEQ_LEN)
            input_batch[i-m,:] = a
            input_lens_batch[i-m] = b

        target_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
        target_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
        for i in range(m, n):
            b,a = hi_lang.encodeSentence2(train_pairs[i][1], MAX_SEQ_LEN)
            target_batch[i-m,:] = a
            target_lens_batch[i-m] = b

        feed_dict={
            input_ids: input_batch,
            input_lens: input_lens_batch,
            ph_target_ids: target_batch,
            target_lens: target_lens_batch,
            keep_prob: 0.8
        }
        sess.run(train_op, feed_dict=feed_dict)
        batch_loss = sess.run(cost, feed_dict=feed_dict)
        print(f"Epoch: {_e} >> Status: {n}/{train_n} >> Loss: {batch_loss}", end="\r")
        if (1 + n//BATCH_SIZE) % 100 == 0:
            small_test()

Status: 6336/199000 Loss: 7.4355077743530275

BLEU Score: 0.0

Status: 12736/199000 Loss: 5.6901521682739265

BLEU Score: 0.0

Status: 19136/199000 Loss: 5.2087216377258335

BLEU Score: 0.0

Status: 25536/199000 Loss: 5.2628655433654785

BLEU Score: 0.0

Status: 31936/199000 Loss: 4.9873642921447755

BLEU Score: 0.0

Status: 38336/199000 Loss: 5.0791559219360355

BLEU Score: 0.0

Status: 44736/199000 Loss: 4.7777185440063485

BLEU Score: 0.0

Status: 51136/199000 Loss: 4.9013814926147465

BLEU Score: 0.0

Status: 57536/199000 Loss: 5.1752510070800785

BLEU Score: 0.0

Status: 63936/199000 Loss: 5.0818676948547365

BLEU Score: 0.0

Status: 70336/199000 Loss: 5.1289081573486335

BLEU Score: 0.0

Status: 76736/199000 Loss: 4.9690222740173345

BLEU Score: 0.0

Status: 83136/199000 Loss: 4.8674006462097175

BLEU Score: 0.0

Status: 89536/199000 Loss: 5.0300726890563965

BLEU Score: 0.0

Status: 95936/199000 Loss: 4.8781161308288575

BLEU Score: 0.0

Status: 102336/199000 Loss: 4.97684335708

Status: 70336/199000 Loss: 4.2576661109924323

BLEU Score: 0.04365472316659568

Status: 76736/199000 Loss: 4.1121530532836915

BLEU Score: 0.04291873508898363

Status: 83136/199000 Loss: 3.9832925796508795

BLEU Score: 0.03569969387381602

Status: 89536/199000 Loss: 4.1267757415771485

BLEU Score: 0.04013472255860014

Status: 95936/199000 Loss: 4.0549044609069825

BLEU Score: 0.044543682686717864

Status: 102336/199000 Loss: 4.1538600921630866

BLEU Score: 0.03858414060376789

Status: 108736/199000 Loss: 4.2430262565612793

BLEU Score: 0.03636673210966177

Status: 115136/199000 Loss: 4.1926059722900392

BLEU Score: 0.03151326999504446

Status: 121536/199000 Loss: 4.3281850814819344

BLEU Score: 0.04965283974008179

Status: 127936/199000 Loss: 4.2653760910034185

BLEU Score: 0.042909629082618306

Status: 134336/199000 Loss: 4.0166168212890625

BLEU Score: 0.04194838987485026

Status: 140736/199000 Loss: 4.0873599052429265

BLEU Score: 0.0413248521793542

Status: 147136/199000 Loss: 4.25

Status: 147136/199000 Loss: 3.9467592239379883

BLEU Score: 0.07637246289450834

Status: 153536/199000 Loss: 3.6941242218017585

BLEU Score: 0.07578117755415989

Status: 159936/199000 Loss: 3.9460370540618896

BLEU Score: 0.07325970330722577

Status: 166336/199000 Loss: 3.6321368217468263

BLEU Score: 0.07349705639983835

Status: 172736/199000 Loss: 3.7790086269378663

BLEU Score: 0.07698153677829254

Status: 179136/199000 Loss: 3.7270412445068365

BLEU Score: 0.07593955473255308

Status: 185536/199000 Loss: 3.7541911602020264

BLEU Score: 0.07297186939235496

Status: 191936/199000 Loss: 3.8142096996307373

BLEU Score: 0.07592772760197977

Status: 198336/199000 Loss: 3.7607498168945312

BLEU Score: 0.07704379341765603

Status: 198976/199000 Loss: 3.5271446704864552
Epoch Complete...


In [60]:
small_test()



BLEU Score: 0.08219526058944544



In [68]:
hi_lang.id2word[9758]

'शुभारंभ'

#### Performance Evaluation using BLEU scores

### Let's see some real translation examples now!

In [61]:
def translate(s):
    input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
    input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
    b,a = en_lang.encodeSentence2(s, MAX_SEQ_LEN)
    input_batch[0, :] = a
    input_lens_batch[0] = b
    
    feed_dict={
        input_ids: input_batch,
        input_lens: input_lens_batch,
        #target_ids: target_batch,
        #target_lens: target_lens_batch,
        keep_prob: 1.0
    }
    pred_batch = sess.run(infer_output[0].sample_id, feed_dict=feed_dict)
    pred_ = pred_batch[0]
    pred_s = hi_lang.decodeSentence(list(pred_))
    # ref = valid_pairs[m+k][1]
    return pred_s

In [66]:
translate("Please fill in the form.")

'UNK के लिए UNK UNK के लिए UNK UNK . UNK . UNK . UNK . UNK . UNK . UNK . UNK . UNK . UNK . UNK . UNK . UNK . UNK . UNK . UNK . UNK . UNK . UNK . UNK . UNK .'

### Bonus Resources
Last but not the least, learn PyTorch also.