### Importing Libraries

In [None]:
import re
import nltk
import pandas as pd
import os
from tqdm import tqdm
from bs4 import BeautifulSoup
import numpy as np
import seaborn as sns
import math
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow.keras as keras
import tensorflow as tf
from sklearn.model_selection import train_test_split

### Installing bpemb

In [None]:
!pip install bpemb

Collecting bpemb
  Downloading bpemb-0.3.3-py3-none-any.whl (19 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 4.3 MB/s 
Installing collected packages: sentencepiece, bpemb
Successfully installed bpemb-0.3.3 sentencepiece-0.1.96


### Reading the Preprocessed csv file

In [None]:
input_output =pd.read_csv('/content/drive/MyDrive/converted_for_training_1M.csv')

In [None]:
input_output.head()

Unnamed: 0.1,Unnamed: 0,input,output,decoder_output
0,0,<start> website definitely <end>,<start> read at least the 2nd to last paragrap...,read at least the 2nd to last paragraph in the...
1,1,<start> website definitely read <end>,<start> at least the 2nd to last paragraph in ...,at least the 2nd to last paragraph in the pdf ...
2,2,<start> website definitely read at <end>,<start> least the 2nd to last paragraph in the...,least the 2nd to last paragraph in the pdf <end>
3,3,<start> website definitely read at least <end>,<start> the 2nd to last paragraph in the pdf <...,the 2nd to last paragraph in the pdf <end>
4,4,<start> website definitely read at least the <...,<start> 2nd to last paragraph in the pdf <end>,2nd to last paragraph in the pdf <end>


### Removing the start and end tags

In [None]:
def remove_start_end_input(str):
  return ' '.join(str.split(' ')[1:-1])

def remove_start_output(str):
  return ' '.join(str.split(' ')[:-1])

In [None]:
input_output['input'] = input_output['input'].transform(remove_start_end_input)
input_output['output'] = input_output['output'].transform(remove_start_end_input)
input_output['decoder_output'] = input_output['decoder_output'].transform(remove_start_output)

In [None]:
input_output.head()

Unnamed: 0.1,Unnamed: 0,input,output,decoder_output
0,0,website definitely,read at least the 2nd to last paragraph in the...,read at least the 2nd to last paragraph in the...
1,1,website definitely read,at least the 2nd to last paragraph in the pdf,at least the 2nd to last paragraph in the pdf
2,2,website definitely read at,least the 2nd to last paragraph in the pdf,least the 2nd to last paragraph in the pdf
3,3,website definitely read at least,the 2nd to last paragraph in the pdf,the 2nd to last paragraph in the pdf
4,4,website definitely read at least the,2nd to last paragraph in the pdf,2nd to last paragraph in the pdf


In [None]:
from bpemb import BPEmb

In [None]:
bpemb_en = BPEmb(lang="en",vs=50000)

downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs50000.model


100%|██████████| 1100587/1100587 [00:01<00:00, 711358.47B/s]


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs50000.d100.w2v.bin.tar.gz


100%|██████████| 18972246/18972246 [00:03<00:00, 5652398.27B/s]


In [None]:
bpemb_en.vectors.shape

(50000, 100)

### Creating the Encoder input and Decoder input and output

In [None]:
encoder_input = np.array(input_output['input'])
decoder_input = np.array(input_output['output'])
decoder_target = np.array(input_output['decoder_output'])

In [None]:
encoder_input_train, encoder_input_test, decoder_input_train, decoder_input_test, decoder_target_train, decoder_target_test = train_test_split(encoder_input, decoder_input,decoder_target, test_size=0.3)

In [None]:
encoder_input_train = bpemb_en.encode_ids_with_bos_eos(encoder_input_train)
encoder_input_test = bpemb_en.encode_ids_with_bos_eos(encoder_input_test)
decoder_input_train = bpemb_en.encode_ids_with_bos_eos(decoder_input_train)
decoder_input_test = bpemb_en.encode_ids_with_bos_eos(decoder_input_test)
decoder_target_train = bpemb_en.encode_ids_with_eos(decoder_target_train)
decoder_target_test = bpemb_en.encode_ids_with_eos(decoder_target_test)

In [None]:
print(encoder_input_train.shape, encoder_input_test.shape)
print(decoder_input_train.shape, decoder_input_test.shape)
print(decoder_target_train.shape, decoder_target_test.shape)

(700000,) (300000,)
(700000,) (300000,)
(700000,) (300000,)


In [None]:
print(encoder_input_train[0])
print(encoder_input_test[0])
print(decoder_input_train[0])
print(decoder_input_test[0])
print(decoder_target_train[0])
print(decoder_target_test[0])

[1, 4, 4538, 2]
[1, 774, 26, 6663, 6663, 1964, 35933, 2]
[1, 32, 774, 15725, 71, 146, 963, 2]
[1, 663, 515, 2780, 39617, 49934, 20915, 1645, 32478, 3393, 3589, 774, 24, 2]
[32, 774, 15725, 71, 146, 963, 2]
[663, 515, 2780, 39617, 49934, 20915, 1645, 32478, 3393, 3589, 774, 24, 2]


In [None]:
def max_length(t):
    return max(len(i) for i in t)

max_length_in = max_length(encoder_input_train)
max_length_out = max_length(decoder_input_train)

encoder_input_train = keras.preprocessing.sequence.pad_sequences(encoder_input_train, maxlen=max_length_in, padding="post")
decoder_input_train = keras.preprocessing.sequence.pad_sequences(decoder_input_train, maxlen=max_length_out, padding="post")
decoder_target_train = keras.preprocessing.sequence.pad_sequences(decoder_target_train, maxlen=max_length_out, padding="post")

encoder_input_test = keras.preprocessing.sequence.pad_sequences(encoder_input_test, maxlen=max_length_in, padding="post")
decoder_input_test = keras.preprocessing.sequence.pad_sequences(decoder_input_test, maxlen=max_length_out, padding="post")
decoder_target_test = keras.preprocessing.sequence.pad_sequences(decoder_target_test, maxlen=max_length_out, padding="post")

In [None]:
print(max_length_in, max_length_out)
gru_dim = 256
batch_size = 128

149 146


### Creating the Encoder - Decoder Model

In [None]:
from tensorflow.keras.layers import Input, Embedding, GRU, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import TerminateOnNaN
import datetime

In [None]:
#Encoder

input_encoder = Input(shape = (max_length_in,))
embedding_encoder = Embedding(bpemb_en.vectors.shape[0],100,embeddings_initializer = keras.initializers.Constant(bpemb_en.vectors), input_length=max_length_in,trainable=False)
gru_en = GRU(units = gru_dim, return_sequences = True, return_state = True)
gru_bi_encoder = Bidirectional(gru_en)

#Decoder
input_decoder = Input(shape=(None,))
embedding_decoder = Embedding(bpemb_en.vectors.shape[0],100,embeddings_initializer = keras.initializers.Constant(bpemb_en.vectors), input_length=max_length_out,trainable=False)
gru_de = GRU(units = gru_dim*2, return_sequences = True, return_state = True)


#create model flow
#encoder flow
input_en = input_encoder
embedding_en = embedding_encoder(input_en)
encoder_out, forward_state, backward_state = gru_bi_encoder(embedding_en)
state_h = keras.layers.Concatenate()([forward_state, backward_state])

#decoder flow
input_de = input_decoder
embedding_de = embedding_decoder(input_de)
decoder_out, _ = gru_de(embedding_de, initial_state=state_h)
dropout1 = Dropout(0.2)(decoder_out)
decoder_dense_1 = keras.layers.Dense(128, activation="relu")(dropout1)
dropout2 = Dropout(0.2)(decoder_dense_1)
decoder_dense_output = keras.layers.Dense(bpemb_en.vectors.shape[0], activation="softmax")(dropout2)



In [None]:
#ModelCheckpoint = Saves the model when the acc. metric improve
filepath="/content/drive/MyDrive/Best_Model_L1_revised/weights-{epoch:02d}-{val_perplexity:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_perplexity',  verbose=1, save_best_only=True, mode='min')

#Stops when the acc. metric does not imporve for 2 iterations
earlystop = EarlyStopping(monitor='val_perplexity', patience=5, verbose=15,mode='min')

#Creates tensorboard logs 
log_dir="/content/drive/MyDrive/logs_revised/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir,histogram_freq=1, write_graph=True,write_grads=True)

#terminates when the loss becomes NaN
TerminateWhenLossNaN = TerminateOnNaN()



In [None]:
def perplexity(y_true, y_pred):
    return keras.backend.exp(keras.backend.mean(keras.backend.sparse_categorical_crossentropy(y_true, y_pred)))

In [None]:
# Define the model that uses the Encoder and the Decoder
model = keras.models.Model([input_encoder, input_decoder], decoder_dense_output)


model.compile(optimizer='adam', loss="sparse_categorical_crossentropy", metrics=[perplexity])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 149)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 149, 100)     5000000     ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 bidirectional (Bidirectional)  [(None, 149, 512),   549888      ['embedding[0][0]']              
                                 (None, 256),                                                 

In [None]:
epochs = 100
with tf.device('/device:GPU:0'):
  model.fit([encoder_input_train, decoder_input_train], decoder_target_train,
                 batch_size=batch_size,
                 epochs=epochs,
                 validation_split=0.2
                 ,callbacks = [checkpoint,TerminateWhenLossNaN,tensorboard_callback])

Epoch 1/100
Epoch 1: val_perplexity improved from inf to 1.28693, saving model to /content/drive/MyDrive/Best_Model_L1_revised/weights-01-1.2869.hdf5
Epoch 2/100
Epoch 2: val_perplexity improved from 1.28693 to 1.24087, saving model to /content/drive/MyDrive/Best_Model_L1_revised/weights-02-1.2409.hdf5
Epoch 3/100
Epoch 3: val_perplexity improved from 1.24087 to 1.21848, saving model to /content/drive/MyDrive/Best_Model_L1_revised/weights-03-1.2185.hdf5
Epoch 4/100
Epoch 4: val_perplexity improved from 1.21848 to 1.20749, saving model to /content/drive/MyDrive/Best_Model_L1_revised/weights-04-1.2075.hdf5
Epoch 5/100
Epoch 5: val_perplexity improved from 1.20749 to 1.20011, saving model to /content/drive/MyDrive/Best_Model_L1_revised/weights-05-1.2001.hdf5
Epoch 6/100
Epoch 6: val_perplexity improved from 1.20011 to 1.19606, saving model to /content/drive/MyDrive/Best_Model_L1_revised/weights-06-1.1961.hdf5
Epoch 7/100
Epoch 7: val_perplexity improved from 1.19606 to 1.19229, saving mod

### Loading the best found model

In [None]:
new_model = tf.keras.models.load_model('/content/drive/MyDrive/Best_Model_L1_revised/weights-42-1.1737.hdf5',custom_objects = {'perplexity':perplexity})

# Check its architecture
new_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 149)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 149, 100)     5000000     ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 bidirectional (Bidirectional)  [(None, 149, 512),   549888      ['embedding[0][0]']              
                                 (None, 256),                                                 

### Inference Function

In [None]:
def predict_trail(input_sentence):
  en_sen = bpemb_en.encode_ids_with_bos_eos(input_sentence)
  de_sen = [[1]]
  encoder_input_train = keras.preprocessing.sequence.pad_sequences([en_sen], maxlen=149, padding="post")
  decoder_input_train = np.array(de_sen)
  embedding_en = new_model.layers[1](encoder_input_train)
  encoder_output,state_f,state_b=new_model.layers[3](embedding_en)
  concatenated = new_model.layers[5]([state_f,state_b])
  start_flag = 1
  result = []
  for i in range(0,152):
    embedding_de = new_model.layers[4](decoder_input_train)
    output,state = new_model.layers[6](embedding_de, initial_state = concatenated)
    dense = new_model.layers[8](output)
    dense1 = new_model.layers[10](dense)
    index = np.argmax(dense1[0][0])
    de_sen = [[int(index)]]
    decoder_input_train = np.array(de_sen)
    result.append(int(index))
    concatenated = state
    if(index == 0):
      break
  return result

### Checking the output given by the decoder

In [None]:
texts = [
    'here is a',
    'have have a',
    'pleaseeeee review',
    'please call me',
    'thanks for the',
    'Let me know if yu',
    'this sounds',
    'is this call going to',
    'can you get',
    'is it okay',
    'it should',
    'call if there\'s',
    'gave her a',
    'i will let',
    'i will lettt',
    'may i get a copy of all the',
    'how is our trade',
    'looks like a',
    'i am fine with the changes',
    'please be sure this'
]

output = list(map(lambda text: (text, bpemb_en.decode_ids(predict_trail(text))), texts))
output_df = pd.DataFrame(output, columns=["input", "output"])
output_df.head(len(output))

Unnamed: 0,input,output
0,here is a,lot of the org for the org of the org game ⁇
1,have have a,great weekend ⁇
2,pleaseeeee review,⁇
3,please call me,if you have any questions ⁇
4,thanks for the,help ⁇
5,Let me know if yu,know you need to get a nomination form ⁇
6,this sounds,good ⁇
7,is this call going to,the org and name and name and i shall be glad ...
8,can you get,"a copy of the org spreadsheet, i am missing th..."
9,is it okay,⁇


### The final perplexity score on test data points 

In [None]:
scores = new_model.evaluate([encoder_input_test, decoder_input_test], decoder_target_test)
print("%s: %.2f" % (new_model.metrics_names[1], scores[1]))

perplexity: 1.16
