## import the libraries

In [223]:
import tensorflow as tf
import numpy as np
from nltk.tokenize import RegexpTokenizer
from itertools import islice
import warnings
warnings.filterwarnings('ignore')

## import the data 

In [224]:
#import the data from 
#chat1 stores the chat of 1st person
#chat2 stores the chat of 2nd person

chat1=[]
chat2=[]

def yield_alt(f, option='odd'):
    
        if option == 'odd':
            return islice(f, 0, None, 2)
        return islice(f, 1, None, 2)
 
with open('/home/hadoop-user1/chat_dataset1.txt') as f:
    for line in yield_alt(f):      
        chat1.append(line)

with open('/home/hadoop-user1/chat_dataset1.txt') as f:
    for line in yield_alt(f, 'even'):
        chat2.append(line)

## showing some samples

In [225]:
chat1,chat2

(['hi\n',
  'hi there how are you,\n',
  'i am  doing fine\n',
  'hi my name is expectations\n',
  'hello apeksha my best friend\n'],
 ['hello\n',
  'i am fine how do you do\n',
  'what is your name\n',
  'you are indeed expectations\n',
  'apeksha is expectations\n'])

## tokenizes the list of strings and finds the vocab size

In [226]:
#ques stores the tokenized list of chat1
#ans stores tokenized list of chat2
#words store total number of words
#vocab_stores total number of unique words

tokenizer = RegexpTokenizer(r'\w+')

ques,ans=[],[]
words=[]

for i in chat1:
    
        tr=map(str.lower,tokenizer.tokenize(i))
        ques.append(tr)
        words.extend(tr)
        
for i in chat2:
    
        tr=map(str.lower,tokenizer.tokenize(i)) 
        ans.append(tr)
        words.extend(tr)

## vocabulary size

In [227]:
vocab_size=len((set(words)))
vocab_size

21

## finds dictionary of words (id to word and word to id)

In [228]:
id_to_word=dict(enumerate(set(words)))

word_to_id={k:v for v,k in id_to_word.items()}
id_to_word,word_to_id

({0: 'apeksha',
  1: 'do',
  2: 'what',
  3: 'friend',
  4: 'your',
  5: 'name',
  6: 'i',
  7: 'my',
  8: 'doing',
  9: 'there',
  10: 'am',
  11: 'expectations',
  12: 'hello',
  13: 'how',
  14: 'hi',
  15: 'are',
  16: 'indeed',
  17: 'you',
  18: 'fine',
  19: 'is',
  20: 'best'},
 {'am': 10,
  'apeksha': 0,
  'are': 15,
  'best': 20,
  'do': 1,
  'doing': 8,
  'expectations': 11,
  'fine': 18,
  'friend': 3,
  'hello': 12,
  'hi': 14,
  'how': 13,
  'i': 6,
  'indeed': 16,
  'is': 19,
  'my': 7,
  'name': 5,
  'there': 9,
  'what': 2,
  'you': 17,
  'your': 4})

## data stores tokenized question and answer as a whole

In [229]:
data=[]

a=[data.append(i) for i in ques]
a=[data.append(i) for i in ans]

## Adding an empty string to the dictionary

In [230]:
# Adds empty string corresponding the index added for padding in the later part part of code

id_to_word[vocab_size]=''

## converting words to indices of ques and ans

In [231]:
# train and target stores the  indices corresponding to words of ques and ans

train,target=[],[]

for i in range(len(ques)):
    
    train.append([word_to_id[x] for x in ques[i]])
    target.append([word_to_id[x] for x in ans[i]])
    

## padding to make all sequence of equal length

In [232]:
# max_length calculates length of longest sequence and add 100 as a padding to make all sequence of equal length

max_length = max([len(i) for i in data])

for i in range(len(train)):
    
    train[i] =[j for j in train[i]] + [vocab_size] * (max_length - len(train[i]))

    target[i] =[j for j in target[i]] + [vocab_size] * (max_length - len(target[i]))
   

In [233]:
train = np.array(train)

target = np.array(target)

In [234]:
train,target

(array([[14, 21, 21, 21, 21, 21, 21],
        [14,  9, 13, 15, 17, 21, 21],
        [ 6, 10,  8, 18, 21, 21, 21],
        [14,  7,  5, 19, 11, 21, 21],
        [12,  0,  7, 20,  3, 21, 21]]), array([[12, 21, 21, 21, 21, 21, 21],
        [ 6, 10, 18, 13,  1, 17,  1],
        [ 2, 19,  4,  5, 21, 21, 21],
        [17, 15, 16, 11, 21, 21, 21],
        [ 0, 19, 11, 21, 21, 21, 21]]))

## embedding vector

In [235]:
# one hot encoding is used as an embedding vector

vec = np.zeros((vocab_size+1, vocab_size+1))

for k in id_to_word.keys():
    vec[k][k] = 1

# constant tensor for word embedding(one hot)

embed = tf.constant(vec, dtype=tf.float32)

## test sample

In [236]:
test_ques = np.array([[14, 13, 15, 17, 21, 21, 21]])
[id_to_word[i] for i in test_ques[0]]

['hi', 'how', 'are', 'you', '', '', '']

## initializing variables

In [237]:
input_size = vocab_size+1
output_size = vocab_size+1
iterations = 5001
hidden_layer = 60
learning_rate = 0.001


## initializing random weights 

In [238]:
Wxhe=tf.Variable(tf.random_normal(([input_size,hidden_layer]),0,0.1),dtype=tf.float32)

Whhe=tf.Variable(tf.random_normal(([hidden_layer,hidden_layer]),0,0.1),dtype=tf.float32)

Whye=tf.Variable(tf.random_normal(([hidden_layer,output_size]),0,0.1),dtype=tf.float32)


In [239]:
Wxhd=tf.Variable(tf.random_normal(([input_size,hidden_layer]),0,0.1),dtype=tf.float32)

Whhd=tf.Variable(tf.random_normal(([hidden_layer,hidden_layer]),0,0.1),dtype=tf.float32)

Whyd=tf.Variable(tf.random_normal(([hidden_layer,output_size]),0,0.1),dtype=tf.float32)


## function to train the RNN

In [240]:
def train_nn(): # execution of training starts with this function
    
    with tf.variable_scope("train_nn"):
        
        #placeholder for input, output and hidden state and previous outputs of encoder and decoder
        
        x=tf.placeholder(shape=[None,max_length], dtype=tf.int32) 
        y=tf.placeholder(shape=[None,None],dtype=tf.int32)
        
        hinit = tf.placeholder(shape=[None, hidden_layer], dtype=tf.float32)
        xinit=tf.placeholder(shape=[None,vocab_size+1],dtype=tf.float32)
        prob=tf.placeholder(shape=[None,vocab_size+1],dtype=tf.float32)
        

        #embedding target as one hot
        Y_embed = tf.nn.embedding_lookup(embed,y)

        # encoder_output is the context vector and encoder state is the hidden state of lat time space
        encoder_output,encoder_state=encoder(x,hinit,xinit)

        #decoder_output is the output of decoder(as an output of softmax layer)
        decoder_output=decoder(x,encoder_output,encoder_state,prob)
        decoder_output = tf.transpose(decoder_output, [1, 0, 2])
        
        #takes the argmax of output to find the indices of the corresponding word 
        decoder_output_arg=tf.argmax(decoder_output,axis=2)

        #loss takes the mean of all the samples loss (loss function used is softmax_cross_entropy_with_logits)
        loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=decoder_output,labels=Y_embed))
        
        optimizer=tf.train.AdamOptimizer(0.001).minimize(loss)
        
        with tf.Session() as session:

            session.run(tf.global_variables_initializer()) #initializes the variables
            session.run(tf.local_variables_initializer())
            
            #iterating the loop
            for i in range(iterations):
                
                #runs the loss and optimizer function by feeding train, target and hidden state to their placeholders 
                #l is the loss at each iteration
                
                l,_=session.run([loss,optimizer],
                feed_dict={x:train, y:target,hinit:np.zeros((5,hidden_layer)),xinit:np.zeros((5,vocab_size+1))
                           ,prob:np.zeros((5,vocab_size+1))})
                
                
                if i%1000==0:
                    print "loss for",i,"iteration:",l
                
                #predict outputs the decoders output after each weight tuning at each iteration
                
                
                predict = session.run(decoder_output_arg,
                {x:train, y:target,hinit:np.zeros((5,hidden_layer)),xinit:np.zeros((5,vocab_size+1)),
                 prob:np.zeros((5,vocab_size+1))})
                
                if(i%5000==0):
                    
                    for j in range(5):#print predicted result of four sample for simplicity

                        print("chat1 : ", [[id_to_word[w] for w in sent] for sent in train][j])
                        print("chat2 : ", [[id_to_word[w] for w in sent] for sent in predict][j])
                        
                        
                        
            print"---------Training over------------"
            
            print "---testing the trained model with test data----"
            
            out_test = session.run(decoder_output_arg, {x:test_ques, 
                hinit: np.zeros((1, hidden_layer)),xinit:np.zeros((1,vocab_size+1)),
                prob:np.zeros((1,vocab_size+1))})
            
            print("chat1 : ", [[id_to_word[w] for w in sent] for sent in test_ques])
            print("chat_bot     : ", [[id_to_word[w] for w in sent] for sent in out_test])
        

## encoder function to find the context vector and hidden state of input sequence

In [241]:
def encoder(X,hinit,xinit):#takes input sequence and initial hidden state and initial output 
    
    with tf.variable_scope("encoder"):
        
        #embedding for train to one hot
        X_embed = tf.nn.embedding_lookup(embed,X)
        X_embed = tf.transpose(X_embed, [1, 0, 2])

        #list of hidden state and output
        con=[hinit,xinit]
        
        #scans encoder's helper function RNN_enc 
        output= tf.scan(RNN_enc, X_embed, initializer=con)
  
        
        h_t=output[0]
        y_t=output[1]

        #takes last hidden state and output
        h_t=h_t[-1]
        y_t=y_t[-1]


        arg=tf.argmax(y_t,axis=1)
        arg_one_hot=tf.nn.embedding_lookup(embed,(tf.cast(arg, tf.int32)))
        
        #returns one hot vector of encoder's output and hidden state
        return arg_one_hot,h_t


## encoder's helper function

In [242]:
def RNN_enc(h, x_t):#helper function of encoder (takes hidden states,output and input sequence)
    
    with tf.variable_scope('RNN_enc'):
        
            #reshapes input sequence
            x_t = tf.reshape(x_t, [-1, input_size])
            x_t=tf.cast(x_t, tf.float32)
            
            #reshapes hidden state
            hprev = tf.reshape(h[0], [-1, hidden_layer])

            #h_t calculates the wighted sum of inputs and hidden  state of previous layes 
            #and then pass it to the activation function
            
            h_t=tf.tanh(tf.matmul(hprev,Whhe) + tf.matmul(x_t,Wxhe))
            h_t = tf.reshape(h_t, [-1, hidden_layer])

            #out is the output from the softmax layer
            out=tf.nn.softmax(tf.matmul(h_t,Whye))


            x_and_h=[h_t,out]        

            #reurns list of hidden state and ouput of encoder
            return x_and_h

## decoder function

In [243]:
def decoder(X,xprev,s,prob):
    #takes input sequence  
    #last hidden state from encoder 
    #initial input to decoder as a context vector 
    
    with tf.variable_scope("decoder"):
        
        #embedding for train to one hot
        X_embed = tf.nn.embedding_lookup(embed,X)
        X_embed = tf.transpose(X_embed, [1, 0, 2])

        #list of hidden state,output of previous state
        con=[s,xprev,prob]

        #scans decoder's helper function RNN_enc 
        output= tf.scan(RNN_dec, X_embed, initializer=con)

        h_dec=output[0]

        out_prob=output[2]
        
        #returns output from decoder
        return out_prob


## decoder's helper function

In [244]:
def RNN_dec(h, x_t):
     #takes hidden state of previous time state
     #ouput of previous state
     #input sequence just for the sake of working of scan function to iterate
        
    
        with tf.variable_scope('RNN_dec'):
            
            hprev=h[0]
            yprev=h[1]
            
            #reshape
            hprev = tf.reshape(h[0], [-1, hidden_layer])

            yprev = tf.reshape(h[1], [-1, input_size])

            #h_t calculates the wighted sum of inputs and hidden  state of previous layes 
            #and then pass it to the activation function
            
            h_t=tf.tanh(tf.matmul(yprev,Wxhd )+ tf.matmul(hprev,Whhd))
            
            h_t = tf.reshape(h_t, [-1, hidden_layer])

            #out is the output from the softmax layer
            out=tf.nn.softmax(tf.matmul(h_t,Whyd))

            arg=tf.argmax(out,axis=1)
            arg_one_hot=tf.nn.embedding_lookup(embed,arg)

            x_and_h=[h_t,arg_one_hot,out] 
            
            #reurns list of hidden state and ouput of decoder
            return x_and_h

## function to start the execution of training

In [245]:
#start of execution of training
train_nn()

loss for 0 iteration: 3.09008
('chat1 : ', ['hi', '', '', '', '', '', ''])
('chat2 : ', ['there', 'apeksha', 'what', 'how', 'my', '', ''])
('chat1 : ', ['hi', 'there', 'how', 'are', 'you', '', ''])
('chat2 : ', ['do', 'you', 'what', 'fine', 'expectations', 'friend', 'your'])
('chat1 : ', ['i', 'am', 'doing', 'fine', '', '', ''])
('chat2 : ', ['', 'apeksha', '', 'apeksha', 'are', 'am', 'i'])
('chat1 : ', ['hi', 'my', 'name', 'is', 'expectations', '', ''])
('chat2 : ', ['', 'apeksha', 'indeed', 'what', 'you', 'apeksha', 'you'])
('chat1 : ', ['hello', 'apeksha', 'my', 'best', 'friend', '', ''])
('chat2 : ', ['do', 'i', 'what', 'fine', 'are', 'name', 'doing'])
loss for 1000 iteration: 2.19537
loss for 2000 iteration: 2.16642
loss for 3000 iteration: 2.16632
loss for 4000 iteration: 2.16628
loss for 5000 iteration: 2.16626
('chat1 : ', ['hi', '', '', '', '', '', ''])
('chat2 : ', ['hello', '', '', '', '', '', ''])
('chat1 : ', ['hi', 'there', 'how', 'are', 'you', '', ''])
('chat2 : ', ['i',