In [1]:
import numpy as np
import tensorflow as tf
import pickle

from tensorflow.keras import layers , activations , models , preprocessing

In [2]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras import preprocessing , utils
import os, sys
import yaml
import gensim
path = r"C:\Users\asmit\Downloads\Chatbot\fyp\chat\data"
files_list = os.listdir( path )

questions = list()
answers = list()

for filepath in files_list:
    stream = open( path + os.sep + filepath , 'rb')
    
    docs = yaml.safe_load(stream)
    conversations = docs['conversations']
    for con in conversations:
        if len( con ) > 2 :
            questions.append(con[0])
            replies = con[ 1 : ]
            ans = ''
            for rep in replies:
                ans += ' ' + rep
            answers.append( ans )
        elif len( con )> 1:
            questions.append(con[0])
            answers.append(con[1])

answers_with_tags = list()
for i in range( len( answers ) ):
    if type( answers[i] ) == str:
        answers_with_tags.append( answers[i] )
    else:
        questions.pop( i )

answers = list()
for i in range( len( answers_with_tags ) ) :
    answers.append( '<START> ' + answers_with_tags[i] + ' <END>' )



Using TensorFlow backend.


In [3]:
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(  questions + answers )

In [4]:
VOCAB_SIZE = len( tokenizer.word_index )+1
print( 'VOCAB SIZE : {}'.format( VOCAB_SIZE ))
tokenizer.word_index

VOCAB SIZE : 2416


{'end': 1,
 'start': 2,
 'you': 3,
 'i': 4,
 'a': 5,
 'the': 6,
 'is': 7,
 'to': 8,
 'what': 9,
 'are': 10,
 'of': 11,
 'do': 12,
 'and': 13,
 'not': 14,
 'me': 15,
 'that': 16,
 'have': 17,
 'it': 18,
 'in': 19,
 'can': 20,
 'like': 21,
 'am': 22,
 "i'm": 23,
 'my': 24,
 'tell': 25,
 'be': 26,
 'how': 27,
 'as': 28,
 'get': 29,
 'about': 30,
 "don't": 31,
 'your': 32,
 'for': 33,
 'with': 34,
 'when': 35,
 'so': 36,
 'know': 37,
 'an': 38,
 'no': 39,
 'on': 40,
 'feel': 41,
 'or': 42,
 'but': 43,
 'joke': 44,
 'think': 45,
 'who': 46,
 'if': 47,
 'all': 48,
 'computer': 49,
 'why': 50,
 'one': 51,
 'by': 52,
 'human': 53,
 'very': 54,
 'cross': 55,
 'will': 56,
 'much': 57,
 'just': 58,
 'at': 59,
 'software': 60,
 'really': 61,
 'any': 62,
 'was': 63,
 'good': 64,
 'yes': 65,
 'which': 66,
 'he': 67,
 'from': 68,
 'say': 69,
 'robot': 70,
 'now': 71,
 'too': 72,
 'more': 73,
 'time': 74,
 'some': 75,
 'nice': 76,
 'love': 77,
 'we': 78,
 'been': 79,
 'should': 80,
 'chat': 81,
 'coul

In [5]:
from gensim.models import Word2Vec
import re
import string
vocab = []
for word in tokenizer.word_index:
    vocab.append( word )

def tokenize( sentences ):
    tokens_list = []
    vocabulary = []
    for sentence in sentences:
        sentence = str(sentence).lower()
        sentence = re.sub( '[^a-zA-Z]', ' ', sentence )
        
        result = sentence.translate(str.maketrans("","", string.punctuation))
        #print(result)
        tokens = result.split()
        vocabulary += tokens
        tokens_list.append( tokens )
    return tokens_list , vocabulary

p = tokenize( questions + answers )


In [6]:
# encoder_input_data
tokenized_questions = tokenizer.texts_to_sequences( questions ) #Transform each question in a sequence of integers.
maxlen_questions = max( [ len(x) for x in tokenized_questions ] ) # question with maximum length i.e is 50
padded_questions = preprocessing.sequence.pad_sequences( tokenized_questions , maxlen=maxlen_questions , padding='post' ) #ensure that all sequences in a list have the same length
encoder_input_data = np.array( padded_questions ) #create array
print( encoder_input_data.shape , maxlen_questions )

(854, 50) 50


In [7]:
# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences( answers ) #Transform each text in texts in a sequence of integers.
maxlen_answers = max( [ len(x) for x in tokenized_answers ] ) # answer with maximum length i.e is 74
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )  #ensure that all sequences in a list have the same length
decoder_input_data = np.array( padded_answers ) #create array
print( decoder_input_data.shape , maxlen_answers )

(854, 74) 74


In [8]:
# decoder_output_data: Tokenize the answers. Remove the first element from all the tokenized_answers. 
#This is the <START> element which we added earlier.
tokenized_answers = tokenizer.texts_to_sequences( answers ) #Transform each text in texts in a sequence of integers.
for i in range(len(tokenized_answers)) :
    tokenized_answers[i] = tokenized_answers[i][1:] #remove first element form all the tokenized_answers.
    
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )  #ensure that all sequences in a list have the same length
onehot_answers = utils.to_categorical( padded_answers , VOCAB_SIZE ) #Converts a class vector (integers) to binary class matrix.
decoder_output_data = np.array( onehot_answers ) #create array of onehot_answers

print( decoder_output_data.shape )

(854, 74, 2416)


In [10]:
import tensorflow as tf 
new_model = tf.keras.models.load_model('finalmodel.h5')

In [11]:
new_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 200)    483200      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 200)    483200      input_2[0][0]                    
______________________________________________________________________________________________

In [14]:

encoder_new_model = tf.keras.models.load_model('encoder_new_model.h5',compile=False)
decoder_new_model = tf.keras.models.load_model('decoder_new_model.h5',)




In [13]:
encoder_new_model.summary()

NameError: name 'encoder_new_model' is not defined

In [19]:
decoder_new_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 200)    483200      input_4[0][0]                    
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 200)]        0                                            
____________________________________________________________________________________________

In [20]:
def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    words = sentence.translate(str.maketrans("","", string.punctuation)).split()
    
    tokens_list = list()
    for word in words:
        tokens_list.append( tokenizer.word_index[ word ] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=maxlen_questions , padding='post')

In [None]:


for _ in range(100):
    states_values = encoder_new_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation ='BOT : '+ ''
    while not stop_condition :
        dec_outputs , h , c = decoder_new_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )

Enter question : do you watch movie
BOT :  you are liar end
Enter question : no i am not a liar
BOT :  morning a man as a human end
Enter question : what is ai
BOT :  million in science about by good ball by personal end
Enter question : are you mad
BOT :  of though not much as a software end
Enter question : what do you eat
BOT :  i like not really python a buying bot end
Enter question : are you human
BOT :  i like studied end
Enter question : do you have boyfriend
BOT :  i like alien in refused going you're much to i'm course i don't i like end
Enter question : what do you mean
BOT :  worry end
Enter question : hy
BOT :  asking much end
Enter question : hello
BOT :  can't be are you end
Enter question : you mad bro
BOT :  i can fill robert a tales of humans to worldwide i do it isn't it i i think they of well are seem end
Enter question : fuck you
BOT :  i don't stock actually end
