<a href="https://colab.research.google.com/github/bugalia786/QnAChatBot/blob/main/QnA_CHAT_BOT1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# QnA_CHAT_BOT 

In [None]:
## About The Dataset
# The code is done based on the Babi Data Set from Facebook Research 
# Full Details: https://research.fb.com/downloads/babi/
# - Jason Weston, Antoine Bordes, Sumit Chopra, Tomas Mikolov, Alexander M. Rush,
#   "Towards AI-Complete Question Answering: A Set of Prerequisite Toy Tasks",
#   http://arxiv.org/abs/1502.05698

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#importing libraries
import pickle
import numpy as np

#loading training and test data
with open("/content/drive/MyDrive/train_qa.txt", "rb") as fp:   # Unpickling
    train_data =  pickle.load(fp)

with open("/content/drive/MyDrive/test_qa.txt", "rb") as fp:   # Unpickling
    test_data =  pickle.load(fp)

In [None]:
#The dataset consists of three parts 
#1. The part to train on(input text)
#2. The binary question asked
#3. Binary answer to the question
#some of the examples are written below
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [None]:
# Create a set that holds the vocab words
vocab = set()
all_data = test_data + train_data
for story, question , answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))
vocab.add('no')
vocab.add('yes')

In [None]:
len(vocab)

37

In [None]:
vocab_len = len(vocab) + 1 
#we add an extra space to hold a 0 for Keras's pad_sequences
max_story_len = max([len(data[0]) for data in all_data])
max_question_len = max([len(data[1]) for data in all_data])

In [None]:
vocab_size = len(vocab) + 1

In [None]:
#importing libraries for padding and tokenization of data
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [None]:
# integer encode sequences of words
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [None]:
train_story_text = []
train_question_text = []
train_answers = []

for story,question,answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)

In [None]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [None]:
len(train_story_text)

10000

In [None]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len,max_question_len=max_question_len):
    '''
    INPUT: 
    
    data: consisting of Stories,Queries,and Answers
    word_index: word index dictionary from tokenizer
    max_story_len: the length of the longest story (used for pad_sequences function)
    max_question_len: length of the longest question (used for pad_sequences function)
    OUTPUT:
    Vectorizes the stories,questions, and answers into padded sequences. We first loop for every story, query , and
    answer in the data. Then we convert the raw words to an word index value. Then we append each set to their appropriate
    output list. Then once we have converted the words to numbers, we pad the sequences so they are all of equal length.
    
    Returns this in the form of a tuple (X,Xq,Y) (padded based on max lengths)
    '''
    
    
    # X = STORIES
    X = []
    # Xq = QUERY/QUESTION
    Xq = []
    # Y = CORRECT ANSWER
    Y = []
    
    
    for story, query, answer in data:
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in query]
        y = np.zeros(len(word_index) + 1)
        y[word_index[answer]] = 1
        X.append(x)
        Xq.append(xq)
        Y.append(y)
    return (pad_sequences(X, maxlen=max_story_len),pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

In [None]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)

In [None]:
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [None]:
#CREATING THE MODEL

In [None]:
#Importing the lib to create the model 
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout
from keras.layers import add, dot, concatenate
from keras.layers import LSTM

In [None]:
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

In [None]:
# Input gets embedded to a sequence of vectors
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64))
input_encoder_m.add(Dropout(0.3))

# This encoder will output:
# (samples, story_maxlen, embedding_dim)

In [None]:
# embed the input into a sequence of vectors of size query_maxlen
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))
# output: (samples, story_maxlen, query_maxlen)

In [None]:
# embed the question into a sequence of vectors
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,
                               output_dim=64,
                               input_length=max_question_len))
question_encoder.add(Dropout(0.3))
# output: (samples, query_maxlen, embedding_dim)

In [None]:
# encode input sequence and questions (which are indices)
# to sequences of dense vectors
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [None]:
# shape: `(samples, story_maxlen, query_maxlen)`
match = dot([input_encoded_m, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)

In [None]:
response = add([match, input_encoded_c]) 
response = Permute((2, 1))(response)  

In [None]:
answer = concatenate([response, question_encoded])

In [None]:
answer = LSTM(32)(answer)

In [None]:
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer)

In [None]:
answer = Activation('softmax')(answer)
model = Model([input_sequence, question], answer)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 156)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 6)]          0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, None, 64)     2432        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_2 (Sequential)       (None, 6, 64)        2432        input_2[0][0]                    
______________________________________________________________________________________________

In [None]:
#history = model.fit([inputs_train, queries_train], answers_train,batch_size=32,epochs=120,validation_data=([inputs_test, queries_test], answers_test))

In [None]:
filename = 'chatbot_120_epochs.h5'
#model.save(filename)

In [None]:
# #Visualizing the results
# import matplotlib.pyplot as plt
# # %matplotlib inline
# print(history.history.keys())
# # summarize history for accuracy
# plt.plot(history.history['accuracy'])
# plt.plot(history.history['val_accuracy'])
# plt.title('model accuracy')
# plt.ylabel('accuracy')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()

# Testing the model

In [None]:
model.load_weights(filename)
pred_results = model.predict(([inputs_test, queries_test]))

In [None]:
story =' '.join(word for word in test_data[0][0])
print(story)

Mary got the milk there . John moved to the bedroom .


In [None]:
query = ' '.join(word for word in test_data[0][1])
print(query)

Is John in the kitchen ?


In [None]:
print("True Test Answer from Data is:",test_data[0][2])

True Test Answer from Data is: no


In [None]:
#Generate prediction from model
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])

Predicted answer is:  john
Probability of certainty was:  0.999992


In [None]:
my_story = "John left the kitchen . Sandra dropped the football in the garden ."
# my_story.split()
my_question = "Is the football in the garden ?"

In [None]:
mydata = [(my_story.split(),my_question.split(),'yes')]
my_story,my_ques,my_ans = vectorize_stories(mydata)
pred_results = model.predict(([ my_story, my_ques]))
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])

Predicted answer is:  took
Probability of certainty was:  0.68173075


In [None]:
# !pip install fastapi nest-asyncio pyngrok uvicorn

In [None]:
# from fastapi import FastAPI
# from fastapi.middleware.cors import CORSMiddleware

# app = FastAPI()

# app.add_middleware(
#     CORSMiddleware,
#     allow_origins=['*'],
#     allow_credentials=True,
#     allow_methods=['*'],
#     allow_headers=['*'],
# )

# @app.get('/')
# async def root():
#     return {'hello': 'world'}

In [None]:
!pip install fastapi
from unicodedata import category
from fastapi import FastAPI



In [None]:
app = FastAPI()
@app.get("/")
def read_root():
    return {"Hello": "World"}



In [None]:
@app.post('/answer')
def answer(story : str,question: str):
    mydata = [(story.split(),question.split(),'yes')]
    story,ques,ans = vectorize_stories(mydata)
    pred_results = model.predict(([ story, ques]))
    val_max = np.argmax(pred_results[0])

    for key, val in tokenizer.word_index.items():
        if val == val_max:
            k = key
    a= float(pred_results[0][val_max])
    return {'Predicted answer is: ': k,'Probability of certainty was:':a}


In [None]:
pred_results[0]

array([2.6554528e-07, 2.7135815e-07, 3.0879411e-07, 2.8571756e-07,
       2.9834882e-07, 3.0204853e-07, 3.1825903e-01, 2.5781628e-07,
       2.6778449e-07, 2.7699431e-07, 3.3015203e-07, 2.2425674e-07,
       3.0668340e-07, 2.9293841e-07, 2.1323149e-07, 2.5143507e-07,
       2.5090688e-07, 3.2931962e-07, 3.1349813e-07, 6.8173075e-01,
       3.0027988e-07, 2.9106013e-07, 3.0506911e-07, 2.5092794e-07,
       2.4694745e-07, 3.3313833e-07, 3.4973414e-07, 2.2572350e-07,
       2.2105615e-07, 2.4083639e-07, 2.6148251e-07, 3.0220181e-07,
       3.0403311e-07, 2.9097353e-07, 2.4309293e-07, 3.4899477e-07,
       3.2914377e-07, 2.5384125e-07], dtype=float32)

In [None]:
!pip install colabcode



In [None]:
from colabcode import ColabCode
server = ColabCode(port=10000, code=False)

In [None]:
server.run_app(app=app)

Public URL: NgrokTunnel: "https://a8f4-35-229-200-190.ngrok.io" -> "http://localhost:10000"


In [None]:
# import nest_asyncio
# from pyngrok import ngrok
# import uvicorn

# ngrok_tunnel = ngrok.connect(8000)
# print('Public URL:', ngrok_tunnel.public_url)
# nest_asyncio.apply()
# uvicorn.run(app, port=8000)

In [None]:
# THE END!!