# <center> Introduction and Discovery of Natural Language Processing  </center>
<center><strong> Application : Chatbot for answering patient’s simple questions</strong></center>

## Part I
#### Loading libraries, modules

In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
import random
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import json
import pickle
import nltk
import pandas as pd
nltk.download('punkt', quiet = True)
nltk.download('wordnet', quiet = True)

True

In [2]:
intents = pd.read_json('intents_dataset.json')

In [4]:
intents.head()

Unnamed: 0,intents
0,"{'tag': 'greeting', 'patterns': ['Hi there', '..."
1,"{'tag': 'goodbye', 'patterns': ['Bye', 'See yo..."
2,"{'tag': 'thanks', 'patterns': ['Thanks', 'Than..."
3,"{'tag': 'noanswer', 'patterns': ['', '??'], 'r..."
4,"{'tag': 'options', 'patterns': ['How you could..."


## Part II : Data Processing

In [5]:
words=[]
classes = []
documents = []
ignore_letters = ['!', '?', ',', '.']

for intent in intents['intents']:
    for pattern in intent['patterns']:
        
        word = nltk.word_tokenize(pattern) #tokenize each word
        words.extend(word)
        
        documents.append((word, intent['tag'])) #add documents in the corpus
        
        if intent['tag'] not in classes:
            classes.append(intent['tag']) # add to our classes list

#### Lemmaztize and lower each word and remove duplicates

In [6]:
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_letters]
words = sorted(list(set(words)))

classes = sorted(list(set(classes)))
# documents = combination between patterns and intents

print (len(documents), "documents")

# classes = intents

print (len(classes), "classes", classes)
# words = all words, vocabulary

print (len(words), "unique lemmatized words", words)

pickle.dump(words,open('words.pkl','wb'))

pickle.dump(classes,open('classes.pkl','wb'))

78 documents
14 classes ['about', 'adverse_drug', 'blood_pressure', 'blood_pressure_search', 'goodbye', 'greeting', 'help', 'hospital_search', 'name', 'noanswer', 'options', 'pharmacy_search', 'search_pharmacy_by_name', 'thanks']
118 unique lemmatized words ["'s", 'a', 'adverse', 'aid', 'all', 'anyone', 'are', 'awesome', 'be', 'behavior', 'blood', 'by', 'bye', 'call', 'can', 'causing', 'center', 'chatting', 'check', 'community', 'costco', 'could', 'creek', 'cv', 'data', 'day', 'detail', 'do', 'drug', 'entry', 'farmacia', 'find', 'for', 'give', 'good', 'goodbye', 'hand', 'have', 'health', 'hello', 'help', 'helpful', 'helping', 'hey', 'hi', 'history', 'hola', 'hospital', 'how', 'i', 'id', 'is', 'kaiser', 'later', 'list', 'load', 'locate', 'log', 'looking', 'lookup', 'management', 'me', 'medical', 'module', 'mowry', "n't", 'name', 'nearby', 'need', 'next', 'nice', 'of', 'offered', 'omnicare', 'open', 'patient', 'pharmacy', 'plaza', 'please', 'pressure', 'provide', 'reaction', 'related', '

## Part III : Training and Testing Data

In [None]:
# create the training data
training = list()
trainings = []
# create empty array for the output

output_empty = [0] * len(classes)
# training set, bag of words for every sentence
for doc in documents:
    # initializing bag of words
    bag = []
    # list of tokenized words for the pattern
    word_patterns = doc[0]
    
    # lemmatize each word - create base word, in attempt to represent related words
    word_patterns = [lemmatizer.lemmatize(word.lower()) for word in word_patterns]
    
    # create the bag of words array with 1, if word is found in current pattern
    for word in words:
        bag.append(1) if word in word_patterns else bag.append(0)
        # output is a '0' for each tag and '1' for current tag (for each pattern)
        output_row = list(output_empty)
        output_row[classes.index(doc[1])] = 1
        trainings.append([bag, output_row])
        # shuffle the features and make numpy array
        random.shuffle(trainings)
        training = np.array(trainings)
        # create training and testing lists. X - patterns, Y - intents
        
        train_x = list(training[:,0])
        train_y = list(training[:,1])
print("Training data is created")

  training = np.array(trainings)


In [None]:
import matplotlib.pyplot as plt
plt.hist(train_x)
plt.show()

## Part IV :  Training the Model

In [None]:
# deep neural networds model
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))
# Compiling model. SGD with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

#Training and saving the model 

hist = model.fit(np.array(train_x), np.array(train_y), epochs=200, batch_size=5, verbose=1)
model.save('chatbot_model.h5', hist)

print("model is created")

## Part V : Interacting With the Chatbot

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import pickle
import numpy as np
from keras.models import load_model
model = load_model('chatbot_model.h5')
import json
import random
intents = json.loads(open('intents_dataset.json').read())
words = pickle.load(open('words.pkl','rb'))
classes = pickle.load(open('classes.pkl','rb'))

#### Clean up Sentence Function

In [None]:
def clean_up_sentence(sentence):
    # tokenize the pattern - splitting words into array
    sentence_words = nltk.word_tokenize(sentence)
  
    # stemming every word - reducing to base form

    sentence_words = [lemmatizer.lemmatize(word.lower()) for word in sentence_words]
    return sentence_words
    #return bag of words array: 0 or 1 for words that exist in sentence

#### Bag of Words Function

In [None]:
def bag_of_words(sentence, words, show_details=True):
    # tokenizing patterns
    sentence_words = clean_up_sentence(sentence)
    # bag of words - vocabulary matrix
    bag = [0]*len(words)  
    for s in sentence_words:
        for i,word in enumerate(words):
            if word == s:
                # assign 1 if current word is in the vocabulary position
                bag[i] = 1
            if show_details:
                print ("found in bag: %s" % word)
    return(np.array(bag))

#### Prection function

In [None]:
def predict_class(sentence):
    # filter below  threshold predictions
    p = bag_of_words(sentence, words,show_details=False)

    res = model.predict(np.array([p]))[0]
    ERROR_THRESHOLD = 0.25

    results = [[i,r] for i,r in enumerate(res) if r>ERROR_THRESHOLD]

    # sorting strength probability

    results.sort(key=lambda x: x[1], reverse=True)

    return_list = []

    for r in results:

        return_list.append({"intent": classes[r[0]], "probability": str(r[1])})
    return return_list

#### Getting responses function

In [None]:
def getResponse(ints, intents_json):

    tag = ints[0]['intent']

    list_of_intents = intents_json['intents']

    for i in list_of_intents:

        if(i['tag']== tag):

            result = random.choice(i['responses'])

            break

    return result

#### Launch Chat Function

In [None]:
def chatting():
    print("Welcome to Talk-To-Me : \n")
    
    while(True):
        message = input()
        if message == 'quit':
            break
        elif message != '':
            ints = predict_class(message)
            result = getResponse(ints, intents)
            print(result)

In [34]:
chatting()

Welcome to Talk-To-Me : 

Hello !
Good to see you again
God
Not sure I understand
no
Sorry, can't understand you
fuck you
Any time!
You're fedding me up
I can guide you through Adverse drug reaction list, Blood pressure tracking, Hospitals and Pharmacies
oh 
Sorry, can't understand you
really
Not sure I understand
quit tt
Please give me more info
quit my home
Sorry, can't understand you
Sorry, can't understand you too
Happy to help!
quit


## Stop Word

In [54]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords', quiet = True)
set(stopwords.words('english'))


# sample sentence
text = """At He determined to drop his litigation with the monastry, and relinguish his claims to the wood-cuting and 
fishery rihgts at once. He was the more ready to do this becuase the rights had become much less valuable, and he had 
indeed the vaguest idea where the wood and river in question were.""".lower()

# set of stop words
stop_words = set(stopwords.words('english')) 

# tokens of words  
word_tokens = word_tokenize(text) 
    
filtered_sentence = [] 
  
for w in word_tokens: 
    if w not in stop_words: 
        filtered_sentence.append(w) 



print("\n\nOriginal Sentence \n\n")
print(" ".join(word_tokens)) 

print("\n\nFiltered Sentence \n\n")
print(" ".join(filtered_sentence))
#word_tokens
#type(stop_words)



Original Sentence 


at he determined to drop his litigation with the monastry , and relinguish his claims to the wood-cuting and fishery rihgts at once . he was the more ready to do this becuase the rights had become much less valuable , and he had indeed the vaguest idea where the wood and river in question were .


Filtered Sentence 


determined drop litigation monastry , relinguish claims wood-cuting fishery rihgts . ready becuase rights become much less valuable , indeed vaguest idea wood river question .


In [55]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


## Stemming from nltk.stem import PorterStemmer        # module for stemming
