In [1]:
#https://pykit.org/chatbot-in-python-using-nlp/
#https://github.com/vishal-verma27/Building-a-Simple-Chatbot-in-Python-using-NLTK

In [2]:
import tensorflow as tf
from tensorflow import keras

In [3]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import SGD
from keras.models import load_model

In [4]:
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
import json
import pickle
import random

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
# Create an object of WordNetLemmatizer

In [7]:
lemmatizer = WordNetLemmatizer()

In [8]:
# importing the Train Bot corpus file for pre-processing

In [9]:
words = []
classes = []
documents = []
ignore_words = ['?', '!']


In [10]:
data_file = open("Train_Bot.json").read()
intents = json.loads(data_file)

In [11]:
# Data pre-processing : can refer to the manipulation or dropping of data before it is used in order to ensure or enhance performance

In [12]:
# Pre-process the json data
# tokenization

In [13]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mdani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mdani\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mdani\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [14]:
for intent in intents['intents']:
    for pattern in intent['patterns']:
        
        # tokenize each word
        
        w = nltk.word_tokenize(pattern)
        words.extend(w)
        
        #add documents in the corpus
        
        documents.append((w, intent['tag']))
        
        #add to our classes list
        
        if intent['tag'] not in classes:
            
            classes.append(intent['tag'])

In [15]:
# In the above code, we are using the Corpus Data which contains nested JSON values and 
# updating the existing empty lists words, documents and classes [mentioned in the code line 9] 

In [16]:
# Tokenize or Tokenization is used to split a large sample of text or sentences into words
# Below code shown the sample from each list we have created above

In [17]:
print("This is words list")
print(words[3:5])

This is words list
['you', 'is']


In [18]:
print("This is documents list")
print(documents[3:5])

This is documents list
[(['hello'], 'Intro'), (['whats', 'up'], 'Intro')]


In [19]:
print("This is classes list")
print(classes[3:5])

This is classes list
['SL', 'NN']


In [20]:
# Lemmatize, Lower eeach word and remove duplicates

In [21]:
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

In [22]:
# sort classes

In [23]:
classes = sorted(list(set(classes)))

In [24]:
# documents = combination between patterns and intents

In [25]:
print("Document Length = ")
print(len(documents),"documents")

Document Length = 
128 documents


In [26]:
# classes = intents

In [27]:
print("Class Length  = ")
print (len(classes),"classes", classes)

Class Length  = 
8 classes ['Bot', 'Exit', 'Intro', 'NN', 'Olympus', 'Profane', 'SL', 'Ticket']


In [28]:
# words = all words, vocabulary

In [29]:
print(len(words), "unique lemmatized words = \n\n", words)

158 unique lemmatized words = 

 ['a', 'able', 'access', 'activation', 'ada', 'adam', 'aifl', 'aiml', 'am', 'an', 'ann', 'anyone', 'are', 'artificial', 'backward', 'bad', 'bagging', 'batch', 'bayes', 'belong', 'best', 'blended', 'bloody', 'boosting', 'bot', 'buddy', 'classification', 'contact', 'create', 'cross', 'cya', 'day', 'deep', 'did', 'diffult', 'do', 'ensemble', 'epoch', 'explain', 'first', 'for', 'forest', 'forward', 'from', 'function', 'good', 'goodbye', 'gradient', 'great', 'hate', 'have', 'hell', 'hello', 'help', 'helped', 'hey', 'hi', 'hidden', 'hour', 'how', 'hyper', 'i', 'imputer', 'in', 'intelligence', 'is', 'jerk', 'joke', 'knn', 'later', 'layer', 'learner', 'learning', 'leaving', 'link', 'listen', 'logistic', 'lot', 'machine', 'me', 'ml', 'my', 'naive', 'name', 'nb', 'net', 'network', 'neural', 'no', 'not', 'of', 'olympus', 'olypus', 'on', 'online', 'operation', 'opertions', 'otimizer', 'parameter', 'piece', 'please', 'pm', 'problem', 'propagation', 'random', 'regress

In [30]:
# In the above output, we have observed a total of 128 documents, 8 classes and 158 unique lemmatized words.
# We have also saved the words and classes for further use, code shown in below

In [31]:
# creating a pickle file to store the Python objects which we will use while predicting 

In [32]:
pickle.dump(words,open('words.pkl','wb'))
pickle.dump(classes,open('classes.pkl','wb'))

In [None]:
# Lemmatization
# Lemmatization is the grouping together the inflected forms of words into one word
# For example, the root word or lemmatized word from trouble, troubling, troubled and troubles is trouble
# Using the same concept, we have a total of 128 unique root words present in our training dataset

In [None]:
# Step -3 
# Creating our training data

In [48]:
training = []

In [None]:
# creating an empty array for our output

In [50]:
output_empty = [0] * len(classes)

In [None]:
# Training set, bag of words for each sentence

In [52]:
for doc in documents:
    
    bag = []                      # initialize our bag of words
    
    pattern_words = doc[0]        # list of tokenized words for the pattern
    
    # Lemmatize each word; create base word, in attempt to represent related words
    
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]
    
    # Create our bag of words array with 1, if word match found in current pattern
    
    for w in words:
        
        bag.append(1) if w in pattern_words else bag.append(0)
        
    # output is a '0' for each tag and '1' for current tag (for each pattern)
    
    output_row = list(output_empty)
    
    output_row[classes.index(doc[1])] = 1
    
    training.append([bag, output_row])

In [None]:
# shuffle features and converting it into numpy arrays

In [53]:
random.shuffle(training)
training = np.array(training)

In [None]:
# Create train and test lists

In [56]:
train_x = list(training[:,0])
train_y = list(training[:,1])

In [57]:
print("Training Data Created")

Training Data Created


In [None]:
# Creating Neural Network Model

In [61]:
# Create NN model to predict the responses

model = Sequential()

model.add(Dense(128, input_shape = (len(train_x[0]),), activation = 'relu'))

model.add(Dropout(0.5))

model.add(Dense(64, activation = 'relu'))

model.add(Dropout(0.5))

model.add(Dense(len(train_y[0]), activation = 'softmax'))

In [None]:
# compile model 
# Stochatic Gradient Descent (SGD) with Nesterov accelerated gradient gives results for this model

In [62]:
sgd = SGD(learning_rate=0.01, decay = 1e-6, momentum = 0.9, nesterov = True)

In [63]:
model.compile(loss='categorical_crossentropy', optimizer = sgd, metrics = ['accuracy'])

In [None]:
# fitting and saving the model

In [64]:
hist = model.fit(np.array(train_x), np.array(train_y), epochs = 200, batch_size = 5, verbose = 1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [None]:
# save the model and we will pickle this model to use in the future

In [68]:
model.save('chatbot.h5', hist)
print('Model Created Successfully! ')

Model Created Successfully! 
