# EXPLORATORY DATA ANALYSIS 

In [3]:
import pandas as pd
# Libraries needed for NLP 
import nltk 
nltk.download('punkt')
from nltk.stem import PorterStemmer # used to reduce words to their base form, also known as the root form.
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Libraries needed for Tensorflow processing 
import tensorflow as tf
import numpy as np
import random 
import json

In [5]:
# Load the intents.json file from your local device 
with open('intents.json') as json_data :
    intents = json.load(json_data)

In [6]:
intents

{'intents': [{'tag': 'greeting',
   'patterns': ['Hi', 'How are you', 'Is anyone there?', 'Hello', 'Good day'],
   'responses': ['Hello, thanks for visiting',
    'Good to see you again',
    'Hi there, how can I help?'],
   'context_set': ''},
  {'tag': 'goodbye',
   'patterns': ['Bye', 'See you later', 'Goodbye'],
   'responses': ['See you later, thanks for visiting',
    'Have a nice day',
    'Bye! Come back again soon.']},
  {'tag': 'thanks',
   'patterns': ['Thanks', 'Thank you', "That's helpful"],
   'responses': ['Happy to help!', 'Any time!', 'My pleasure']},
  {'tag': 'chatbot',
   'patterns': ['Who built this chatbot?',
    'Tell me about Chatbot',
    'What is this chatbot name?'],
   'responses': ['Hi, I am Chatbot designed by Mayank.',
    'Thanks for asking. I am designed by Mayank Bajaj.',
    'I am a chatbot.']},
  {'tag': 'location',
   'patterns': ['What is your location?',
    'Where are you located?',
    'What is your address?'],
   'responses': ["We are from Worl

# PRE-PROCESSING THE TEXT DATA 

In [7]:
words =[]   # will contain all the unique words from the pattern to be trained so that chatbot gives correct outcome  
classes =[]   # it will contain the list of all the text totally 8 elements here in text, so it will contain totally 8 elements in the document
documents =[] # documents is like a tuple which will contain the first list , the first element of the list of words 
ignore =['?'] # if you want to remove any special characters 

In [14]:
# loop through each sentence in the intents patterns
for intent in intents['intents']:
    for pattern in intent['patterns']:
        # tokenize each and every word in the sentence
        w =nltk.word_tokenize(pattern)
        # add words to the words list 
        words.extend(w)
        # add words to documents 
        documents.append((w,intent['tag']))
        # add tags to our classes list 
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

In [15]:
# Perform stemming and lower each word as well as remove duplicates
words =[stemmer.stem(w.lower()) for w in words if w not in ignore] # stemmer converts the word into its root word
words = sorted(list(set(words))) # coverted so that no duplicated or same words are appended in the list 

# remove duplicate classes 
classes = sorted(list(set(classes)))

print(len(documents),"documents")
print(len(classes),"classes",classes)
print(len(words),"unique stemmed words",words)

29 documents
8 classes ['about', 'chatbot', 'connect', 'goodbye', 'greeting', 'location', 'movies', 'thanks']
52 unique stemmed words ["'s", 'about', 'account', 'address', 'ani', 'anyon', 'are', 'built', 'bye', 'can', 'chatbot', 'connect', 'day', 'favourit', 'give', 'good', 'goodby', 'hello', 'help', 'hi', 'how', 'i', 'is', 'later', 'link', 'locat', 'me', 'media', 'movi', 'name', 'out', 'reach', 'recommend', 'see', 'social', 'some', 'suggest', 'tell', 'thank', 'that', 'there', 'thi', 'to', 'way', 'we', 'what', 'where', 'which', 'who', 'you', 'your', 'yourself']


#  CREATING AND TRAINING THE MODEL FOR CHATBOT 

In [None]:
# creating training data 
training =[]   # will work as X data 
output =[]   # will work as Y data
# create an empty array for output
output_empty =[0]* len(classes)

# creating training set ,bag of words for each sentence 
for doc in documents:
    # initialize bag of words 
    bag =[]
    # list of tokenized words for the pattern 
    pattern_words =doc[0]
    # stemming each word 
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
    # create a bag of words of array 
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)
    # output is '1' for current tag and '0' for the rest of other tags 
    output_row = list(output_empty)
    output_row[classes.index(doc[1])]=1
    
    training.append([bag,output_row])
# shuffling features and turning it to np.array
random.shuffle(training)
training = np.array(training)

# creating training lists
train_x = list(training[:,0])
train_y = list(training[:,1])

In [30]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(10,input_shape=(len(train_x[0]),))) # 1st layer with 10m neurons
model.add(tf.keras.layers.Dense(10))
model.add(tf.keras.layers.Dense(len(train_y[0]),activation='softmax')) # when there is more than 2 class(multi class classification) softmax is used
model.compile(tf.keras.optimizers.Adam(),loss='categorical_crossentropy',metrics =['accuracy'])

In [31]:
model.fit(np.array(train_x),np.array(train_y),epochs=100,batch_size=8,verbose=1)
model.save("model.pkl")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
INFO:tensorflow:Assets written to: model.pkl\assets


# MAKING PREDICTIONS USING CHATBOT

In [32]:
import pickle 
pickle.dump({"words":words,'classes':classes},open("training_data",'wb'))

In [33]:
from keras.models import load_model
model = load_model("model.pkl")

In [34]:
# restoring all the data structures 
data = pickle.load(open("training_data","rb"))
words = data['words']
classes = data['classes']

In [35]:
with open('intents.json') as json_data:
    intents = json.load(json_data)

In [54]:
def clean_up_sentence(sentence):
    # tokenizing the pattern
    sentence_words = nltk.word_tokenize(sentence)
    # stemming each word 
    sentence_words =[stemmer.stem(word.lower()) for word in sentence_words]
    return sentence_words

# returning bag of words array :0 or 1 for each word in the bag that exists in the sentence 
def bow(sentence,words):
    # tokenize the pattern 
    sentence_words = clean_up_sentence(sentence)
    # generating bag of words 
    bag = [0]*len(words)
    for s in sentence_words:
        for i,w in enumerate(words):
            if w==s:
                bag[i]=1
    bag = np.array(bag)
    return(bag)

In [56]:
ERROR_THRESHOLD =0.30
def classify(sentence):
    # generate probabilities from the mmodel 
    bag = bow(sentence,words)
    results = model.predict(np.array([bag]))
    # filter out predictions below a threshold 
    results =[[i,r] for i ,r in enumerate(results[0]) if r>ERROR_THRESHOLD]
    # sort by strength of probabilty 
    results.sort(key=lambda x:x[1],reverse=True)
    return_list =[]
    for r in results:
        return_list.append((classes[r[0]],r[1]))
    # return tuple of intent and probablity 
    return return_list

def response(sentence):
    results = classify(sentence)
    # if we have a classification then find the matching intent tag 
    if results:
        # loop as long as there are matches to tje process 
        while results:
            for i in intents['intents']:
                # find a tag matching the first result
                if i['tag']== results[0][0]:
                    # a random response from the intent 
                    return print(random.choice(i['responses']))
            results.pop(0)
            
    

In [57]:
response('Where are you located?')

You can visit India to meet us


In [58]:
response("Bye")

See you later, thanks for visiting
