In [111]:
# using natural language toolkit
import nltk
import json
import os
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
#from nltk.stem.snowball import SnowballStemmer
# word stemmer
stemmer = LancasterStemmer()
#stemmer1 = SnowballStemmer("english")

In [112]:
# import chat-bot intents training file
with open('intents.json') as json_data:
    sample_dataset = json.load(json_data)

In [113]:
# capture unique stemmed words in the training corpus
corpus_words = []
intents = []
training_data = []
record_list =[]
#ignore = ['?']

for record in sample_dataset['intent_examples']:
    
    if record['intent'] not in intents:
        intents.append(record['intent'])
    record_list.append(record)
    
    for word in nltk.word_tokenize(record['text']):
        # ignore a few things
        if word not in ["?", "'s", ","]:
            # stem and lowercase each word
            if word == "'m":
                word = "am"
            stemmed_word = stemmer.stem(word.lower())
            corpus_words.append(stemmed_word)
            #documents.append((stemmed_word,intent_examples['intent']))
            #class_words[intent_examples['intent']].extend([stemmed_word])

for record in record_list:
    training_data.append((nltk.word_tokenize(record["text"]), record["intent"]))

corpus_words = list(set(corpus_words))

# # the number of occurances of the word in training corpus (word frequency)
#print("Corpus words and counts: %s" % corpus_words)
# # all words in each intent-class
#print (len(corpus_words), "unique stemmed words", corpus_words)
#print("Intents : %s" % intents)   
#print(documents)
print (len(training_data), "dataset records")
print (len(intents), "intents", intents)
print (len(corpus_words), "unique stemmed words", corpus_words)

36 dataset records
4 intents ['greet', 'restaurant_search', 'affirm', 'goodbye']
61 unique stemmed words ['mex', 'goodby', 'anywh', 'ind', 'ok', 'cuisin', 'for', 'town', 'me', 'bye', 'som', 'pleas', 'yo', 'i', 'chines', 'how', 'okay', 'am', 'ye', 'right', 'an', 'look', 'hav', 'in', 'yep', 'west', 'cent', 'ar', 'yeah', 'near', 'food', 'indee', 'is', 'howdy', 'good', 'that', 'city', 'to', 'eat', 'of', 'search', 'the', 'plac', 'end', 'ther', 'gre', 'spot', 'hi', 'show', 'nor', 'hey', 'you', 'morn', 'welcom', 'afternoon', 'stop', 'resta', 'day', 'nic', 'a', 'hello']


In [114]:
# create our training data
training_input = []
training_output = []
# create an empty array for our output
output_empty = [0] * len(intents)

# training set, bag of words for each sentence
for record in training_data:
    # initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = record[0]
    #print(pattern_words)
    # stem each word
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
    # create our bag of words array
    for w in corpus_words:
        bag.append(1) if w in pattern_words else bag.append(0)

    training_input.append(bag)
    # output is a '0' for each tag and '1' for current tag
    output = list(output_empty)
    output[intents.index(record[1])] = 1
    training_output.append(output)

#print ("# words", len(corpus_words))
#print ("# classes", len(intents))
#print(training)
#print(output)


In [115]:
# sample training/output
i = 10
w = training_data[i][0]
print ([stemmer.stem(word.lower()) for word in w])
print (training_input[i])
print (training_output[i])

['i', 'am', 'look', 'for', 'a', 'plac', 'to', 'eat']
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
[0, 1, 0, 0]


In [116]:
import numpy as np
import time
import datetime

# sigmoid function
def sigmoid(x):
    output = 1/(1+np.exp(-x))
    return output

# convert output of sigmoid function to its derivative
def sigmoidDerivative(output):
    return output*(1-output)
 
def wordsTokenizer(sentence):
    # tokenize the pattern
    sentence_words = nltk.word_tokenize(sentence)
    # stem each word
    sentence_words = [stemmer.stem(word) for word in sentence_words]
    return sentence_words

# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bagOfWords(sentence, corpus_words, show_details=False):
    # tokenize the pattern
    sentence_words = wordsTokenizer(sentence)
    # bag of words
    bag = [0]*len(corpus_words)  
    for s in sentence_words:
        for i,w in enumerate(corpus_words):
            if w == s: 
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)

    return(np.array(bag))

def setSynapses(sentence, show_details=False):
    x = bagOfWords(sentence.lower(), corpus_words, show_details)
    if show_details:
        print("sentence:", sentence, "\n bag of words:", x)
    # input layer is our bag of words
    l0 = x
    # matrix multiplication of input and hidden layer
    l1 = sigmoid(np.dot(l0, synapse_0))
    # output layer
    l2 = sigmoid(np.dot(l1, synapse_1))
    return l2

In [117]:
def train(X, y, hidden_neurons=10, alpha=0.1, epochs=50000, dropout=False, dropout_percent=0.5):

    print("Training with %s neurons, alpha:%s, dropout:%s %s" % (hidden_neurons, str(alpha), dropout, dropout_percent if dropout else '') )
    print("Input matrix: %sx%s    Output matrix: %sx%s" % (len(X),len(X[0]),1, len(y[0])) )
    np.random.seed(1)

    last_mean_error = 1
    # randomly initialize our weights with mean 0
    synapse_0 = 2*np.random.random((len(X[0]), hidden_neurons)) - 1
    synapse_1 = 2*np.random.random((hidden_neurons, len(y[0]))) - 1

    prev_synapse_0_weight_update = np.zeros_like(synapse_0)
    prev_synapse_1_weight_update = np.zeros_like(synapse_1)

    synapse_0_direction_count = np.zeros_like(synapse_0)
    synapse_1_direction_count = np.zeros_like(synapse_1)
        
    for j in iter(range(epochs+1)):

        # Feed forward through layers 0, 1, and 2
        layer_0 = X
        layer_1 = sigmoid(np.dot(layer_0, synapse_0))
                
        if(dropout):
            layer_1 *= np.random.binomial([np.ones((len(X),hidden_neurons))],1-dropout_percent)[0] * (1.0/(1-dropout_percent))

        layer_2 = sigmoid(np.dot(layer_1, synapse_1))

        # how much did we miss the target value?
        layer_2_error = y - layer_2
        

        if (j% 10000) == 0 and j > 5000:
            # if this 10k iteration's error is greater than the last iteration, break out
            if np.mean(np.abs(layer_2_error)) < last_mean_error:
                print ("delta after "+str(j)+" iterations:" + str(np.mean(np.abs(layer_2_error))) )
                last_mean_error = np.mean(np.abs(layer_2_error))
            else:
                print ("break:", np.mean(np.abs(layer_2_error)), ">", last_mean_error )
                break
                
        # in what direction is the target value?
        # were we really sure? if so, don't change too much.
        layer_2_delta = layer_2_error * sigmoidDerivative(layer_2)

        # how much did each l1 value contribute to the l2 error (according to the weights)?
        layer_1_error = layer_2_delta.dot(synapse_1.T)

        # in what direction is the target l1?
        # were we really sure? if so, don't change too much.
        layer_1_delta = layer_1_error * sigmoidDerivative(layer_1)
        
        synapse_1_weight_update = (layer_1.T.dot(layer_2_delta))
        synapse_0_weight_update = (layer_0.T.dot(layer_1_delta))
        
        if(j > 0):
            synapse_0_direction_count += np.abs(((synapse_0_weight_update > 0)+0) - ((prev_synapse_0_weight_update > 0) + 0))
            synapse_1_direction_count += np.abs(((synapse_1_weight_update > 0)+0) - ((prev_synapse_1_weight_update > 0) + 0))        
        
        synapse_1 += alpha * synapse_1_weight_update
        synapse_0 += alpha * synapse_0_weight_update
        
        prev_synapse_0_weight_update = synapse_0_weight_update
        prev_synapse_1_weight_update = synapse_1_weight_update

    now = datetime.datetime.now()

    # persist synapses
    synapse = {'synapse0': synapse_0.tolist(), 'synapse1': synapse_1.tolist(),
               'datetime': now.strftime("%Y-%m-%d %H:%M"),
               'words': corpus_words,
               'intents': intents
              }
    synapse_file = "synapses.json"

    with open(synapse_file, 'w') as outfile:
        json.dump(synapse, outfile, indent=4, sort_keys=True)
    print ("saved synapses to:", synapse_file)

In [118]:
X = np.array(training_input)
y = np.array(training_output)
#print(training)
#print(output)

start_time = time.time()

train(X, y, hidden_neurons=20, alpha=0.1, epochs=100000, dropout=False, dropout_percent=0.2)

elapsed_time = time.time() - start_time
print ("processing time:", elapsed_time, "seconds")

Training with 20 neurons, alpha:0.1, dropout:False 
Input matrix: 36x61    Output matrix: 1x4
delta after 10000 iterations:0.00445092739035
delta after 20000 iterations:0.00304771516708
delta after 30000 iterations:0.00244877799078
delta after 40000 iterations:0.00209866734213
delta after 50000 iterations:0.00186283533551
delta after 60000 iterations:0.00169042242308
delta after 70000 iterations:0.00155743602285
delta after 80000 iterations:0.00145090290813
delta after 90000 iterations:0.00136312106247
delta after 100000 iterations:0.00128919457373
saved synapses to: synapses.json
processing time: 10.65522837638855 seconds


In [119]:
# probability threshold
ERROR_THRESHOLD = 0.2
# load our calculated synapse values
synapse_file = 'synapses.json' 
with open(synapse_file) as data_file: 
    synapse = json.load(data_file) 
    synapse_0 = np.asarray(synapse['synapse0']) 
    synapse_1 = np.asarray(synapse['synapse1'])

def intentClassifier(sentence, show_details=False):
    results = setSynapses(sentence, show_details)
    #print(results)
    results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD ] 
    results.sort(key=lambda x: x[1], reverse=True) 
    return_results =[[intents[r[0]],r[1]] for r in results]
    #print ("%s \n classification: %s" % (sentence, return_results))
    return return_results

# classify("show me a mexicon place in the center")
# classify("how are you today?")
# classify("talk to you tomorrow, bye")
# classify("search thai cuisine in city")
# classify("get me some lunch")
# print ()
print(intentClassifier("looking for a dinner place near city center?", show_details=True))


found in bag: look
found in bag: for
found in bag: a
found in bag: plac
found in bag: near
found in bag: city
found in bag: cent
sentence: looking for a dinner place near city center? 
 bag of words: [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
[['restaurant_search', 0.96959436876859351]]
