In [173]:
# using natural language toolkit
import nltk
import json
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
#from nltk.stem.snowball import SnowballStemmer
# word stemmer
stemmer = LancasterStemmer()
#stemmer1 = SnowballStemmer("english")

In [174]:
# import chat-bot intents training file
with open('intents.json') as json_data:
    training_data = json.load(json_data)

In [175]:
# capture unique stemmed words in the training corpus
corpus_words = {}
class_words = {}

for c in training_data['intent_examples']:
     class_words[c['intent']] = []
    
for intent_examples in training_data['intent_examples']:
    for word in nltk.word_tokenize(intent_examples['text']):
        # ignore a few things
        if word not in ["?", "'s"]:
            # stem and lowercase each word
            if word == "'m":
                word = "am"
            stemmed_word = stemmer.stem(word.lower())
            if stemmed_word not in corpus_words:
                corpus_words[stemmed_word] = 1
            else:
                corpus_words[stemmed_word] += 1
            class_words[intent_examples['intent']].extend([stemmed_word])
            
# the number of occurances of the word in training corpus (word frequency)
print ("Corpus words and counts: %s" % corpus_words)
# all words in each intent-class
print ("Class words: %s" % class_words)

Corpus words and counts: {'hey': 2, 'hello': 2, ',': 1, 'how': 2, 'ar': 1, 'you': 1, 'is': 1, 'yo': 1, 'day': 2, 'good': 3, 'morn': 1, 'afternoon': 1, 'welcom': 1, 'howdy': 1, 'ther': 1, 'hi': 1, 'i': 3, 'am': 3, 'look': 3, 'for': 4, 'a': 4, 'plac': 3, 'to': 1, 'eat': 1, 'in': 4, 'the': 4, 'nor': 1, 'of': 1, 'town': 1, 'show': 2, 'me': 4, 'chines': 1, 'resta': 3, 'ye': 1, 'ok': 2, 'fin': 1, 'yep': 1, 'yeah': 1, 'okay': 1, 'mex': 1, 'cent': 2, 'bye': 2, 'goodby': 1, 'hav': 1, 'nic': 1, 'stop': 1, 'end': 1, 'an': 1, 'ind': 2, 'spot': 1, 'city': 1, 'search': 1, 'near': 2, 'anywh': 1, 'west': 1, 'indee': 1, 'that': 1, 'right': 1, 'gre': 1}
Class words: {'greet': ['hey', 'hello', ',', 'how', 'ar', 'you', 'how', 'is', 'yo', 'day', 'good', 'morn', 'good', 'afternoon', 'welcom', 'howdy', 'hey', 'ther', 'hello', 'hi'], 'restaurant_search': ['i', 'am', 'look', 'for', 'a', 'plac', 'to', 'eat', 'i', 'am', 'look', 'for', 'a', 'plac', 'in', 'the', 'nor', 'of', 'town', 'show', 'me', 'chines', 'resta'

In [176]:
# calculate a score for a given class taking into account word commonality
def calculate_class_score_commonality(sentence, class_name, show_details=True):
    score = 0
    for word in nltk.word_tokenize(sentence):
        word = stemmer.stem(word.lower())
        if word in class_words[class_name]:
            score += (1 / corpus_words[word])
            if show_details:
                print ("   match: %s (%s)" % (word, 1 / corpus_words[word]))
    return score

In [177]:
# return the class with highest score for sentence
def find_intent(sentence):
    high_class = None
    high_score = 0
    for c in class_words.keys():
        #% (c, calculate_class_score_commonality(sentence, c)))
        score = calculate_class_score_commonality(sentence, c)
        print("Class: %s  Score: %s \n" % (c, score))
        if score > high_score:
            high_class = c
            high_score = score
    return high_class, high_score

In [178]:
find_intent("look for good chinese restaurant")

   match: good (0.3333333333333333)
Class: greet  Score: 0.3333333333333333 

   match: look (0.3333333333333333)
   match: for (0.25)
   match: chines (1.0)
   match: resta (0.3333333333333333)
Class: restaurant_search  Score: 1.9166666666666665 

Class: affirm  Score: 0 

   match: good (0.3333333333333333)
Class: goodbye  Score: 0.3333333333333333 



('restaurant_search', 1.9166666666666665)