In [98]:
# packages and libraries
import nltk
import json
import random
import numpy as np
import matplotlib.pyplot as plt
import re
import time

from nltk.stem import PorterStemmer
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# naive bayes
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

from keras.callbacks import EarlyStopping

stemmer = PorterStemmer()

### Import and load the data file

In [99]:
with open('intents.json', 'r') as f:
    intents = json.load(f)
print(intents)

{'intents': [{'tag': 'greetings', 'patterns': ['Hi', 'Hey', 'Hello', 'Hey there', 'Hello restaurant Taiwan'], 'responses': ['Hello and welcome to restaurant Taiwan', 'Hey there', 'Hi, how can I help you?', 'Welcome to restaurant Taiwan, I am your personal assistant. How can I help you?']}, {'tag': 'openinghours', 'patterns': ['what are the opening hours?', 'When is the restaurant closed?', 'Could you please give me the opening hours of the restaurant?', 'When is the restaurant open?'], 'responses': ['The restaurant is open from Thursday-Sunday from 5pm-10:30pm']}, {'tag': 'payments', 'patterns': ['Can i pay with credit card', 'Cash', 'Google Pay', 'Apple Pay', 'AMEX', 'Debit', 'Paypal', 'what are the payment methods?'], 'responses': ['We accept the following payment methods: Cash, VISA, Mastercard']}, {'tag': 'reservation', 'patterns': ['I want to make a reservation', 'Can you please help me to make a reservation?', 'Would you help me to make a reservation?'], 'responses': ['Sure, for 

### Preprocessing

In [100]:
training_sentences = []
training_sentences_stem = []
training_labels = []
labels = []
documents = []
#responses = []

In [101]:
sentence = "Artificial intelligence is a wide topic, and NLP is one of the subsets."

In [102]:
def tokenize(sentence):
    token = nltk.word_tokenize(sentence)
    return token

#def remove_punctuation_marks(token):
#    output = []
#    output = [k for k in token if k.isalpha()]
#    return output

#def remove_punctuation_marks(word):
#    output = re.sub("[^a-zA-Z']", " ", str(word))
#    return output

def stem(word):
    return stemmer.stem(word.lower())

def bag_of_words(tokenized_sentence, training_sentences, doc):
    
    tok_sens = []
    bag = []
    for w in tokenized_sentence:
        word = stem(w)
        tok_sens.append(word)
        
    for w in training_sentences:
        bag.append(1) if w in tok_sens else bag.append(0)
    #print(bag)
    
    # output is a '0' for each tag and '1' for current tag (for each pattern)
    output_row = list(output_empty)
    output_row[labels.index(doc[1])] = 1
    training.append([bag, output_row])
            
    return bag

In [103]:
#sentence = tokenize(sentence)
#print(sentence)

In [104]:
#sentence = remove_punctuation_marks(sentence)
#print(sentence)

In [105]:
for intent in intents['intents']:
    # add to our labels
    training_label = intent['tag']
    training_labels.append(training_label)
    for pattern in intent['patterns']:
        # tokenize each word
        word = tokenize(pattern)
        training_sentences.extend(word)
        # add document in the corpus
        documents.append((word, training_label))

In [106]:
# print(documents)

In [107]:
#for w in training_sentences:
#    test = remove_punctuation_marks(w)
#    print(test)

In [108]:
ignore_words = ['?', '!', '.', ',']

for w in training_sentences:
    # remove punctuations
    if w not in ignore_words:
        # stem the words 
        word = stem(w)
        training_sentences_stem.append(word)

print(training_sentences_stem)

['hi', 'hey', 'hello', 'hey', 'there', 'hello', 'restaur', 'taiwan', 'what', 'are', 'the', 'open', 'hour', 'when', 'is', 'the', 'restaur', 'close', 'could', 'you', 'pleas', 'give', 'me', 'the', 'open', 'hour', 'of', 'the', 'restaur', 'when', 'is', 'the', 'restaur', 'open', 'can', 'i', 'pay', 'with', 'credit', 'card', 'cash', 'googl', 'pay', 'appl', 'pay', 'amex', 'debit', 'paypal', 'what', 'are', 'the', 'payment', 'method', 'i', 'want', 'to', 'make', 'a', 'reserv', 'can', 'you', 'pleas', 'help', 'me', 'to', 'make', 'a', 'reserv', 'would', 'you', 'help', 'me', 'to', 'make', 'a', 'reserv', 'i', 'would', 'like', 'to', 'make', 'a', 'reserv', 'for', '2', 'person', 'i', 'would', 'like', 'to', 'make', 'a', 'reserv', 'for', '3', 'person', 'i', 'would', 'like', 'to', 'make', 'a', 'reserv', 'for', '4', 'person', 'i', 'would', 'like', 'to', 'make', 'a', 'reserv', 'for', '4', 'person', '1', '2', '3', '4', '5', '6', 'i', 'want', 'food', 'i', 'want', 'to', 'order', 'food', 'asap', 'can', 'you', 'ple

In [109]:
print(training_labels)

['greetings', 'openinghours', 'payments', 'reservation', 'makereservation', 'placeorder', 'order', 'location', 'userthanks', 'goodbye', 'contact', 'soupitems', 'appetizeritems', 'poultryitems', 'riceitems', 'specialitiesitems', 'spicyitems', 'vegetarianitems', 'alcohol', 'delivery', 'menu', 'seats']


In [110]:
# remove duplicates and sort it

training_sentences = sorted(set(training_sentences_stem))
labels = sorted(set(training_labels))


print(training_sentences)
print(labels)

['1', '2', '3', '4', '5', '6', 'a', 'address', 'alcohol', 'amex', 'an', 'and', 'ani', 'appet', 'appl', 'are', 'asap', 'beer', 'big', 'bye', 'can', 'car', 'card', 'cash', 'chines', 'close', 'cocktail', 'contact', 'could', 'credit', 'debit', 'deliv', 'deliveri', 'dish', 'do', 'drink', 'find', 'food', 'for', 'give', 'googl', 'goreng', 'have', 'hello', 'help', 'hey', 'hi', 'home', 'hour', 'how', 'i', 'inform', 'is', 'kip', 'like', 'liquor', 'locat', 'loempia', 'lot', 'make', 'mani', 'map', 'me', 'menu', 'met', 'method', 'more', 'much', 'my', 'nasi', 'of', 'ok', 'open', 'order', 'pay', 'payment', 'paypal', 'person', 'place', 'pleas', 'poultri', 'provid', 'reserv', 'restaur', 'rice', 'seat', 'see', 'so', 'soup', 'special', 'spici', 'spirit', 'taiwan', 'take', 'thank', 'the', 'there', 'to', 'tomatensoep', 'train', 'transport', 'vegetarian', 'want', 'well', 'what', 'when', 'where', 'which', 'wine', 'with', 'would', 'ya', 'you', 'your']
['alcohol', 'appetizeritems', 'contact', 'delivery', 'good

### Create training and test data

In [112]:
# create training data

training = []

# create an empty array for the output
output_empty = [0] * len(labels)
print(output_empty)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [113]:
for doc in documents:
    pattern_sentence = doc[0]
    bag = bag_of_words(pattern_sentence, training_sentences, doc)

# shuffle our features and turn into np.array
# shuffling randomly and converting into numpy array for faster processing
random.shuffle(training)
training = np.array(training)

# create train and test lists. X - patterns, Y - intents
# creating train and test lists
X = list(training[:,0])
y = list(training[:,1])
print("Training data created")

Training data created


  training = np.array(training)


In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Build the model

### Naive Bayes

In [116]:
NBclassifier = MultinomialNB(alpha=1)

NBclassifier.fit(X_train,y_train)

cf = confusion_matrix(y_test, y_pred)
print(cf)
print(accuracy_score(y_test, y_pred) * 100)


ValueError: y should be a 1d array, got an array of shape (76, 22) instead.