In [1]:
import nltk
import json
import pandas as pd
import numpy as np
import re 
import matplotlib.pyplot as plt
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

### Import and load the data file

In [2]:
with open('intents.json', 'r') as f:
    intents = json.load(f)

In [3]:
# getting all the data to lists
tags = []
patterns = []
labels = []
for intent in intents['intents']:
    for lines in intent['patterns']:
        patterns.append(lines)
        tags.append(intent['tag'])

In [4]:
#converting to dataframe
df = pd.DataFrame({"patterns":patterns,
                     "tags":tags})
df

Unnamed: 0,patterns,tags
0,Hi,greetings
1,Hey,greetings
2,Hello,greetings
3,Hey there,greetings
4,Hello restaurant Taiwan,greetings
...,...,...
193,How big is the restaurant?,seats
194,How many seats,seats
195,seats,seats
196,,unknown


### Test train split

In [5]:
X = df['patterns']
y = df['tags']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
v = dict(zip(list(y), df['tags'].to_list()))

### Preprocessing

In [6]:
def text_preprocessing(text, language):
    
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    
    text_alpha_chars = ""
    for char in text:
        if char not in punctuations:
            text_alpha_chars = text_alpha_chars + char
    
     # convert to lower-case
    text_lower = text_alpha_chars.lower()
    
    
    # remove stop words
    stops = set(stopwords.words(language)) 
    text_no_stop_words = ''  
    
    for word in text_lower.split():
        if word not in stops:  
            text_no_stop_words = text_no_stop_words + word + ' '
         
    # do stemming, porterstemmer
    text_stemmer = ' '
    stemmer = PorterStemmer()
    for w in text_no_stop_words.split():
        text_stemmer = text_stemmer + stemmer.stem(w) + ' '
        
    return text_stemmer
 


In [7]:
language = 'english'
text_prep = np.empty

for i in range(X_train.shape[0]):
    X_train.iloc[i] = text_preprocessing(X_train.iloc[i], language)
    
    
for i in range(X_test.shape[0]):
    X_test.iloc[i] = text_preprocessing(X_test.iloc[i], language)


### Convert to bag-of-words

In [8]:
count_vect = CountVectorizer() 
X_train_bag_of_words = count_vect.fit_transform(X_train)
X_test_bag_of_words = count_vect.transform(X_test)


tfidf_transformer = TfidfTransformer()  
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_bag_of_words)
X_train_tf = tf_transformer.transform(X_train_bag_of_words)
X_test_tf = tf_transformer.transform(X_test_bag_of_words)

### Building the model


#### Machine learning algorithms

#### Logistic regression

In [9]:
lregclassifier = LogisticRegression(C=10)

lregclassifier.fit(X_train_tf, y_train)

y_pred = lregclassifier.predict(X_test_tf)
print(classification_report(y_test, y_pred))

cf = confusion_matrix(y_test, y_pred)
print(cf)
print(accuracy_score(y_test, y_pred) * 100) 

                 precision    recall  f1-score   support

      allergies       1.00      0.20      0.33         5
 appetizeritems       1.00      1.00      1.00         2
       delivery       1.00      1.00      1.00         1
       location       0.00      0.00      0.00         1
makereservation       1.00      1.00      1.00         1
      meatitems       1.00      1.00      1.00         2
    noodleitems       1.00      1.00      1.00         1
   openinghours       1.00      1.00      1.00         1
          order       0.58      1.00      0.73        11
       payments       1.00      0.25      0.40         4
   poultryitems       1.00      1.00      1.00         3
    reservation       1.00      1.00      1.00         1
       showmenu       1.00      1.00      1.00         1
      soupitems       1.00      1.00      1.00         3
     spicyitems       1.00      1.00      1.00         1
vegetarianitems       1.00      1.00      1.00         2

       accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
print("score on train: "+ str(lregclassifier.score(X_train_tf, y_train) * 100))
print("score on test: " + str(lregclassifier.score(X_test_tf, y_test) * 100))

score on train: 98.73417721518987
score on test: 80.0


#### Hyperparameter tuning

#### Grid search

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
paramaters = [
             {'C' : [0.25, 0.5, 0.75, 1, 10, 50, 100], 
              'solver': ['liblinear','lbfgs', 'newton-cg', 'sag','saga'], 
              'class_weight': [None, 'balanced']},
             ]
grid_search = GridSearchCV(estimator = model, 
                           param_grid = paramaters,
                           scoring = 'accuracy',
                           cv = 4,
                           n_jobs = -1,
                           verbose=5)
grid_search = grid_search.fit(X_train_tf, y_train)

best_accuracy = grid_search.best_score_ 
best_parameters = grid_search.best_params_  

print('Best accuracy : ', grid_search.best_score_)
print('Best parameters :', grid_search.best_params_  )



Fitting 4 folds for each of 70 candidates, totalling 280 fits
Best accuracy :  0.7661858974358975
Best parameters : {'C': 50, 'class_weight': None, 'solver': 'liblinear'}


#### Random search

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint 
from scipy.stats import uniform
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

parameters = [
             {'C' : [0.25, 0.5, 0.75, 1, 10, 50, 100], 
              'solver': ['liblinear','lbfgs', 'newton-cg', 'sag','saga'],
              'class_weight': [None, 'balanced']}
             ]

              

n_iter_search = 20

random_search = RandomizedSearchCV(model, param_distributions=parameters,cv=4,n_iter=n_iter_search,n_jobs = -1,verbose=1)

random_search = random_search.fit(X_train_tf, y_train)

best_accuracy = random_search.best_score_ 
best_parameters = random_search.best_params_  

print('Best accuracy : ', random_search.best_score_)
print('Best parameters :',random_search.best_params_  )

Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best accuracy :  0.7661858974358975
Best parameters : {'solver': 'liblinear', 'class_weight': None, 'C': 50}




#### Naive Bayes

In [13]:
from sklearn.naive_bayes import MultinomialNB

NBclassifier = MultinomialNB(alpha=1)

NBclassifier.fit(X_train_tf, y_train)

y_pred = NBclassifier.predict(X_test_tf)
print(classification_report(y_test, y_pred))

cf = confusion_matrix(y_test, y_pred)
print(cf)
print(accuracy_score(y_test, y_pred) * 100)

                 precision    recall  f1-score   support

      allergies       0.00      0.00      0.00         5
 appetizeritems       0.00      0.00      0.00         2
       delivery       0.00      0.00      0.00         1
       location       0.00      0.00      0.00         1
makereservation       1.00      1.00      1.00         1
      meatitems       0.00      0.00      0.00         2
    noodleitems       0.00      0.00      0.00         1
   openinghours       0.00      0.00      0.00         1
          order       0.29      1.00      0.45        11
       payments       0.00      0.00      0.00         4
   poultryitems       0.00      0.00      0.00         3
    reservation       0.00      0.00      0.00         1
       showmenu       0.00      0.00      0.00         1
      soupitems       0.00      0.00      0.00         3
     spicyitems       1.00      1.00      1.00         1
vegetarianitems       0.00      0.00      0.00         2

       accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
print("score on train: "+ str(NBclassifier.score(X_train_tf, y_train) * 100))
print("score on test: " + str(NBclassifier.score(X_test_tf, y_test) * 100))

score on train: 48.734177215189874
score on test: 32.5


#### Hyperparameter tuning

#### Grid search

In [15]:
from sklearn.model_selection import GridSearchCV


model = MultinomialNB()
paramaters = [
             {'alpha' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]}                                       
             ]

grid_search = GridSearchCV(estimator = model, 
                           param_grid = paramaters,
                           scoring = 'accuracy',
                           cv = 4,
                           n_jobs = -1,
                           verbose=5)
grid_search = grid_search.fit(X_train_tf, y_train)

best_accuracy = grid_search.best_score_ 
best_parameters = grid_search.best_params_  

print('Best accuracy : ', grid_search.best_score_)
print('Best parameters :', grid_search.best_params_  )

Fitting 4 folds for each of 7 candidates, totalling 28 fits
Best accuracy :  0.6969551282051282
Best parameters : {'alpha': 0.1}




#### Random search

In [16]:

model = MultinomialNB()
parameters = [
             {'alpha' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]}                                       
             ]

n_iter_search = 20

random_search = RandomizedSearchCV(model, param_distributions=parameters,cv=4,n_iter=n_iter_search,n_jobs = -1,verbose=1)

random_search = random_search.fit(X_train_tf, y_train)

best_accuracy = random_search.best_score_ 
best_parameters = random_search.best_params_  

print('Best accuracy : ', random_search.best_score_)
print('Best parameters :',random_search.best_params_  )

Fitting 4 folds for each of 7 candidates, totalling 28 fits
Best accuracy :  0.6969551282051282
Best parameters : {'alpha': 0.1}




### SVM

In [17]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train_tf, y_train)

y_pred = svc.predict(X_test_tf)
print(classification_report(y_test, y_pred))

cf = confusion_matrix(y_test, y_pred)
print(cf)
print(accuracy_score(y_test, y_pred) * 100)

                 precision    recall  f1-score   support

      allergies       0.00      0.00      0.00         5
 appetizeritems       1.00      0.50      0.67         2
       delivery       1.00      1.00      1.00         1
       location       0.00      0.00      0.00         1
makereservation       1.00      1.00      1.00         1
      meatitems       1.00      1.00      1.00         2
    noodleitems       1.00      1.00      1.00         1
   openinghours       1.00      1.00      1.00         1
          order       0.46      1.00      0.63        11
       payments       1.00      0.25      0.40         4
   poultryitems       1.00      0.67      0.80         3
    reservation       1.00      1.00      1.00         1
       showmenu       0.00      0.00      0.00         1
      soupitems       1.00      0.67      0.80         3
     spicyitems       1.00      1.00      1.00         1
vegetarianitems       1.00      1.00      1.00         2

       accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
print("score on train: "+ str(svc.score(X_train_tf, y_train) * 100))
print("score on test: " + str(svc.score(X_test_tf, y_test) * 100))

score on train: 96.20253164556962
score on test: 67.5


#### Hyperparameter tuning

#### Grid search

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

model = SVC()

paramaters = [ 
        {'kernel': ['linear'], 
         'C': np.linspace(0.01,20,10), 
         'degree': [2,3,4], 
         'class_weight': [None, 'balanced']},
        {'kernel': ['rbf', 'poly', 'sigmoid'], 
         'C': np.linspace(0.01,20,10), 
         'gamma': [0.0001, 0.001, 0.01, 0.1, 0.2], 
         'degree': [2,3,4], 
         'class_weight': [None, 'balanced']},
]

grid_search = GridSearchCV(estimator = model, 
                           param_grid = paramaters,
                           scoring = 'accuracy', 
                           cv = 4,
                           n_jobs = -1,
                           verbose =5)
grid_search = grid_search.fit(X_train_tf, y_train)

best_accuracy = grid_search.best_score_ 
best_parameters = grid_search.best_params_  
print('Best accuracy : ', grid_search.best_score_)
print('Best parameters :', grid_search.best_params_  )

Fitting 4 folds for each of 960 candidates, totalling 3840 fits




Best accuracy :  0.7533653846153846
Best parameters : {'C': 11.115555555555554, 'class_weight': None, 'degree': 2, 'gamma': 0.1, 'kernel': 'rbf'}


#### Random search

In [20]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint 
from scipy.stats import uniform



model = SVC()

parameters = [
    {'kernel': ['linear'], 
     'C': uniform(0.01, 20), 
     'degree': [2,3,4], 
     'class_weight': [None, 'balanced']
    },
    {'kernel': ['rbf', 'poly', 'sigmoid'], 
     'C': uniform(0.01, 20), 
     'gamma': uniform(0.001, 0.2), 
     'degree': [2,3,4], 
     'class_weight': [None, 'balanced']}
]
 
n_iter_search = 20

random_search = RandomizedSearchCV(model, param_distributions=parameters,cv=4,n_iter=n_iter_search,n_jobs = -1,verbose=1)

random_search = random_search.fit(X_train_tf, y_train)

best_accuracy = random_search.best_score_ 
best_parameters = random_search.best_params_  

print('Best accuracy : ', random_search.best_score_)
print('Best parameters :',random_search.best_params_  )

Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best accuracy :  0.7471153846153846
Best parameters : {'C': 5.196139408781772, 'class_weight': None, 'degree': 2, 'kernel': 'linear'}




#### Decision tree

In [21]:
from sklearn.tree import DecisionTreeClassifier

decisiontree = DecisionTreeClassifier(criterion='entropy')
decisiontree.fit(X_train_tf, y_train)

y_pred = decisiontree.predict(X_test_tf)
print(classification_report(y_test, y_pred))

cf = confusion_matrix(y_test, y_pred)
print(cf)
print(accuracy_score(y_test, y_pred) * 100)

                 precision    recall  f1-score   support

      allergies       0.00      0.00      0.00         5
 appetizeritems       1.00      1.00      1.00         2
       delivery       1.00      1.00      1.00         1
       location       0.00      0.00      0.00         1
makereservation       1.00      1.00      1.00         1
      meatitems       1.00      1.00      1.00         2
    noodleitems       0.00      0.00      0.00         1
   openinghours       1.00      1.00      1.00         1
          order       1.00      1.00      1.00        11
       payments       1.00      0.25      0.40         4
   poultryitems       1.00      0.67      0.80         3
    reservation       1.00      1.00      1.00         1
       showmenu       1.00      1.00      1.00         1
      soupitems       1.00      1.00      1.00         3
     spicyitems       1.00      1.00      1.00         1
        unknown       0.00      0.00      0.00         0
vegetarianitems       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
print("score on train: "+ str(decisiontree.score(X_train_tf, y_train) * 100))
print("score on test: " + str(decisiontree.score(X_test_tf, y_test) * 100))

score on train: 99.36708860759494
score on test: 70.0


#### hyperparemeter tuning

#### Grid search

In [23]:
model = DecisionTreeClassifier()
parameters = [
             {'criterion' : ['gini', 'entropy'],
              'max_depth':[5, 10, 100, 1000] }                                       
             ]

n_iter_search = 20

grid_search = GridSearchCV(estimator = model, 
                           param_grid = parameters,
                           scoring = 'accuracy', 
                           cv = 4,
                           n_jobs = -1,
                           verbose =5)
grid_search = grid_search.fit(X_train_tf, y_train)

best_accuracy = grid_search.best_score_ 
best_parameters = grid_search.best_params_  
print('Best accuracy : ', grid_search.best_score_)
print('Best parameters :', grid_search.best_params_  )

Fitting 4 folds for each of 8 candidates, totalling 32 fits
Best accuracy :  0.6009615384615384
Best parameters : {'criterion': 'gini', 'max_depth': 1000}




#### Random search

In [24]:
model = DecisionTreeClassifier()
parameters = [
             {'criterion' : ['gini', 'entropy'],
              'max_depth':[5, 10, 100, 1000] }                                       
             ]

n_iter_search = 20

random_search = RandomizedSearchCV(model, param_distributions=parameters,cv=4,n_iter=n_iter_search,n_jobs = -1,verbose=1)

random_search = random_search.fit(X_train_tf, y_train)

best_accuracy = random_search.best_score_ 
best_parameters = random_search.best_params_  

print('Best accuracy : ', random_search.best_score_)
print('Best parameters :',random_search.best_params_  )

Fitting 4 folds for each of 8 candidates, totalling 32 fits
Best accuracy :  0.5947115384615385
Best parameters : {'max_depth': 100, 'criterion': 'gini'}




#### Random forest

In [25]:
from sklearn.ensemble import RandomForestClassifier

randomforest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
randomforest.fit(X_train_tf, y_train)

y_pred = randomforest.predict(X_test_tf)
print(classification_report(y_test, y_pred))

cf = confusion_matrix(y_test, y_pred)
print(cf)
print(accuracy_score(y_test, y_pred) * 100)

                 precision    recall  f1-score   support

      allergies       0.00      0.00      0.00         5
 appetizeritems       1.00      0.50      0.67         2
       delivery       1.00      1.00      1.00         1
       location       0.00      0.00      0.00         1
makereservation       1.00      1.00      1.00         1
      meatitems       0.67      1.00      0.80         2
    noodleitems       0.00      0.00      0.00         1
   openinghours       1.00      1.00      1.00         1
          order       1.00      1.00      1.00        11
       payments       0.00      0.00      0.00         4
   poultryitems       0.00      0.00      0.00         3
    reservation       1.00      1.00      1.00         1
       showmenu       1.00      1.00      1.00         1
      soupitems       1.00      0.67      0.80         3
     spicyitems       1.00      1.00      1.00         1
        unknown       0.00      0.00      0.00         0
vegetarianitems       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
print("score on train: "+ str(randomforest.score(X_train_tf, y_train) * 100))
print("score on test: " + str(randomforest.score(X_test_tf, y_test) * 100))

score on train: 98.73417721518987
score on test: 57.49999999999999


#### Hyperparameter tuning

#### Grid search

In [27]:
model = RandomForestClassifier()
parameters = [
             {'criterion' : ['gini', 'entropy'],
              'n_estimators':[10, 100, 1000] }                                       
             ]

n_iter_search = 20

grid_search = GridSearchCV(estimator = model, 
                           param_grid = parameters,
                           scoring = 'accuracy', 
                           cv = 4,
                           n_jobs = -1,
                           verbose =5)
grid_search = grid_search.fit(X_train_tf, y_train)

best_accuracy = grid_search.best_score_ 
best_parameters = grid_search.best_params_  
print('Best accuracy : ', grid_search.best_score_)
print('Best parameters :', grid_search.best_params_  )

Fitting 4 folds for each of 6 candidates, totalling 24 fits




Best accuracy :  0.6389423076923076
Best parameters : {'criterion': 'gini', 'n_estimators': 100}


#### Random search

In [28]:
model = RandomForestClassifier()
parameters = [
             {'criterion' : ['gini', 'entropy'],
              'n_estimators':[10, 100, 1000] }                                       
             ]

n_iter_search = 20

random_search = RandomizedSearchCV(model, param_distributions=parameters,cv=4,n_iter=n_iter_search,n_jobs = -1,verbose=1)

random_search = random_search.fit(X_train_tf, y_train)

best_accuracy = random_search.best_score_ 
best_parameters = random_search.best_params_  

print('Best accuracy : ', random_search.best_score_)
print('Best parameters :',random_search.best_params_  )

Fitting 4 folds for each of 6 candidates, totalling 24 fits




Best accuracy :  0.6325320512820513
Best parameters : {'n_estimators': 1000, 'criterion': 'gini'}


### Evaluate the model

I chose the logistic regression model for evaluation as it has the highest accuracy

#### Trying model

In [29]:
from sklearn.pipeline import Pipeline

text_lregclassifier = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('lregclassifier', LogisticRegression(C=10)),
])

In [30]:
text_lregclassifier.fit(X_train.to_list(), list(y_train))

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('lregclassifier', LogisticRegression(C=10))])

#### Testing model

In [31]:
X_TEST = X_test.to_list()
Y_TEST = list(y_test)

In [32]:
predicted = text_lregclassifier.predict(X_TEST)

In [33]:
c = 0

for doc, category in zip(X_TEST, predicted):
    
    if c == 5:
        break
    
    
    print(doc)
    print(v[category])
    print("-"*55)
    
    c = c + 1

 kippenfilet met divers groenten 
order
-------------------------------------------------------
 soup dish 
soupitems
-------------------------------------------------------
 amex 
order
-------------------------------------------------------
 meat dish 
meatitems
-------------------------------------------------------
 vegetarian dish 
vegetarianitems
-------------------------------------------------------


#### Accuracy

In [34]:
np.mean((predicted == Y_TEST)* 100)

80.0

#### Prediction

In [35]:
new_sentence = ['Where can I find the restaurant?']

In [36]:
predicted = text_lregclassifier.predict(new_sentence)

In [37]:
v[predicted[0]]

'contact'

In [38]:
new_sentence = ['Which dishes are vegetarian?']

In [39]:
predicted = text_lregclassifier.predict(new_sentence)

In [40]:
v[predicted[0]]

'vegetarianitems'

In [41]:
new_sentence = ['Do you have rice dishes?']

In [42]:
predicted = text_lregclassifier.predict(new_sentence)

In [43]:
v[predicted[0]]

'riceitems'

In [44]:
new_sentence = ['Can you give me the menu?']

In [45]:
predicted = text_lregclassifier.predict(new_sentence)

In [46]:
v[predicted[0]]

'showmenu'