In [53]:
import nltk
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import json
from joblib import dump

In [54]:
with open("../data/foods_data.json") as f:
    content = json.load(f)

examples = []
for item in content:
    for i in item["inputs"]:
        examples.append([i.lower(), item["class"].lower()])

examples

[['i would like margherita', 'margherita'],
 ['i would like one margherita', 'margherita'],
 ['i would like two margheritas', 'margherita'],
 ['margherita, please', 'margherita'],
 ['one margherita, please', 'margherita'],
 ['two margheritas, please', 'margherita'],
 ['margherita', 'margherita'],
 ['one margherita', 'margherita'],
 ['two margheritas', 'margherita'],
 ['please deliver me a margherita', 'margherita'],
 ['please deliver me two margheritas', 'margherita'],
 ['i want margherita to be delivered to me', 'margherita'],
 ['i want to order margherita', 'margherita'],
 ['i would like formaggio', 'formaggio'],
 ['i would like one formaggio', 'formaggio'],
 ['i would like two formaggios', 'formaggio'],
 ['formaggio, please', 'formaggio'],
 ['one formaggio, please', 'formaggio'],
 ['two formaggios, please', 'formaggio'],
 ['formaggio', 'formaggio'],
 ['one formaggio', 'formaggio'],
 ['two formaggios', 'formaggio'],
 ['please deliver me a formaggio', 'formaggio'],
 ['please deliver m

In [55]:
df = pd.DataFrame(examples, columns=["inputs", "class"])
df

Unnamed: 0,inputs,class
0,i would like margherita,margherita
1,i would like one margherita,margherita
2,i would like two margheritas,margherita
3,"margherita, please",margherita
4,"one margherita, please",margherita
...,...,...
112,two garlic breads,garlic bread
113,please deliver me a garlic bread,garlic bread
114,please deliver me two garlic breads,garlic bread
115,i want garlic bread to be delivered to me,garlic bread


In [56]:
le = LabelEncoder()
le.fit(df["class"])

LabelEncoder()

In [57]:
df["labels"] = le.transform(df["class"])
df

Unnamed: 0,inputs,class,labels
0,i would like margherita,margherita,5
1,i would like one margherita,margherita,5
2,i would like two margheritas,margherita,5
3,"margherita, please",margherita,5
4,"one margherita, please",margherita,5
...,...,...,...
112,two garlic breads,garlic bread,3
113,please deliver me a garlic bread,garlic bread,3
114,please deliver me two garlic breads,garlic bread,3
115,i want garlic bread to be delivered to me,garlic bread,3


In [58]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["inputs"])

In [59]:
X.shape

(117, 34)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, df["labels"], test_size=0.2, random_state=0)

In [61]:
classifier = MultinomialNB(alpha=0.99)
classifier.fit(X_train.toarray(), y_train)

MultinomialNB(alpha=0.99)

In [62]:
preds = []
for row in X_test:
    result = classifier.predict(row.toarray())
    preds.append(result)

preds

[array([5]),
 array([7]),
 array([0]),
 array([0]),
 array([5]),
 array([1]),
 array([6]),
 array([3]),
 array([2]),
 array([0]),
 array([0]),
 array([5]),
 array([4]),
 array([5]),
 array([3]),
 array([7]),
 array([0]),
 array([8]),
 array([7]),
 array([8]),
 array([2]),
 array([3]),
 array([2]),
 array([4])]

In [63]:
acc_score = accuracy_score(y_test, preds)
acc_score

0.9583333333333334

In [64]:
cm = confusion_matrix(y_test, preds)
cm

array([[5, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 3, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 3, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 2, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 3, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 3, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 2]])

In [65]:
model_path = "../models/foods_clf_mnb.joblib"
dump(classifier, model_path)

['../models/foods_clf_mnb.joblib']