In [23]:
import nltk
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import json
from joblib import dump

In [2]:
with open("../data/foods_data.json") as f:
    content = json.load(f)

examples = []
for item in content:
    for i in item["inputs"]:
        examples.append([i, item["class"]])

examples

[['I would like Margherita', 'Margherita'],
 ['I would like one Margherita', 'Margherita'],
 ['I would like two Margheritas', 'Margherita'],
 ['Margherita, please', 'Margherita'],
 ['One Margherita, please', 'Margherita'],
 ['Two Margheritas, please', 'Margherita'],
 ['Margherita', 'Margherita'],
 ['One Margherita', 'Margherita'],
 ['Two Margheritas', 'Margherita'],
 ['Please deliver me a Margherita', 'Margherita'],
 ['Please deliver me two Margheritas', 'Margherita'],
 ['I want Margherita to be delivered to me', 'Margherita'],
 ['I want to order Margherita', 'Margherita'],
 ['I would like Formaggio', 'Formaggio'],
 ['I would like one Formaggio', 'Formaggio'],
 ['I would like two Formaggios', 'Formaggio'],
 ['Formaggio, please', 'Formaggio'],
 ['One Formaggio, please', 'Formaggio'],
 ['Two Formaggios, please', 'Formaggio'],
 ['Formaggio', 'Formaggio'],
 ['One Formaggio', 'Formaggio'],
 ['Two Formaggios', 'Formaggio'],
 ['Please deliver me a Formaggio', 'Formaggio'],
 ['Please deliver m

In [6]:
df = pd.DataFrame(examples, columns=["inputs", "class"])
df

Unnamed: 0,inputs,class
0,I would like Margherita,Margherita
1,I would like one Margherita,Margherita
2,I would like two Margheritas,Margherita
3,"Margherita, please",Margherita
4,"One Margherita, please",Margherita
...,...,...
112,Two Garlic breads,Garlic bread
113,Please deliver me a Garlic bread,Garlic bread
114,Please deliver me two Garlic breads,Garlic bread
115,I want Garlic bread to be delivered to me,Garlic bread


In [7]:
le = LabelEncoder()
le.fit(df["class"])

LabelEncoder()

In [9]:
df["labels"] = le.transform(df["class"])
df

Unnamed: 0,inputs,class,labels
0,I would like Margherita,Margherita,5
1,I would like one Margherita,Margherita,5
2,I would like two Margheritas,Margherita,5
3,"Margherita, please",Margherita,5
4,"One Margherita, please",Margherita,5
...,...,...,...
112,Two Garlic breads,Garlic bread,3
113,Please deliver me a Garlic bread,Garlic bread,3
114,Please deliver me two Garlic breads,Garlic bread,3
115,I want Garlic bread to be delivered to me,Garlic bread,3


In [11]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["inputs"])

<117x34 sparse matrix of type '<class 'numpy.float64'>'
	with 434 stored elements in Compressed Sparse Row format>

In [12]:
X.shape

(117, 34)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, df["labels"], test_size=0.2, random_state=0)

In [19]:
classifier = GaussianNB()
classifier.fit(X_train.toarray(), y_train)

GaussianNB()

In [20]:
preds = []
for row in X_test:
    result = classifier.predict(row.toarray())
    preds.append(result)

preds

[array([5]),
 array([7]),
 array([0]),
 array([0]),
 array([5]),
 array([1]),
 array([6]),
 array([3]),
 array([2]),
 array([0]),
 array([0]),
 array([5]),
 array([7]),
 array([7]),
 array([3]),
 array([7]),
 array([0]),
 array([8]),
 array([7]),
 array([8]),
 array([2]),
 array([3]),
 array([2]),
 array([4])]

In [21]:
acc_score = accuracy_score(y_test, preds)
acc_score

0.9166666666666666

In [24]:
cm = confusion_matrix(y_test, preds)
cm

array([[5, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 3, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 3, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 3, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 3, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 2]])

In [25]:
model_path = "../models/foods_clf_gnb.joblib"
dump(classifier, model_path)

['../models/foods_clf_gnb.joblib']