In [1]:
import nltk
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import json
from joblib import dump

In [2]:
with open("../data/foods_data.json") as f:
    content = json.load(f)

examples = []
for item in content:
    for i in item["inputs"]:
        examples.append([i.lower(), item["class"].lower()])

examples

[['i would like margherita', 'margherita'],
 ['i would like one margherita', 'margherita'],
 ['i would like two margheritas', 'margherita'],
 ['margherita, please', 'margherita'],
 ['one margherita, please', 'margherita'],
 ['two margheritas, please', 'margherita'],
 ['margherita', 'margherita'],
 ['one margherita', 'margherita'],
 ['two margheritas', 'margherita'],
 ['please deliver me a margherita', 'margherita'],
 ['please deliver me two margheritas', 'margherita'],
 ['i want margherita to be delivered to me', 'margherita'],
 ['i want to order margherita', 'margherita'],
 ['i would like formaggio', 'formaggio'],
 ['i would like one formaggio', 'formaggio'],
 ['i would like two formaggios', 'formaggio'],
 ['formaggio, please', 'formaggio'],
 ['one formaggio, please', 'formaggio'],
 ['two formaggios, please', 'formaggio'],
 ['formaggio', 'formaggio'],
 ['one formaggio', 'formaggio'],
 ['two formaggios', 'formaggio'],
 ['please deliver me a formaggio', 'formaggio'],
 ['please deliver m

In [3]:
df = pd.DataFrame(examples, columns=["inputs", "class"])
df

Unnamed: 0,inputs,class
0,i would like margherita,margherita
1,i would like one margherita,margherita
2,i would like two margheritas,margherita
3,"margherita, please",margherita
4,"one margherita, please",margherita
...,...,...
112,two garlic breads,garlic bread
113,please deliver me a garlic bread,garlic bread
114,please deliver me two garlic breads,garlic bread
115,i want garlic bread to be delivered to me,garlic bread


In [4]:
le = LabelEncoder()
le.fit(df["class"])

LabelEncoder()

In [5]:
df["labels"] = le.transform(df["class"])
df

Unnamed: 0,inputs,class,labels
0,i would like margherita,margherita,5
1,i would like one margherita,margherita,5
2,i would like two margheritas,margherita,5
3,"margherita, please",margherita,5
4,"one margherita, please",margherita,5
...,...,...,...
112,two garlic breads,garlic bread,3
113,please deliver me a garlic bread,garlic bread,3
114,please deliver me two garlic breads,garlic bread,3
115,i want garlic bread to be delivered to me,garlic bread,3


In [6]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["inputs"])

In [7]:
X.shape

(117, 34)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, df["labels"], test_size=0.2, random_state=0)

In [9]:
classifier = MultinomialNB()
classifier.fit(X_train.toarray(), y_train)

MultinomialNB()

In [10]:
preds = []
for row in X_test:
    result = classifier.predict(row.toarray())
    preds.append(result)

preds

[array([5]),
 array([7]),
 array([0]),
 array([0]),
 array([5]),
 array([1]),
 array([6]),
 array([3]),
 array([2]),
 array([0]),
 array([0]),
 array([5]),
 array([4]),
 array([5]),
 array([3]),
 array([7]),
 array([0]),
 array([8]),
 array([7]),
 array([8]),
 array([2]),
 array([3]),
 array([2]),
 array([4])]

In [11]:
acc_score = accuracy_score(y_test, preds)
acc_score

0.9583333333333334

In [12]:
cm = confusion_matrix(y_test, preds)
cm

array([[5, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 3, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 3, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 2, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 3, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 3, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 2]])

In [13]:
model_path = "../models/foods_clf_mnb.joblib"
dump(classifier, model_path)

['../models/foods_clf_mnb.joblib']

In [14]:
probabilities = []

for _, row in df.iterrows():
    vector = vectorizer.transform([row["inputs"]])
    pred_proba = classifier.predict_proba(vector)
    probabilities.append(pred_proba)

probabilities

[array([[0.05608907, 0.08940464, 0.07728915, 0.0845776 , 0.10691859,
         0.31720051, 0.07283334, 0.10199777, 0.09368933]]),
 array([[0.06127445, 0.09360197, 0.09037996, 0.08221375, 0.10814539,
         0.2835236 , 0.07809196, 0.10274119, 0.10002772]]),
 array([[0.06758699, 0.09862697, 0.10009765, 0.09482441, 0.1314467 ,
         0.19362539, 0.08883224, 0.11972126, 0.1052384 ]]),
 array([[0.05736364, 0.08003017, 0.0680706 , 0.06890077, 0.08325913,
         0.41216455, 0.08059261, 0.07479955, 0.07481897]]),
 array([[0.06536756, 0.0883531 , 0.08545646, 0.07048655, 0.08952847,
         0.34535447, 0.08912158, 0.08065917, 0.08567264]]),
 array([[0.07510837, 0.09726956, 0.09932085, 0.0867016 , 0.11704737,
         0.22325919, 0.10569793, 0.10085671, 0.09473843]]),
 array([[0.05926568, 0.0725555 , 0.06471554, 0.06512804, 0.07405495,
         0.4517166 , 0.07292979, 0.06925131, 0.07038259]]),
 array([[0.06947346, 0.08423593, 0.08558643, 0.06896474, 0.08360482,
         0.3609592 , 0.08484

In [18]:
minimal_max_prob = 1
for row in probabilities:
    idx = np.argmax(row)
    max_prob = row[0][idx]
    if minimal_max_prob > max_prob:
        minimal_max_prob = max_prob

minimal_max_prob

0.128541414448388

In [20]:
min_probs = []
for row in probabilities:
    idx = np.argmax(row)
    max_prob = row[0][idx]
    if max_prob < 0.2:
        min_probs.append(max_prob)

min_probs

[0.19362538505934948,
 0.1971300291645623,
 0.128541414448388,
 0.15575935068146265,
 0.172723163105468,
 0.19859942312148132,
 0.18704484264129972,
 0.14180414228605298,
 0.1751932407748272,
 0.1991140676491382,
 0.16330510493369818]