In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
df = pd.read_json('../../data/train/trainText-75-25-bow.json', lines=True)

In [2]:
df.head()

Unnamed: 0,cuisine,id,ingredients,label,numeric_recipe,recipe
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",0,romaine lettuce black olives grape tomatoes ga...,romaine lettuce black olives grape tomatoes ga...
1,greek,34471,"[ground pork, finely chopped fresh parsley, on...",0,ground pork finely chopped fresh parsley onion...,ground pork finely chopped fresh parsley onion...
2,greek,4635,"[minced garlic, dried oregano, red wine vinega...",0,minced garlic dried oregano red wine vinegar o...,minced garlic dried oregano red wine vinegar o...
3,greek,5980,"[orange, anise, cinnamon sticks, unflavored ge...",0,orange anise cinnamon sticks unflavored gelati...,orange anise cinnamon sticks unflavored gelati...
4,greek,18031,"[fresh dill, yoghurt, salt, myzithra, large eg...",0,fresh dill yoghurt salt myzithra large eggs ch...,fresh dill yoghurt salt myzithra large eggs ch...


In [3]:
recipes = list(df['recipe'])
labels = list(df['cuisine'])

In [9]:
units = ['cup','cups','lb','oz','tablespoon','tablespoons', 'teaspoon', 'teaspoons', 'clove', 'cloves', 'small', 'large']
adjs = ['range', 'extra', 'corned', 'cooked', 'steamed', 'toasted', 'unseasoned','waxy','smoked','skim', 'shredded','seasoned', 'processed', 'peeled', 'organic', 'minced', 'chopped', 'peeled', 'drained', 'cut', 'ground', 'light', 'medium', 'melted', 'firm', 'neutral','lean', 'skinless', 'sliced', 'free', 'fine', 'granulated', 'packed', 'firmly', 'fresh', 'freshly']
stopwords = units + adjs + ['style', 'and', 'such', 'as', 'or', 'not', 'into', 'other', 'in', 'to']

In [10]:
vectorizer = CountVectorizer(min_df=0.0001, stop_words = stopwords)
vectorizer.fit(recipes)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))

Number of Features: 2058


In [11]:
print(vectorizer.get_feature_names())

['about', 'above', 'abura', 'achiote', 'acid', 'ackee', 'acorn', 'acting', 'active', 'added', 'adobo', 'adzuki', 'agar', 'agave', 'age', 'aged', 'ahi', 'aioli', 'ajwain', 'aka', 'albacore', 'ale', 'aleppo', 'alfalfa', 'alfredo', 'all', 'allspice', 'almond', 'almondmilk', 'almonds', 'also', 'amaretti', 'amaretto', 'amchur', 'american', 'aminos', 'an', 'anaheim', 'ancho', 'anchovies', 'anchovy', 'andouille', 'anejo', 'angel', 'anglaise', 'angostura', 'anise', 'anjou', 'annatto', 'any', 'aonori', 'apple', 'apples', 'applesauce', 'applewood', 'apricot', 'apricots', 'arbol', 'arborio', 'armagnac', 'arrowroot', 'artichok', 'artichoke', 'artichokes', 'artisan', 'arugula', 'asadero', 'asafetida', 'asafoetida', 'asiago', 'asian', 'asin', 'asparagus', 'assorted', 'at', 'atar', 'atta', 'au', 'aubergine', 'avocado', 'avocados', 'açai', 'babi', 'baby', 'back', 'bacon', 'bagels', 'bags', 'baguette', 'baileys', 'baked', 'baking', 'balance', 'balls', 'balsamic', 'balsamico', 'bamboo', 'banana', 'banan

In [12]:
X_train = vectorizer.transform(recipes).toarray()
Y_train = labels

In [13]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train, Y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [22]:
input_recipe = input("What is your recipe? \n> ")
while input_recipe != 'quit':
    input_recipe = input_recipe.lower()
    X_input = vectorizer.transform([input_recipe]).toarray()
    prediction = clf.predict(X_input)[0]
    
    print('\nIdentified Ingredients: \n> %s'%vectorizer.inverse_transform(X_input))
    
    print('\nPredicted Cuisine Type: \n> %s'%prediction)
    print('\n====================================')
    
    input_recipe = input("What is your recipe? \n> ")

What is your recipe? 
> white mushrooms

Identified Ingredients: 
> [array(['mushrooms', 'white'], dtype='<U14')]

Predicted Cuisine Type: 
> french

What is your recipe? 
> caviar

Identified Ingredients: 
> [array(['caviar'], dtype='<U14')]

Predicted Cuisine Type: 
> french

What is your recipe? 
> chicken wings

Identified Ingredients: 
> [array(['chicken', 'wings'], dtype='<U14')]

Predicted Cuisine Type: 
> japanese

What is your recipe? 
> soy sauce

Identified Ingredients: 
> [array(['sauce', 'soy'], dtype='<U14')]

Predicted Cuisine Type: 
> japanese

What is your recipe? 
> soy

Identified Ingredients: 
> [array(['soy'], dtype='<U14')]

Predicted Cuisine Type: 
> japanese

What is your recipe? 
> sauce

Identified Ingredients: 
> [array(['sauce'], dtype='<U14')]

Predicted Cuisine Type: 
> japanese

What is your recipe? 
> tomato sauce

Identified Ingredients: 
> [array(['sauce', 'tomato'], dtype='<U14')]

Predicted Cuisine Type: 
> japanese

What is your recipe? 
> tomato

I

In [15]:
from sklearn.pipeline import Pipeline
NB_model = Pipeline([('vectorizer', vectorizer),('NB', clf)])

In [16]:
import pickle
filename = 'NB_model_bow_v1.sav'
pickle.dump(NB_model, open(filename,'wb'))

In [21]:
NB_model.predict(list(['salmon rice', 'cheese pepperoni mushrooms']))

array(['japanese', 'italian'], dtype='<U12')