In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
df = pd.read_json('../../data/train/trainText-75-25.json', lines=True)

In [2]:
ingredients = {}
for lst in df['ingredientsFiltered']:
    for i in lst:
        i = i.replace(' ','_').strip('_')
        if not i in ingredients:
            ingredients[i] = 1
        else:
            ingredients[i] += 1

In [3]:
ingredients['black_olives']

774

In [4]:
MIN_FREQUENCY = 10
filtered_ingredients = {k: v for k, v in ingredients.items() if v >= MIN_FREQUENCY}

In [5]:
recipes = list(df['recipe'])
labels = list(df['cuisine'])

In [6]:
vectorizer = CountVectorizer()
vectorizer.fit(filtered_ingredients.keys())
print('Number of Features: %d'%len(vectorizer.get_feature_names()))

Number of Features: 3092


In [7]:
print(vectorizer.get_feature_names())

['____oz', '_alfredo_sauce', '_asiago', '_beaten', '_best_food_mayonnais', '_best_food_real_mayonnais', '_butter', '_capicola', '_chicken_breast', '_chicken_broth', '_chicken_tenderloins', '_chinese_black_vinegar_dark_soy_sauce_chicken_stock_peanut_oil_red_onion', '_chop', '_chunky_mild_salsa', '_classic_hummus', '_classico_olive_oil', '_cook_drain', '_cookies', '_crush', '_crushed_finely_grated_ginger_½_sichuan_peppercorns', '_crushed_long_dried_red_chillies', '_dash', '_diced', '_diced_tomatoes', '_drain', '_eggs', '_fiesta_sides_spanish_rice', '_finely', '_garlic_cloves', '_genoa_salami', '_green_chiles', '_half', '_half_lengthways', '_halved_thinly_lengthways_', '_italian_loaf', '_less', '_lightly_beaten', '_manchego', '_meats_fish', '_natural_sweetener', '_parmigiano', '_peel_devein', '_pie_filling', '_prosciutto', '_real_mayonnaise', '_refried_beans', '_refrigerated_crescent_dinner_rolls', '_rins_drain', '_salmon', '_sharp_cheddar', '_shrimp', '_slice', '_soften', '_soppressata',

In [8]:
X_train = vectorizer.transform(recipes).toarray()
Y_train = labels

In [9]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train, Y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [10]:
input_recipe = input("What is your recipe? \n> ")
while input_recipe != 'quit':
    input_recipe = input_recipe.lower()
    X_input = vectorizer.transform([input_recipe]).toarray()
    prediction = clf.predict(X_input)[0]
    
    print('\nIdentified Ingredients: \n> %s'%vectorizer.inverse_transform(X_input))
    
    print('\nPredicted Cuisine Type: \n> %s'%prediction)
    print('\n====================================')
    
    input_recipe = input("What is your recipe? \n> ")

What is your recipe? 
> salmon

Identified Ingredients: 
> [array(['salmon'], dtype='<U72')]

Predicted Cuisine Type: 
> japanese

What is your recipe? 
> quit


In [11]:
from sklearn.pipeline import Pipeline
NB_model = Pipeline([('vectorizer', vectorizer),('NB', clf)])

In [14]:
import pickle
filename = 'NB_model_v2.sav'
pickle.dump(NB_model, open(filename,'wb'))

In [17]:
NB_model.predict(list(['hi', 'hi']))

array(['italian', 'italian'], dtype='<U12')