In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
train_df = pd.read_json('../../data/train/trainText-75-25-bow.json', lines=True)

In [2]:
test_df = pd.read_json('../../data/test/testText-75-25-bow.json', lines=True)

In [3]:
train_df.head()

Unnamed: 0,cuisine,id,ingredients,label,numeric_recipe,recipe
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",0,romaine lettuce black olives grape tomatoes ga...,romaine lettuce black olives grape tomatoes ga...
1,greek,34471,"[ground pork, finely chopped fresh parsley, on...",0,ground pork finely chopped fresh parsley onion...,ground pork finely chopped fresh parsley onion...
2,greek,4635,"[minced garlic, dried oregano, red wine vinega...",0,minced garlic dried oregano red wine vinegar o...,minced garlic dried oregano red wine vinegar o...
3,greek,5980,"[orange, anise, cinnamon sticks, unflavored ge...",0,orange anise cinnamon sticks unflavored gelati...,orange anise cinnamon sticks unflavored gelati...
4,greek,18031,"[fresh dill, yoghurt, salt, myzithra, large eg...",0,fresh dill yoghurt salt myzithra large eggs ch...,fresh dill yoghurt salt myzithra large eggs ch...


In [4]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
def stem(text):
    return ' '.join([stemmer.stem(word) for word in text.split(' ')])

In [5]:
file = open("stopwords.txt", "r")
stopwords = file.read().lower().split('\n')
def remove_stopwords(text):
    filtered = ""
    for word in text.split(' '):
        if not word in stopwords:
            filtered = filtered + ' ' + word
    return filtered

In [6]:
train_df['recipe'] = train_df['recipe'].apply(remove_stopwords)
train_df['recipe'] = train_df['recipe'].apply(stem)

In [7]:
train_df.head()

Unnamed: 0,cuisine,id,ingredients,label,numeric_recipe,recipe
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",0,romaine lettuce black olives grape tomatoes ga...,romain lettuc black oliv grape tomato garlic ...
1,greek,34471,"[ground pork, finely chopped fresh parsley, on...",0,ground pork finely chopped fresh parsley onion...,pork parsley onion salt vinegar caul fat
2,greek,4635,"[minced garlic, dried oregano, red wine vinega...",0,minced garlic dried oregano red wine vinegar o...,minc garlic dri oregano red wine vinegar oliv...
3,greek,5980,"[orange, anise, cinnamon sticks, unflavored ge...",0,orange anise cinnamon sticks unflavored gelati...,orang anis cinnamon stick unflavor gelatin zi...
4,greek,18031,"[fresh dill, yoghurt, salt, myzithra, large eg...",0,fresh dill yoghurt salt myzithra large eggs ch...,dill yoghurt salt myzithra egg chees feta che...


In [8]:
test_df['recipe'] = test_df['recipe'].apply(remove_stopwords)
test_df['recipe'] = test_df['recipe'].apply(stem)

In [9]:
recipes = list(train_df['recipe'])
labels = list(train_df['cuisine'])

In [10]:
X_test = list(test_df['recipe'])
y_test = list(test_df['cuisine'])

In [71]:
vectorizer = CountVectorizer(ngram_range=(1,2), min_df=0.0012, stop_words='english', binary=True)
vectorizer.fit(recipes)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))

Number of Features: 2798


In [72]:
vectorizer.get_feature_names()

['activ',
 'activ dri',
 'adobo',
 'adobo sauc',
 'agav',
 'agav nectar',
 'alfredo',
 'alfredo sauc',
 'allspic',
 'almond',
 'almond butter',
 'almond extract',
 'almond flour',
 'almond meal',
 'almond milk',
 'amchur',
 'amino',
 'ancho',
 'ancho chile',
 'ancho powder',
 'anchovi',
 'anchovi fillet',
 'andouill',
 'andouill sausag',
 'angel',
 'angel hair',
 'anis',
 'appl',
 'appl cider',
 'apricot',
 'arbol',
 'arborio',
 'arborio rice',
 'arrowroot',
 'artichok',
 'artichok heart',
 'arugula',
 'asafetida',
 'asafoetida',
 'asiago',
 'asian',
 'asian fish',
 'asparagus',
 'avocado',
 'avocado cilantro',
 'avocado lime',
 'avocado tomato',
 'babi',
 'babi arugula',
 'babi bok',
 'babi carrot',
 'babi corn',
 'babi spinach',
 'bacon',
 'bacon onion',
 'bacon slice',
 'bag',
 'baguett',
 'bake',
 'balsam',
 'balsam vinegar',
 'bamboo',
 'bamboo shoot',
 'banana',
 'barbecu',
 'barbecu sauc',
 'base',
 'basil',
 'basil black',
 'basil dri',
 'basil garlic',
 'basil grate',
 'basil 

In [73]:
#import pickle
#filename = 'vectorizer.sav'
#pickle.dump(vectorizer, open(filename,'wb'))

In [74]:
X_train = vectorizer.transform(recipes).toarray()
y_train = labels

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [None]:
from sklearn.pipeline import Pipeline
LogReg_model = Pipeline([('vectorizer', vectorizer),('LR', clf)])

In [None]:
from sklearn import metrics
y_pred = LogReg_model.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

In [70]:
input_recipe = input("Input Recipe: \n> ")
input_recipe = stem(input_recipe.lower())
X_input = vectorizer.transform([input_recipe]).toarray()
prob = clf.predict_proba(X_input)[0]
classes = clf.classes_

print('\nIdentified Ingredients: \n> %s'%vectorizer.inverse_transform(X_input))

print('\nCuisine Probabilities:')
for i in range(len(prob)):
    print('> %s: %s%%'%(classes[i], int(prob[i]*100)))

Input Recipe: 
> Prepare and cook ingredients as below.  – In a bowl, combine the meat with the marinade. Cover the bowl and leave it in the fridge while you are working on other the ingredients. When ready, Add some cooking oil into a wok and cook the meat on medium high to high heat. It takes about 3 to 5 mins to thoroughly cook it. – Prepare the Korean cucumber side dish, fernbrake side dish and bellflower root side dish per the linked recipes. These can be prepared on a different day to save your bibimbap making time. – Separately sauté carrots, zucchini and mushrooms in a frying pan. Season them lightly with fine sea salt. – Mix the bibimbap sauce ingredients in a small bowl. Set aside. – Pan fry the eggs per your preference. (Sunny side up is a popular choice.)  Preheat an oven for 10 minutes at 220 C / 428 F. Once heated, place your stone bowl into the oven. Leave it there for 10 minutes. 10 minutes later, take it out. Drop in some sesame oil (about 1 Tbsp per medium size bowl) 

In [None]:
import pickle
filename = 'LogReg_v5.sav'
pickle.dump(LogReg_model, open(filename,'wb'))