In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
train_df = pd.read_json('../../data/train/trainText-75-25-bow.json', lines=True)

In [2]:
test_df = pd.read_json('../../data/test/testText-75-25-bow.json', lines=True)

In [3]:
train_df.head()

Unnamed: 0,cuisine,id,ingredients,label,numeric_recipe,recipe
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",0,romaine lettuce black olives grape tomatoes ga...,romaine lettuce black olives grape tomatoes ga...
1,greek,34471,"[ground pork, finely chopped fresh parsley, on...",0,ground pork finely chopped fresh parsley onion...,ground pork finely chopped fresh parsley onion...
2,greek,4635,"[minced garlic, dried oregano, red wine vinega...",0,minced garlic dried oregano red wine vinegar o...,minced garlic dried oregano red wine vinegar o...
3,greek,5980,"[orange, anise, cinnamon sticks, unflavored ge...",0,orange anise cinnamon sticks unflavored gelati...,orange anise cinnamon sticks unflavored gelati...
4,greek,18031,"[fresh dill, yoghurt, salt, myzithra, large eg...",0,fresh dill yoghurt salt myzithra large eggs ch...,fresh dill yoghurt salt myzithra large eggs ch...


In [4]:
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 
  
def lemmatize(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ')])

In [5]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
def stem(text):
    return ' '.join([stemmer.stem(word) for word in text.split(' ')])

In [6]:
file = open("stopwords.txt", "r")
stopwords = file.read().lower().split('\n')
def remove_stopwords(text):
    filtered = ""
    for word in text.split(' '):
        if not word in stopwords:
            filtered = filtered + ' ' + word
    return filtered

FileNotFoundError: [Errno 2] No such file or directory: 'stopwords.txt'

In [None]:
train_df['recipe'] = train_df['recipe'].apply(remove_stopwords)
train_df['recipe'] = train_df['recipe'].apply(lemmatize)

In [None]:
train_df.head()

In [25]:
test_df['recipe'] = test_df['recipe'].apply(remove_stopwords)
test_df['recipe'] = test_df['recipe'].apply(lemmatize)

In [26]:
recipes = list(train_df['recipe'])
labels = list(train_df['cuisine'])

In [27]:
X_test = list(test_df['recipe'])
y_test = list(test_df['cuisine'])

In [46]:
vectorizer = CountVectorizer(ngram_range=(1,2), min_df=0.0012, stop_words='english', binary=True)
vectorizer.fit(recipes)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))

Number of Features: 2828


In [47]:
vectorizer.get_feature_names()

['active',
 'active dry',
 'adobo',
 'adobo sauce',
 'agave',
 'agave nectar',
 'alfredo',
 'alfredo sauce',
 'allspice',
 'almond',
 'almond butter',
 'almond extract',
 'almond flour',
 'almond meal',
 'almond milk',
 'amchur',
 'amino',
 'ancho',
 'ancho chile',
 'ancho powder',
 'anchovy',
 'anchovy fillet',
 'andouille',
 'andouille sausage',
 'angel',
 'angel hair',
 'anise',
 'apple',
 'apple cider',
 'apricot',
 'arbol',
 'arborio',
 'arborio rice',
 'arrowroot',
 'artichoke',
 'artichoke heart',
 'arugula',
 'asafetida',
 'asafoetida',
 'asiago',
 'asian',
 'asian fish',
 'asparagus',
 'avocado',
 'avocado cilantro',
 'avocado lime',
 'avocado tomato',
 'baby',
 'baby arugula',
 'baby bok',
 'baby carrot',
 'baby corn',
 'baby spinach',
 'bacon',
 'bacon onion',
 'bacon slice',
 'bag',
 'baguette',
 'baked',
 'balsamic',
 'balsamic vinegar',
 'bamboo',
 'bamboo shoot',
 'banana',
 'barbecue',
 'barbecue sauce',
 'base',
 'basil',
 'basil black',
 'basil dried',
 'basil garlic'

In [48]:
#import pickle
#filename = 'vectorizer.sav'
#pickle.dump(vectorizer, open(filename,'wb'))

In [49]:
X_train = vectorizer.transform(recipes).toarray()
y_train = labels

In [50]:
from sklearn.tree import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [51]:
from sklearn.pipeline import Pipeline
LogReg_model = Pipeline([('vectorizer', vectorizer),('LR', clf)])

In [52]:
from sklearn import metrics
y_pred = LogReg_model.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   brazilian       0.67      0.41      0.51       117
     british       0.63      0.26      0.37       201
cajun_creole       0.81      0.63      0.71       387
     chinese       0.83      0.93      0.88      3294
    filipino       0.71      0.38      0.49       189
      french       0.67      0.48      0.56      2205
       greek       0.91      0.81      0.86      1544
      indian       0.92      0.95      0.94      2119
       irish       0.64      0.38      0.47       167
     italian       0.73      0.99      0.84      4310
    jamaican       0.81      0.57      0.67       132
    japanese       0.88      0.81      0.84      1557
      korean       0.82      0.67      0.74       208
     mexican       0.90      0.94      0.92      3600
    moroccan       0.85      0.63      0.72       206
     russian       0.65      0.28      0.40       123
 southern_us       0.78      0.70      0.74      1080
     spanish       0.86    

In [38]:
input_recipe = input("Input Recipe: \n> ")
input_recipe = stem(input_recipe.lower())
X_input = vectorizer.transform([input_recipe]).toarray()
prob = clf.predict_proba(X_input)[0]
classes = clf.classes_

print('\nIdentified Ingredients: \n> %s'%vectorizer.inverse_transform(X_input))

print('\nCuisine Probabilities:')
for i in range(len(prob)):
    print('> %s: %s%%'%(classes[i], int(prob[i]*100)))

Input Recipe: 
> Step 1     Season the chicken with salt and pepper. In a large, deep skillet, heat 2 tablespoons of the olive oil. Add the chicken and cook over moderately high heat until browned, about 4 minutes per side. Transfer the chicken to a plate and pour off the oil.  Step 2     Add the remaining 2 tablespoons of olive oil to the skillet and add the onion, carrots, celery and garlic. Cook over moderate heat, stirring occasionally, until starting to brown, about 8 minutes. Add the sugar, wine, vinegar, orange juice, capers and almonds and bring to a boil. Return the chicken to the skillet, skin side up. Cover partially and simmer over low heat until the chicken is cooked through, about 35 minutes.  Step 3     Transfer the chicken to a plate. Boil the pan sauce over high heat until thickened, about 3 minutes. Season the sauce with salt and pepper. Return the chicken to the skillet until warmed through. Transfer to a plate, spoon the sauce on top and serve.

Identified Ingredien

In [None]:
import pickle
filename = 'LogReg_v5.sav'
pickle.dump(LogReg_model, open(filename,'wb'))