In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
train_df = pd.read_json('../../data/train/trainText-75-25-bow.json', lines=True)

In [2]:
test_df = pd.read_json('../../data/test/testText-75-25-bow.json', lines=True)

In [3]:
train_df.head()

Unnamed: 0,cuisine,id,ingredients,label,numeric_recipe,recipe
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",0,romaine lettuce black olives grape tomatoes ga...,romaine lettuce black olives grape tomatoes ga...
1,greek,34471,"[ground pork, finely chopped fresh parsley, on...",0,ground pork finely chopped fresh parsley onion...,ground pork finely chopped fresh parsley onion...
2,greek,4635,"[minced garlic, dried oregano, red wine vinega...",0,minced garlic dried oregano red wine vinegar o...,minced garlic dried oregano red wine vinegar o...
3,greek,5980,"[orange, anise, cinnamon sticks, unflavored ge...",0,orange anise cinnamon sticks unflavored gelati...,orange anise cinnamon sticks unflavored gelati...
4,greek,18031,"[fresh dill, yoghurt, salt, myzithra, large eg...",0,fresh dill yoghurt salt myzithra large eggs ch...,fresh dill yoghurt salt myzithra large eggs ch...


In [4]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
def stem(text):
    return ' '.join([stemmer.stem(word) for word in text.split(' ')])

In [5]:
file = open("stopwords.txt", "r")
stopwords = file.read().lower().split('\n')
def remove_stopwords(text):
    filtered = ""
    for word in text.split(' '):
        if not word in stopwords:
            filtered = filtered + ' ' + word
    return filtered

In [6]:
train_df['recipe'] = train_df['recipe'].apply(remove_stopwords)
train_df['recipe'] = train_df['recipe'].apply(stem)

In [7]:
train_df.head()

Unnamed: 0,cuisine,id,ingredients,label,numeric_recipe,recipe
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",0,romaine lettuce black olives grape tomatoes ga...,romain lettuc black oliv grape tomato garlic ...
1,greek,34471,"[ground pork, finely chopped fresh parsley, on...",0,ground pork finely chopped fresh parsley onion...,pork parsley onion salt vinegar caul fat
2,greek,4635,"[minced garlic, dried oregano, red wine vinega...",0,minced garlic dried oregano red wine vinegar o...,minc garlic dri oregano red wine vinegar oliv...
3,greek,5980,"[orange, anise, cinnamon sticks, unflavored ge...",0,orange anise cinnamon sticks unflavored gelati...,orang anis cinnamon stick unflavor gelatin zi...
4,greek,18031,"[fresh dill, yoghurt, salt, myzithra, large eg...",0,fresh dill yoghurt salt myzithra large eggs ch...,dill yoghurt salt myzithra egg chees feta che...


In [8]:
test_df['recipe'] = test_df['recipe'].apply(remove_stopwords)
test_df['recipe'] = test_df['recipe'].apply(stem)

In [9]:
recipes = list(train_df['recipe'])
labels = list(train_df['cuisine'])

In [10]:
X_test = list(test_df['recipe'])
y_test = list(test_df['cuisine'])

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,2), min_df=0.0012, stop_words='english', binary=True)
vectorizer.fit(recipes)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))

Number of Features: 2798


In [12]:
vectorizer.get_feature_names()

[u'activ',
 u'activ dri',
 u'adobo',
 u'adobo sauc',
 u'agav',
 u'agav nectar',
 u'alfredo',
 u'alfredo sauc',
 u'allspic',
 u'almond',
 u'almond butter',
 u'almond extract',
 u'almond flour',
 u'almond meal',
 u'almond milk',
 u'amchur',
 u'amino',
 u'ancho',
 u'ancho chile',
 u'ancho powder',
 u'anchovi',
 u'anchovi fillet',
 u'andouill',
 u'andouill sausag',
 u'angel',
 u'angel hair',
 u'anis',
 u'appl',
 u'appl cider',
 u'apricot',
 u'arbol',
 u'arborio',
 u'arborio rice',
 u'arrowroot',
 u'artichok',
 u'artichok heart',
 u'arugula',
 u'asafetida',
 u'asafoetida',
 u'asiago',
 u'asian',
 u'asian fish',
 u'asparagus',
 u'avocado',
 u'avocado cilantro',
 u'avocado lime',
 u'avocado tomato',
 u'babi',
 u'babi arugula',
 u'babi bok',
 u'babi carrot',
 u'babi corn',
 u'babi spinach',
 u'bacon',
 u'bacon onion',
 u'bacon slice',
 u'bag',
 u'baguett',
 u'bake',
 u'balsam',
 u'balsam vinegar',
 u'bamboo',
 u'bamboo shoot',
 u'banana',
 u'barbecu',
 u'barbecu sauc',
 u'base',
 u'basil',
 u'

In [13]:
#import pickle
#filename = 'vectorizer.sav'
#pickle.dump(vectorizer, open(filename,'wb'))

In [14]:
X_train = vectorizer.transform(recipes).toarray()
y_train = labels

In [15]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
from sklearn.pipeline import Pipeline
LogReg_model = Pipeline([('vectorizer', vectorizer),('LR', clf)])

In [17]:
from sklearn import metrics
y_pred = LogReg_model.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   brazilian       0.65      0.42      0.51       117
     british       0.62      0.28      0.39       201
cajun_creole       0.81      0.63      0.71       387
     chinese       0.83      0.93      0.88      3294
    filipino       0.73      0.37      0.49       189
      french       0.67      0.48      0.56      2205
       greek       0.91      0.81      0.86      1544
      indian       0.92      0.95      0.94      2119
       irish       0.64      0.38      0.48       167
     italian       0.73      0.99      0.84      4310
    jamaican       0.79      0.55      0.65       132
    japanese       0.88      0.80      0.84      1557
      korean       0.82      0.67      0.74       208
     mexican       0.90      0.94      0.92      3600
    moroccan       0.86      0.64      0.73       206
     russian       0.65      0.28      0.40       123
 southern_us       0.77      0.70      0.73      1080
     spanish       0.84    

In [18]:
input_recipe = raw_input("Input Recipe: \n> ")
input_recipe = stem(input_recipe.lower())
X_input = vectorizer.transform([input_recipe]).toarray()
prob = clf.predict_proba(X_input)[0]
classes = clf.classes_

print('\nIdentified Ingredients: \n> %s'%vectorizer.inverse_transform(X_input))

print('\nCuisine Probabilities:')
for i in range(len(prob)):
    print('> %s: %s%%'%(classes[i], int(prob[i]*100)))

Input Recipe: 
> salmon

Identified Ingredients: 
> [array([u'salmon'], dtype='<U20')]

Cuisine Probabilities:
> brazilian: 0%
> british: 2%
> cajun_creole: 0%
> chinese: 3%
> filipino: 1%
> french: 17%
> greek: 0%
> indian: 5%
> irish: 2%
> italian: 31%
> jamaican: 0%
> japanese: 14%
> korean: 0%
> mexican: 3%
> moroccan: 0%
> russian: 1%
> southern_us: 2%
> spanish: 3%
> thai: 4%
> vietnamese: 0%


In [None]:
import pickle
filename = 'LogReg_v5.sav'
pickle.dump(LogReg_model, open(filename,'wb'))