In [1]:
import json
import pandas as pd
from collections import Counter 
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import classification_report as report 
from sklearn.model_selection import train_test_split as split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [2]:
# initalize the data 
with open("train.json", "rb") as t:
    train_json = json.load(t)
raw_train = pd.DataFrame.from_dict(train_json)
raw_train.head()
# split the dataframe into two in order to test how well our models will generalize to new data 
train, test = split(raw_train)

The simplest text representation is a "bag of words" model where the order of words in a document is irrelevant. While this doesn't work for all documents, it is a good starting point for most NLP projects. In this project in particular, it is a good assumption as the ingredients are nouns and word dependencies (e.g., parts of speech) do not matter. 

For this first attempt, we'll generate a simple term document matrix where each term is a column in the matrix and each row is a recipe. 

In [3]:
def fit_count_vectorizer(train, test):
    """Initalizes a simple count vectorizer 
    """
    count_vectorizer =  CountVectorizer(tokenizer = lambda doc: doc, 
                                       lowercase=False)
    count_vectorizer.fit(train["ingredients"])
    train_matrix = count_vectorizer.transform(train["ingredients"])
    test_matrix = count_vectorizer.transform(test["ingredients"])
    return (train_matrix, train["cuisine"]), (test_matrix, test["cuisine"])

train_count, test_count = fit_count_vectorizer(train, test)
lr_model = LogisticRegression().fit(train_count[0], train_count[1])
lr_predictions = lr_model.predict(test_count[0])
lr_report = report(test_count[1], lr_predictions)
print(lr_report)

              precision    recall  f1-score   support

   brazilian       0.74      0.54      0.62       115
     british       0.59      0.38      0.46       194
cajun_creole       0.81      0.69      0.74       371
     chinese       0.80      0.82      0.81       677
    filipino       0.69      0.57      0.63       174
      french       0.62      0.61      0.61       657
       greek       0.82      0.67      0.74       295
      indian       0.84      0.89      0.86       780
       irish       0.71      0.44      0.54       192
     italian       0.77      0.89      0.83      1928
    jamaican       0.79      0.60      0.68       129
    japanese       0.80      0.68      0.73       373
      korean       0.88      0.74      0.80       223
     mexican       0.89      0.91      0.90      1625
    moroccan       0.80      0.77      0.78       192
     russian       0.65      0.37      0.47       116
 southern_us       0.66      0.82      0.73      1063
     spanish       0.66    

The count vectorizer does not control for the frequency of common words. In longer documents or ones that resemble natural language more, it is likely that some words (e.g., "I", "am", "be", "was") will be repeated often. 

One way to control for this and to identify key words in a text (locally frequent, globally infrequent) is Term Frequency, Inverse Document Frequency (TF-IDF) weighting. 

Term frequency is how often a word appears in a particular document. For example, "The red dog jumped over the red wagon" would yield a TF score of: 

{'The': 1,
'dog': 1,
'jumped': 1,
'over': 1,
'red': 2,
'the': 1,
'wagon': 1}

Inverse Document Frequency (IDF) takes the log of the number of documents in a corpus versus divided by the number of documents a particular term appears in. 

In python this would read as:
def idf(word, corpus):
    return math.log(len(corpus) / (1 + sum([1 for _ in corpus if word in _ ])))
    
The final calculation is just tf * idf 
You can read more up on TF-IDF on the relevant Wikipedia Article: https://en.wikipedia.org/wiki/Tf%E2%80%93idf

Sklearn implements a tf-idf vectorizer similar to the CountVectorizer where each cell is weighted by tf-idf rather than just the counts. 

In [4]:
def fit_tfidf_vectorizer(train, test):
    """Initalizes a tf-idf vectorizer 
    """
    tfidf_vectorizer =  TfidfVectorizer(tokenizer = lambda doc: doc, 
                                       lowercase=False)
    tfidf_vectorizer.fit(train["ingredients"])
    train_matrix = tfidf_vectorizer.transform(train["ingredients"])
    test_matrix = tfidf_vectorizer.transform(test["ingredients"])
    return (train_matrix, train["cuisine"]), (test_matrix, test["cuisine"])

train_tfidf, test_tfidf = fit_tfidf_vectorizer(train, test)
lr_tfidf = LogisticRegression().fit(train_tfidf[0], train_tfidf[1])
lr_predictions_tfidf = lr_tfidf.predict(test_tfidf[0])
lr_report_tfidf = report(test_tfidf[1], lr_predictions_tfidf)
print(lr_report_tfidf)

              precision    recall  f1-score   support

   brazilian       0.81      0.37      0.51       115
     british       0.60      0.29      0.39       194
cajun_creole       0.81      0.65      0.72       371
     chinese       0.79      0.86      0.82       677
    filipino       0.76      0.53      0.62       174
      french       0.57      0.58      0.58       657
       greek       0.86      0.61      0.72       295
      indian       0.83      0.90      0.87       780
       irish       0.72      0.31      0.43       192
     italian       0.73      0.90      0.81      1928
    jamaican       0.84      0.53      0.65       129
    japanese       0.85      0.64      0.73       373
      korean       0.89      0.67      0.77       223
     mexican       0.87      0.91      0.89      1625
    moroccan       0.85      0.72      0.78       192
     russian       0.69      0.27      0.39       116
 southern_us       0.62      0.81      0.70      1063
     spanish       0.69    

It looks like the TF-IDF weighting did not improve our model predictions. This makes sense given our data set has individual words (ingredients) that are not repeated within a recipe. This means the TF for each document (recipe) is always 1. 

In longer documents, TF-IDF is good starting point. In particular, if your analysis involves calculating the similarity between documents, TF-IDF is an important preprocessing step as common words will swamp the similarity between two different documents. 

In addition to TF-IDF weighting, there are other preprocessing steps that are common in text mining and could be beneficial with this dataset. In particular, string lemmatization may reduce some of the dimensionality in the dataset. Lemmatization reduces words to their common root e.g., "am", "are", "is" all map to "be."

Lemmatization is similar to stemming in terms of reducing words to their common root. It has the advantage of being more readable than word stemming but may be slower as a good lemmatizer will rely on Part of Speech tagging. In this dataset, we can treat all of the ingredients as nouns and avoid any part of speech tagging. 

In [5]:
def prep_lemmatizer(recipes):
    """Lemmatizes the individual ingredients and regroups them based on 
    """
    lmtzr = WordNetLemmatizer()
    cleaned_recipes = []
    for recipe in recipes:
        cleaned_recipe = []
        for ingredient in recipe:
            cleaned_ingredient = " ".join([lmtzr.lemmatize(x) for x in ingredient.split()])
            cleaned_recipe.append(cleaned_ingredient)
        cleaned_recipes.append(cleaned_recipe)
    return cleaned_recipes

The function above will strip individual words down to their lemmas and then convert them back to the same string primative they were joined to. For example, the recipe list: 

[['pepper', 'hot sauce', 'scallions', 'fresh parsley', 'green bell pepper', 'salt', 'wild rice', 'bay leaf', 'chicken', 'celery ribs', 'chopped fresh thyme', 'sauce', 'garlic cloves', 'onions', 'chicken stock', 'ground pork', 'butter oil', 'red bell pepper', 'long grain white rice'], 

['romaine lettuce', 'red wine vinegar', 'lemon juice', 'tomatoes', 'feta cheese', 'salt', 'gaeta olives', 'extra-virgin olive oil', 'oregano', 'mint', 'kirby cucumbers', 'freshly ground pepper'], 

['ground black pepper', 'italian eggplant', 'provolone cheese', 'marinara sauce', 'garlic', 'fresh basil leaves', 'herbs', 'extra-virgin olive oil', 'nonstick spray', 'coarse salt', 'parmagiano reggiano']]

Yields: 
[['pepper', 'hot sauce', 'scallion', 'fresh parsley', 'green bell pepper', 'salt', 'wild rice', 'bay leaf', 'chicken', 'celery rib', 'chopped fresh thyme', 'sauce', 'garlic clove', 'onion', 'chicken stock', 'ground pork', 'butter oil', 'red bell pepper', 'long grain white rice'], 

['romaine lettuce', 'red wine vinegar', 'lemon juice', 'tomato', 'feta cheese', 'salt', 'gaeta olive', 'extra-virgin olive oil', 'oregano', 'mint', 'kirby cucumber', 'freshly ground pepper'], 

['ground black pepper', 'italian eggplant', 'provolone cheese', 'marinara sauce', 'garlic', 'fresh basil leaf', 'herb', 'extra-virgin olive oil', 'nonstick spray', 'coarse salt', 'parmagiano reggiano']]

Notice that plural words (e.g., "Scallions", "Cucumbers") are reduced down to their root (scallion, cucumber) 

We can run the function on our dataset. Note that it'll run slowly as it's in O(n ** 3) 

In [6]:
def fit_lemma_vectorizer(train, test):
    """Initalizes a count vectorizer with the words lemmatized 
    """
    count_vectorizer =  CountVectorizer(tokenizer = lambda doc: doc, 
                                       lowercase=False)
    train_lemmas = prep_lemmatizer(train["ingredients"])
    test_lemmas = prep_lemmatizer(test["ingredients"])
    count_vectorizer.fit(train_lemmas)
    train_matrix = count_vectorizer.transform(train_lemmas)
    test_matrix = count_vectorizer.transform(test_lemmas)
    return (train_matrix, train["cuisine"]), (test_matrix, test["cuisine"])  

train_lemma, test_lemma = fit_lemma_vectorizer(train, test)
lr_lemma = LogisticRegression().fit(train_lemma[0], train_lemma[1])
lemma_predictions = lr_lemma.predict(test_lemma[0])
lemma_report = report(test_lemma[1], lemma_predictions)
print(lemma_report)

              precision    recall  f1-score   support

   brazilian       0.74      0.54      0.62       115
     british       0.59      0.38      0.46       194
cajun_creole       0.81      0.68      0.74       371
     chinese       0.80      0.81      0.81       677
    filipino       0.68      0.57      0.62       174
      french       0.62      0.60      0.61       657
       greek       0.82      0.67      0.74       295
      indian       0.84      0.89      0.86       780
       irish       0.70      0.43      0.53       192
     italian       0.77      0.89      0.83      1928
    jamaican       0.79      0.60      0.68       129
    japanese       0.80      0.67      0.73       373
      korean       0.88      0.74      0.80       223
     mexican       0.89      0.91      0.90      1625
    moroccan       0.80      0.77      0.78       192
     russian       0.65      0.37      0.47       116
 southern_us       0.66      0.82      0.73      1063
     spanish       0.66    

It looks like the lemmatization made some marginal improvements to our model. There are other preprocessing steps that we could pursue that may improve our predictions. For example, terms like "extra-virgin oil" probably do not differentiate a recipe any more than "olive oil." We could spend more time on feature engineering to improve our predictions. 

For now, it might make sense to cover some of the algorithm choices that could improve our predictions.

In [7]:
lemma_errors = [x for x in zip(test_lemma[1], lemma_predictions)]
error_counter = Counter(lemma_errors)
items_list = list(error_counter.items())
true_errors = [x for x in items_list if x[0][0] != x[0][1]]
true_errors.sort(key=lambda x: x[1], reverse=True)
print(true_errors[0:10])

[(('french', 'italian'), 140), (('italian', 'french'), 70), (('spanish', 'italian'), 64), (('cajun_creole', 'southern_us'), 63), (('french', 'southern_us'), 63), (('italian', 'southern_us'), 57), (('greek', 'italian'), 57), (('southern_us', 'italian'), 51), (('japanese', 'indian'), 46), (('british', 'southern_us'), 44)]


It looks like this classifer performs poorly for the French-Italian pair compared. This may be be helpful in the future for when developing an ensemble model to predict the rest of the dataset.

In [8]:
training_lemmas = prep_lemmatizer(raw_train["ingredients"])
print(training_lemmas[0])

['romaine lettuce', 'black olive', 'grape tomato', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo bean', 'feta cheese crumbles']


In [9]:
# implement a voting classifier
# in order to do this, you'll need to install the mlxtend library
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
count_vectorizer =  CountVectorizer(tokenizer = lambda doc: doc, 
                                       lowercase=False)


count_vectorizer.fit(training_lemmas)
train_matrix = count_vectorizer.transform(training_lemmas)


In [11]:
clf1 = MultinomialNB()
clf2 = LogisticRegression()
clf3 = LinearSVC()
clf4 = RandomForestClassifier()
eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4], weights=[1, 1.5, 1, 1])
labels = ["Naive Bayes", "Logistic Regression", "SVM", "RandomForest", "ECLF"]
for clf, label in zip([clf1, clf2, clf3, clf4, eclf], labels):

    scores = cross_val_score(clf, train_matrix, raw_train["cuisine"], 
                                              cv=3, 
                                              scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))


Accuracy: 0.72 (+/- 0.00) [Naive Bayes]
Accuracy: 0.78 (+/- 0.00) [Logistic Regression]
Accuracy: 0.76 (+/- 0.00) [SVM]
Accuracy: 0.66 (+/- 0.00) [RandomForest]
Accuracy: 0.78 (+/- 0.00) [ECLF]


We can try tuning the lexical features a bit more in order to improve our predictions. At first, we tried to preserve the structure of the data so that each multi-word ingredient was kept (e.g., feta cheese crumbles). It might make sense to consolidate everything into one string and test from there. 

In [12]:
def flatten_lemma(ingredient_list):
    lmtzr = WordNetLemmatizer()
    new_ingredients = []
    for item in ingredient_list:
        plural = item.split()
        new_ingredients.extend(plural)
    return [lmtzr.lemmatize(x) for x in new_ingredients]
# flatten everything into one string 
flat_data = [" ".join(flatten_lemma(x)) for x in raw_train["ingredients"]]
print(flat_data[0])

romaine lettuce black olive grape tomato garlic pepper purple onion seasoning garbanzo bean feta cheese crumbles


We can try to improve our predictions by using a grid search to optimize the best hyper-parameters (e.g., the min number of words to consider, number of tokens, etc.) 

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder().fit(raw_train["cuisine"])
encoded_cuisine = le.transform(raw_train["cuisine"]) 
params = {"tf_idf__min_df": [1, 3, 10], #min count of words allowed
          "tf_idf__ngram_range": [(1, 1), (1, 2)], #ngram range to consider
          "tf_idf__max_df": [.99, .85, .5] # max document frequency 
         } #1-grams or 2-grams

estimators = [("tf_idf", TfidfVectorizer()), 
              ("SVC", LinearSVC())]
model = Pipeline(estimators)
grid = GridSearchCV(estimator=model, param_grid = params)
grid.fit(flat_data, encoded_cuisine) 

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('tf_idf', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('NB', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'tf_idf__min_df': [1, 3, 10], 'tf_idf__ngram_range': [(1, 1), (1, 2)], 'tf_idf__max_df': [0.99, 0.85, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [19]:
grid.best_params_

{'tf_idf__max_df': 0.99, 'tf_idf__min_df': 1, 'tf_idf__ngram_range': (1, 1)}

In [31]:
grouped_vectorizer = CountVectorizer()
grouped_vectorizer.fit(flat_data)
train_matrix2 = grouped_vectorizer.transform(flat_data)
svc_estimators = [("tf_idf", TfidfVectorizer()), 
                  ("SVC", LinearSVC())]
nb_estimators = [("count_vectorizer", CountVectorizer()), 
                  ("SVC", MultinomialNB())]
logit_estimators = [("count_vectorizer", CountVectorizer()), 
                  ("LR", LogisticRegression())]
rf_estimators = [("count_vectorizer", CountVectorizer()), 
                  ("RF", RandomForestClassifier())]

clf1 = Pipeline(nb_estimators)
clf2 = Pipeline(logit_estimators)
clf3 = Pipeline(svc_estimators)
clf4 = Pipeline(rf_estimators)
eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4], weights=[1, 1, 1.5, .5])
labels = ["Naive Bayes", "Logistic Regression", "SVM", "RandomForest", "ECLF"]
for clf, label in zip([clf1, clf2, clf3, clf4, eclf], labels):

    scores = cross_val_score(clf, flat_data, raw_train["cuisine"], 
                                              cv=3, 
                                              scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

Accuracy: 0.72 (+/- 0.00) [Naive Bayes]
Accuracy: 0.78 (+/- 0.00) [Logistic Regression]
Accuracy: 0.79 (+/- 0.00) [SVM]
Accuracy: 0.70 (+/- 0.00) [RandomForest]
Accuracy: 0.79 (+/- 0.00) [ECLF]


This last model looks pretty good. We're getting ~ 79% accuracy and we haven't seen any major improvements with any of the feature or model tuning. To make a submission, we need to load in the test data and transform the features in the same way as the traning model. From there, we can make a prediction and a submission. 

In [33]:
with open("test.json", "rb") as t:
    test_json = json.load(t)
raw_test = pd.DataFrame.from_dict(test_json)
flat_test = [" ".join(flatten_lemma(x)) for x in raw_test["ingredients"]]
eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4], weights=[1, 1, 1.5, .5])
eclf.fit(flat_data, raw_train["cuisine"])
test_predictions = eclf.predict(flat_test)

In [39]:
out_df = pd.DataFrame({"id":list(raw_test["id"]), "cuisine":test_predictions})
out_df.to_csv("RecipeOutput.csv", index=False)

This submission scored .79022 when I uploaded it to Kaggle which would put it at ~400th place out of 1388. While we probably coud do better, this percentage was only ~ 2% of of the winning score. Depending on your application, a 2% improvement in modeling performance can be important or just noise. 