## Yummly recipe data analysis and modeling

In [285]:
import pandas as pd
import numpy as np
import re

In [286]:
pd.set_option('display.max_colwidth', -1)

In [287]:
yummly_df = pd.read_pickle('yummly_df.pkl')

In [288]:
yummly_df.head()

Unnamed: 0,bitter,course,cuisine,ingredients,meaty,piquant,rating,recipe_name,salty,sour,sweet
0,0.666667,[Main Dishes],american,"['dried pasta', 'milk', 'shredded cheddar cheese', 'salt', 'dijon mustard']",0.166667,0.166667,4,Revolutionary Mac & Cheese,0.833333,0.166667,0.166667
1,0.5,[Salads],american,"['tomatoes', 'avocado', 'red onion', 'chopped cilantro', 'lime', 'extra-virgin olive oil', 'salt']",0.166667,0.0,4,Avocado and Tomato Salad,0.166667,0.833333,0.166667
2,,"[Breakfast and Brunch, Breads]",american,"['melted butter', 'biscuit dough', 'fresh mozzarella', 'bacon', 'shredded cheddar cheese']",,,5,Easy Cheesy Bacon Biscuit Pull-Aparts,,,
3,,[Side Dishes],american,"['cauliflower', 'extra-virgin olive oil', 'red pepper flakes', 'salt', 'ground black pepper']",,,5,Roasted Spicy Cauliflower,,,
4,0.833333,,american,"['yukon gold potatoes', 'salt', 'smoked paprika', 'olive oil']",0.166667,0.166667,5,Shakin’ Hash Browns,0.166667,0.666667,0.0


In [289]:
yummly_df['ingredients'][0]

"['dried pasta', 'milk', 'shredded cheddar cheese', 'salt', 'dijon mustard']"

Now I'm going to convert the string representation of my list of ingredients to an actual list.  I found a handy library which will do that for me called ast.

In [290]:
import ast

In [291]:
yummly_df['ingredients'] = yummly_df['ingredients'].apply(lambda x: ast.literal_eval(x))

In [292]:
yummly_df.head()

Unnamed: 0,bitter,course,cuisine,ingredients,meaty,piquant,rating,recipe_name,salty,sour,sweet
0,0.666667,[Main Dishes],american,"[dried pasta, milk, shredded cheddar cheese, salt, dijon mustard]",0.166667,0.166667,4,Revolutionary Mac & Cheese,0.833333,0.166667,0.166667
1,0.5,[Salads],american,"[tomatoes, avocado, red onion, chopped cilantro, lime, extra-virgin olive oil, salt]",0.166667,0.0,4,Avocado and Tomato Salad,0.166667,0.833333,0.166667
2,,"[Breakfast and Brunch, Breads]",american,"[melted butter, biscuit dough, fresh mozzarella, bacon, shredded cheddar cheese]",,,5,Easy Cheesy Bacon Biscuit Pull-Aparts,,,
3,,[Side Dishes],american,"[cauliflower, extra-virgin olive oil, red pepper flakes, salt, ground black pepper]",,,5,Roasted Spicy Cauliflower,,,
4,0.833333,,american,"[yukon gold potatoes, salt, smoked paprika, olive oil]",0.166667,0.166667,5,Shakin’ Hash Browns,0.166667,0.666667,0.0


Next I'd like to clean the recipe_name column and tokenize it.

In [293]:
def clean_recipe_name(df):
    df['recipe_name'] = df['recipe_name'].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df['recipe_name'] = df['recipe_name'].str.replace(r"&","")
    df['recipe_name'] = df['recipe_name'].str.lower()
    
    return df

In [294]:
yummly_df2 = clean_recipe_name(yummly_df)

In [295]:
yummly_df2.head(5)

Unnamed: 0,bitter,course,cuisine,ingredients,meaty,piquant,rating,recipe_name,salty,sour,sweet
0,0.666667,[Main Dishes],american,"[dried pasta, milk, shredded cheddar cheese, salt, dijon mustard]",0.166667,0.166667,4,revolutionary mac cheese,0.833333,0.166667,0.166667
1,0.5,[Salads],american,"[tomatoes, avocado, red onion, chopped cilantro, lime, extra-virgin olive oil, salt]",0.166667,0.0,4,avocado and tomato salad,0.166667,0.833333,0.166667
2,,"[Breakfast and Brunch, Breads]",american,"[melted butter, biscuit dough, fresh mozzarella, bacon, shredded cheddar cheese]",,,5,easy cheesy bacon biscuit pull aparts,,,
3,,[Side Dishes],american,"[cauliflower, extra-virgin olive oil, red pepper flakes, salt, ground black pepper]",,,5,roasted spicy cauliflower,,,
4,0.833333,,american,"[yukon gold potatoes, salt, smoked paprika, olive oil]",0.166667,0.166667,5,shakin hash browns,0.166667,0.666667,0.0


In [296]:
cols = yummly_df2.columns
cols

Index(['bitter', 'course', 'cuisine', 'ingredients', 'meaty', 'piquant',
       'rating', 'recipe_name', 'salty', 'sour', 'sweet'],
      dtype='object')

In [314]:
mask = yummly_df2['cuisine']=='japanese'

In [316]:
yummly_df2.loc[mask,['cuisine', 'course', 'ingredients', 'bitter', 'meaty', 'piquant', 'salty', 'sour', 'sweet',
       'rating', 'recipe_name']].tail()

Unnamed: 0,cuisine,course,ingredients,bitter,meaty,piquant,salty,sour,sweet,rating,recipe_name
9988,japanese,,"[pork belly, shoyu, mirin, sake, sugar, scallions, garlic, shallots, ginger, salt]",0.333333,0.833333,0.0,0.833333,0.166667,0.333333,3,japanese chashu pork belly (for ramen)
9989,japanese,[Condiments and Sauces],"[light brown sugar, mirin, reduced sodium soy sauce]",0.833333,0.166667,0.0,0.833333,0.0,0.833333,3,canal house teriyaki sauce
9990,japanese,"[Breakfast and Brunch, Lunch]","[fresh spinach, spinach, onions, garlic cloves, large eggs, salt, black pepper, soy sauce, sugar, olive oil]",0.833333,0.166667,0.0,0.666667,0.833333,0.166667,4,spinach tamagoyaki (spinach packed omelette)
9991,japanese,[Main Dishes],"[pork shoulder, soy sauce, mirin, sake, sugar, garlic, green onions, ginger, shallots]",,,,,,,4,slow braised japanese chashu pork
9992,japanese,[Side Dishes],"[gai lan, cooking oil, fresh ginger, garlic, hot pepper, miso paste, water, toasted sesame oil, soy sauce]",0.5,0.166667,0.166667,0.333333,0.833333,0.166667,5,chinese broccoli with garlicky ginger miso


In [159]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

yummly_df2["tokens_rn"] = yummly_df2["recipe_name"].apply(tokenizer.tokenize)
yummly_df2.head()

Unnamed: 0,bitter,course,cuisine,ingredients,meaty,piquant,rating,recipe_name,salty,sour,sweet,tokens_rn
0,0.666667,[Main Dishes],american,"[dried pasta, milk, shredded cheddar cheese, salt, dijon mustard]",0.166667,0.166667,4,revolutionary mac cheese,0.833333,0.166667,0.166667,"[revolutionary, mac, cheese]"
1,0.5,[Salads],american,"[tomatoes, avocado, red onion, chopped cilantro, lime, extra-virgin olive oil, salt]",0.166667,0.0,4,avocado and tomato salad,0.166667,0.833333,0.166667,"[avocado, and, tomato, salad]"
2,,"[Breakfast and Brunch, Breads]",american,"[melted butter, biscuit dough, fresh mozzarella, bacon, shredded cheddar cheese]",,,5,easy cheesy bacon biscuit pull aparts,,,,"[easy, cheesy, bacon, biscuit, pull, aparts]"
3,,[Side Dishes],american,"[cauliflower, extra-virgin olive oil, red pepper flakes, salt, ground black pepper]",,,5,roasted spicy cauliflower,,,,"[roasted, spicy, cauliflower]"
4,0.833333,,american,"[yukon gold potatoes, salt, smoked paprika, olive oil]",0.166667,0.166667,5,shakin hash browns,0.166667,0.666667,0.0,"[shakin, hash, browns]"


In [165]:
yummly_df2.head(10)

Unnamed: 0,bitter,course,cuisine,ingredients,meaty,piquant,rating,recipe_name,salty,sour,sweet,tokens_rn
0,0.666667,[Main Dishes],american,"[dried pasta, milk, shredded cheddar cheese, salt, dijon mustard]",0.166667,0.166667,4,revolutionary mac cheese,0.833333,0.166667,0.166667,"[revolutionary, mac, cheese]"
1,0.5,[Salads],american,"[tomatoes, avocado, red onion, chopped cilantro, lime, extra-virgin olive oil, salt]",0.166667,0.0,4,avocado and tomato salad,0.166667,0.833333,0.166667,"[avocado, and, tomato, salad]"
2,,"[Breakfast and Brunch, Breads]",american,"[melted butter, biscuit dough, fresh mozzarella, bacon, shredded cheddar cheese]",,,5,easy cheesy bacon biscuit pull aparts,,,,"[easy, cheesy, bacon, biscuit, pull, aparts]"
3,,[Side Dishes],american,"[cauliflower, extra-virgin olive oil, red pepper flakes, salt, ground black pepper]",,,5,roasted spicy cauliflower,,,,"[roasted, spicy, cauliflower]"
4,0.833333,,american,"[yukon gold potatoes, salt, smoked paprika, olive oil]",0.166667,0.166667,5,shakin hash browns,0.166667,0.666667,0.0,"[shakin, hash, browns]"
5,0.833333,[Lunch],american,"[red onion, bread, cheddar cheese, unsalted butter]",0.833333,0.0,5,best grilled cheese sandwich,0.833333,0.166667,0.166667,"[best, grilled, cheese, sandwich]"
6,,[Main Dishes],american,"[sweet paprika, brown sugar, cayenne pepper, salt, freshly ground black pepper, pork baby back ribs, barbecue sauce, vegetable oil spray]",,,5,slow cooker ribs,,,,"[slow, cooker, ribs]"
7,,,american,"[water, lemon, onions, celery seed, allspice, salt, cayenne pepper, crab boil, bay leaves, crawfish, ice water]",,,3,milton zatarain's crawfish boil,,,,"[milton, zatarain, s, crawfish, boil]"
8,0.166667,[Main Dishes],american,"[jalapeno chilies, lemon, dried oregano, olive oil, garlic, boneless skinless chicken breasts]",0.666667,0.833333,4,chicago chicken,0.166667,0.833333,0.0,"[chicago, chicken]"
9,1.0,"[Main Dishes, Lunch]",american,"[ground chicken, avocado, chopped garlic, panko, jalapeno chilies, salt, pepper]",1.0,0.666667,4,chicken avocado burgers,1.0,1.0,1.0,"[chicken, avocado, burgers]"


Now I'd like to attempt to get rid of stopwords or other unimportant words from my ingredients list.

In [162]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [164]:
type(ENGLISH_STOP_WORDS)

frozenset

In [170]:
def tokenize_list(l):
    new_l = [val.split(" ") for val in l]
    flat_l = [item for sublist in new_l for item in sublist]
    return flat_l

In [171]:
temp_l = tokenize_list(yummly_df2['ingredients'][0])

In [172]:
temp_l

['dried',
 'pasta',
 'milk',
 'shredded',
 'cheddar',
 'cheese',
 'salt',
 'dijon',
 'mustard']

In [178]:
yummly_df2['tokens_ingr'] = yummly_df2['ingredients'].apply(tokenize_list)

In [179]:
yummly_df2.head()

Unnamed: 0,bitter,course,cuisine,ingredients,meaty,piquant,rating,recipe_name,salty,sour,sweet,tokens_rn,tokens_ingr
0,0.666667,[Main Dishes],american,"[dried pasta, milk, shredded cheddar cheese, salt, dijon mustard]",0.166667,0.166667,4,revolutionary mac cheese,0.833333,0.166667,0.166667,"[revolutionary, mac, cheese]","[dried, pasta, milk, shredded, cheddar, cheese, salt, dijon, mustard]"
1,0.5,[Salads],american,"[tomatoes, avocado, red onion, chopped cilantro, lime, extra-virgin olive oil, salt]",0.166667,0.0,4,avocado and tomato salad,0.166667,0.833333,0.166667,"[avocado, and, tomato, salad]","[tomatoes, avocado, red, onion, chopped, cilantro, lime, extra-virgin, olive, oil, salt]"
2,,"[Breakfast and Brunch, Breads]",american,"[melted butter, biscuit dough, fresh mozzarella, bacon, shredded cheddar cheese]",,,5,easy cheesy bacon biscuit pull aparts,,,,"[easy, cheesy, bacon, biscuit, pull, aparts]","[melted, butter, biscuit, dough, fresh, mozzarella, bacon, shredded, cheddar, cheese]"
3,,[Side Dishes],american,"[cauliflower, extra-virgin olive oil, red pepper flakes, salt, ground black pepper]",,,5,roasted spicy cauliflower,,,,"[roasted, spicy, cauliflower]","[cauliflower, extra-virgin, olive, oil, red, pepper, flakes, salt, ground, black, pepper]"
4,0.833333,,american,"[yukon gold potatoes, salt, smoked paprika, olive oil]",0.166667,0.166667,5,shakin hash browns,0.166667,0.666667,0.0,"[shakin, hash, browns]","[yukon, gold, potatoes, salt, smoked, paprika, olive, oil]"


Next I'd like to try to throw out stop words from the ingredients list such as 'shredded','extra-virgin' or 'chopped'.  I'm going to try to analyze word frequency in order to achieve this.

In [189]:
yummly_df2['ingredients_string'] = yummly_df2['ingredients'].str.join(' ')

In [191]:
#yummly_df2.head()

In [222]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [223]:
list_corpus = yummly_df2['ingredients_string'].tolist()
list_labels = yummly_df2['cuisine'].tolist()

In [224]:
vectorizer = TfidfVectorizer()
vectorizer.fit(list_corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [225]:
vector = vectorizer.transform(yummly_df2['ingredients_string'])
max_value = vector.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()

In [226]:
feature_names = np.array(vectorizer.get_feature_names())
print("Features with the lowest tfidf:\n{}".format(feature_names[sorted_by_tfidf[:100]]))

Features with the lowest tfidf:
['range' 'huckleberries' 'cornish' 'partridges' 'hens' 'village' 'harvest'
 'dijonnaise' 'unfiltered' 'collect' 'burgundi' 'vineyard' 'premium'
 'jamón' 'pinch' 'perfect' 'vital' 'world' 'nacho' 'pressed' 'gelato'
 'four' 'fresca' 'tradit' 'tasajo' 'calabaza' 'crosswise' 'snaps' 'pilaf'
 'toll' 'amontillado' 'flatout' 'flatbreads' 'chex' 'substitut' 'chees'
 'lea' 'perrins' 'candlenuts' 'brazil' 'barramundi' 'leav' 'lettuc'
 'romain' 'gourmet' 'super' 'natur' 'skippi' 'garlic' 'rins' 'cara' 'mia'
 'pepperocini' 'leafy' 'candies' 'cupcake' 'craisins' 'cheerios' 'mm'
 'valentine' 'du' 'lentilles' 'rosa' 'sheet' 'creations' 'klondike'
 'stale' 'roquefort' 'traditional' 'amaranth' 'trifle' 'heath' 'bowl'
 'pompeian' 'boned' 'fettuccini' 'cook' 'claws' 'drain' 'touch' 'pack'
 'minicub' 'parslei' 'fresno' 'seitan' 'stellette' 'flageolet' 'caper'
 'teardrop' '100' 'crocker' 'blackberry' 'betty' 'crisp' 'ritz' 'eatin'
 'bianca' 'kasuri' 'crabs' 'drumstick']


In [227]:
print("Features with the highest tfidf:\n{}".format(feature_names[sorted_by_tfidf[-100:]]))

Features with the highest tfidf:
['beaten' 'sunflower' 'stilton' 'biscoff' 'mirin' 'carrots' 'creole'
 'tart' 'pimenton' 'seasoning' 'oleo' 'lemonade' 'cornmeal' 'liqueur'
 'millet' 'pectin' 'pizza' 'brisket' 'sofrito' 'whiskey' 'paneer' 'liver'
 'beech' 'mango' 'alum' 'kosher' 'sushi' 'clarified' 'violets'
 'buttermilk' 'melon' 'sea' 'herbs' 'naan' 'pancake' 'papad' 'orange'
 'gizzards' 'yardlong' 'muenster' 'rose' 'couscous' 'champagne'
 'drippings' 'limoncello' 'melted' 'chambord' 'fruit' 'gram' 'roe' 'gumbo'
 'cookies' 'citrus' 'sardines' 'oatmeal' 'crisco' 'plantains' 'acorn'
 'cottage' 'sheepshead' 'atta' 'potatoes' 'boudin' 'half' 'shells'
 'apples' 'goya' 'strawberries' 'lard' 'flavoring' 'yucca' 'liquor'
 'semolina' 'bran' 'pickling' 'konbu' 'dried' 'juice' 'cauliflower'
 'jarlsberg' 'chicory' 'liquid' 'cultured' 'brats' 'coffee' 'bhaji'
 'dates' 'vodka' 'kit' 'ground' 'grits' 'peach' 'duck' 'homemade'
 'mccormick' 'biscuits' 'cabbage' 'taro' 'peanuts' 'pudding']


The following list of words are those with the lowest idf score.  That is, those that appear frequently and are therefore deemed less important.

In [278]:
sorted_by_idf = np.argsort(vectorizer.idf_)
print("Features with the lowest idf:\n{}".format(feature_names[sorted_by_idf[:10]]))

Features with the lowest idf:
['salt' 'oil' 'pepper' 'garlic' 'sugar' 'ground' 'olive' 'butter' 'flour'
 'sauce']


In [240]:
custom_stop_words = []
for word in ENGLISH_STOP_WORDS:
    custom_stop_words.append(word) 
    
custom_stop_words = custom_stop_words + feature_names[sorted_by_idf[:30]].tolist()

Next I'm going to do Bag-of-Words processing.

In [241]:
from sklearn.feature_extraction.text import CountVectorizer

In [242]:
count_vect = CountVectorizer(stop_words=custom_stop_words)

In [243]:
counts = count_vect.fit_transform(yummly_df2["ingredients_string"])  # sparse matrix with columns corresponding to words
words = count_vect.get_feature_names()  # array with words corresponding to columns

In [244]:
counts.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [213]:
#words

Now I'm going to try K-means clustering on my vectorized data set.

In [245]:
from sklearn.cluster import KMeans

In [246]:
number_of_clusters=10
km = KMeans(n_clusters = number_of_clusters)
km.fit(counts)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [247]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = count_vect.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :10]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))



Top terms per cluster:
Cluster 0: beans corn tortillas shredded cumin salsa chili cheddar chopped chilies
Cluster 1: dried oregano feta cucumber virgin extra vinegar parsley leaves wine
Cluster 2: vanilla extract large egg purpose chocolate heavy granulated unsalted yolks
Cluster 3: beef paprika parsley broth bell chopped leaves bay sour cloves
Cluster 4: sesame ginger seeds vinegar corn starch toasted brown vegetable scallions
Cluster 5: vinegar large purpose kosher brown bread potatoes unsalted seasoning pineapple
Cluster 6: lime mint leaves coconut rum fish chopped ginger paste chilies
Cluster 7: skinless boneless breasts bell broth ginger corn pineapple paprika coconut
Cluster 8: cumin ginger coriander turmeric cinnamon paprika cayenne chopped cloves chili
Cluster 9: baking purpose soda buttermilk unsalted large vanilla granulated wheat cinnamon


In [248]:
number_of_clusters=25
km = KMeans(n_clusters = number_of_clusters)
km.fit(counts)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=25, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [249]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = count_vect.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :10]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))


Top terms per cluster:
Cluster 0: paprika smoked sour parsley sweet broth cumin hungarian cayenne bell
Cluster 1: lime chopped fish chilies jalapeno leaves chili cumin ginger chile
Cluster 2: shredded cheddar tortillas beans salsa sour chopped corn taco chilies
Cluster 3: baking purpose soda buttermilk unsalted large vanilla granulated cinnamon heavy
Cluster 4: coconut curry lime paste fish ginger leaves thai brown breasts
Cluster 5: yeast dry active purpose warm bread unsalted large instant cinnamon
Cluster 6: parsley leaf chopped flat virgin extra cloves paprika kosher freshly
Cluster 7: skinless boneless breasts broth paprika bell thighs chopped ginger cloves
Cluster 8: parmesan grated mozzarella shredded basil italian pasta leaves parsley ricotta
Cluster 9: yogurt greek plain cucumber dill feta nonfat kosher cloves cumin
Cluster 10: dried oregano thyme basil parsley cayenne paprika leaves bell kosher
Cluster 11: beef broth parsley allspice sour pork paprika nutmeg bread stock
Clust

K-means clustering shows that certain words show up a lot, but are not informative.  These words are like 'extra','virgin','extract','unsalted'.   I'm going to include these words in my list of stopwords before doing further analysis.

Another observation is that I feel like certain words should be hyphenated, since they always go together.  These words include 'baking-soda','baking-power','sesame-seeds'.  I'm going to do this step first before removing any stop words.

In [262]:
yummly_df3 = clean_recipe_name(yummly_df)

In [270]:
def hyphenate_key_ingredients(df_row_l):
    ingredients_to_hyphenate = ['baking soda','baking powder','sesame seeds','simple syrup',
                                'olive oil','corn starch','garam masala']
    
    new_df_row_l = []
    
    for val in df_row_l:
        #print(val)
        if val in ingredients_to_hyphenate:
            for ingredient in ingredients_to_hyphenate:
                if val in ingredient:
                    print(ingredient)
                    new_val_l = ingredient.split(' ')
                    new_val_s = '-'.join(new_val_l)
                    new_df_row_l.append(new_val_s)
                    break
        else:
            new_df_row_l.append(val)
    return new_df_row_l

In [263]:
yummly_df3['ingredients'] = yummly_df3['ingredients'].apply(hyphenate_key_ingredients)

In [264]:
yummly_df3.head(10)

Unnamed: 0,bitter,course,cuisine,ingredients,meaty,piquant,rating,recipe_name,salty,sour,sweet,tokens_rn,tokens_ingr,ingredients_string
0,0.666667,[Main Dishes],american,"[dried pasta, milk, shredded cheddar cheese, salt, dijon mustard]",0.166667,0.166667,4,revolutionary mac cheese,0.833333,0.166667,0.166667,"[revolutionary, mac, cheese]","[dried, pasta, milk, shredded, cheddar, cheese, salt, dijon, mustard]",dried pasta milk shredded cheddar cheese salt dijon mustard
1,0.5,[Salads],american,"[tomatoes, avocado, red onion, chopped cilantro, lime, extra-virgin olive oil, salt]",0.166667,0.0,4,avocado and tomato salad,0.166667,0.833333,0.166667,"[avocado, and, tomato, salad]","[tomatoes, avocado, red, onion, chopped, cilantro, lime, extra-virgin, olive, oil, salt]",tomatoes avocado red onion chopped cilantro lime extra-virgin olive oil salt
2,,"[Breakfast and Brunch, Breads]",american,"[melted butter, biscuit dough, fresh mozzarella, bacon, shredded cheddar cheese]",,,5,easy cheesy bacon biscuit pull aparts,,,,"[easy, cheesy, bacon, biscuit, pull, aparts]","[melted, butter, biscuit, dough, fresh, mozzarella, bacon, shredded, cheddar, cheese]",melted butter biscuit dough fresh mozzarella bacon shredded cheddar cheese
3,,[Side Dishes],american,"[cauliflower, extra-virgin olive oil, red pepper flakes, salt, ground black pepper]",,,5,roasted spicy cauliflower,,,,"[roasted, spicy, cauliflower]","[cauliflower, extra-virgin, olive, oil, red, pepper, flakes, salt, ground, black, pepper]",cauliflower extra-virgin olive oil red pepper flakes salt ground black pepper
4,0.833333,,american,"[yukon gold potatoes, salt, smoked paprika, olive-oil]",0.166667,0.166667,5,shakin hash browns,0.166667,0.666667,0.0,"[shakin, hash, browns]","[yukon, gold, potatoes, salt, smoked, paprika, olive, oil]",yukon gold potatoes salt smoked paprika olive oil
5,0.833333,[Lunch],american,"[red onion, bread, cheddar cheese, unsalted butter]",0.833333,0.0,5,best grilled cheese sandwich,0.833333,0.166667,0.166667,"[best, grilled, cheese, sandwich]","[red, onion, bread, cheddar, cheese, unsalted, butter]",red onion bread cheddar cheese unsalted butter
6,,[Main Dishes],american,"[sweet paprika, brown sugar, cayenne pepper, salt, freshly ground black pepper, pork baby back ribs, barbecue sauce, vegetable oil spray]",,,5,slow cooker ribs,,,,"[slow, cooker, ribs]","[sweet, paprika, brown, sugar, cayenne, pepper, salt, freshly, ground, black, pepper, pork, baby, back, ribs, barbecue, sauce, vegetable, oil, spray]",sweet paprika brown sugar cayenne pepper salt freshly ground black pepper pork baby back ribs barbecue sauce vegetable oil spray
7,,,american,"[water, lemon, onions, celery seed, allspice, salt, cayenne pepper, crab boil, bay leaves, crawfish, ice water]",,,3,milton zatarain's crawfish boil,,,,"[milton, zatarain, s, crawfish, boil]","[water, lemon, onions, celery, seed, allspice, salt, cayenne, pepper, crab, boil, bay, leaves, crawfish, ice, water]",water lemon onions celery seed allspice salt cayenne pepper crab boil bay leaves crawfish ice water
8,0.166667,[Main Dishes],american,"[jalapeno chilies, lemon, dried oregano, olive-oil, garlic, boneless skinless chicken breasts]",0.666667,0.833333,4,chicago chicken,0.166667,0.833333,0.0,"[chicago, chicken]","[jalapeno, chilies, lemon, dried, oregano, olive, oil, garlic, boneless, skinless, chicken, breasts]",jalapeno chilies lemon dried oregano olive oil garlic boneless skinless chicken breasts
9,1.0,"[Main Dishes, Lunch]",american,"[ground chicken, avocado, chopped garlic, panko, jalapeno chilies, salt, pepper]",1.0,0.666667,4,chicken avocado burgers,1.0,1.0,1.0,"[chicken, avocado, burgers]","[ground, chicken, avocado, chopped, garlic, panko, jalapeno, chilies, salt, pepper]",ground chicken avocado chopped garlic panko jalapeno chilies salt pepper


In [282]:
yummly_df['ingredients'].to_string()

"0        [dried pasta, milk, shredded cheddar cheese, salt, dijon mustard]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       \n1        [tomatoes, avocado, red onion, chopped cilantro, lime, extra-virgin olive oil, salt]                                                                                                                                                                                                                                                                                                       

In [258]:
'olive oil' in 'extra-virgin olive oil'

True

In [266]:
yummly_df3['ingredients'][1]

['tomatoes',
 'avocado',
 'red onion',
 'chopped cilantro',
 'lime',
 'extra-virgin olive oil',
 'salt']

In [271]:
hyphenate_key_ingredients(yummly_df3['ingredients'][1])

tomatoes
avocado
red onion
chopped cilantro
lime
extra-virgin olive oil
salt


['tomatoes',
 'avocado',
 'red onion',
 'chopped cilantro',
 'lime',
 'extra-virgin olive oil',
 'salt']

In [272]:
'extra-virgin olive oil' in 'olive oil'

False

In [273]:
ingredients_to_hyphenate = ['baking soda','baking powder','sesame seeds','simple syrup',
                                'olive oil','corn starch','garam masala']

In [274]:
if ingredients_to_hyphenate in 'extra-virgin olive oil'

SyntaxError: invalid syntax (<ipython-input-274-daa8a43259d0>, line 1)

In [283]:
yummly_df3.shape

(12492, 14)