In [2]:
import pandas as pd
from string import ascii_lowercase
import numpy as np
import re
from collections import Counter, defaultdict
import datetime

import json
from pprint import pprint

In [3]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")


In [4]:
import functools
def conjunction(*conditions):
    return functools.reduce(np.logical_and, conditions)

def disjunction(*conditions):
    return functools.reduce(np.logical_or, conditions)

In [5]:
root_dir='/Users/aliceallafort/Google_Drive/Github/'
datasets_dir=root_dir+'RecipeNet/'
save_dir=root_dir+'Miamiam/data_save/'

## Load Ingredients

In [6]:
# Made from unique elements - Cleaned in extract_ingredients.ipynb

with open(save_dir+'my_cleaned_ing.txt','r') as f:
    cleaned_ing =f.read().splitlines()
print(len(cleaned_ing))
print(cleaned_ing[:10])

3340
['salt', 'pepper', 'butter', 'garlic', 'sugar', 'flour', 'onion', 'olive oil', 'olive', 'egg']


## Epicurious

#### Load data

In [7]:
epi_json = datasets_dir+'epicurious/full_format_recipes.json'

In [8]:
df=pd.read_json(epi_json)

In [9]:
df=df.dropna(how='all')
df.title=df.title.apply(lambda t: t.strip())
df['num_dir'] = df.directions.apply(len)
df['num_ing'] = df.ingredients.apply(len)
df['num_cat'] = df.categories.apply(len)

df.ingredients=df.ingredients.apply(lambda l: [s.lower() for s in l])
df['title_words']=df.title.apply(lambda t: t.lower().replace(',','').split())

#### Load pre-processed

In [5]:
df=pd.read_json(save_dir+'epicurious_ing_cleaned.json')

###### tests

In [97]:
df.columns

Index(['directions', 'fat', 'date', 'categories', 'calories', 'desc',
       'protein', 'rating', 'title', 'ingredients', 'sodium', 'ing_cleaned'],
      dtype='object')

In [98]:
df.head()

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,sodium,ing_cleaned
0,"[1. Place the stock, lentils, celery, carrot, ...",7.0,2006-09-01 04:00:00+00:00,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",426.0,,30.0,2.5,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",559.0,"[Freshly ground black pepper, Fuji apple, carr..."
1,[Combine first 9 ingredients in heavy medium s...,23.0,2004-08-20 04:00:00+00:00,"[Food Processor, Onion, Pork, Bake, Bastille D...",403.0,This uses the same ingredients found in boudin...,18.0,4.375,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",1439.0,"[Bay leaves, Cracked peppercorns, French bread..."
2,[In a large heavy saucepan cook diced fennel a...,7.0,2004-08-20 04:00:00+00:00,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",165.0,,6.0,3.75,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",165.0,"[chicken broth, fennel bulb stalks discarded b..."
3,[Heat oil in heavy large skillet over medium-h...,,2009-03-27 04:00:00+00:00,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",,The Sicilian-style tomato sauce has tons of Me...,,5.0,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",,[Country style white bread thick slices toaste...
4,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,2004-08-20 04:00:00+00:00,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",547.0,,20.0,3.125,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",452.0,"[extra wide egg noodles cooked, frozen spinach..."


#### Clean ingredients

In [10]:
units = ['ounce','oz', 'cup', 'teaspoon', 'tablespoon', 'tbsp','tsp',
         'gram', 'quart','qt', 'lb', 'pound',
         'l','ml','g','gram', 'inch','inches','cm']

descriptions= ['large', 'medium','small','quarter',
              'pkg','package','bottle','stick','cube','can','piece','pinch of',
               'bit', 'goodquality'  ]      

preparations = ['chopped', 'shredded', 'diced', 'minced', 'crushed','peeled',
                'cored','seeded','squeezed','sliced', 'drained', 'grated','packed',
                'wellstirred','divided','added',
                'chilled','thawed','cut into','halved',
                'thinly', 'finely','fine','lightly' , 'coarse','coarsely','freshly','firmly',
               'plus more for seasoning','plus more for drizzling','room temperature','to taste',
               'available at specialty foods shops some supermarkets',
               'special equipment an instantread thermometer']

connective = ['a','the','or', 'and','plus','to','of','more','with','about','for','well']

# Not sure what to do with 'whole'

In [11]:
def remove_plurals(ing):
    doc = nlp(ing)
    for token in doc:
        if token.tag_=='NNS': 
            ing=re.sub(token.text,token.lemma_,ing)
    return ing


def pre_process(s):
    original_string = s
    for u in units+descriptions:
        ex=r'\b{}\b|\b{}s\b'.format(u,u)
        s=re.sub(ex,'',s)
    for w in preparations+connective:
        ex=r'\b{}\b'.format(w)
        s=re.sub(ex,'',s)
    s=re.sub('-',' ',s)    
    s=re.sub(r'\(.*?\)','',s)
    s=re.sub(r'[^A-Za-z\s]','',s)
    s=re.sub(r'\s+',r' ',s)
    
    s=remove_plurals(s)
    
    return s.strip()

In [12]:
first_time = datetime.datetime.now()

df['ing_cleaned']=df.ingredients.apply(lambda list_ing: sorted([pre_process(s) for s in list_ing]))        
        
later_time = datetime.datetime.now()
later_time - first_time


datetime.timedelta(seconds=1416, microseconds=790598)

In [13]:
df['ing_cleaned_all']=df.ing_cleaned.apply(lambda l: " ".join(l))

In [51]:
df['recipe_id'] = list(df.index)
df.head()

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,...,num_cat,title_words,ing_cleaned,ing_cleaned_all,ing_index,ing_assoc,ing_assoc_index,ing_assoc_list,ing_assoc_index_list,recipe_id
0,"[1. Place the stock, lentils, celery, carrot, ...",7.0,2006-09-01 04:00:00,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",426.0,,30.0,2.5,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",...,11,"[lentil, apple, and, turkey, wrap]","[carrot, dried brown lentil, dried french gree...",carrot dried brown lentil dried french green l...,"[23, 5199, 2504, 11, 1230, 13, 2505, 9, 84, 52...","[[carrot], [brown lentil], [green lentil], [ol...","[[45], [2337], [2352], [7, 8, 53, 69, 70], [13...","[apple, bibb lettuce, black pepper, breast, br...","[0, 1, 5, 7, 8, 10, 14, 15, 16, 19, 20, 32, 45...",0
1,[Combine first 9 ingredients in heavy medium s...,23.0,2004-08-20 04:00:00,"[Food Processor, Onion, Pork, Bake, Bastille D...",403.0,This uses the same ingredients found in boudin...,18.0,4.375,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",...,11,"[boudin, blanc, terrine, with, red, onion, con...","[all purpose flour, bay leaves, bay leaves, bu...",all purpose flour bay leaves bay leaves butter...,"[6, 85, 85, 15, 462, 10022, 210, 210, 189, 10,...","[[flour], [bay leaves, bay], [bay leaves, bay]...","[[5], [173, 184], [173, 184], [2], [15, 47, 20...","[baguette, bay, bay leaves, bread, broth, butt...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 15, 17, 21,...",1
2,[In a large heavy saucepan cook diced fennel a...,7.0,2004-08-20 04:00:00,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",165.0,,6.0,3.75,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",...,7,"[potato, and, fennel, soup, hodge]","[chicken broth, fennel bulb stalk discarded bu...",chicken broth fennel bulb stalk discarded bulb...,"[81, 10024, 42, 7, 100, 4]","[[chicken, chicken broth, broth], [garnish, fe...","[[15, 47, 201], [66, 256, 773, 2019], [11], [6...","[broth, bulb, butter, chicken, chicken broth, ...","[2, 6, 11, 15, 28, 33, 47, 66, 201, 256, 548, ...",2
3,[Heat oil in heavy large skillet over medium-h...,,2009-03-27 04:00:00,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",,The Sicilian-style tomato sauce has tons of Me...,,5.0,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",...,17,"[mahi-mahi, in, tomato, olive, sauce]","[anchovy paste, country style white bread thic...",anchovy paste country style white bread thick ...,"[476, 10025, 20, 11, 173, 10026, 1913, 7, 67, ...","[[paste, anchovy, anchovy paste], [bread, whit...","[[505, 1082, 1581], [51, 377], [62, 74, 138, 1...","[anchovy, anchovy paste, basil, bread, dry, dr...","[3, 6, 7, 8, 16, 43, 50, 51, 53, 54, 62, 69, 7...",3
4,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,2004-08-20 04:00:00,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",547.0,,20.0,3.125,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",...,11,"[spinach, noodle, casserole]","[extra wide egg noodle cooked, frozen spinach ...",extra wide egg noodle cooked frozen spinach so...,"[10028, 5202, 86, 2177, 346, 38]","[[egg, noodle, egg noodle, wide egg noodle], [...","[[9, 147, 443, 1283], [80, 396], [60, 199], [5...","[cheddar, cheddar cheese, cheese, cream, egg, ...","[9, 12, 13, 35, 37, 42, 60, 80, 147, 199, 257,...",4


In [52]:
#df.to_csv(save_dir+'epicurious_ing_cleaned.csv')
df.to_json(save_dir+'epicurious_ing_cleaned.json')

###### tests

In [10]:
ingredients = df.ingredients.iloc[0]
for i in ingredients:
    print(i,'--->',clean_replace_units(i))

4 cups low-sodium vegetable or chicken stock ---> low sodium vegetable chicken stock
1 cup dried brown lentils ---> dried brown lentils
1/2 cup dried french green lentils ---> dried french green lentils
2 stalks celery, chopped ---> stalks celery
1 large carrot, peeled and chopped ---> carrot
1 sprig fresh thyme ---> sprig fresh thyme
1 teaspoon kosher salt ---> kosher salt
1 medium tomato, cored, seeded, and diced ---> tomato
1 small fuji apple, cored and diced ---> fuji apple
1 tablespoon freshly squeezed lemon juice ---> lemon juice
2 teaspoons extra-virgin olive oil ---> extra virgin olive oil
freshly ground black pepper to taste ---> ground black pepper
3 sheets whole-wheat lavash, cut in half crosswise, or 6 (12-inch) flour tortillas ---> sheets whole wheat lavash cut in half crosswise flour tortillas
3/4 pound turkey breast, thinly sliced ---> turkey breast
1/2 head bibb lettuce ---> head bibb lettuce


In [12]:
s='Freshly ounce ground black pepper to taste ounces l (andgd)'
s=re.sub(r'\bto taste\b','',s)
s=re.sub(r'\bounce\b|\bounces\b','',s)
s=re.sub(r'\bl\b',r'',s)
s=re.sub(r'\(.*?\)','',s)
s=re.sub(r'\s+',r' ',s)
s

'Freshly ground black pepper '

In [13]:
result = re.findall('fresh','Fresh',re.IGNORECASE)
result

['Fresh']

In [11]:
print(df.ingredients.iloc[0])
print(df.ing_cleaned.iloc[0])

['4 cups low-sodium vegetable or chicken stock', '1 cup dried brown lentils', '1/2 cup dried french green lentils', '2 stalks celery, chopped', '1 large carrot, peeled and chopped', '1 sprig fresh thyme', '1 teaspoon kosher salt', '1 medium tomato, cored, seeded, and diced', '1 small fuji apple, cored and diced', '1 tablespoon freshly squeezed lemon juice', '2 teaspoons extra-virgin olive oil', 'freshly ground black pepper to taste', '3 sheets whole-wheat lavash, cut in half crosswise, or 6 (12-inch) flour tortillas', '3/4 pound turkey breast, thinly sliced', '1/2 head bibb lettuce']
['carrot', 'dried brown lentils', 'dried french green lentils', 'extra virgin olive oil', 'fuji apple', 'ground black pepper', 'head bibb lettuce', 'kosher salt', 'lemon juice', 'low sodium vegetable chicken stock', 'sheets whole wheat lavash cut in half crosswise flour tortillas', 'sprig fresh thyme', 'stalks celery', 'tomato', 'turkey breast']


In [None]:
# NEED TO COME BACK TO THOSE PROBLEM CASES LATER

'half'
'all purpose':'allpurpose'
'egg yolk','egg white' plurals whole eggs
'quality'
'dash','dashes','beaten','crumbled','dusting','toasted','fine'
'scrubbed debearded','lightly beaten''
water, ice
orange peel, orange lemon

colors: red /green bell pepper / 
    
pressed  quartered lengthwise rinsed if desired
broth 
additional
 mixture is available at specialty foods stores some supermarkets 
    combination
    accompaniment
     cut crosswise into thin slices
sliced
quartered lengthwise
pitted
extra virgin olive oil
if fresh dried in same sentence pick dried
crumbled
 cooked according directions
canned

sentence with or


##### Stats

In [18]:
# CATEGORIES

all_cat=df.categories.sum()

In [19]:
unique_cat = list(set(all_cat))
len(all_cat),len(unique_cat)

(244585, 674)

In [20]:
c_cat = Counter(all_cat)

In [21]:
c_cat.most_common(50)

[('Bon Appétit', 9355),
 ('Peanut Free', 8390),
 ('Soy Free', 8088),
 ('Tree Nut Free', 7044),
 ('Vegetarian', 6846),
 ('Gourmet', 6648),
 ('Kosher', 6175),
 ('Pescatarian', 6042),
 ('Quick & Easy', 5372),
 ('Wheat/Gluten-Free', 4906),
 ('Bake', 4413),
 ('Summer', 4151),
 ('Dessert', 3573),
 ('Dairy Free', 3206),
 ('Side', 3151),
 ('No Sugar Added', 3132),
 ('Winter', 3099),
 ('Fall', 3015),
 ('Dinner', 2705),
 ('Sugar Conscious', 2466),
 ('Healthy', 2351),
 ('Kidney Friendly', 2313),
 ('Onion', 2238),
 ('Tomato', 2140),
 ('Vegetable', 2087),
 ('Sauté', 2044),
 ('Milk/Cream', 1995),
 ('Fruit', 1958),
 ('Vegan', 1851),
 ('Kid-Friendly', 1791),
 ('Egg', 1768),
 ('Spring', 1715),
 ('Herb', 1681),
 ('Garlic', 1643),
 ('Salad', 1516),
 ('Dairy', 1496),
 ('Thanksgiving', 1458),
 ('Appetizer', 1372),
 ('Lunch', 1359),
 ('Cheese', 1355),
 ('Chicken', 1344),
 ('Roast', 1320),
 ('No-Cook', 1251),
 ('Soup/Stew', 1164),
 ('Cocktail Party', 1154),
 ('Ginger', 1146),
 ('Potato', 1128),
 ('Chill', 11

In [26]:
df[df.title.str.contains('Chicken',case=False)].sort_values('rating',ascending=False)

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,sodium,num_dir,num_ing,num_cat,title_words,ing_cleaned
20121,[1. Preheat the oven to 400°F. Spray a baking ...,70.0,2014-03-21 04:00:00+00:00,"[Chicken, Poultry, Bake, Passover, Fennel, Sha...",1086.0,Kosher Status: Poultry,78.0,5.0,Crispy Salt and Pepper Chicken with Caramelize...,"[cooking spray, one 3 1/2-pound chicken, cut i...",1323.0,2,11,6,"[crispy, salt, and, pepper, chicken, with, car...","[cooking spray, fennel bulbs trimmed slices, f..."
10630,[Pat chicken dry and sprinkle with pepper and ...,59.0,2006-05-16 20:13:33+00:00,"[Chicken, Tomato, Quick & Easy, Prune, White W...",940.0,The simplicity and speed of this dish belie it...,65.0,5.0,Chicken with Tomatoes and Prunes,"[1 (3-lb) chicken, cut into 8 pieces, 1/4 teas...",1264.0,3,11,8,"[chicken, with, tomatoes, and, prunes]","[black pepper, chicken, cinnamon, dry white wi..."
3782,"[Mix onion, garlic, ginger, gochujang, soy sau...",,2016-03-01 21:26:00+00:00,"[Bon Appétit, Chicken, Dinner, Sesame Oil, Gin...",,Long-grain rice is not starchy enough to hold ...,,5.0,Gochujang-Braised Chicken and Crispy Rice,"[1 small onion, finely chopped, 8 garlic clove...",,5,19,9,"[gochujang-braised, chicken, and, crispy, rice]","[chicken drumsticks patted dry, cooked shortgr..."
9583,"[1. Marinate the chicken: In a medium bowl, st...",11.0,2013-07-02 04:00:00+00:00,"[Chicken, Stir-Fry, Dinner, Soy Sauce, Chile P...",259.0,This highly addictive stir-fried chicken conti...,30.0,5.0,Kung Pao Chicken,"[1 tablespoon soy sauce, 2 teaspoons chinese r...",371.0,4,17,9,"[kung, pao, chicken]","[boneless skinless chicken breasts thighs, chi..."
15322,"[1. The day before, using 4 cloves of the garl...",,2015-02-04 04:00:00+00:00,"[Slow Cooker, Chicken, Citrus, Olive, Poultry,...",,(Djej Emshmel),,5.0,Chicken with Lemons and Olives Emshmel,"[2 to 3 chickens, whole or quartered, with the...",,12,19,6,"[chicken, with, lemons, and, olives, emshmel]","[bowl, casserole cover, chickens whole quarter..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16776,[Rinse the chicken backs and necks well under ...,14.0,2013-03-19 04:00:00+00:00,"[Chicken, Celery, Fennel, Leek, Shallot, Simme...",153.0,Editor's Note: This recipe goes with Daniel Hu...,7.0,0.0,Chicken Stock,"[10 pounds chicken backs and necks, 15 pounds ...",35.0,1,10,7,"[chicken, stock]","[bay leaf, celery, celery root, chicken backs ..."
4912,"[In a sauté pan over moderate heat, toast the ...",,2012-07-30 04:00:00+00:00,"[Chicken, Rice, Fry, Poach, Dinner, Tomatillo,...",,This recipe is part of the Epicurious Online C...,,0.0,Green Pipiân Mole with Chicken,"[1 cup pumpkin seeds, 1 cup vegetable stock or...",,6,17,12,"[green, pipiân, mole, with, chicken]","[canola vegetable oil, clove garlic, cooked wh..."
6624,"[Brown the chicken pieces in the butter, turni...",,2004-08-20 04:00:00+00:00,"[Wine, Chicken, Poultry, Sauté, Hot Pepper, Wh...",,,,0.0,Mexican Chicken Sauté,"[3 1/2 to 4 pound chicken, quartered, 4-6 tabl...",,1,8,8,"[mexican, chicken, sauté]","[butter, chicken quartered, garlic clove, garn..."
560,"[Whisk chile, oil, lime juice, soy sauce, brow...",39.0,2013-04-05 04:00:00+00:00,[],508.0,Although it's reason alone to keep a rotisseri...,25.0,,Asian Chicken and Cabbage Salad,[1 red jalapeño or fresno chile with some seed...,538.0,1,16,0,"[asian, chicken, and, cabbage, salad]","[baby spinach, carrots, dryroasted peanuts, fi..."


In [36]:
# KEYWORDS MATCHING EXACTLY THE WORDS IN THE TITLE
keywords=['Chicken','Eggplant','Zucchini']

def test_keywords(word,list_):
    if word.lower() in list_: return True
    else: return False
    
df_sel = df[df.title_words.apply(lambda l: test_keywords(keywords[0],l))].copy()
len(df_sel)

1220

#### From categories

In [105]:
max_cat=max(df.num_cat)
df[df.num_cat==max(df.num_cat)].categories

19380    [Smoothie, Blender, Berry, Citrus, Fruit, Brea...
Name: categories, dtype: object

In [51]:
def cat_list_order(c,i):
    if i<len(c): return c[i]
    else: return ''

In [53]:
df_test=df.copy()
for i in range(max(df_test.num_cat)):
    df_test['category_%i'%i]= df_test.categories.apply(lambda c: cat_list_order(c,i))
df_test

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,...,category_27,category_28,category_29,category_30,category_31,category_32,category_33,category_34,category_35,category_36
0,"[1. Place the stock, lentils, celery, carrot, ...",7.0,2006-09-01 04:00:00+00:00,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",426.0,,30.0,2.500,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",...,,,,,,,,,,
1,[Combine first 9 ingredients in heavy medium s...,23.0,2004-08-20 04:00:00+00:00,"[Food Processor, Onion, Pork, Bake, Bastille D...",403.0,This uses the same ingredients found in boudin...,18.0,4.375,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",...,,,,,,,,,,
2,[In a large heavy saucepan cook diced fennel a...,7.0,2004-08-20 04:00:00+00:00,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",165.0,,6.0,3.750,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",...,,,,,,,,,,
3,[Heat oil in heavy large skillet over medium-h...,,2009-03-27 04:00:00+00:00,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",,The Sicilian-style tomato sauce has tons of Me...,,5.000,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",...,,,,,,,,,,
4,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,2004-08-20 04:00:00+00:00,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",547.0,,20.0,3.125,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20125,[Beat whites in a bowl with an electric mixer ...,2.0,2004-08-20 04:00:00+00:00,"[Mixer, Cheese, Egg, Fry, Cocktail Party, Parm...",28.0,,2.0,3.125,Parmesan Puffs,"[2 large egg whites, 3 oz parmigiano-reggiano,...",...,,,,,,,,,,
20126,[Bring broth to simmer in saucepan.Remove from...,28.0,2008-02-28 22:06:54+00:00,"[Side, Kid-Friendly, High Fiber, Dinner, Parme...",671.0,Cooking the artichokes with the rice infuses t...,22.0,4.375,Artichoke and Parmesan Risotto,"[5 1/2 cups (or more) low-salt chicken broth, ...",...,,,,,,,,,,
20127,"[Using a sharp knife, cut a shallow X in botto...",38.0,2005-10-21 18:21:20+00:00,"[Onion, Poultry, turkey, Vegetable, Bake, Kid-...",563.0,,31.0,4.375,Turkey Cream Puff Pie,"[1 small tomato, 1 small onion, finely chopped...",...,,,,,,,,,,
20128,[Heat 2 tablespoons oil in heavy medium skille...,24.0,2004-08-20 04:00:00+00:00,"[Milk/Cream, Citrus, Dairy, Fish, Garlic, Past...",631.0,"Sharon Hooykaas of Los Alamitos, California, w...",45.0,4.375,Snapper on Angel Hair with Citrus Cream,"[4 tablespoons olive oil, 4 shallots, thinly s...",...,,,,,,,,,,


In [120]:
## CATEGORIES

df_test=df.copy()
for cat in unique_cat:
    df_test['category_'+cat]=0

In [121]:
first_time = datetime.datetime.now()

for ind in df_test.index:
    for c in df_test.loc[ind]['categories']:
        new_cat = 'category_'+c
        df_test.loc[ind,new_cat]=1
        
        
later_time = datetime.datetime.now()
later_time - first_time

datetime.timedelta(seconds=70, microseconds=783317)

#### Keyword search

In [6]:
# KEYWORDS MATCHING 
keywords=['Chicken','Eggplant','Zucchini']

In [70]:
# FROM FIND IN TITLE

def test_title(word,title):
    word=word.lower()
    title=title.lower()
    if title.find(word)>=0: return True
    else: return False

print('Recipies that have one of those ingredients')
for kw in keywords:
    df_sel = df[df.title.apply(lambda l: test_title(kw,l))]
    print(kw,len(df_sel))   

Recipies that have one of those ingredients
Chicken 1249
Eggplant 178
Zucchini 171


In [71]:
keywords=['Eggplant','Zucchini','Chicken']
df_sel=df.copy()
for kw in keywords:
    df_sel = df_sel[df_sel.title.apply(lambda l: test_title(kw,l))]
    print(kw,len(df_sel)) 
df_sel

Eggplant 178
Zucchini 4
Chicken 0


Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,sodium,num_dir,num_ing,num_cat,title_words,ing_cleaned,ing_cleaned_all


In [7]:
# FROM MATCHING STRINGS
def match_string(keywords,title,how='any'):
    found=0
    for pattern in keywords:
        if re.search(pattern,title, re.IGNORECASE): 
            if how=='any': return True
            if how=='all': found+=1
    if found==len(keywords): return True
    else: return False
    

In [73]:
# TITLE ANY
keywords=['Chicken','Eggplant','Zucchini']
mask = df.title.apply(lambda t: match_string(keywords,t,'any'))
len(df[mask])

1586

In [74]:
# TITLE ALL
keywords=['Chicken','Eggplant','Zucchini']
mask = df.title.apply(lambda t: match_string(keywords,t,'all'))
len(df[mask])

0

In [19]:
# FROM MATCHING INGREDIENTS
keywords=['Chicken','Eggplant','Zucchini']
mask = df.ing_cleaned_all.apply(lambda t: match_string(keywords,t,'all'))
df[mask]

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,...,num_ing,num_cat,title_words,ing_cleaned,ing_cleaned_all,ing_index,ing_assoc,ing_assoc_index,ing_assoc_list,ing_assoc_index_list
2983,[Prepare barbecue (high heat). Combine first 5...,,2004-08-20 04:00:00,"[Salad, Chicken, Poultry, Low Fat, Eggplant, A...",,,,5.0,Grilled Chicken Salad Provencal,"[1 tablespoon olive oil, 1 tablespoon canned u...",...,14,11,"[grilled, chicken, salad, provencal]","[additional balsamic vinegar red wine vinegar,...",additional balsamic vinegar red wine vinegar b...,"[13464, 547, 6619, 1756, 740, 13465, 4314, 162...","[[vinegar, wine, wine vinegar, red wine, red, ...","[[17, 74, 85, 95, 145, 166, 180], [17, 74, 85,...","[arugula, balsamic vinegar, basil, bell pepper...","[1, 3, 7, 8, 15, 17, 23, 46, 47, 52, 53, 54, 5..."
12592,[Put oven racks in upper and lower thirds of o...,19.0,2006-08-21 17:14:11,"[Herb, Olive, Onion, Tomato, Vegetable, Side, ...",311.0,The deep flavors in this lush and hearty dish ...,7.0,5.0,Mediterranean Eggplant and Barley Salad,"[1 1/2 pound eggplant, cut into 1/2-inch cubes...",...,21,17,"[mediterranean, eggplant, and, barley, salad]",[accompaniment ricotta salata cut crosswise in...,accompaniment ricotta salata cut crosswise int...,"[24874, 22, 109, 991, 286, 11, 110, 8, 72, 3, ...","[[ricotta, crosswise, accompaniment, ricotta s...","[[245, 339, 915, 1924], [1, 10], [58], [16, 26...","[accompaniment, barley, black olive, black pep...","[0, 1, 3, 4, 6, 7, 8, 10, 14, 15, 16, 19, 21, ..."
14697,[Combine first 7 ingredients in large glass ba...,,2004-08-20 04:00:00,"[Chicken, Poultry, Marinate, Eggplant, Bell Pe...",,"A marinade of balsamic vinegar, olive oil, hon...",,4.375,Grilled Chicken and Vegetables with Wild Rice,"[1 1/2 cups balsamic vinegar, 1/2 cup olive oi...",...,15,10,"[grilled, chicken, and, vegetables, with, wild...","[additional fresh oregano, additional fresh sa...",additional fresh oregano additional fresh sage...,"[27337, 9665, 52, 716, 1211, 93, 24, 29, 27338...","[[oregano, fresh, fresh oregano], [sage, fresh...","[[43, 1067, 2095], [294, 1067, 3070], [17, 166...","[allspice, balsamic vinegar, bell pepper, bone...","[1, 6, 7, 8, 15, 17, 43, 49, 53, 57, 72, 87, 8..."
16541,[1. Grease the slow cooker with cooking spray....,26.0,2013-03-01 04:00:00,"[Slow Cooker, Tomato, Vegetarian, Dinner, Goat...",550.0,"Parmigiano-Reggiano adds salty, nutty richness...",18.0,3.75,Slow-Cooked Ratatouille Over Goat Cheese Polenta,"[cooking spray, 2 large eggplants, peeled and ...",...,21,19,"[slow-cooked, ratatouille, over, goat, cheese,...","[all purpose flour, cooking spray, eggplant, f...",all purpose flour cooking spray eggplant fresh...,"[6, 1079, 286, 136, 846, 166, 33, 9864, 9864, ...","[[flour], [cooking spray], [eggplant], [basil,...","[[5], [163], [433], [54, 197, 1007, 1067, 2269...","[basil, basil leaves, bell pepper, black peppe...","[0, 1, 2, 3, 5, 6, 10, 12, 15, 16, 28, 52, 54,..."
18453,[Heat oil in heavy large pot or Dutch oven ove...,,2004-08-20 04:00:00,"[Cheese, Herb, Vegetable, Low Fat, Quick & Eas...",,Warm pita bread would make a delicious accompa...,,3.125,Couscous à la Greque,"[1 tablespoon olive oil, 1 red onion, chopped,...",...,14,11,"[couscous, à, la, greque]","[canned unsalted chicken broth, couscous, drie...",canned unsalted chicken broth couscous dried o...,"[740, 394, 219, 140, 1177, 72, 1396, 3, 5778, ...","[[chicken, chicken broth, broth], [couscous], ...","[[15, 47, 201], [628], [43, 128], [52, 174], [...","[artichoke, artichoke heart, bell pepper, brot...","[1, 3, 6, 7, 8, 12, 15, 16, 23, 43, 46, 47, 52..."
19589,"[Brush zucchini, eggplants, tomatoes, and pepp...",,2005-07-05 22:27:30,"[Tomato, Appetizer, Sauté, Low Fat, Vegetarian...",,"Head north of Sunset, and you will happen upon...",,3.75,Grilled Vegetable Flatbreads Stuffed with Zucc...,"[2 zucchini, cut into 1/3-inch slices, 2 eggpl...",...,14,19,"[grilled, vegetable, flatbreads, stuffed, with...","[beefsteak heirloom tomato slice, eggplant sli...",beefsteak heirloom tomato slice eggplant slice...,"[33201, 33202, 11, 33203, 50, 69, 33204, 2, 7,...","[[tomato, heirloom tomato], [eggplant], [olive...","[[16, 1914], [433], [7, 8, 53, 69, 70], [2301,...","[bean, bell pepper, broth, chicken, chicken br...","[1, 6, 7, 8, 14, 15, 16, 21, 31, 47, 52, 53, 5..."


In [20]:
print(list(df[mask].title))

['Grilled Chicken Salad Provencal', 'Mediterranean Eggplant and Barley Salad', 'Grilled Chicken and Vegetables with Wild Rice', 'Slow-Cooked Ratatouille Over Goat Cheese Polenta', 'Couscous à la Greque', 'Grilled Vegetable Flatbreads Stuffed with Zucchini, Eggplant, and Tomato']


In [10]:
for ind in df[mask].index[:1]:
    print(df.loc[ind,'title'])
    print('    You will also need: \n    '+'\n    '.join(df.loc[ind,'ing_cleaned'])+'\n')

Grilled Chicken Salad Provencal
    You will also need: 
    additional balsamic vinegar red wine vinegar
    balsamic vinegar red wine vinegar
    boneless skinless chicken breast trimmed
    bunch arugula
    canned unsalted chicken broth
    crookneck squash quartered lengthwise
    curly endive
    garlic clove pressed
    herbes de provence crumbled
    herbes de provence dried herb mixture is available at specialty food store some supermarket combination dried thyme basil savor fennel seed be used
    japanese eggplant quartered lengthwise
    olive oil
    red green bell pepper lengthwise
    zucchini quartered lengthwise



In [21]:
df[mask][['title']].to_dict('records')

[{'title': 'Grilled Chicken Salad Provencal'},
 {'title': 'Mediterranean Eggplant and Barley Salad'},
 {'title': 'Grilled Chicken and Vegetables with Wild Rice'},
 {'title': 'Slow-Cooked Ratatouille Over Goat Cheese Polenta'},
 {'title': 'Couscous à la Greque'},
 {'title': 'Grilled Vegetable Flatbreads Stuffed with Zucchini, Eggplant, and Tomato'}]

#### From cleaned ingredient list

In [15]:
# INGREDIENTS
all_ing_cleaned = df.ing_cleaned.sum()

In [23]:
c_ingr = Counter(all_ing_cleaned)
len(c_ingr.most_common())

('salt', 5693)

In [24]:
unique_ingr = [c[0] for c in c_ingr.most_common()]

len(all_ing_cleaned),len(unique_ingr)

(199030, 33797)

In [128]:
c_ingr_all = Counter(all_ing_list)

In [138]:
c_ingr_all.most_common(10)

[('1/2 teaspoon salt', 1266),
 ('1/4 teaspoon salt', 963),
 ('2 tablespoons olive oil', 947),
 ('1 teaspoon salt', 859),
 ('1 tablespoon olive oil', 640),
 ('1/2 cup sugar', 589),
 ('2 large eggs', 568),
 ('2 tablespoons fresh lemon juice', 544),
 ('kosher salt', 536),
 ('1/4 cup olive oil', 470)]

#### Mapping unique ingredients to cleaned list

In [25]:
df_map=pd.DataFrame({'original_unique':unique_ingr,'original_ind':range(len(unique_ingr))})
df_cleaned=pd.DataFrame({'cleaned':cleaned_ing,'cleaned_ind':range(len(cleaned_ing))})


In [26]:
df_cleaned

Unnamed: 0,cleaned,cleaned_ind
0,salt,0
1,pepper,1
2,butter,2
3,garlic,3
4,sugar,4
...,...,...
3335,poblano chilie,3335
3336,crystal hot sauce,3336
3337,watercress leave,3337
3338,emeril essence,3338


In [108]:
#for i in range(len(cleaned_ing)): df_map['C'+str(i)]=0


In [56]:
df_map['assoc']=[ [] for _ in range(len(df_map.index)) ]
df_map['assoc_index']=[ [] for _ in range(len(df_map.index)) ]

first_time = datetime.datetime.now()


def map_ingredient(r,clean,ind_cleaned):
    for index_map in df_map.index:
        ing = df_map.original_unique.loc[index_map]
        if r.findall(ing):
            df_map.loc[index_map,'assoc'].append(clean)
            df_map.loc[index_map,'assoc_index'].append(ind_cleaned)
            #df_map.loc[index_map,'C'+str(ind_cleaned)]=1


for ind_cleaned in df_cleaned.index:
    clean = df_cleaned.cleaned.loc[ind_cleaned]
    r = re.compile(r'\b{}\b'.format(clean),re.IGNORECASE)
    map_ingredient(r,clean,ind_cleaned)

        
later_time = datetime.datetime.now()
later_time - first_time



datetime.timedelta(seconds=2290, microseconds=514261)

In [57]:
df_map.head()

Unnamed: 0,original_unique,original_ind,assoc,assoc_index
0,salt,0,[salt],[0]
1,sugar,1,[sugar],[4]
2,olive oil,2,"[olive oil, olive, oil]","[7, 8, 53]"
3,garlic clove,3,"[garlic, garlic clove, clove]","[3, 23, 46]"
4,unsalted butter,4,"[butter, unsalted butter]","[2, 28]"


In [58]:
df_map.to_json(save_dir+'mapped_ing.json')

In [67]:
kw='zucchini'
df_map[df_map.original_unique.str.contains(kw)]

Unnamed: 0,original_unique,original_ind,assoc,assoc_index
188,zucchini,188,[zucchini],[134]
574,zucchini trimmed,574,[zucchini],[134]
1597,zucchini lengthwise,1597,[zucchini],[134]
2235,zucchini thick round,2235,[zucchini],[134]
2366,zucchini trimmed cut lengthwise into thick slice,2366,[zucchini],[134]
...,...,...,...,...
32872,trimmed baby zucchini,32872,[zucchini],[134]
32886,zucchini trimmed cut on sharp diagonal into th...,32886,[zucchini],[134]
33065,zucchini lengthwise cut crosswise into thick,33065,"[zucchini, crosswise]","[134, 339]"
33073,zucchini trimmed lengthwise crosswise,33073,"[zucchini, crosswise]","[134, 339]"


In [59]:
missing = [i for i in df_map.index if len(df_map.iloc[i]['assoc_index'])==0]
len(missing)

1676

In [62]:
df_map.loc[missing].head(20)

Unnamed: 0,original_unique,original_ind,assoc,assoc_index
5,water,5,[],[]
232,crme frache,232,[],[]
242,parsnip,242,[],[]
243,walnuts toasted,243,[],[]
255,old fashioned oats,255,[],[]
281,oranges,281,[],[]
313,jalapeo,313,[],[]
331,tabasco,331,[],[]
472,lukewarm water,472,[],[]
535,crumbled feta,535,[],[]


In [64]:
remove_plurals('walnuts toasted')

'walnut toasted'

#### Map cleaned and unique to the recipe df

In [93]:
df.head()

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,sodium,num_dir,num_ing,num_cat,title_words,ing_cleaned,ing_cleaned_all
0,"[1. Place the stock, lentils, celery, carrot, ...",7.0,2006-09-01 04:00:00+00:00,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",426.0,,30.0,2.5,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",559.0,3,15,11,"[lentil, apple, and, turkey, wrap]","[carrot, dried brown lentil, dried french gree...",carrot dried brown lentil dried french green l...
1,[Combine first 9 ingredients in heavy medium s...,23.0,2004-08-20 04:00:00+00:00,"[Food Processor, Onion, Pork, Bake, Bastille D...",403.0,This uses the same ingredients found in boudin...,18.0,4.375,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",1439.0,5,28,11,"[boudin, blanc, terrine, with, red, onion, con...","[all purpose flour, bay leaves, bay leaves, bu...",all purpose flour bay leaves bay leaves butter...
2,[In a large heavy saucepan cook diced fennel a...,7.0,2004-08-20 04:00:00+00:00,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",165.0,,6.0,3.75,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",165.0,2,6,7,"[potato, and, fennel, soup, hodge]","[chicken broth, fennel bulb stalk discarded bu...",chicken broth fennel bulb stalk discarded bulb...
3,[Heat oil in heavy large skillet over medium-h...,,2009-03-27 04:00:00+00:00,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",,The Sicilian-style tomato sauce has tons of Me...,,5.0,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",,2,10,17,"[mahi-mahi, in, tomato, olive, sauce]","[anchovy paste, country style white bread thic...",anchovy paste country style white bread thick ...
4,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,2004-08-20 04:00:00+00:00,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",547.0,,20.0,3.125,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",452.0,1,6,11,"[spinach, noodle, casserole]","[extra wide egg noodle cooked, frozen spinach ...",extra wide egg noodle cooked frozen spinach so...


In [145]:
df['ing_index']=''
df['ing_assoc']=''
df['ing_assoc_index']=''

#df = df.drop(columns=['ing_index','ing_assoc','ing_assoc_index'])

In [146]:
first_time = datetime.datetime.now()


for i in df.index:
    ing_index=[]
    ing_assoc=[]
    ing_assoc_index=[]
    for ing in df.ing_cleaned.loc[i]:
        subselect  = df_map[df_map.original_unique==ing]
        if len(subselect.index)>1: print("not unique entry",ing)
        ing_index.append( subselect.original_ind.iloc[0] )
        ing_assoc.append( subselect.assoc.iloc[0]  )
        ing_assoc_index.append( subselect.assoc_index.iloc[0] )
        
    df.at[i,'ing_index']=ing_index
    df.at[i,'ing_assoc']=ing_assoc
    df.at[i,'ing_assoc_index']=ing_assoc_index
    
    
        
        
later_time = datetime.datetime.now()
later_time - first_time

datetime.timedelta(seconds=818, microseconds=839997)

In [150]:
df.shape

(20111, 20)

In [147]:
df.tail()

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,sodium,num_dir,num_ing,num_cat,title_words,ing_cleaned,ing_cleaned_all,ing_index,ing_assoc,ing_assoc_index
20125,[Beat whites in a bowl with an electric mixer ...,2.0,2004-08-20 04:00:00+00:00,"[Mixer, Cheese, Egg, Fry, Cocktail Party, Parm...",28.0,,2.0,3.125,Parmesan Puffs,"[2 large egg whites, 3 oz parmigiano-reggiano,...",64.0,2,3,8,"[parmesan, puffs]","[egg white, parmigiano reggiano, vegetable oil]",egg white parmigiano reggiano vegetable oil,"[62, 175, 12]","[[egg, egg white], [parmigiano], [vegetable oi...","[[9, 148], [1092], [24, 53]]"
20126,[Bring broth to simmer in saucepan.Remove from...,28.0,2008-02-28 22:06:54+00:00,"[Side, Kid-Friendly, High Fiber, Dinner, Parme...",671.0,Cooking the artichokes with the rice infuses t...,22.0,4.375,Artichoke and Parmesan Risotto,"[5 1/2 cups (or more) low-salt chicken broth, ...",583.0,2,8,16,"[artichoke, and, parmesan, risotto]","[arborio rice, baby artichoke trimmed, butter,...",arborio rice baby artichoke trimmed butter dry...,"[410, 9992, 15, 20, 11, 54, 7, 46]","[[rice, arborio rice], [artichoke, baby artich...","[[89, 771], [954, 2757], [2], [62, 74, 138, 16..."
20127,"[Using a sharp knife, cut a shallow X in botto...",38.0,2005-10-21 18:21:20+00:00,"[Onion, Poultry, turkey, Vegetable, Bake, Kid-...",563.0,,31.0,4.375,Turkey Cream Puff Pie,"[1 small tomato, 1 small onion, finely chopped...",652.0,6,17,15,"[turkey, cream, puff, pie]","[all purpose flour, all purpose flour sifted a...",all purpose flour all purpose flour sifted aft...,"[6, 33792, 112, 993, 4441, 1467, 110, 7, 175, ...","[[flour], [flour], [butter, unsalted butter], ...","[[5], [5], [2, 28], [168, 803], [34, 138, 272,..."
20128,[Heat 2 tablespoons oil in heavy medium skille...,24.0,2004-08-20 04:00:00+00:00,"[Milk/Cream, Citrus, Dairy, Fish, Garlic, Past...",631.0,"Sharon Hooykaas of Los Alamitos, California, w...",45.0,4.375,Snapper on Angel Hair with Citrus Cream,"[4 tablespoons olive oil, 4 shallots, thinly s...",517.0,5,18,13,"[snapper, on, angel, hair, with, citrus, cream]","[additional orange peel, all purpose flour, an...",additional orange peel all purpose flour angel...,"[33794, 6, 2175, 387, 64, 16, 75, 3, 33795, 21...","[[orange, orange peel], [flour], [pasta, angel...","[[50, 539], [5], [99, 1010], [1104], [54, 1007..."
20129,[Position rack in bottom third of oven and pre...,10.0,2004-08-20 04:00:00+00:00,"[Pork, Bake, Roast, Christmas, Ham, Winter, Bo...",560.0,"Although labeled fully cooked, the ham will st...",73.0,4.375,Baked Ham with Marmalade-Horseradish Glaze,"[1 18-pound fully cooked bone-in smoked ham, r...",3698.0,3,7,7,"[baked, ham, with, marmalade-horseradish, glaze]","[fresh orange juice, fully cooked bone in smok...",fresh orange juice fully cooked bone in smoked...,"[75, 8475, 56, 488, 33796, 5, 113]","[[orange, orange juice, fresh, fresh orange ju...","[[50, 93, 1067, 3154, 3155], [198, 781, 1590],..."


In [153]:
def flatten_unique(lofl):
    all_el = [item for sublist in lofl for item in sublist]
    return sorted(list(set(all_el)))

In [154]:
df['ing_assoc_list']=df.ing_assoc.apply(flatten_unique)
df['ing_assoc_index_list']=df.ing_assoc_index.apply(flatten_unique)

In [155]:
df.head()

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,...,num_ing,num_cat,title_words,ing_cleaned,ing_cleaned_all,ing_index,ing_assoc,ing_assoc_index,ing_assoc_list,ing_assoc_index_list
0,"[1. Place the stock, lentils, celery, carrot, ...",7.0,2006-09-01 04:00:00+00:00,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",426.0,,30.0,2.5,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",...,15,11,"[lentil, apple, and, turkey, wrap]","[carrot, dried brown lentil, dried french gree...",carrot dried brown lentil dried french green l...,"[23, 5199, 2504, 11, 1230, 13, 2505, 9, 84, 52...","[[carrot], [brown lentil], [green lentil], [ol...","[[45], [2337], [2352], [7, 8, 53, 69, 70], [13...","[apple, bibb lettuce, black pepper, breast, br...","[0, 1, 5, 7, 8, 10, 14, 15, 16, 19, 20, 32, 45..."
1,[Combine first 9 ingredients in heavy medium s...,23.0,2004-08-20 04:00:00+00:00,"[Food Processor, Onion, Pork, Bake, Bastille D...",403.0,This uses the same ingredients found in boudin...,18.0,4.375,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",...,28,11,"[boudin, blanc, terrine, with, red, onion, con...","[all purpose flour, bay leaves, bay leaves, bu...",all purpose flour bay leaves bay leaves butter...,"[6, 85, 85, 15, 462, 10022, 210, 210, 189, 10,...","[[flour], [bay leaves, bay], [bay leaves, bay]...","[[5], [173, 184], [173, 184], [2], [15, 47, 20...","[baguette, bay, bay leaves, bread, broth, butt...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 15, 17, 21,..."
2,[In a large heavy saucepan cook diced fennel a...,7.0,2004-08-20 04:00:00+00:00,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",165.0,,6.0,3.75,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",...,6,7,"[potato, and, fennel, soup, hodge]","[chicken broth, fennel bulb stalk discarded bu...",chicken broth fennel bulb stalk discarded bulb...,"[81, 10024, 42, 7, 100, 4]","[[chicken, chicken broth, broth], [garnish, fe...","[[15, 47, 201], [66, 256, 773, 2019], [11], [6...","[broth, bulb, butter, chicken, chicken broth, ...","[2, 6, 11, 15, 28, 33, 47, 66, 201, 256, 548, ..."
3,[Heat oil in heavy large skillet over medium-h...,,2009-03-27 04:00:00+00:00,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",,The Sicilian-style tomato sauce has tons of Me...,,5.0,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",...,10,17,"[mahi-mahi, in, tomato, olive, sauce]","[anchovy paste, country style white bread thic...",anchovy paste country style white bread thick ...,"[476, 10025, 20, 11, 173, 10026, 1913, 7, 67, ...","[[paste, anchovy, anchovy paste], [bread, whit...","[[505, 1082, 1581], [51, 377], [62, 74, 138, 1...","[anchovy, anchovy paste, basil, bread, dry, dr...","[3, 6, 7, 8, 16, 43, 50, 51, 53, 54, 62, 69, 7..."
4,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,2004-08-20 04:00:00+00:00,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",547.0,,20.0,3.125,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",...,6,11,"[spinach, noodle, casserole]","[extra wide egg noodle cooked, frozen spinach ...",extra wide egg noodle cooked frozen spinach so...,"[10028, 5202, 86, 2177, 346, 38]","[[egg, noodle, egg noodle, wide egg noodle], [...","[[9, 147, 443, 1283], [80, 396], [60, 199], [5...","[cheddar, cheddar cheese, cheese, cream, egg, ...","[9, 12, 13, 35, 37, 42, 60, 80, 147, 199, 257,..."


In [96]:
df.to_json(save_dir+'epicurious_ing_cleaned.json')

Unnamed: 0,original_unique,original_ind,assoc,assoc_index
10028,extra wide egg noodle cooked,10028,"[egg, noodle, egg noodle, wide egg noodle]","[9, 147, 443, 1283]"


In [101]:
df_map['num_assoc']=df_map.assoc_index.apply(len)
df_map.sort_values('num_assoc',ascending=False)

Unnamed: 0,original_unique,original_ind,assoc,assoc_index,num_assoc
10509,accompaniment avocado crema queso fresco icebe...,10509,"[pepper, onion, oregano, red pepper, pepper fl...","[1, 6, 43, 44, 100, 112, 116, 128, 145, 151, 1...",22
18854,ingredient info kaffir lime leave are leave ka...,18854,"[milk, lime juice, coconut, unsweetened, lime,...","[11, 75, 79, 98, 179, 226, 317, 599, 602, 617,...",19
33377,accompaniment fresh bean sprouts very onion fr...,33377,"[onion, cilantro, basil, bean, scallion, lime,...","[6, 38, 54, 68, 131, 179, 197, 222, 304, 317, ...",18
10283,in bowl put out pineapple mango watermelon kiw...,10283,"[lemon, orange, basil, syrup, pineapple, straw...","[14, 50, 54, 86, 115, 172, 179, 187, 222, 286,...",18
11247,accompaniment fried tortilla strips california...,11247,"[pepper, onion, oregano, red pepper, pepper fl...","[1, 6, 43, 44, 100, 112, 116, 128, 145, 146, 1...",18
...,...,...,...,...,...
13984,four bamboo skewer,13984,[],[],0
32366,ramp,32366,[],[],0
32364,head frise torn apart,32364,[],[],0
9664,vegenaise,9664,[],[],0


#### Mapping all recipes to each clean ingredients

In [181]:
df_cleaned['recipes_index']=''
df_cleaned['recipes_titles']=''

#df = df.drop(columns=['ing_index','ing_assoc','ing_assoc_index'])

In [182]:
first_time = datetime.datetime.now()

for i in df_cleaned.index:
    recipes_index  = []
    recipes_titles = []
    
    for rec in df.index:
        #print(df.title.iloc[rec])
        list_all_ing_assoc = df.ing_assoc_index_list.loc[rec]
        if df_cleaned.cleaned_ind.iloc[i] in list_all_ing_assoc:
            
            recipes_index.append( rec )
            recipes_titles.append( df.title.loc[rec] )
        
    df_cleaned.at[i,'recipes_index'] = recipes_index
    df_cleaned.at[i,'recipes_titles']= recipes_titles
    
    
        
        
later_time = datetime.datetime.now()
later_time - first_time

datetime.timedelta(seconds=2685, microseconds=738461)

In [184]:
df_cleaned.tail()

Unnamed: 0,cleaned,cleaned_ind,recipes_index,recipes_titles
3335,poblano chilie,3335,"[62, 471, 1778, 2262, 3402, 3757, 3769, 4711, ...",[Southwest Corn Bread Stuffing with Corn and G...
3336,crystal hot sauce,3336,[],[]
3337,watercress leave,3337,"[568, 3045, 4046, 8597, 8775, 8818, 10348, 133...","[Lentil Croquettes with Watercress and Kefir, ..."
3338,emeril essence,3338,[],[]
3339,corn flake cereal,3339,[],[]


#### Test

In [None]:
if clean=='salt' and re.findall('low salt|no salt|lowsalt',ing)!=[]: continue
            if clean!='broth' and re.findall('broth',ing)!=[]: continue
            

In [199]:
for i in ref['sugar']:
    for clean in cleaned_filtered_top_3500[:100]:
        r = re.compile(r'.*\s{}\s'.format(clean),re.IGNORECASE)
        if clean=='sugar': continue
        if r.match(i):
            print(clean,'-->',i)

green --> frozen sugar snap green peas
brown sugar --> granulated brown sugar each
cream --> nutmeg ice cream andbourbon burnt sugar sauce as accompaniments
sauce --> nutmeg ice cream andbourbon burnt sugar sauce as accompaniments
brown sugar --> golden brown sugar white sugar
cream --> peach brown sugar ice cream as an accompaniment if desired
brown sugar --> peach brown sugar ice cream as an accompaniment if desired
cream --> peach brown sugar ice cream vanilla ice cream as an accompaniment if desired
brown sugar --> peach brown sugar ice cream vanilla ice cream as an accompaniment if desired
vanilla --> peach brown sugar ice cream vanilla ice cream as an accompaniment if desired
beans --> green beans trimmed sugar snap peas
cream --> peach brown sugar ice cream vanilla cream as an accompaniment
brown sugar --> peach brown sugar ice cream vanilla cream as an accompaniment
vanilla --> peach brown sugar ice cream vanilla cream as an accompaniment
brown sugar --> oven dried brown sugar 

In [None]:
clean_to_remove=[]

'pepper' remove if 'bell'
chicken remove if broth

'low salt','no salt','lowsalt'

In [187]:
'broth' in cleaned_filtered_top_3500

True

### Wrapper for app

In [14]:
def match_string(keywords, title, how='any'):
    found = 0
    for pattern in keywords:
        if re.search(pattern, title, re.IGNORECASE):
            if how == 'any':
                return True
            if how == 'all':
                found += 1
    if found == len(keywords):
        return True
    else:
        return False

In [10]:
def load_recipes():
    """Loads recipe dataframe"""
    return pd.read_json(save_dir + 'epicurious_cuisine.json')
df = load_recipes()

In [20]:
cuisine_list = df.cuisine.unique()
print(cuisine_list)

['italian' 'french' 'southern_us' 'mexican' 'korean' 'thai' 'indian'
 'greek' 'british' 'moroccan' 'russian' 'japanese' 'spanish' 'irish'
 'chinese' 'cajun_creole' 'filipino' 'jamaican' 'vietnamese' 'brazilian']


In [30]:
def get_recipes(df,rec_id=None, kw=None, cuis=[]):
    """
    Query on recipe dataframe from keyword ingredients, matching all of them
    :param rec_id: Recipe Id to retrieve a single recipe to be displayed on its page
    :param kw: List of ingredients
    :param cuis: List of cuisine selected
    :return: dictionary
    """
    if kw is not None:
        kw = [k.strip() for k in kw.split(' ')]
        mask = df.ing_cleaned_all.apply(lambda t: match_string(kw, t, 'all'))
    if rec_id is not None:
        mask = df.recipe_id == int(rec_id)        
    df_sel  = df[mask]
    if cuis!=[]:
        df_sel = sort_cuis(df_sel,cuis)
    return df_sel #[['recipe_id', 'title', 'ing_cleaned', 'ingredients', 'directions','cuisine']]#.to_dict('records')
col = ['recipe_id', 'title', 'ing_cleaned', 'ingredients', 'directions','cuisine']

In [130]:
def sort_cuis(df,cuis_list):
    ind_list,ind_list_match={},{}
    ind_sorted=[]
    
    for cui in cuis_list:
        ind_list_match[cui] = list(df[df.cuisine==cui].sort_values(cui,ascending=False).index)
        ind_list[cui] = list(df.sort_values(cui,ascending=False).index)
    
    for i in range(max([len(l) for l in ind_list_match.values()])):         
        for cui in cuis_list:
            try:
                ind_sorted.append(ind_list_match[cui][i])
            except:
                pass
            
    for i in range(len(df.index)):         
        for cui in cuis_list:
            if ind_list[cui][i] not in ind_sorted:
                ind_sorted.append(ind_list[cui][i])
                
    
    df = df.loc[ind_sorted]
    return df

In [131]:
df_test[df_test.cuisine=='mexican']
cuis_list=['mexican','french']
cuis_list.remove('mexican')
d={'a':[3,5,4],'b': [3,6]}


In [138]:
ingredients_kw='Salmon mushroom'
cuis_list=['french','italian','japanese']
df_test=get_recipes(df,kw=ingredients_kw)
df_sorted = sort_cuis(df_test,cuis_list)
df_sorted[col]

no index 1 match for cuisine italian
no index 2 match for cuisine italian
no index 2 match for cuisine japanese
no index 3 match for cuisine italian
no index 3 match for cuisine japanese


Unnamed: 0,recipe_id,title,ing_cleaned,ingredients,directions,cuisine
8108,8108,Salmon with Mushroom Orzo and Red Wine Sauce,"[bay leaves, canned beef broth, canned low sal...","[7 tablespoons olive oil, 1 large onion, slice...",[Heat 2 tablespoons oil in heavy large saucepa...,french
18669,18669,Rosemary-Rubbed Side of Salmon with Roasted Po...,[crimini mushroom quartered if if assorted sal...,[1 1/2 cups loosely packed fresh rosemary leav...,"[Blend rosemary, salt, and pepper in processor...",italian
13676,13676,Miso-Marinated Salmon with Citrus and Shiitakes,"[fresh cilantro leave, fresh lemon juice, fres...","[2 (3-lb) whole salmon fillets with skin, any ...",[Line a large shallow (1-inch-deep) baking pan...,japanese
2313,2313,Poached Wild Salmon With Peas and Morels,"[center cut wild salmon fillets, dry white win...",[2 6-8-ounce center-cut wild salmon fillets (e...,"[Place salmon, skin side down, in a large high...",french
3450,3450,"Wasabi Salmon with Bok Choy, Green Cabbage, an...","[baby bok choy, garlic clove, ginger, green ca...","[1/4 cup mayonnaise, 1 teaspoon wasabi paste (...",[Preheat oven to 450°F. Heat a large rimmed ba...,japanese
8453,8453,Barely Cooked Salmon with Parmesan Polenta and...,"[butter, button mushroom, canola oil, flat lea...","[1 pound button mushrooms, 10 cups water, fine...","[Place the button mushrooms in a pan, cover wi...",french
19016,19016,Salmon Chowder,"[button cremini mushroom, clove garlic, dry wh...","[2 tablespoons extra-virgin olive oil, 1 cup d...",[Heat the olive oil in a large pot over medium...,french
9595,9595,"Salmon ""Bulgogi"" with Bok Choy and Mushrooms","[asian sesame oil, available in asian food sec...","[2 large garlic cloves, peeled, divided, 1/3 c...",[Blend 1 garlic clove and next 7 ingredients i...,chinese
18131,18131,Linguine with Salmon and Mushrooms,"[caper, dry white wine, fresh dill dried dillw...","[1/4 cup olive oil, 1 12-ounce skinless salmon...",[Heat oil in heavy large skillet over medium-h...,russian
8094,8094,Sesame-Crusted Salmon,"[basmati long grain rice serve, bok choy quart...","[1/4 cup sesame seeds, 1 teaspoon dried red pe...",[Mix the sesame seeds and red pepper flakes on...,korean


In [136]:
df.loc[13881]

directions              [Bring a large saucepan of water to a boil, re...
fat                                                                  38.0
date                                           2016-05-02 16:32:13.507000
categories              [HarperCollins, Dinner, Seafood, Fish, Salmon,...
calories                                                            607.0
desc                    Ratatouille doesn’t have to take ages! Just ma...
protein                                                              49.0
rating                                                              3.125
title                   Quick Poached Salmon With Speedy Rat-Atat-A-To...
ingredients             [1 tablespoon coconut oil, 1 small red onion, ...
sodium                                                              205.0
num_dir                                                                 5
num_ing                                                                 8
num_cat                               

## Recipe Box

In [None]:
files = ['recipes_raw_nosource_epi.json', 'recipes_raw_nosource_ar.json', 'recipes_raw_nosource_fn.json']
# epicurious, allrecipes, foodnetwork
recipe_box_recipes = set()
for filename in files:
    with open('recipe-box/' + filename, 'r') as f:
        rb_data_part = json.loads(f.read())
        for r in rb_data_part.values():
            if len(r):
                recipe_box_recipes.add(tuple(sorted([x.lower().replace('advertisement', '').strip() for x in r['ingredients']])))
pprint(random.sample(recipe_box_recipes, 1))

## 1 Milion recipes

In [5]:
with open(datasets_dir+'recipe1M/layer1.json', 'r') as f:
    recipe1M_data = json.loads(f.read())

In [6]:
pprint(recipe1M_data[2355])
len(recipe1M_data) ;

{'id': '0095abaed6',
 'ingredients': [{'text': '2 avocados, peeled and cut in chunks'},
                 {'text': '2 large beefsteak tomatoes or 4 medium tomatoes, '
                          'chopped'},
                 {'text': '12 English cucumber, peeled and chopped'},
                 {'text': '3 cloves garlic, peeled and minced'},
                 {'text': '1 -2 tablespoon Braggs liquid aminos (or to taste) '
                          'or 1 -2 tablespoon tamari (or to taste)'}],
 'instructions': [{'text': 'Mix all ingredients and let marinate for a bit.'},
                  {'text': 'Serve over salad, rice, or any other grain of your '
                           'liking.'}],
 'partition': 'train',
 'title': 'Tasty Avocado Salad',
 'url': 'http://www.food.com/recipe/tasty-avocado-salad-12253'}


In [None]:
for url in my_results_list:
        m = re.search('https?://www\.([A-Za-z_0-9.-]+).*', url)
        if m and m.group(1)=='epicurious.com': 
            urls[ind].append(url)

In [14]:
def get_domain(url):
    m = re.search('https?://([A-Za-z_0-9.-]+).*', url)
    if m : return m.group(1)
    else: return ''
    
recipe1M_data[2355]['url'],get_domain(recipe1M_data[2355]['url'])

('http://www.food.com/recipe/tasty-avocado-salad-12253', 'www.food.com')

In [15]:
df = pd.DataFrame(recipe1M_data)

In [16]:
df['domain'] = df.url.apply(get_domain)

In [17]:
df.domain.value_counts()

www.food.com               507834
tastykitchen.com            75537
cookpad.com                 61438
cookeatshare.com            60628
www.foodnetwork.com         58066
www.kraftrecipes.com        50850
allrecipes.com              49006
www.epicurious.com          48697
recipeland.com              27332
www.foodandwine.com         18269
cooking.nytimes.com         17453
www.foodgeeks.com           10309
www.cookstr.com              9240
www.myrecipes.com            7133
www.chowhound.com            6361
online-cookbook.com          5763
www.vegetariantimes.com      4791
www.delish.com               4169
www.landolakes.com           2562
www.foodrepublic.com         2341
www.lovefood.com             1940
www.comidakraft.com             1
Name: domain, dtype: int64

In [25]:
for dom in list(df.domain.value_counts().index):
    print(df.url[df.domain==dom].iloc[0])

http://www.food.com/recipe/crunchy-onion-potato-bake-479149
http://tastykitchen.com/recipes/breakfastbrunch/yogurt-parfaits/
https://cookpad.com/us/recipes/150100-kombu-tea-grilled-chicken-thigh
http://cookeatshare.com/recipes/dilly-macaroni-salad-49166
http://www.foodnetwork.com/recipes/gazpacho1.html
http://www.kraftrecipes.com/recipes/salmon-salad-a-la-sportz-59364.aspx
http://allrecipes.com/recipe/harrison-muesli/
http://www.epicurious.com/recipes/food/views/-world-s-best-mac-and-cheese-387747
https://recipeland.com/recipe/v/spinatsuppe-spinach-soup-36398
http://www.foodandwine.com/recipes/cornmeal-crackers-with-pumpkin-seeds
http://cooking.nytimes.com/recipes/10753
http://www.foodgeeks.com/recipes/431
http://www.cookstr.com/recipes/sweet-and-spicy-red-pepper-dip
http://www.myrecipes.com/recipe/apple-currant-bars
http://www.chowhound.com/recipes/pate-a-choux-shells-french-pastry-shells-30867
http://online-cookbook.com/goto/cook/rpage/000A09
http://www.vegetariantimes.com/recipe/tof

In [30]:
dom_list = ['cooking.nytimes.com','www.chowhound.com',
            'www.epicurious.com','www.foodrepublic.com',
            'www.foodnetwork.com','allrecipes.com']

urls=[]
for dom in dom_list:
    urls += list(df.url[df.domain==dom])
print(len(urls))

181924


In [31]:
df_urls  = pd.DataFrame(urls) 
    
# saving the dataframe 
df_urls.to_csv(save_dir+'url_list.csv') 

In [32]:
df_urls.head()

Unnamed: 0,0
0,http://cooking.nytimes.com/recipes/10753
1,http://cooking.nytimes.com/recipes/1017290
2,http://cooking.nytimes.com/recipes/4338
3,http://cooking.nytimes.com/recipes/1015395
4,http://cooking.nytimes.com/recipes/1012955


## Getting urls to Epicurious recipe pages

### google search

In [196]:
query = "epicurious Gochujang-Braised Chicken and Crispy Rice"

In [200]:
my_results_list = []
for i in search(query,        # The query you want to run
                tld = 'com',  # The top level domain
                lang = 'en',  # The language
                num = 2,     # Number of results per page
                start = 0,    # First result to retrieve
                stop = 2,  # Last result to retrieve
                pause = 2.0,  # Lapse between HTTP requests
               ):
    my_results_list.append(i)
    print(i)

https://www.epicurious.com/recipes/food/views/gochujang-braised-chicken-and-crispy-rice
https://www.bonappetit.com/recipe/gochujang-braised-chicken-and-crispy-rice


In [82]:
url = 'https://www.epicurious.com/recipes/food/views/gochujang-braised-chicken-and-crispy-rice'
m = re.search('https?://www\.([A-Za-z_0-9.-]+).*', url)
if m:
    print(m.group(1))

epicurious.com


In [95]:
from googlesearch import search

urls={}
missing = []
for ind in df.index:
    urls[ind]=[]
    query = "epicurious %s"%df.title.loc[ind]
    my_results_list = search(query, tld = 'com', lang = 'en', 
                             num = 2, start = 0, stop = 2, pause = 2.0)
    for url in my_results_list:
        m = re.search('https?://www\.([A-Za-z_0-9.-]+).*', url)
        if m and m.group(1)=='epicurious.com': 
            urls[ind].append(url)
    if len(urls[ind])==0: missing.append(ind)
            

In [96]:
missing

[323,
 4980,
 7734,
 8210,
 8354,
 9375,
 9548,
 9858,
 10006,
 11920,
 12238,
 12607,
 13063,
 13130,
 13559,
 13942,
 14032,
 14090,
 14289,
 14293,
 14418,
 14911]

### get image - but no need to actually download the image for now. Just need to grab the link to display it

In [80]:
## Importing Necessary Modules
import requests # to get image from the web
import shutil # to save it locally

## Set up the image URL and filename
image_url = "https://assets.epicurious.com/photos/57191b1213c4b7d74c9d45e6/6:4/w_620%2Ch_413/Quick-Poached-Salmon-(c)-Maja-Smend.jpg"
filename = image_url.split("/")[-1]

# Open the url image, set stream to True, this will return the stream content.
r = requests.get(image_url, stream = True)

# Check if the image was retrieved successfully
if r.status_code == 200:
    # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
    r.raw.decode_content = True
    
    # Open a local file with wb ( write binary ) permission.
    with open(filename,'wb') as f:
        shutil.copyfileobj(r.raw, f)
        
    print('Image sucessfully Downloaded: ',filename)
else:
    print('Image Couldn\'t be retreived')

Image sucessfully Downloaded:  Quick-Poached-Salmon-(c)-Maja-Smend.jpg


In [84]:
pd.Series({'a':3,'b': np.NaN}).isnull()

a    False
b     True
dtype: bool

### 

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
def preprocess_df(df):
    
    def process_string(x):
        x = [" ".join([WordNetLemmatizer().lemmatize(q) for q in p.split()]) for p in x] #Lemmatization
        x = list(map(lambda x: re.sub(r'\(.*oz.\)|crushed|crumbles|ground|minced|powder|chopped|sliced','', x), x))
        x = list(map(lambda x: re.sub("[^a-zA-Z]", " ", x), x))   # To remove everything except a-z and A-Z
        x = " ".join(x)                                 # To make list element a string element 
        x = x.lower()
        return x
    
    df = df.drop('id',axis=1)
    df['ing_processed'] = df['ingredients'].apply(process_string)
    
    return df

In [None]:
tfidf = TfidfVectorizer(stop_words='english',
                             ngram_range = ( 1 , 1 ),analyzer="word", 
                             max_df = .57 , binary=False , 
                        token_pattern=r'\w+' , sublinear_tf=False)

C = 604.5300203551828
gamma = 0.9656489284085462

clf = SVC(C=float(C), gamma=float(gamma), kernel='rbf',probability = True)

In [None]:
pipe = Pipeline([('tfidf_vect',tfidf),
                 ('SVC',clf)])