In [1]:
import pandas as pd

In [2]:
data_root = "recipebox"

In [3]:
from glob import glob
datasets = glob(data_root + "/recipes*.json")
datasets = datasets[1], datasets[0], datasets[2]

In [4]:
stoppers = (",", ";") # stop reading the string here

measure_words = (
    # from https://en.wikipedia.org/wiki/Cooking_weights_and_measures#United_States_measures
    "scant",
    "drop",
    "smidgen",
    "pinch",
    "dash",
    "head",
    "piece",
    "splash",
    "squeeze",
    # from # from https://en.wikibooks.org/wiki/Cookbook:Units_of_measurement
    # fluid
    "teaspoon","t", "tsp",
    "tablespoon", "T", "tbl", "tbs", "tbsp", 
    "fluid ounce", "fl oz",
    "gill",
    "cup", "c",
    "pint", "p", "pt", "fl pt",
    "quart", "q", "qt", "fl qt",
    "gallon", "g", "gal",
    "ml", "milliliter", "millilitre", "cc",
    "l", "liter", "litre",
    "dl", "deciliter", "decilitre",
    "can",
    "bottle",
    # weight
    "pound", "lb" "#",
    "ounce", "oz",
    "mg", "milligram", "milligramme",
    "g", "gram", "gramme",
    "kg", "kilogram", "kilogramme",
    # length
    "mm", "millimeter", "millimetre",
    "cm", "centimeter", "centimetre",
    "m", "meter", "metre",
    "inch", "in",
    # size
    "large",
    "medium",
    "small",
    "about",
    # quantifiers
    "half",
    "single",
    "couple",
    "dozen",
    "many",
    "each",
    "some",
    "every",
    "pair",
    "additional",
    "approx".
    "more",
    "less",
    # misc
    "bag",
    "box",
    "bunch",
    "handful",
    "slice",
    "end",
    "accompaniment"
)


adjective_endings = ("ly",) # remove any word that ends with this
verb_endings = ("ed",) # e.g. packed, melted, chopped, chilled, divided, trimmed, grated
endings = verb_endings + adjective_endings
endings

to_remove = (
    "advertisement", 
    "of", "a", "the", "an",
    "to", "about", "from", "at", "by", "as", "above", "below", "abs",
    "recipe",
    "favorite", "choice", 
    "fat", "free", "taste", "garnish",
    "baby", "plump", "pure", "simple", "fresh", "torn", "good", "quality", "brand", "new", "dusting",
    "very",
    
)
cardinal_numbers = ("one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty")

In [5]:
import re

def lower(string):
    return string.lower()

def remove_after_stopper(string):
    # remove after any comma, colon, or period
    return re.sub(r"[,;\.].*", "", string)

def remove_between_paren(string):
    # \( and \) for parentheses characters
    return re.sub(r"\(.*\)", "", string)

def remove_measure_words(string):
    for m in measure_words:
        # greater than 1, don't want to apply to single letters. E.g. c for cups
        if len(m) > 1:
            # \b word boundary, %s the measure word, [a-zA-Z]* any number of following letters, \b word boundary
            string = re.sub(r"\b%s[a-zA-z]*\b" %m, "", string)
        else:
            string = re.sub(r"\b%s\b" %m, "", string)
    return string

def remove_nonalpha(string):
    # first replace dashes with spaces
    string = string.replace("-", " ")
    # ^ not, [a-zA-z] letter, \s space
    return re.sub(r"[^a-zA-Z\s]", "", string)

def remove_endings(string):
    for end in endings:
        string = re.sub(r"\b[\w]+%s\b" %end, "", string)
    return string

def remove_less_than(num):
    def remove_less_than_from_str(string):
        # remove words with less than num letters
        return re.sub(r"\b[\w]{%s}\b" % ",".join(str(i) for i in range(1,num+1)), "", string)
    return remove_less_than_from_str

def remove_dumb_words(string):
    for word in to_remove + cardinal_numbers:
        string = re.sub(r"\b%s\b" % word, "", string)
    return string

def remove_available_at(string):
    """ remove 'available at *** markets' """
    return re.sub(r"available at.*markets", "", string)

def remove_extra_spaces(string):
    return re.sub(r"\s+", " ", string)

def strip(string):
    return string.strip()

import functools
def compose(*functions):
    # we have to reverse to get in the desired order
    functions = reversed(functions)
    return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)

functions = (
    lower,
    remove_after_stopper, 
    remove_between_paren, 
    remove_measure_words, 
    remove_nonalpha, 
    remove_endings,
    remove_less_than(2),
    remove_dumb_words,
    remove_available_at,
    remove_extra_spaces, 
    strip
)
extract_ingredient = compose(*functions)

In [6]:
re.split("and|or", "tomatoes and potatos")

['tomatoes ', ' potatos']

In [7]:
from tqdm import tqdm_notebook as tqdm
from collections import defaultdict
import numpy as np

ingredients = defaultdict(int)

for ds in tqdm(datasets, desc="Dataset"):
    print("Loading dataset", ds)
    df = pd.read_json(ds)
    all_ingredient_lists = df.loc["ingredients"]
    for ingredient_list in tqdm(all_ingredient_lists, desc="recipes"):
        try:
            if type(ingredient_list) != list:
                continue
            for ingredient_string in ingredient_list:
                
                # SPLIT THE INGREDIENT STRING ON AND AND OR
                parts = re.split(r"\band\b|\bor\b|\bplus\b", ingredient_string)
                if len(parts) > 1:
                    ingredient_list += parts[1:]
                    ingredient_string = parts[0]
                
                ingredient = extract_ingredient(ingredient_string)
                if ingredient: # ignore empty string
                    ingredients[ingredient] += 1
        except Exception as e:
            print(f"Warning! failed on {str(ingredient_list)}")
            print(e)


HBox(children=(IntProgress(value=0, description='Dataset', max=3, style=ProgressStyle(description_width='initi…

Loading dataset recipebox/recipes_raw_nosource_ar.json


HBox(children=(IntProgress(value=0, description='recipes', max=39802, style=ProgressStyle(description_width='i…

Loading dataset recipebox/recipes_raw_nosource_epi.json


HBox(children=(IntProgress(value=0, description='recipes', max=25323, style=ProgressStyle(description_width='i…

Loading dataset recipebox/recipes_raw_nosource_fn.json


HBox(children=(IntProgress(value=0, description='recipes', max=60039, style=ProgressStyle(description_width='i…




In [8]:
def order(my_dict):
    return sorted(my_dict.items(), key=lambda x:x[1], reverse=True)

In [9]:
len(ingredients)

74947

In [10]:
order(ingredients)[:200]

[('salt', 54826),
 ('butter', 37722),
 ('ground black pepper', 29544),
 ('sugar', 27418),
 ('olive oil', 23372),
 ('all purpose flour', 22828),
 ('kosher salt', 22679),
 ('onion', 22056),
 ('eggs', 17918),
 ('water', 17244),
 ('cloves garlic', 14603),
 ('vanilla extract', 13162),
 ('extra virgin olive oil', 13056),
 ('vegetable oil', 12092),
 ('milk', 11713),
 ('lemon juice', 11639),
 ('white sugar', 11355),
 ('pepper', 10548),
 ('ground cinnamon', 7994),
 ('baking powder', 7810),
 ('heavy cream', 7595),
 ('egg', 7554),
 ('garlic', 7469),
 ('brown sugar', 6789),
 ('garlic cloves', 6355),
 ('honey', 6259),
 ('baking soda', 6250),
 ('oil', 6173),
 ('parsley', 6010),
 ('black pepper', 5623),
 ('carrots', 5423),
 ('ginger', 5231),
 ('sour cream', 5198),
 ('more', 4896),
 ('mayonnaise', 4848),
 ('lime juice', 4837),
 ('ground cumin', 4768),
 ('ground pepper', 4704),
 ('onions', 4640),
 ('tomatoes', 4600),
 ('lemon', 4582),
 ('clove garlic', 4550),
 ('cilantro', 4423),
 ('confectioners sugar

In [11]:
names, counts = zip(*order(ingredients))
sorted(names[:200])

['all purpose flour',
 'almond extract',
 'almonds',
 'apple cider vinegar',
 'arugula',
 'avocado',
 'bacon',
 'baking powder',
 'baking soda',
 'balsamic vinegar',
 'basil',
 'basil leaves',
 'bay leaf',
 'bay leaves',
 'beans',
 'bell pepper',
 'bell peppers',
 'black beans',
 'black pepper',
 'blueberries',
 'boiling water',
 'boneless',
 'brandy',
 'bread crumbs',
 'brown sugar',
 'butter',
 'buttermilk',
 'cabbage',
 'capers',
 'carrot',
 'carrots',
 'cayenne',
 'cayenne pepper',
 'celery',
 'cheddar cheese',
 'cherry tomatoes',
 'chicken',
 'chicken broth',
 'chicken stock',
 'chili powder',
 'chives',
 'cider vinegar',
 'cilantro',
 'cilantro leaves',
 'cinnamon',
 'clove garlic',
 'cloves garlic',
 'coarse salt',
 'cocoa powder',
 'coconut',
 'coconut milk',
 'cold butter',
 'cold water',
 'confectioners sugar',
 'cooking spray',
 'cornstarch',
 'cranberries',
 'cream',
 'cream cheese',
 'cucumber',
 'curry powder',
 'cut',
 'dark brown sugar',
 'dijon mustard',
 'dill',
 'dry

In [12]:
from difflib import SequenceMatcher

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()


def combine_similar(my_dict, ratio=0.65, combine_reverse=True):
    """
    combines similar keys in the dictionary
    
    assumes the relevant similar parts are at the end of the key
    """
    new_dict = defaultdict(int)
    
    # we sort by the tail ends. puts tail ends next to each other. better similarity this way.
    # e.g. "awesome lemon zest" and "weird lemon zest" are typically far apart.
    # sorting from the ends puts them next to each other
    sorted_keys = sorted(my_dict, key=lambda x: x[::-1].replace(" ", "Ω") if combine_reverse else x.replace(" ", "Ω")) 
    prev = ""
    for k in sorted_keys:
        if similarity(prev, k) > ratio:
            new_dict[prev] = my_dict[prev] + my_dict[k]
        else:
            new_dict[k] = my_dict[k]
            prev = k
    return new_dict

In [13]:
combined_ingredients = combine_similar(ingredients)
len(combined_ingredients)

37558

In [14]:
order(combined_ingredients)[:500]

[('salt', 54826),
 ('butter', 37722),
 ('bulgar', 27419),
 ('olive oil', 23372),
 ('kocher salt', 22680),
 ('onion', 22056),
 ('eggs', 17918),
 ('water', 17244),
 ('ounces garlic', 14604),
 ('vegetable oil', 12093),
 ('milk', 11713),
 ('chocolate sugar', 11357),
 ('pepper', 10548),
 ('sri lanka cinnamon', 7995),
 ('hot pastry cream', 7596),
 ('egg', 7554),
 ('garlic', 7470),
 ('brown sugar', 6791),
 ('garlic cloves', 6358),
 ('honey', 6259),
 ('baking soda', 6251),
 ('oil', 6174),
 ('pot barley', 6011),
 ('carrots', 5423),
 ('tomatoes', 5295),
 ('ginger', 5277),
 ('for cream', 5202),
 ('more', 4897),
 ('mayonnaise', 4850),
 ('chipotle juice', 4838),
 ('ground cumin', 4772),
 ('onions', 4640),
 ('emon', 4584),
 ('brunoise garlic', 4551),
 ('cilantro', 4423),
 ('confectioners sugar', 4307),
 ('oregano', 4249),
 ('paprika', 3940),
 ('thyme', 3930),
 ('bell pepper', 3875),
 ('parmesan cheese', 3859),
 ('gold metallic powder', 3684),
 ('starch', 3607),
 ('chicken stock', 3598),
 ('agar flak

In [15]:
names, counts = zip(*order(combined_ingredients))
sorted(names[:200])

['agar flakes',
 'allspice',
 'almonds',
 'apples',
 'apricot sauce',
 'asian style sea salt',
 'avocado',
 'bacon',
 'baking soda',
 'banana extract',
 'basil',
 'bay powder',
 'bean broth',
 'beans',
 'bell pepper',
 'betel leaves',
 'black peppers',
 'boneless',
 'brandy',
 'brown sugar',
 'brunoise garlic',
 'bulgar',
 'butter',
 'buttermilk',
 'cabbage',
 'cake flour',
 'capers',
 'caramel stick',
 'carrot',
 'carrots',
 'cayenne',
 'celery',
 'cheddar cheese',
 'chicken',
 'chicken stock',
 'chili butter',
 'chili powder',
 'chipotle juice',
 'chives',
 'chocolate sugar',
 'cilantro',
 'cinnamon',
 'coarse salt',
 'cocoa powder',
 'coconut',
 'confectioners sugar',
 'cooking water',
 'cornmeal',
 'cream',
 'cream tartar',
 'cucumber',
 'cumin seeds',
 'cut',
 'dark brown sugar',
 'dark rum',
 'dill',
 'dry sherry',
 'edam cheese',
 'egg',
 'egg whites',
 'eggs',
 'emon',
 'fenugreek seeds',
 'fish sauce',
 'flour',
 'flour tortillas',
 'for cream',
 'fregula',
 'garlic',
 'garlic

In [38]:
# COMBINE SIMILAR AGAIN, but this time with higher ratio 0.9 to get rid of plurals
refiltered = order(combine_similar(dict(order(combined_ingredients)[:500]), ratio=0.9, combine_reverse=False))
refiltered

[('salt', 54826),
 ('butter', 37722),
 ('bulgar', 27419),
 ('onion', 26696),
 ('olive oil', 23372),
 ('kocher salt', 22680),
 ('eggs', 17918),
 ('water', 17244),
 ('ounces garlic', 14604),
 ('vegetable oil', 12093),
 ('milk', 11713),
 ('chocolate sugar', 11357),
 ('pepper', 10548),
 ('garlic clove', 8595),
 ('carrot', 8070),
 ('sri lanka cinnamon', 7995),
 ('hot pastry cream', 7596),
 ('egg', 7554),
 ('garlic', 7470),
 ('brown sugar', 6791),
 ('honey', 6259),
 ('baking soda', 6251),
 ('oil', 6174),
 ('shallot', 6023),
 ('pot barley', 6011),
 ('tomatoes', 5295),
 ('ginger', 5277),
 ('for cream', 5202),
 ('more', 4897),
 ('mayonnaise', 4850),
 ('chipotle juice', 4838),
 ('ground cumin', 4772),
 ('emon', 4584),
 ('brunoise garlic', 4551),
 ('cilantro', 4423),
 ('confectioners sugar', 4307),
 ('oregano', 4249),
 ('scallion', 3977),
 ('paprika', 3940),
 ('thyme', 3930),
 ('bell pepper', 3875),
 ('parmesan cheese', 3859),
 ('gold metallic powder', 3684),
 ('starch', 3607),
 ('chicken stock',

In [39]:
n, c = zip(*refiltered)
n = list(n)
n.remove("more")
n.remove("more for")
n.remove("more for dusting")
sorted(n)

['agar flakes',
 'agave nectar',
 'allspice',
 'almonds',
 'anchovy fillets',
 'anko',
 'any seeds',
 'apple',
 'apple cider',
 'apple pie spice',
 'apple vinegar',
 'apricot sauce',
 'apricots',
 'asiago cheese',
 'asian style sea salt',
 'asparagus',
 'avocado',
 'bacon',
 'bag frozen chestnuts',
 'baguette',
 'baking potatoes',
 'baking soda',
 'banana',
 'banana extract',
 'bananas foster sauce',
 'barbecue sauce',
 'barley wine',
 'basil',
 'basmati rice',
 'bay powder',
 'bean broth',
 'beans',
 'beef',
 'beef broth',
 'beer',
 'beets',
 'bell pepper',
 'betel leaves',
 'bias celery',
 'bittersweet',
 'black olives',
 'black peppers',
 'blackberries',
 'bliss potato',
 'boboli',
 'bok choy',
 'boneless',
 'bourbon',
 'branches rosemary',
 'brandy',
 'broccoli florets',
 'broth like chicken',
 'brown rice',
 'brown sugar',
 'brunoise garlic',
 'brussels sprouts',
 'bulgar',
 'bunch cilantro',
 'buratta cheese',
 'butter',
 'buttermilk',
 'cabbage',
 'cake flour',
 'capers',
 'cara

In [35]:
re.split(r"\band\b|\bor\b|\bplus\b", "oregano and oranges andrew")

['oregano ', ' oranges andrew']

In [18]:
"Ω" < "s"

False