In [1]:
import pandas as pd

In [2]:
data_root = "recipebox"

In [3]:
from glob import glob
datasets = glob(data_root + "/recipes*.json")
datasets = datasets[1], datasets[0], datasets[2]

In [4]:
stoppers = (",", ";") # stop reading the string here

measure_words = (
    # from https://en.wikipedia.org/wiki/Cooking_weights_and_measures#United_States_measures
    "scant",
    "drop",
    "smidgen",
    "pinch",
    "dash",
    "head",
    "piece",
    "splash",
    "squeeze",
    # from # from https://en.wikibooks.org/wiki/Cookbook:Units_of_measurement
    # fluid
    "teaspoon","t", "tsp",
    "tablespoon", "T", "tbl", "tbs", "tbsp", 
    "fluid ounce", "fl oz",
    "gill",
    "cup", "c",
    "pint", "p", "pt", "fl pt",
    "quart", "q", "qt", "fl qt",
    "gallon", "g", "gal",
    "ml", "milliliter", "millilitre", "cc",
    "l", "liter", "litre",
    "dl", "deciliter", "decilitre",
    "can",
    "bottle",
    # weight
    "pound", "lb" "#",
    "ounce", "oz",
    "mg", "milligram", "milligramme",
    "g", "gram", "gramme",
    "kg", "kilogram", "kilogramme",
    # length
    "mm", "millimeter", "millimetre",
    "cm", "centimeter", "centimetre",
    "m", "meter", "metre",
    "inch", "in",
    # size
    "large",
    "medium",
    "small",
    "about",
    # quantifiers
    "half",
    "single",
    "couple",
    "dozen",
    "many",
    "each",
    "some",
    "every",
    "pair",
    "additional",
    "approx"
    # misc
    "bag",
    "box",
    "handful",
    "slice",
    "end",
    "accompaniment"
)


adjective_endings = ("ly",) # remove any word that ends with this
verb_endings = ("ed",) # e.g. packed, melted, chopped, chilled, divided, trimmed, grated
endings = verb_endings + adjective_endings
endings

to_remove = (
    "advertisement", 
    "of", "a", "the", "an",
    "to", "about", "from", "at", "by", "as", "above", "below", "abs",
    "recipe",
    "favorite", "choice", 
    "fat", "free", "taste", "garnish",
    "baby", "plump", "pure", "simple", "fresh", "torn", "good", "quality", "brand", "new"
    "very",
    
)
cardinal_numbers = ("one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty")

In [5]:
import re

def lower(string):
    return string.lower()

def remove_after_stopper(string):
    # remove after any comma, colon, or period
    return re.sub(r"[,;\.].*", "", string)

def remove_between_paren(string):
    # \( and \) for parentheses characters
    return re.sub(r"\(.*\)", "", string)

def remove_measure_words(string):
    for m in measure_words:
        # greater than 1, don't want to apply to single letters. E.g. c for cups
        if len(m) > 1:
            # \b word boundary, %s the measure word, [a-zA-Z]* any number of following letters, \b word boundary
            string = re.sub(r"\b%s[a-zA-z]*\b" %m, "", string)
        else:
            string = re.sub(r"\b%s\b" %m, "", string)
    return string

def remove_nonalpha(string):
    # first replace dashes with spaces
    string = string.replace("-", " ")
    # ^ not, [a-zA-z] letter, \s space
    return re.sub(r"[^a-zA-Z\s]", "", string)

def remove_endings(string):
    for end in endings:
        string = re.sub(r"\b[\w]+%s\b" %end, "", string)
    return string

def remove_less_than(num):
    def remove_less_than_from_str(string):
        # remove words with less than num letters
        return re.sub(r"\b[\w]{%s}\b" % ",".join(str(i) for i in range(1,num+1)), "", string)
    return remove_less_than_from_str

def remove_dumb_words(string):
    for word in to_remove + cardinal_numbers:
        string = re.sub(r"\b%s\b" % word, "", string)
    return string

def remove_available_at(string):
    """ remove 'available at *** markets' """
    return re.sub(r"available at.*markets", "", string)

def remove_extra_spaces(string):
    return re.sub(r"\s+", " ", string)

def strip(string):
    return string.strip()

import functools
def compose(*functions):
    # we have to reverse to get in the desired order
    functions = reversed(functions)
    return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)

functions = (
    lower,
    remove_after_stopper, 
    remove_between_paren, 
    remove_measure_words, 
    remove_nonalpha, 
    remove_endings,
    remove_less_than(2),
    remove_dumb_words,
    remove_available_at,
    remove_extra_spaces, 
    strip
)
extract_ingredient = compose(*functions)

In [6]:
re.split("and|or", "tomatoes and potatos")

['tomatoes ', ' potatos']

In [19]:
from tqdm import tqdm_notebook as tqdm
from collections import defaultdict
import numpy as np

ingredients = defaultdict(int)

for ds in tqdm(datasets, desc="Dataset"):
    print("Loading dataset", ds)
    df = pd.read_json(ds)
    all_ingredient_lists = df.loc["ingredients"]
    for ingredient_list in tqdm(all_ingredient_lists, desc="recipes"):
        try:
            if type(ingredient_list) != list:
                continue
            for ingredient_string in ingredient_list:
                
                # SPLIT THE INGREDIENT STRING ON AND AND OR
                parts = re.split(r"\band\b|\bor\b|\bplus\b", ingredient_string)
                if len(parts) > 1:
                    ingredient_list += parts[1:]
                    ingredient_string = parts[0]
                
                ingredient = extract_ingredient(ingredient_string)
                if ingredient: # ignore empty string
                    ingredients[ingredient] += 1
        except Exception as e:
            print(f"Warning! failed on {str(ingredient_list)}")
            print(e)


HBox(children=(IntProgress(value=0, description='Dataset', max=3, style=ProgressStyle(description_width='initi…

Loading dataset recipebox/recipes_raw_nosource_ar.json


HBox(children=(IntProgress(value=0, description='recipes', max=39802, style=ProgressStyle(description_width='i…

Loading dataset recipebox/recipes_raw_nosource_epi.json


KeyboardInterrupt: 

In [20]:
def order(my_dict):
    return sorted(my_dict.items(), key=lambda x:x[1], reverse=True)

In [21]:
len(ingredients)

15065

In [22]:
order(ingredients)[:200]

[('salt', 19393),
 ('butter', 12443),
 ('white sugar', 11128),
 ('all purpose flour', 8783),
 ('onion', 8140),
 ('ground black pepper', 7948),
 ('eggs', 6834),
 ('water', 6825),
 ('milk', 6125),
 ('olive oil', 5703),
 ('vanilla extract', 5648),
 ('cloves garlic', 4721),
 ('vegetable oil', 4089),
 ('brown sugar', 4004),
 ('ground cinnamon', 3877),
 ('pepper', 3218),
 ('baking powder', 3198),
 ('baking soda', 3034),
 ('egg', 3026),
 ('lemon juice', 2761),
 ('more', 2679),
 ('garlic powder', 2195),
 ('parmesan cheese', 2194),
 ('parsley', 2079),
 ('garlic', 1778),
 ('skinless', 1756),
 ('confectioners sugar', 1736),
 ('cheddar cheese', 1732),
 ('clove garlic', 1697),
 ('carrots', 1681),
 ('honey', 1658),
 ('sour cream', 1642),
 ('ground nutmeg', 1622),
 ('walnuts', 1615),
 ('mayonnaise', 1603),
 ('soy sauce', 1575),
 ('oregano', 1561),
 ('green bell pepper', 1510),
 ('potatoes', 1491),
 ('tomatoes', 1475),
 ('basil', 1462),
 ('ground cumin', 1409),
 ('cayenne pepper', 1403),
 ('chicken br

In [23]:
names, counts = zip(*order(ingredients))
sorted(names[:200])

['active dry yeast',
 'all purpose flour',
 'almond extract',
 'almonds',
 'apple',
 'apple cider vinegar',
 'apples',
 'applesauce',
 'avocado',
 'bacon',
 'baking powder',
 'baking soda',
 'balsamic vinegar',
 'banana',
 'bananas',
 'basil',
 'basil leaves',
 'bay leaf',
 'bay leaves',
 'beans',
 'beef broth',
 'bell pepper',
 'black beans',
 'black olives',
 'black pepper',
 'blueberries',
 'boiling water',
 'bread crumbs',
 'bread flour',
 'brown sugar',
 'butter',
 'buttermilk',
 'cabbage',
 'carrot',
 'carrots',
 'cayenne pepper',
 'celery',
 'cheddar cheese',
 'chicken',
 'chicken broth',
 'chicken stock',
 'chili powder',
 'chives',
 'cider vinegar',
 'cilantro',
 'clove garlic',
 'cloves garlic',
 'cocoa powder',
 'coconut',
 'coconut milk',
 'coconut oil',
 'cold water',
 'confectioners sugar',
 'container frozen topping',
 'container sour cream',
 'cooking spray',
 'cornstarch',
 'cranberries',
 'cream',
 'cream cheese',
 'cream tartar',
 'cucumber',
 'curry powder',
 'cut',

In [24]:
from difflib import SequenceMatcher

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()


def combine_similar(my_dict, ratio=0.65, combine_reverse=True):
    """
    combines similar keys in the dictionary
    
    assumes the relevant similar parts are at the end of the key
    """
    new_dict = defaultdict(int)
    
    # we sort by the tail ends. puts tail ends next to each other. better similarity this way.
    # e.g. "awesome lemon zest" and "weird lemon zest" are typically far apart.
    # sorting from the ends puts them next to each other
    sorted_keys = sorted(my_dict, key=lambda x: x[::-1].replace(" ", "Ω") if combine_reverse else x.replace(" ", "Ω")) 
    prev = ""
    for k in sorted_keys:
        if similarity(prev, k) > ratio:
            new_dict[prev] = my_dict[prev] + my_dict[k]
        else:
            new_dict[k] = my_dict[k]
            prev = k
    return new_dict

In [25]:
combined_ingredients = combine_similar(ingredients)
len(combined_ingredients)

7807

In [26]:
order(combined_ingredients)[:500]

[('salt', 19993),
 ('butter', 12443),
 ('white sugar', 11129),
 ('gluten multi purpose flour', 8784),
 ('onion', 8140),
 ('ground black pepper', 7953),
 ('eggs', 6834),
 ('water', 6829),
 ('milk', 6125),
 ('anise oil', 5714),
 ('cloves garlic', 4734),
 ('vegetable oil', 4090),
 ('corn sugar', 4006),
 ('cinnamon', 3991),
 ('pepper', 3218),
 ('meringue powder', 3202),
 ('baking soda', 3034),
 ('egg', 3026),
 ('lemon juice', 2765),
 ('bell pepper', 2765),
 ('core', 2681),
 ('carob powder', 2206),
 ('parmesan cheese', 2195),
 ('parsley', 2080),
 ('garlic', 1790),
 ('skinless', 1758),
 ('confectioners sugar', 1742),
 ('tomatoes', 1742),
 ('cheddar cheese', 1733),
 ('carrots', 1681),
 ('honey', 1658),
 ('sour cream', 1645),
 ('ground nutmeg', 1625),
 ('walnuts', 1615),
 ('mayonnaise', 1604),
 ('oregano', 1561),
 ('potatoes', 1492),
 ('basil', 1462),
 ('cilantro', 1424),
 ('ground cumin', 1412),
 ('cayenne pepper', 1404),
 ('paprika', 1373),
 ('chicken broth', 1333),
 ('green onions', 1307),


In [27]:
names, counts = zip(*order(combined_ingredients))
sorted(names[:200])

['almonds',
 'anise oil',
 'apple',
 'apple pie spice',
 'apples',
 'avocado',
 'bacon',
 'baking soda',
 'balsamic vinegar',
 'banana',
 'banana extract',
 'bananas',
 'barberries',
 'basil',
 'bay leaf',
 'bay powder',
 'beans',
 'beef broth',
 'beer',
 'bell pepper',
 'berries',
 'black beans',
 'black olives',
 'black pepper',
 'blackberries',
 'boiling water',
 'broccoli',
 'brownie crust',
 'bulgar',
 'butter',
 'buttermilk',
 'cabbage',
 'carob powder',
 'carrot',
 'carrots',
 'cayenne pepper',
 'celery',
 'champagne vinegar',
 'cheddar cheese',
 'chicken',
 'chicken broth',
 'chicken stock',
 'chili powder',
 'chives',
 'chunks potatoes',
 'cider vinegar',
 'cilantro',
 'cinnamon',
 'cloves garlic',
 'cocoa powder',
 'coconut',
 'cold vinegar',
 'confectioners sugar',
 'container sour cream',
 'core',
 'corn sugar',
 'cornstarch',
 'cream',
 'cream tartar',
 'cucumber',
 'cut',
 'dijon mustard',
 'dill',
 'dole white',
 'dry bread crumbs',
 'dry yeast',
 'eagle milk',
 'edam ch

In [28]:
# COMBINE SIMILAR AGAIN, but this time with higher ratio 0.9 to get rid of plurals

sorted(order(combine_similar(dict(order(combined_ingredients)[:200]), ratio=0.9, combine_reverse=False)))

[('almonds', 804),
 ('anise oil', 5714),
 ('apple', 1285),
 ('apple pie spice', 233),
 ('avocado', 321),
 ('bacon', 1251),
 ('baking soda', 3034),
 ('balsamic vinegar', 621),
 ('banana', 728),
 ('banana extract', 456),
 ('barberries', 575),
 ('basil', 1462),
 ('bay leaf', 434),
 ('bay powder', 524),
 ('beans', 402),
 ('beef broth', 305),
 ('beer', 231),
 ('bell pepper', 2765),
 ('berries', 395),
 ('black beans', 412),
 ('black olives', 392),
 ('black pepper', 961),
 ('blackberries', 571),
 ('boiling water', 521),
 ('broccoli', 251),
 ('brownie crust', 231),
 ('bulgar', 836),
 ('butter', 12443),
 ('buttermilk', 609),
 ('cabbage', 514),
 ('carob powder', 2206),
 ('carrot', 2357),
 ('cayenne pepper', 1404),
 ('celery', 1019),
 ('champagne vinegar', 511),
 ('cheddar cheese', 1733),
 ('chicken', 306),
 ('chicken broth', 1333),
 ('chicken stock', 443),
 ('chili powder', 1071),
 ('chives', 316),
 ('chunks potatoes', 323),
 ('cider vinegar', 309),
 ('cilantro', 1424),
 ('cinnamon', 3991),
 ('c

In [29]:
re.split(r"\band\b|\bor\b|\bplus\b", "oregano and oranges andrew")

['oregano ', ' oranges andrew']

In [30]:
"Ω" < "s"

False