In [1]:
import pandas as pd

In [2]:
data_root = "../recipebox"

In [3]:
from glob import glob
datasets = glob(data_root + "/recipes*.json")
datasets = datasets[1], datasets[0], datasets[2]

In [4]:
stoppers = (",", ";") # stop reading the string here

measure_words = (
    # from https://en.wikipedia.org/wiki/Cooking_weights_and_measures#United_States_measures
    "scant",
    "drop",
    "smidgen",
    "pinch",
    "dash",
    "head",
    "piece",
    "splash",
    "squeeze",
    # from # from https://en.wikibooks.org/wiki/Cookbook:Units_of_measurement
    # fluid
    "teaspoon","t", "tsp",
    "tablespoon", "T", "tbl", "tbs", "tbsp", 
    "fluid ounce", "fl oz",
    "gill",
    "cup", "c",
    "pint", "p", "pt", "fl pt",
    "quart", "q", "qt", "fl qt",
    "gallon", "g", "gal",
    "ml", "milliliter", "millilitre", "cc",
    "l", "liter", "litre",
    "dl", "deciliter", "decilitre",
    "can",
    "bottle",
    # weight
    "pound", "lb" "#",
    "ounce", "oz",
    "mg", "milligram", "milligramme",
    "g", "gram", "gramme",
    "kg", "kilogram", "kilogramme",
    # length
    "mm", "millimeter", "millimetre",
    "cm", "centimeter", "centimetre",
    "m", "meter", "metre",
    "inch", "in",
    # size
    "large",
    "medium",
    "small",
    "about",
    # quantifiers
    "half",
    "single",
    "couple",
    "dozen",
    "many",
    "each",
    "some",
    "every",
    "pair",
    "additional",
    "approx",
    "more",
    "less",
    # misc
    "bag",
    "box",
    "bunch",
    "handful",
    "slice",
    "end",
    "accompaniment"
)


adjective_endings = ("ly",) # remove any word that ends with this
verb_endings = ("ed",) # e.g. packed, melted, chopped, chilled, divided, trimmed, grated
endings = verb_endings + adjective_endings
endings

to_remove = (
    "advertisement", 
    "of", "a", "the", "an",
    "to", "about", "from", "at", "by", "as", "above", "below", "abs",
    "recipe",
    "favorite", "choice", 
    "fat", "free", "taste", "garnish",
    "baby", "plump", "pure", "simple", "fresh", "torn", "good", "quality", "brand", "new", "dusting",
    "very",
    
)
cardinal_numbers = ("one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty")

In [5]:
import re

def lower(string):
    return string.lower()

def remove_after_stopper(string):
    # remove after any comma, colon, or period
    return re.sub(r"[,;\.].*", "", string)

def remove_between_paren(string):
    # \( and \) for parentheses characters
    return re.sub(r"\(.*\)", "", string)

def remove_measure_words(string):
    for m in measure_words:
        # greater than 1, don't want to apply to single letters. E.g. c for cups
        if len(m) > 1:
            # \b word boundary, %s the measure word, [a-zA-Z]* any number of following letters, \b word boundary
            string = re.sub(r"\b%s[a-zA-z]*\b" %m, "", string)
        else:
            string = re.sub(r"\b%s\b" %m, "", string)
    return string

def remove_nonalpha(string):
    # first replace dashes with spaces
    string = string.replace("-", " ")
    # ^ not, [a-zA-z] letter, \s space
    return re.sub(r"[^a-zA-Z\s]", "", string)

def remove_endings(string):
    for end in endings:
        string = re.sub(r"\b[\w]+%s\b" %end, "", string)
    return string

def remove_less_than(num):
    def remove_less_than_from_str(string):
        # remove words with less than num letters
        return re.sub(r"\b[\w]{%s}\b" % ",".join(str(i) for i in range(1,num+1)), "", string)
    return remove_less_than_from_str

def remove_dumb_words(string):
    for word in to_remove + cardinal_numbers:
        string = re.sub(r"\b%s\b" % word, "", string)
    return string

def remove_available_at(string):
    """ remove 'available at *** markets' """
    return re.sub(r"available at.*markets", "", string)

def remove_extra_spaces(string):
    return re.sub(r"\s+", " ", string)

def strip(string):
    return string.strip()

import functools
def compose(*functions):
    # we have to reverse to get in the desired order
    functions = reversed(functions)
    return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)

functions = (
    lower,
    remove_after_stopper, 
    remove_between_paren, 
    remove_measure_words, 
    remove_nonalpha, 
    remove_endings,
    remove_less_than(2),
    remove_dumb_words,
    remove_available_at,
    remove_extra_spaces, 
    strip
)
extract_ingredient = compose(*functions)

In [6]:
re.split("and|or", "tomatoes and potatos")

['tomatoes ', ' potatos']

In [10]:
from tqdm import tqdm_notebook as tqdm
from tqdm import trange
from collections import defaultdict
import numpy as np

ingredients = defaultdict(int)
ingredients_to_recipe_ids = defaultdict(set)

for ds in tqdm(datasets, desc="Dataset"):
    print("Loading dataset", ds)
    df = pd.read_json(ds)
    all_ingredient_lists = df.loc["ingredients"]
    for i in trange(len(all_ingredient_lists), desc="recipes"):
        try:
            recipe_id = all_ingredient_lists.index[i]
            ingredient_list = all_ingredient_lists[i]

            if type(ingredient_list) != list:
                continue
            
            
            for ingredient_string in ingredient_list:
                # SPLIT THE INGREDIENT STRING ON AND & OR & PLUS
                parts = re.split(r"\band\b|\bor\b|\bplus\b", ingredient_string)
                if len(parts) > 1:
                    ingredient_list += parts[1:] # add other parts later to continue iteration
                    ingredient_string = parts[0]
                
                ingredient = extract_ingredient(ingredient_string)
                if ingredient: # ignore empty string
                    ingredients[ingredient] += 1
                    ingredients_to_recipe_ids[ingredient].add(recipe_id)        
                    
        except Exception as e:
            print(f"Warning! failed on {str(ingredient_list)}")
            print(e)


HBox(children=(IntProgress(value=0, description='Dataset', max=3, style=ProgressStyle(description_width='initi…

Loading dataset ../recipebox/recipes_raw_nosource_fn.json


recipes: 100%|██████████| 60039/60039 [01:51<00:00, 539.06it/s]


Loading dataset ../recipebox/recipes_raw_nosource_epi.json


recipes: 100%|██████████| 25323/25323 [00:42<00:00, 595.90it/s]


Loading dataset ../recipebox/recipes_raw_nosource_ar.json


recipes: 100%|██████████| 39802/39802 [01:05<00:00, 605.90it/s]







In [15]:
ingredients_to_recipe_ids_df = pd.DataFrame.from_dict(ingredients_to_recipe_ids)

ValueError: arrays must all be same length

In [None]:
def order(my_dict):
    return sorted(my_dict.items(), key=lambda x:x[1], reverse=True)

In [None]:
len(ingredients)

In [None]:
order(ingredients)[:200]

In [None]:
names, counts = zip(*order(ingredients))
sorted(names[:200])

In [None]:
from difflib import SequenceMatcher

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()


def combine_similar(my_dict, ratio=0.65, combine_reverse=True):
    """
    combines similar keys in the dictionary
    
    assumes the relevant similar parts are at the end of the key
    """
    new_dict = defaultdict(int)
    
    # we sort by the tail ends. puts tail ends next to each other. better similarity this way.
    # e.g. "awesome lemon zest" and "weird lemon zest" are typically far apart.
    # sorting from the ends puts them next to each other
    sorted_keys = sorted(my_dict, key=lambda x: x[::-1].replace(" ", "Ω") if combine_reverse else x.replace(" ", "Ω")) 
    prev = ""
    for k in sorted_keys:
        if similarity(prev, k) > ratio:
            new_dict[prev] = my_dict[prev] + my_dict[k]
        else:
            new_dict[k] = my_dict[k]
            prev = k
    return new_dict

In [None]:
combined_ingredients = combine_similar(ingredients)
len(combined_ingredients)

In [None]:
order(combined_ingredients)[:500]

In [None]:
names, counts = zip(*order(combined_ingredients))
sorted(names[:200])

In [None]:
# COMBINE SIMILAR AGAIN, but this time with higher ratio 0.9 to get rid of plurals
refiltered = order(combine_similar(dict(order(combined_ingredients)[:500]), ratio=0.9, combine_reverse=False))
refiltered

In [None]:
n, c = zip(*refiltered)
n = list(n)
n.remove("more")
n.remove("more for")
n.remove("more for dusting")
sorted(n)

In [None]:
re.split(r"\band\b|\bor\b|\bplus\b", "oregano and oranges andrew")

In [None]:
"Ω" < "s"