Being a little silly, I've decided to parse the esquire stuff a different way. Since it already uses a pretty common set of units, I'll just use them to split the strings.

In [15]:
import re
import nltk
import json
import copy
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()

# Runs through an entire file and applies a filter to each ingredient
# Filters return the new JSON representation for the ingredient, or 
# None if they decided to delete the ingredient. 
def ingred_processor(infile, ingredient_filter, **kwargs):
    processed = []
    with open(infile, 'r') as recipes:
        recipe_data = json.load(recipes)
        for r in recipe_data:
            # Skip anything that doesn't have ingredients
            if "ingredients" not in r.keys():
                continue
            r_clean = copy.deepcopy(r)
            r_clean["ingredients"] = []
            for i in r['ingredients']:
                clean_i = ingredient_filter(i, **kwargs)
                if clean_i is not None:
                    # This is how we handle deletion of an ingredient
                    r_clean["ingredients"].append(clean_i)
            processed.append(r_clean)
    return processed

In [23]:
def ounce_splitter(i):
    if type(i) is str:
        tok = i.split()
        try:
            idx = tok.index("oz.")
            amount = " ".join(tok[:idx])
            # Try for mixed fractions like 1 1/2
            frac = re.match("([0-9]+) ([0-9]+)/([0-9]+)", amount)
            if frac:
                amount = float(frac[1]) + float(frac[2])/float(frac[3])
            else:
                # Try for proper fractions like 3/4
                frac = re.match("([0-9]+)/([0-9]+)", amount)
                if frac:
                    amount = float(frac[1])/float(frac[2])
                else:
                    # Give up and just convert it to a float
                    amount = float(amount)
            ingredient = " ".join(tok[idx+1:])
            unit = "ounce"
            return {"ingred_amount": amount, "ingred_unit": unit, "ingred_name": ingredient}
        except Exception as e:
            return i
    return i
        
data = ingred_processor("./data/raw/esquire.json", ounce_splitter)

with open('./data/cleaned/esquire.json', 'w') as outfile:
    json.dump(data, outfile, indent=4)

In [30]:
def splitter(i, **kwargs):
    if type(i) is str:
        tok = i.split()
        try:
            idx = tok.index(kwargs['target'])
            amount = " ".join(tok[:idx])
            # Try for mixed fractions like 1 1/2
            frac = re.match("([0-9]+) ([0-9]+)/([0-9]+)", amount)
            if frac:
                amount = float(frac[1]) + float(frac[2])/float(frac[3])
            else:
                # Try for proper fractions like 3/4
                frac = re.match("([0-9]+)/([0-9]+)", amount)
                if frac:
                    amount = float(frac[1])/float(frac[2])
                else:
                    # Give up and just convert it to a float
                    amount = float(amount)
            ingredient = " ".join(tok[idx+1:])
            unit = kwargs['split']
            return {"ingred_amount": amount, "ingred_unit": unit, "ingred_name": ingredient}
        except Exception as e:
            return i
    return i
        
data = ingred_processor("./data/cleaned/esquire.json", splitter, target = "pinch", split="pinch")
with open('./data/cleaned/esquire.json', 'w') as outfile:
    json.dump(data, outfile, indent=4)
