Diffords seems to be mostly in milliliters, and seems to not be big on specifying drops for some things. 

In [2]:
import re
import nltk
import json
import copy
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()

# Runs through an entire file and applies a filter to each ingredient
# Filters return the new JSON representation for the ingredient, or 
# None if they decided to delete the ingredient. 
def ingred_processor(infile, ingredient_filter, **kwargs):
    processed = []
    with open(infile, 'r') as recipes:
        recipe_data = json.load(recipes)
        for r in recipe_data:
            # Skip anything that doesn't have ingredients
            if "ingredients" not in r.keys():
                continue
            r_clean = copy.deepcopy(r)
            r_clean["ingredients"] = []
            for i in r['ingredients']:
                clean_i = ingredient_filter(i, **kwargs)
                if clean_i is not None:
                    # This is how we handle deletion of an ingredient
                    r_clean["ingredients"].append(clean_i)
            processed.append(r_clean)
    return processed

In [None]:
def splitter(i, **kwargs):
    if type(i) is str:
        tok = i.split()
        try:
            idx = tok.index(kwargs['target'])
            amount = " ".join(tok[:idx])
            # Try for mixed fractions like 1 1/2
            frac = re.match("([0-9]+) ([0-9]+)/([0-9]+)", amount)
            if frac:
                amount = float(frac[1]) + float(frac[2])/float(frac[3])
            else:
                # Try for proper fractions like 3/4
                frac = re.match("([0-9]+)/([0-9]+)", amount)
                if frac:
                    amount = float(frac[1])/float(frac[2])
                else:
                    # Give up and just convert it to a float
                    amount = float(amount)
            ingredient = " ".join(tok[idx+1:])
            unit = kwargs['split']
            return {"ingred_amount": amount, "ingred_unit": unit, "ingred_name": ingredient}
        except Exception as e:
            return i
    return i
        
data = ingred_processor("./data/cleaned/diffords.json", splitter, target = "slice", split="slice")
with open('./data/cleaned/diffords.json', 'w') as outfile:
    json.dump(data, outfile, indent=4)


In [58]:
# What is in there that we haven't gotten yet
strs = set()
def str_finder(i, **kwargs):
    if type(i) is str:
        strs.add(i)

_ = ingred_processor("./data/cleaned/diffords.json", str_finder)

for i in strs:
    print(i)

2 Mint leaves
1 Sugar cube
Brut sparkling wine
1 Lime (fresh)
1/2 Lime (fresh) (chopped)
1 Passion fruit
1 Strawberries (hulled, small and ripe)
8 Raspberries
5 Mint leaves
1 Grapefruit peel/zest
10 Mint leaves
12 Basil leaves
6 Sage leaves
5 Raspberries
12 Mint leaves
4 Cucumber (fresh)
7 Mint leaves
3 Blackberries
3 Green cardamom pods
4 Mint leaves
1 1/2 Passion fruit
2 Strawberries (hulled, small and ripe)
4 Strawberries (hulled, small and ripe)
1 Lime zest (peel)
3 Cucumber (fresh)
7 Strawberries (hulled, small and ripe)
2 Sage leaves
3 Strawberries (hulled, small and ripe)
8 Mint leaves
3 Cherry tomato (fresh)
3 Raspberries
3 Blueberries (fresh)
1 Orange peel
3 Mint leaves
1/2 Banana
Soda (club soda) water
2 Raspberries
3 Basil leaves
6 Mint leaves
5 Green grapes (seedless)
2 red chili pepper (fine sliced)
3 Oregano (fresh)
4 Raspberries
1/2 Lime (fresh)
2 Blackberries
4 Cranberries (fresh)
1 Star anise
Tonic water
2 Fresh dill
1 Lemon peel
1/2 Passion fruit
6 Raspberries
1 Egg (