In [142]:
import numpy as np
import pandas as pd
import string
from string import *
import pymongo
from collections import Counter

In [17]:
mc = pymongo.MongoClient()
db = mc['allrecipes']
recipes_coll = db['recipes']
recipes = list(recipes_coll.find())

In [139]:
### CONSTANTS ###

units = ['pound', 'pounds', 'cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons',
         'clove', 'cloves', 'stalk', 'stalks', 'ounce', 'ounces', 'oz.', 'cubes', 'pint', 'pints',
         'quart', 'quarts', 'dash', 'dashs', 'dashes', 'rib', 'ribs', 'bunch', 'bunches']

phrases = [' - ',', or ', ' for garnish', 'cut ', 'such as', ' like ', 'e.g.', 'with', ' or ', 'see note']

stopwords = ['and', 'into', 'very', 'hot', 'cold', 'fresh', 'large', 'medium', 'small', 'halves', 'torn', 'bulk',
             'optional', 'fatfree', 'lowsodium', 'reducedsodium', 'reducedfat', 'ripe']

suffixes = ['ed','less','ly']

flag_words = ['can or bottle', 'can', 'cans', 'package', 'packages', 'jar', 'jars', 'container', 'containers', 'bag', 'bags',
              'bottle', 'bottles', 'envelope', 'envelopes', 'carton','cartons']
flag_words.sort(key=len)
flag_words.reverse()

### FUNCTIONS ###

def parse_ingredients(ingredients, units=units, flag_words=flag_words):
    '''
    Parses a list of ingredients into a list of dictionaries with the following format: 
        {'quantity': (float),
         'units': (str),
         'ingredient': (str)}
    Also takes argument 'units', a list of accepted units (e.g., ['cups', 'tablespoon']).
    If an ingredident does not specify a unit in this list, the label 'each' will be applied.
    '''
    ing_list = []
    for item in ingredients:
        item_dict = {}
        # Check item for flag words (require special parsing treatment)
        flag = False
        for word in item.split():
            if word in flag_words:
                flag = True
        if item.split()[1][0] == '(':
            flag = True  
        # Parse quantities and units        
        if flag:
            quantity, unit, remainder = _parse_special(item, flag_words)
            item_dict['quantity'] = quantity
            item_dict['units'] = unit if unit[-1] != 's' else unit[:-1]
        else:
            quantity, remainder = _determine_quantity(item) 
            item_dict['quantity'] = quantity
            if remainder.split()[0] in units:
                unit = remainder.split()[0]
                item_dict['units'] = unit if unit[-1] != 's' else unit[:-1]
                remainder = ' '.join(remainder.split()[1:])
            else:
                item_dict['units'] = 'each'
        # Remove preparation instructions from remaining text to isolate ingredient
        item_dict['ingredient'] = _remove_descriptors(remainder)
        # Add item dictionary to list
        ing_list.append(item_dict)
    return ing_list

def _determine_quantity(item):
    quantity = 0
    for i, elem in enumerate(item.split()):
        if elem[0] in string.digits:
            try:
                quantity += float(elem)
            except:
                numer, denom = elem.split('/')
                quantity += float(numer) / float(denom)
        else:
            idx = i
            break
    remainder = ' '.join(item.split()[idx:])
    return quantity, remainder

def _parse_special(item, flag_words):
    # Determine special word
    sp_word = ')'
    for word in flag_words:
        if len(item.split(word)) > 1:
            sp_word = ' ' + word + ' '
            break
    
    # Parse item 
    count_and_size = item.split(sp_word)[0]
    remainder = item.split(sp_word)[1]
    count, rest = _determine_quantity(count_and_size)
    if sp_word == ')':
        size, unit = _determine_quantity(rest[1:])
    else:
        size, unit = _determine_quantity(rest[1:-1])
    quantity = count * size
    return quantity, unit, remainder

def _remove_descriptors(item,
                        phrases=phrases,
                        stopwords=stopwords,
                        suffixes=suffixes):
    # Remove common/unnecessary ending phrases
    for phrase in phrases:
        if len(item.split(phrase)) > 1:
            item = item.split(phrase)[0]
    # Remove punctuation and stopwords
    words = []
    for elem in item.split():
        word = ''.join([letter for letter in elem.lower() if letter in string.ascii_lowercase])
        if word not in stopwords:
            words.append(word)
    # Remove adjectives and adverbs    
    for suffix in suffixes:
        for word in words.copy():
            try:
                if (word[-len(suffix):] == suffix) and word != 'red':
                    words.remove(word)
            except:
                continue    
    # Remove trailing spaces
    result = ' '.join([word for word in words if word])
    # Singularize (when not beans)...also, this code is asinine
    if result[-3:] == 'oes':
        result = result[:-2]
    if len(result) < 5:
        return result
    if result[-5:] == 'beans':
        return result
    else:
        return result if result[-1] != 's' else result[:-1]

In [130]:
recipe = recipes[5]
raw_ing = recipe['ingredients_raw']
raw_ing

['1 pound lean ground beef',
 '3 (15 ounce) cans dark red kidney beans',
 '3 (14.5 ounce) cans Mexican-style stewed tomatoes',
 '2 stalks celery, chopped',
 '1 red bell pepper, chopped',
 '1/4 cup red wine vinegar',
 '2 tablespoons chili powder',
 '1 teaspoon ground cumin',
 '1 teaspoon dried parsley',
 '1 teaspoon dried basil',
 '1 dash Worcestershire sauce',
 '1/2 cup red wine']

In [131]:
parse_ingredients(raw_ing)

[{'quantity': 1.0, 'units': 'pound', 'ingredient': 'lean ground beef'},
 {'quantity': 45.0, 'units': 'ounce', 'ingredient': 'dark red kidney beans'},
 {'quantity': 43.5, 'units': 'ounce', 'ingredient': 'mexicanstyle tomato'},
 {'quantity': 2.0, 'units': 'stalk', 'ingredient': 'celery'},
 {'quantity': 1.0, 'units': 'each', 'ingredient': 'red bell pepper'},
 {'quantity': 0.25, 'units': 'cup', 'ingredient': 'red wine vinegar'},
 {'quantity': 2.0, 'units': 'tablespoon', 'ingredient': 'chili powder'},
 {'quantity': 1.0, 'units': 'teaspoon', 'ingredient': 'ground cumin'},
 {'quantity': 1.0, 'units': 'teaspoon', 'ingredient': 'parsley'},
 {'quantity': 1.0, 'units': 'teaspoon', 'ingredient': 'basil'},
 {'quantity': 1.0, 'units': 'dash', 'ingredient': 'worcestershire sauce'},
 {'quantity': 0.5, 'units': 'cup', 'ingredient': 'red wine'}]

In [140]:
ingredients = Counter()
for recipe in recipes:
    ing_list = recipe['ingredients_raw']
    parsed = parse_ingredients(ing_list)
    ingredients = ingredients | set([item['ingredient'] for item in parsed])

In [141]:
len(ingredients)

1059

In [138]:
sorted(list(ingredients))

['',
 'acini di pepe pasta',
 'acorn squash',
 'adobo sauce from chipotle pepper',
 'adobo seasoning rojo adobo',
 'allpurpose flour',
 'almond',
 'alphabet pasta',
 'amber beer',
 'american cheese',
 'anaheim',
 'anaheim chile pepper',
 'anaheim pepper',
 'ancho chile',
 'ancho chile powder',
 'anchovy fillet',
 'andouille sausage',
 'andouille sausage round',
 'angel hair pasta',
 'apple',
 'apple cider',
 'apple cider vinegar',
 'apple juice',
 'applesauce',
 'arborio rice',
 'asiago',
 'asian chile paste',
 'asian chile pepper sauce',
 'asian fish sauce',
 'asian pear',
 'asian sesame oil',
 'asparagu',
 'avocado',
 'avocado oil',
 'baby carrot',
 'baby corn',
 'baby dutch yellow potato',
 'baby red potato',
 'baby spinach',
 'baby spinach leave',
 'bacon',
 'bacon bit',
 'bacon dripping',
 'bacon grease',
 'bacon grease see note',
 'baking potato',
 'baking powder',
 'ball park brand frank',
 'ball park dog bun',
 'ball park hamburger bun',
 'balsamic vinegar',
 'bamboo shoot',
 '