In [172]:
import numpy as np
import pandas as pd
import string
from string import *
import pymongo
from collections import Counter
import matplotlib.pyplot as plt

In [17]:
mc = pymongo.MongoClient()
db = mc['allrecipes']
recipes_coll = db['recipes']
recipes = list(recipes_coll.find())

In [199]:
def find_ingredient_phrase(phrase):
    for recipe in recipes:
        raw_ings = recipe['ingredients_raw']
        for ing in raw_ings:
            if len(ing.split(phrase)) > 1:
                print('Ingredient line :\n', ing)
                
                print('\nEntire recipe :\n', recipe)

In [280]:
### CONSTANTS ###

units = ['pound', 'pounds', 'cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons',
         'clove', 'cloves', 'stalk', 'stalks', 'ounce', 'ounces', 'oz.', 'oz', 'cubes', 'pint', 'pints',
         'quart', 'quarts', 'dash', 'dashs', 'dashes', 'rib', 'ribs', 'bunch', 'bunches', 'pinch', 'head',
         'heads']

manual = ['2 to 3 pound', 'finely chopped from 1 can', 'onion soup, prepared from']

phrases = [' - ',', or ', ' for garnish', 'cut ', 'such as', ' like ', 'e.g.', 'with', ' or ', 'see note', 
           'to taste']

stopwords = ['and', 'into', 'very', 'hot', 'cold', 'warm', 'fresh', 'frozen', 'large', 'medium', 'small', 'halves', 'torn', 'bulk',
             'optional', 'fatfree', 'lowsodium', 'low', 'sodium', 'reducedsodium', 'reducedfat', 'ripe', 'lean',
             'extra', 'pure', 'goya', 'whole', 'ground']

suffixes = ['ed','less','ly']

flag_words = ['can or bottle', 'can', 'cans', 'package', 'packages', 'jar', 'jars', 'container', 'containers', 'bag', 'bags',
              'bottle', 'bottles', 'envelope', 'envelopes', 'carton','cartons', 'packet', 'packets']
flag_words.sort(key=len)
flag_words.reverse()

conversion_dict = {}
conversion_dict['ounce'] = {'other':1}
conversion_dict['cup'] = {'other':8}
conversion_dict['pint'] = {'other':16}
conversion_dict['quart'] = {'other':32}
conversion_dict['gallon'] = {'other':128}
conversion_dict['fluid ounce'] = {'other':1}
conversion_dict['milliliter'] = {'other':0.034}
conversion_dict['pound'] = {'other': 16}
conversion_dict['tablespoon'] = {'other': 1/2}
conversion_dict['teaspoon'] = {'other': 1/6}
conversion_dict['pinch'] = {'other': 0.1}
conversion_dict['dash'] = {'other': 0.1}

conversion_dict['bunch'] =  {'green onion': 3,
                             'cilantro': 2.8,
                             'parsley': 2,
                             'other': 3}
conversion_dict['bunche'] = conversion_dict['bunch']
conversion_dict['clove'] = {'garlic': 0.5, 'other': 0.5}
conversion_dict['cube'] = {'chicken bouillon': 0.4,
                           'beef bouillon': 0.4,
                           'vegetable bouillon': 0.4,
                           'other': 0.4}
conversion_dict['packet'] = {'other':1}
conversion_dict['head'] = {'other': 20,
                             'escarole': 10,
                             'garlic clove': 1.5,
                             'cabbage': 30,
                             'cauliflower': 30,
                             'broccoli': 20}
conversion_dict['rib'] = {'celery': 2, 'other': 2}
conversion_dict['stalk'] = {'celery': 2, 'other': 2}
conversion_dict['each'] = {'onion': 8,
                             'green bell pepper': 6,
                             'potato': 6,
                             'carrot': 4,
                             'red bell pepper': 6,
                             'jalapeno pepper': 0.7,
                             'chicken breast': 10,
                             'bay leaf': 1,
                             'yellow onion': 8,
                             'tomato': 6,
                             'green onion': 0.5,
                             'zucchini': 5,
                             'bay leave': 1,
                             'red onion': 8,
                             'yellow bell pepper': 6,
                             'slices bacon': 1,
                             'lime': 1.5,
                             'head cabbage': 30,
                             'sweet onion': 8,
                             'habanero pepper': 0.1,
                             'sweet potato': 6,
                             'eggs': 1,
                             'green chile pepper': 1,
                             'white onion': 8,
                             'other': 1}

### MAIN FUNCTIONS ###

def parse_ingredients(ingredients, units=units, flag_words=flag_words):
    '''
    Parses a list of ingredients into a list of dictionaries with the following format: 
        {'quantity': (float),
         'units': (str),
         'ingredient': (str)}
    Also takes argument 'units', a list of accepted units (e.g., ['cups', 'tablespoon']).
    If an ingredident does not specify a unit in this list, the label 'each' will be applied.
    '''
    ing_list = []
    for item in ingredients:
        item_dict = {}
        # Check item for flag words/phrases(require special parsing treatment)
        manual_flag = False
        for man_phrase in manual:
            if len(item.split(man_phrase)) > 1:
                manual_flag = True
        sp_flag = False
        for word in item.split():
            if word in flag_words:
                f_word = word
                sp_flag = True
        if item.split()[1][0] == '(':
            f_word = '('
            sp_flag = True  
        # Parse quantities and units        
        if manual_flag:
            quantity, unit, remainder = _parse_manual(item)
            item_dict['quantity'] = quantity
            item_dict['units'] = unit if unit[-1] != 's' else unit[:-1]
        elif sp_flag:
            try:
                quantity, unit, remainder = _parse_special(item, flag_words)                
                item_dict['quantity'] = quantity
                item_dict['units'] = unit if unit[-1] != 's' else unit[:-1]
            except:
                # Exception for special units of unspecified size
                item_dict['quantity'] = float(item.split()[0])
                item_dict['units'] = f_word if f_word[-1] != 's' else f_word[:-1]
                remainder = ' '.join(item.split()[2:])
        else:
            quantity, remainder = _determine_quantity(item) 
            item_dict['quantity'] = quantity
            if remainder.split()[0] in units:
                unit = remainder.split()[0]
                item_dict['units'] = unit if unit[-1] != 's' else unit[:-1]
                remainder = ' '.join(remainder.split()[1:])
            else:
                item_dict['units'] = 'each'
        # Remove preparation instructions from remaining text to isolate ingredient
        parsed = _remove_descriptors(remainder)
        if not parsed:
            continue
        item_dict['ingredient'] = parsed
        item_dict['normalized_qty'] = _normalize_ingredient_quantity(item_dict, conversion_dict)
        # Add item dictionary to list
        ing_list.append(item_dict)
    return ing_list


### HELPER FUNCTIONS ###

def _determine_quantity(item):
    quantity = 0
    for i, elem in enumerate(item.split()):
        if elem[0] in string.digits + '.':
            try:
                quantity += float(elem)
            except:
                numer, denom = elem.split('/')
                quantity += float(numer) / float(denom)
        else:
            idx = i
            break
    remainder = ' '.join(item.split()[idx:])
    return quantity, remainder

def _parse_special(item, flag_words):
    # Determine special word
    sp_word = ')'
    for word in flag_words:
        if len(item.split(word)) > 1:
            sp_word = ' ' + word + ' '
            break
    
    # Parse item 
    count_and_size = item.split(sp_word)[0]
    remainder = item.split(sp_word)[1]
    count, rest = _determine_quantity(count_and_size)
    if sp_word == ')':
        size, unit = _determine_quantity(rest[1:])
    else:
        size, unit = _determine_quantity(rest[1:-1])
    quantity = count * size
    return quantity, unit, remainder

def _parse_manual(item):
    if len(item.split('2 to 3 pound')) > 1:
        quantity = float(item.split()[0]) * 2.5
        unit = 'pound'
        remainder = item.split('2 to 3 pound')[1]
        return quantity, unit, remainder
    if len(item.split('finely chopped from 1 can')) > 1:
        quantity = 1.0
        unit = 'ounce'
        remainder = 'chipotle chile'
        return quantity, unit, remainder
    if len(item.split('onion soup, prepared from')) > 1:
        quantity = 1.5
        unit = 'cup'
        remainder = 'onion soup'
        return quantity, unit, remainder

def _remove_descriptors(item,
                        phrases=phrases,
                        stopwords=stopwords,
                        suffixes=suffixes):
    # Remove common/unnecessary ending phrases
    for phrase in phrases:
        if len(item.split(phrase)) > 1:
            item = item.split(phrase)[0]
    # Remove punctuation and stopwords
    words = []
    for elem in item.split():
        word = ''.join([letter for letter in elem.lower() if letter in string.ascii_lowercase])
        if word not in stopwords:
            words.append(word)
    # Remove adjectives and adverbs    
    for suffix in suffixes:
        for word in words.copy():
            try:
                if (word[-len(suffix):] == suffix) and word != 'red':
                    words.remove(word)
            except:
                continue    
    # Remove trailing spaces
    result = ' '.join([word for word in words if word])
    # Singularize (when not beans)...also, this code is asinine
    if result[-3:] == 'oes':
        result = result[:-2]
    if len(result) < 5:
        return result
    if result[-5:] == 'beans':
        return result
    else:
        return result if result[-1] != 's' else result[:-1]
    
def _normalize_ingredient_quantity(ingredient_dict, conversion_dict):
    ing = ingredient_dict['ingredient']
    qty = ingredient_dict['quantity']
    units = ingredient_dict['units']
    if units in conversion_dict.keys():
        conv_factor_dict = conversion_dict[units]
        if ing in conv_factor_dict.keys():
            conv_factor = conv_factor_dict[ing]
        else:
            conv_factor = conv_factor_dict['other']
        return qty * conv_factor
    else:
        return qty

In [282]:
recipe = recipes[5]
raw_ing = recipe['ingredients_raw']
raw_ing

['1 pound lean ground beef',
 '3 (15 ounce) cans dark red kidney beans',
 '3 (14.5 ounce) cans Mexican-style stewed tomatoes',
 '2 stalks celery, chopped',
 '1 red bell pepper, chopped',
 '1/4 cup red wine vinegar',
 '2 tablespoons chili powder',
 '1 teaspoon ground cumin',
 '1 teaspoon dried parsley',
 '1 teaspoon dried basil',
 '1 dash Worcestershire sauce',
 '1/2 cup red wine']

In [285]:
pd.DataFrame(parse_ingredients(raw_ing))

Unnamed: 0,ingredient,normalized_qty,quantity,units
0,beef,16.0,1.0,pound
1,dark red kidney beans,45.0,45.0,ounce
2,mexicanstyle tomato,43.5,43.5,ounce
3,celery,4.0,2.0,stalk
4,red bell pepper,6.0,1.0,each
5,red wine vinegar,2.0,0.25,cup
6,chili powder,1.0,2.0,tablespoon
7,cumin,0.166667,1.0,teaspoon
8,parsley,0.166667,1.0,teaspoon
9,basil,0.166667,1.0,teaspoon


In [281]:
ingredients = Counter()
for recipe in recipes:
    ing_list = recipe['ingredients_raw']
    parsed = parse_ingredients(ing_list)
    for item in parsed:
        ingredients[item['ingredient']] += 1
print('Number of unique ingredients :', len(ingredients))

Number of unique ingredients : 948


In [277]:
ingredients.most_common()

[('onion', 341),
 ('garlic', 257),
 ('tomato', 205),
 ('salt', 196),
 ('cumin', 192),
 ('chili powder', 189),
 ('water', 174),
 ('black pepper', 128),
 ('olive oil', 113),
 ('chicken broth', 106),
 ('beef', 104),
 ('vegetable oil', 97),
 ('cayenne pepper', 91),
 ('green bell pepper', 89),
 ('carrot', 88),
 ('oregano', 86),
 ('kidney beans', 83),
 ('celery', 80),
 ('tomato sauce', 78),
 ('cilantro', 72),
 ('allpurpose flour', 70),
 ('potato', 68),
 ('butter', 67),
 ('curry powder', 64),
 ('chicken breast', 62),
 ('paprika', 59),
 ('red bell pepper', 58),
 ('tomato paste', 56),
 ('black beans', 55),
 ('garlic powder', 52),
 ('red pepper flake', 46),
 ('parsley', 45),
 ('white sugar', 43),
 ('cheddar cheese', 42),
 ('jalapeno pepper', 41),
 ('green onion', 38),
 ('brown sugar', 36),
 ('coconut milk', 35),
 ('basil', 34),
 ('bay leaf', 33),
 ('yellow onion', 32),
 ('pinto beans', 31),
 ('worcestershire sauce', 30),
 ('milk', 30),
 ('chicken stock', 30),
 ('green chile pepper', 29),
 ('kern

In [278]:
common_ingredients = []
for item, count in ingredients.items():
    if count > 1:
        common_ingredients.append(item)
print('Number of common ingredients :', len(common_ingredients))

Number of common ingredients : 421


In [261]:
ing_units = {}
unit_ings = {}
num_units = {}
all_units = set()
for recipe in recipes:
    ing_list = recipe['ingredients_raw']
    parsed = parse_ingredients(ing_list)
    for item in parsed:
        all_units.add(item['units'])
        # For each ingredient, count the occurences of different units
        units_counter = ing_units.get(item['ingredient'], Counter())
        units_counter[item['units']] += 1
        ing_units[item['ingredient']] = units_counter
        # For each unit, count the occurences of different ingredients
        ings_counter = unit_ings.get(item['units'], Counter())
        ings_counter[item['ingredient']] += 1
        unit_ings[item['units']] = ings_counter
for item, counter in ing_units.items():
        num_units[item] = len(counter)

In [279]:
unit_ings['each'].most_common()

[('onion', 264),
 ('green bell pepper', 72),
 ('potato', 60),
 ('carrot', 58),
 ('red bell pepper', 48),
 ('jalapeno pepper', 37),
 ('chicken breast', 33),
 ('bay leaf', 33),
 ('yellow onion', 27),
 ('tomato', 23),
 ('green onion', 18),
 ('zucchini', 15),
 ('bay leave', 14),
 ('red onion', 13),
 ('yellow bell pepper', 13),
 ('slices bacon', 11),
 ('lime', 11),
 ('head cabbage', 10),
 ('sweet onion', 10),
 ('habanero pepper', 9),
 ('sweet potato', 9),
 ('eggs', 8),
 ('green chile pepper', 7),
 ('white onion', 7),
 ('clove', 7),
 ('garlic clove', 6),
 ('yellow squash', 6),
 ('egg', 6),
 ('avocado', 6),
 ('turnip', 5),
 ('leek', 5),
 ('cinnamon stick', 5),
 ('chicken breast half', 4),
 ('cloves garlic', 4),
 ('roma plum tomato', 4),
 ('parsnip', 4),
 ('pork chop', 4),
 ('mango', 4),
 ('shallot', 4),
 ('egg beaten', 4),
 ('lemon', 4),
 ('sprigs thyme', 3),
 ('ears corn', 3),
 ('celery rib', 3),
 ('russet potato', 3),
 ('anaheim chile pepper', 3),
 ('beef bouillon cube', 3),
 ('baking potat

In [262]:
all_units

{'bunch',
 'bunche',
 'clove',
 'cube',
 'cup',
 'dash',
 'each',
 'fluid ounce',
 'head',
 'inch',
 'inch square',
 'inch thick',
 'milliliter',
 'ounce',
 'packet',
 'pinch',
 'pint',
 'pound',
 'quart',
 'rib',
 'stalk',
 'tablespoon',
 'teaspoon'}

In [271]:
conversion_dict = {}
conversion_dict['ounce'] = {'other':1}
conversion_dict['cup'] = {'other':8}
conversion_dict['pint'] = {'other':16}
conversion_dict['quart'] = {'other':32}
conversion_dict['gallon'] = {'other':128}
conversion_dict['fluid ounce'] = {'other':1}
conversion_dict['milliliter'] = {'other':0.034}
conversion_dict['pound'] = {'other': 16}
conversion_dict['tablespoon'] = {'other': 1/2}
conversion_dict['teaspoon'] = {'other': 1/6}
conversion_dict['pinch'] = {'other': 0.1}
conversion_dict['dash'] = {'other': 0.1}

conversion_dict['bunch'] =  {'green onion': 3,
                             'cilantro': 2.8,
                             'parsley': 2,
                             'other': 3}
conversion_dict['bunche'] = conversion_dict['bunch']
conversion_dict['clove'] = {'garlic': 0.5, 'other': 0.5}
conversion_dict['cube'] = {'chicken bouillon': 0.4,
                           'beef bouillon': 0.4,
                           'vegetable bouillon': 0.4,
                           'other': 0.4}
conversion_dict['packet'] = {'other':1}
conversion_dict['head'] = {'other': 20,
                             'escarole': 10,
                             'garlic clove': 1.5,
                             'cabbage': 30,
                             'cauliflower': 30,
                             'broccoli': 20}
conversion_dict['rib'] = {'celery': 2, 'other': 2}
conversion_dict['stalk'] = {'celery': 2, 'other': 2}
conversion_dict['each'] = {'onion': 8,
                             'green bell pepper': 6,
                             'potato': 6,
                             'carrot': 4,
                             'red bell pepper': 6,
                             'jalapeno pepper': 0.7,
                             'chicken breast': 10,
                             'bay leaf': 1,
                             'yellow onion': 8,
                             'tomato': 6,
                             'green onion': 0.5,
                             'zucchini': 5,
                             'bay leave': 1,
                             'red onion': 8,
                             'yellow bell pepper': 6,
                             'slices bacon': 1,
                             'lime': 1.5,
                             'head cabbage': 30,
                             'sweet onion': 8,
                             'habanero pepper': 0.1,
                             'sweet potato': 6,
                             'eggs': 1,
                             'green chile pepper': 1,
                             'white onion': 8,
                             'other': 1}

In [257]:
find_ingredient_phrase('onion soup, prepared from')

Ingredient line :
 1 1/2 cups onion soup, prepared from a packet of dry onion soup mix

Entire recipe :
 {'_id': ObjectId('5ca54117c6d0b0083247a998'), 'id': 213564, 'name': "Chris' Chili", 'href': 'https://www.allrecipes.com/recipe/213564/chris-chili/', 'category': {'lvl_1': 'Soups, Stews and Chili', 'lvl_2': 'Chili', 'lvl_3': 'Beef Chili'}, 'rating_info': None, 'submitter_info': {'id': 4563797, 'name': 'ChrisG', 'followers': 4, 'href': 'https://www.allrecipes.com/cook/4563797/'}, 'ingredients': [{'quantity': 1.5, 'units': 'cup', 'ingredient': 'onion soup from a packet of dry onion soup mix'}, {'quantity': 0.25, 'units': 'cup', 'ingredient': 'bacon grease see notes'}, {'quantity': 1.5, 'units': 'pound', 'ingredient': 'beef round steak'}, {'quantity': 1.5, 'units': 'pound', 'ingredient': 'beef sirloin steak'}, {'quantity': 1.0, 'units': 'teaspoon', 'ingredient': 'salt'}, {'quantity': 1.0, 'units': 'tablespoon', 'ingredient': 'chili powder'}, {'quantity': 2.0, 'units': 'teaspoon', 'ingre

In [270]:
ing_units['egg']

Counter({'each': 6})

In [209]:
'10.75 ounce'.split()

['10.75', 'ounce']

In [219]:
_determine_quantity('.75 ounce')

(0.75, 'ounce')

In [227]:
s = '1 (28 ounce) can whole tomatoes, chopped'
_parse_special(s, flag_words=flag_words)

(28.0, 'ounce', 'whole tomatoes, chopped')

In [272]:
def _normalize_ingredient_quantity(ingredient_dict, conversion_dict):
    ing = ingredient_dict['ingredient']
    qty = ingredient_dict['quantity']
    units = ingredient_dict['units']
    if units in conversion_dict.keys():
        conv_factor_dict = conversion_dict[units]
        if ing in conv_factor_dict.keys():
            conv_factor = conv_factor_dict[ing]
        else:
            conv_factor = conv_factor_dict['other']
        return qty * conv_factor
    else:
        return qty

In [274]:
ex = {'quantity': 0.5, 'units': 'teaspoon', 'ingredient': 'ground black pepper'}
_normalize_ingredient_quantity(ex, conversion_dict)

0.08333333333333333