In [1]:
# Importing necessary modules - commented-out code is necessary download steps that need to be run before modules are imported
import re
import spacy
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from spacy import displacy
#python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
import pandas as pd
import inflect

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adenweiser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Setting up notebook to display multiple outputs in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# Setting list of directions, as would be obtained in web_scraping.ipynb
directions = ['Preheat the oven to 325 degrees F',
              'Heat the olive oil in a medium saute pan',
              'Add the onions, thyme, salt, and pepper and cook over medium-low heat, stirring occasionally, for 8 to 10 minutes, until the onions are translucent but not brown',
              'Off the heat, add the Worcestershire sauce, chicken stock, and tomato paste',
              'Allow to cool slightly',
              'In a large bowl, combine the ground chuck, onion mixture, bread crumbs, and eggs, and mix lightly with a fork',
              "Don't mash or the meat loaf will be dense",
              'Shape the mixture into a rectangular loaf on a sheet pan covered with parchment paper',
              'Spread the ketchup evenly on top',
              'Bake for 1 to 1 1/4 hours, until the internal temperature is 160 degrees F and the meat loaf is cooked through',
              '(A pan of hot water in the oven, under the meat loaf, will keep the top from cracking',
              ') Serve hot']

In [4]:
# Setting list of ingredients, as would be obtained in web_scraping.ipynb
ingredients = ['1 tablespoon good olive oil',
               '3 cups chopped yellow onions (3 onions)',
               '1 teaspoon chopped fresh thyme leaves',
               '2 teaspoons kosher salt',
               '1 teaspoon freshly ground black pepper',
               '3 tablespoons Worcestershire sauce',
               '1/3 cup canned chicken stock or broth',
               '1 tablespoon tomato paste',
               '2 1/2 pounds ground chuck (81 percent lean)',
               '1/2 cup plain dry bread crumbs (recommended: Progresso)',
               '2 extra-large eggs, beaten',
               '1/2 cup ketchup (recommended: Heinz)']

In [5]:
# Creating a list of units commonly used in cooking
units_list = ["ounce", "pinch", "dash", "cup", "gallon", "pint", "milliliter", "liter", "gram", "pound", "fluid ounce", "kilogram",
             "spoon", "quart", "container", "can", "box", "package", "packet", "tablespoon", "teaspoon", "small", "medium", "large", "stalk"]

# Creating a dictionary of unit conversions
unit_conversion = {"oz": "ounce", "c": "cup", "pt": "pint", "gal": "gallon", "ml": "milliliter", "mL": "milliliter",
                  "g": "gram", "lb": "pound", "kg": "kilogram", "kilo": "kilogram", "qt": "quart", "l": "liter", "L": "liter",
                  "tb": "tablespoon", "tbs": "tablespoon", "tbsp": "tablespoon", "tsp": "teaspoon"}

# Creating a list of equipment commonly used in cooking
equipment_list = ["pot", "pan", "sheet pan", "tray", "dish", "pressure cooker", "blender", "toaster", "microwave", "stove", "oven",
                  "range", "burner", "cooktop", "measuring cup", "bowl", "mixing bowl", "spoon", "fork", "knife", "ladle",
                  "spatula", "tongs", "paddle", "strainer", "mixer", "whisk", "toothpick", "foil", "parchment", "paper", "plastic wrap", "wrap",
                  "paper towel", "food processor", "processor", "skillet", "baking dish", "saute pan"]

# Creating a list of units of time
units_of_time_list = ["second", "seconds", "minute", "minutes", "hour", "hours"]

# Creating a list of common temperature levels used in cooking
temp_levels_list = ["low", "-", "medium", "high", "simmer", "boil"] 

# Stripping away stopwords, predetermined units and/or equipment from list of ingredients for direction parsing purposes
inflection_engine = inflect.engine()
new_ingredients = ingredients.copy()
for ingredient_idx, i in enumerate(ingredients):
    new_ingredients[ingredient_idx] = i.replace(',', "")
    new_sentence = new_ingredients[ingredient_idx]
    for word in new_ingredients[ingredient_idx].split(" "):
        for unit in units_list:
            if word == unit or word == inflection_engine.plural(unit):
                new_sentence = new_sentence.replace(word, "")
        for equipment in equipment_list:
            if word == equipment: 
                new_sentence = new_sentence.replace(word, "")
    new_ingredients[ingredient_idx] = new_sentence

In [6]:
# Visualizing the dependency parsed directions, for reference. Can delete this code chunk later.
# direction_dep_df = pd.DataFrame(columns = ['direction_index', 'word', 'pos_coarse', 'pos_fine', 'dependency', 'parent'])

# for i in range(len(directions)):
#     # Creating a Doc object from the directions
#     direction_doc = nlp(directions[i])
#     # Visualize the dependency relations
#     #displacy.render(direction_doc, style = "dep")
#     # Dependency parsing the directions
#     for token in direction_doc:
#         new_row = {'direction_index': i, 'word': token.text, 'pos_coarse': token.pos_, 'pos_fine': token.tag_, 'dependency': token.dep_, 'parent': token.head}
#         direction_dep_df = direction_dep_df.append(new_row, ignore_index = True)

# print(direction_dep_df.to_string())

In [7]:
# Defining a function to determine whether or not a given direction word should be split up (can the word be a verb)
# Taken from https://www.reddit.com/r/LanguageTechnology/comments/egh7jk/how_to_check_if_a_word_can_be_interpreted_as_a/
def possible_verb(surface):
    return 'v' in set(s.pos() for s in wordnet.synsets(surface))

In [8]:
# Generating a search table for the parsed directions
direction_df = pd.DataFrame(columns = ['direction_index', 'raw_text', 'action', 'action_duration', 'action_temperature', 'action_details', 'ingredient_amount', 'ingredient_unit', 'ingredient_name', 'equipment'])
direction_index = 0 # First direction in a recipe
current_direction_length = len(directions)
while direction_index < current_direction_length:
    # Creating a Doc object from the directions
    direction_doc = nlp(directions[direction_index])
    # Pulling out the relevant information from the dependency parsed directions
    raw_text = direction_doc.text
    action = ""
    action_duration = ""
    action_temperature = ""
    action_details = ""
    ingredient_amount = [] # stored as array so that corresponding unit can be retrieved w/ the same idx
    ingredient_unit = []  # stored as array so that corresponding amount can be retrieved w/ the same idx
    ingredient_name = [] # stored as array so that corresponding unit/amount can be retrieved w/ the same idx
    equipment = []
    direction_not_split = True
    duration_index = float('inf')
    for token_index, token in enumerate(direction_doc):
        token_is_action_preceded_by_and = possible_verb(token.text) and direction_doc[token_index-1].text == 'and'
        token_is_root = token.dep_ == 'ROOT'
        token_is_lower_range_of_action_duration = token.dep_ == 'quantmod' and token.head.dep_== 'nummod'
        token_is_higher_range_of_action_duration = token.dep_ == 'nummod' and token.head.text in units_of_time_list
        token_is_action_temperature = token.dep_ == 'nummod' and token.head.text == 'degrees'
        token_is_action_temperature_unit = (token.text == 'F' or re.match('[Ff]ahrenheit', token.text)) or (token.text == 'C' or re.match('[Cc]elsius', token.text))
        token_is_ingredient_amount_and_abbreviated_unit = direction_doc[token_index-1].pos_ == 'NUM' and token.text in unit_conversion.keys() and unit_conversion[token.text] in units_list
        token_is_ingredient_amount_and_unit = direction_doc[token_index-1].pos_ == 'NUM' and (token.text in units_list or inflection_engine.plural(token.text) in units_list)
        if token_is_action_preceded_by_and:
            and_space_str = "and "
            idx_of_and_action = raw_text.index(and_space_str + token.text)
            split_raw_text = raw_text.split(and_space_str, 1)
            first_half_of_raw_text = raw_text[ : idx_of_and_action]
            second_half_of_raw_text = raw_text[ idx_of_and_action + len(and_space_str) :].capitalize()
            # Altering directions array to reflect new sub-directions
            directions[direction_index] = split_raw_text[0] # Replace the original direction string with the first new direction
            directions.insert(direction_index+1, split_raw_text[1].capitalize()) # Insert the second new direction after the first
            current_direction_length += 1
            direction_not_split = False
            break
        elif token_is_root:
            action = token.text
        if token.text == "until":
            duration_index = token_index
        if token_index >= duration_index:
            action_duration += ' ' + token.text
            if token_index == len(direction_doc):
                duration_index = float('inf')
        elif token_is_lower_range_of_action_duration:
            if token.text not in action_duration:
                action_duration += token.text + ' '
        elif token_is_higher_range_of_action_duration:
            if token.text not in action_duration:
                action_duration += token.text + ' ' + token.head.text        
        elif token_is_action_temperature:
            action_temperature = token.text + ' ' + token.head.text
        elif token_is_action_temperature_unit:
            action_temperature += ' ' + token.text
        elif token.text == "over":
            action_temperature += token.text
        elif "over" in action_temperature:
            if token.text in temp_levels_list or token.text == "heat":
                action_temperature += ' ' + token.text
        elif token.text in temp_levels_list:
            action_temperature += token.text
        elif token_is_ingredient_amount_and_abbreviated_unit:
            ingredient_amount.append(direction_doc[token_index-1].text)
            ingredient_unit.append(unit_conversion[token.text])
        elif token_is_ingredient_amount_and_unit:
            ingredient_amount.append(direction_doc[token_index-1].text)
            ingredient_unit.append(token.text)
        plural_version = inflection_engine.plural(token.text)
        for i in new_ingredients:
            ingredient_words = i.split(" ")
            for ingredient_word in ingredient_words:
                if (token.text == ingredient_word or plural_version == ingredient_word) and token.pos_ != 'NUM' and token.text not in ['and', 'a', 'or', 'of', 'into', 'cut', 'chopped']:
                    ingredient_name.append(token.text)
        for e in equipment_list:
            if token.text.lower() == e:
                equipment_string = ''
                equipment_adj_idx = token_index-1
                while equipment_adj_idx > 0 and direction_doc[equipment_adj_idx].head.text == token.text and direction_doc[equipment_adj_idx].dep_ != 'det':
                    equipment_string = direction_doc[equipment_adj_idx].text + ' ' + equipment_string
                    equipment_adj_idx -= 1
                equipment_string += token.text
                equipment.append(equipment_string)
    if direction_not_split:
        new_row = {'direction_index': direction_index, 'raw_text': raw_text, 'action': action, 'action_duration': action_duration, 'action_temperature': action_temperature, 'action_details': action_details, 'ingredient_amount': ingredient_amount, 'ingredient_unit': ingredient_unit, 'ingredient_name': ingredient_name, 'equipment': equipment}
        direction_df = direction_df.append(new_row, ignore_index = True)
        direction_index += 1

# Joining ingredient names that are part of the same word - e.g., ["ground", "beef"] to ["ground beef"]
for ind in direction_df.index:
    ingredient_index = []
    for ing in direction_df['ingredient_name'][ind]:
        for i in range(len(ingredients)):
            if ing in ingredients[i] or ing[:-1] in ingredients[i]:
                ingredient_index.append(i)
    new_ingredient_list = []
    dup = {x for x in ingredient_index if ingredient_index.count(x) > 1}
    for j in range(0, len(direction_df['ingredient_name'][ind])-1):
        if ingredient_index[j] == ingredient_index[j+1]:
            ingredients_joined = direction_df['ingredient_name'][ind][j] + " " + direction_df['ingredient_name'][ind][j+1]
            new_ingredient_list.append(ingredients_joined)
        elif ingredient_index[j] not in dup:
            new_ingredient_list.append(direction_df['ingredient_name'][ind][j])
        if j+1 == len(direction_df['ingredient_name'][ind])-1:
            if ingredient_index[j+1] not in dup:
                new_ingredient_list.append(direction_df['ingredient_name'][ind][j+1])
    if len(direction_df['ingredient_name'][ind]) == 1:
        new_ingredient_list = direction_df['ingredient_name'][ind].copy()
    direction_df['ingredient_name'][ind] = new_ingredient_list.copy()

# If there are ingredient names, but no amounts and units, then inferring the amount and unit from the ingredient list
for ind in direction_df.index:
    if len(direction_df['ingredient_name'][ind]) > 0 and len(direction_df['ingredient_amount'][ind]) == 0 and len(direction_df['ingredient_unit'][ind]) == 0:
        for ing in direction_df['ingredient_name'][ind]:
            for i in ingredients:
                match_index = re.search(ing, i)
                if not match_index:
                    match_index = re.search(ing[:-1], i)
                if match_index:
                    i_extract = i[0:match_index.span()[0]]
            i_extract_list = i_extract.split(" ")
            for i_ex in i_extract_list:
                if re.match('\d', i_ex):
                    direction_df['ingredient_amount'][ind].append(i_ex)
                for unit in units_list:
                    if i_ex == unit or i_ex == inflection_engine.plural(unit):
                        direction_df['ingredient_unit'][ind].append(i_ex)

# If there is an action name but no equipment, then inferring the equipment from the action
for ind in direction_df.index:
    if len(direction_df['equipment'][ind]) == 0:
        if direction_df['action'][ind].lower() == "preheat":
            direction_df['equipment'][ind].append("oven")
        if direction_df['action'][ind].lower() == "bake":
            direction_df['equipment'][ind].append("oven")  
        if direction_df['action'][ind].lower() == "broil":
            direction_df['equipment'][ind].append("oven")            
        if direction_df['action'][ind].lower() == "roast":
            direction_df['equipment'][ind].append("oven")
        if direction_df['action'][ind].lower() == "pulse":
            direction_df['equipment'][ind].append("food processor")
        if direction_df['action'][ind].lower() == "microwave":
            direction_df['equipment'][ind].append("microwave")
        if direction_df['action'][ind].lower() == "freeze":
            direction_df['equipment'][ind].append("freezer")
        if direction_df['action'][ind].lower() == "cut":
            direction_df['equipment'][ind].append("knife")            
        if direction_df['action'][ind].lower() == "chop":
            direction_df['equipment'][ind].append("knife")            
        if direction_df['action'][ind].lower() == "mince":
            direction_df['equipment'][ind].append("knife")
        if direction_df['action'][ind].lower() == "stir":
            direction_df['equipment'][ind].append("spoon")            

direction_df

Unnamed: 0,direction_index,raw_text,action,action_duration,action_temperature,action_details,ingredient_amount,ingredient_unit,ingredient_name,equipment
0,0,Preheat the oven to 325 degrees F,Preheat,,325 degrees F,,[],[],[],[oven]
1,1,Heat the olive oil in a medium saute pan,Heat,,medium,,[1],[tablespoon],[olive oil],[pan]
2,2,"Add the onions, thyme, salt,",Add,,,,"[3, 1, 2]","[cups, teaspoon, teaspoons]","[onions, thyme, salt]",[]
3,3,Pepper,Pepper,,,,[],[],[],[]
4,4,"Cook over medium-low heat, stirring occasional...",Cook,8 to 10 minutes until the onions are transluce...,over medium - low heat,,[3],[cups],[onions],[]
5,5,"Off the heat, add the Worcestershire sauce, ch...",add,,,,"[3, 1/3, 1]","[tablespoons, cup, tablespoon]","[Worcestershire sauce, chicken stock, tomato p...",[]
6,6,Allow to cool slightly,Allow,,,,[],[],[],[]
7,7,"In a large bowl, combine the ground chuck, oni...",combine,,,,"[1, 1/2]","[tablespoon, cup]","[onion bread, crumbs]",[large bowl]
8,8,"Eggs,",Eggs,,,,[],[],[],[]
9,9,Mix lightly with a fork,Mix,,,,[],[],[],[fork]


In [26]:
direction_df["action"].tolist()

['Preheat',
 'Heat',
 'Add',
 'Pepper',
 'Cook',
 'add',
 'Allow',
 'combine',
 'Eggs',
 'Mix',
 'mash',
 'Shape',
 'Spread',
 'cooked',
 'keep',
 'Serve']

In [29]:
parsed_question = nlp("How many cups of chicken")
for token in parsed_question:
    print(token.text + " " + str(token.pos_) + " " + str(token.lemma_))

How SCONJ how
many ADJ many
cups NOUN cup
of ADP of
chicken NOUN chicken


In [46]:
re.search("india(n)?|(south asia(n)?)", "indi".lower())