In [None]:
# Importing necessary modules
import re
import spacy
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
import pandas as pd
import inflect

In [None]:
# Setting up notebook to display multiple outputs in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Setting list of directions, as would be obtained in web_scraping.ipynb
directions = ['Preheat the oven to 450 degrees F',
              'Pulse the carrots, celery and shallots in a food processor until coarsely chopped',
              'Heat 1 tablespoon of the olive oil in a large nonstick skillet over medium high',
              'Add the chopped vegetables and cook, stirring frequently, until light golden and soft, 8 to 10 minutes',
              '(Add a splash of water if the mixture begins to stick)',
              'Add the ground beef and 4 teaspoons of the tomato paste and cook, breaking the mixture up with a wooden spoon, until browned, about 4 minutes',
              'Add 1/2 cup water, the parsley, raisins, 1 tablespoon of the vinegar and 3/4 teaspoon salt',
              'Bring to a simmer and cook until most of the water is absorbed and the mixture gets saucy, about 1 minute',
              'Let cool slightly',
              'Meanwhile, toss the pepper halves with the remaining 1/2 tablespoon oil in a microwave-safe bowl',
              'Cover with plastic wrap and microwave until the peppers are pliable, 10 to 12 minutes',
              'Carefully uncover the bowl and pour out any liquid that has accumulated',
              'Stir the couscous into the beef mixture',
              'Whisk together the remaining 2 tablespoons tomato paste, 1 tablespoon vinegar and 3/4 cup water in the bottom of a large baking dish',
              'Carefully transfer the peppers to the baking dish cut-side up and fill each pepper with the couscous mixture',
              'Sprinkle with the cheese',
              'Cover with foil and bake until the peppers are tender and the stuffing is hot, 20 to 25 minutes',
              'Serve warm or at room temperature drizzled with the tomato cooking liquid']

In [None]:
# Setting list of ingredients, as would be obtained in ingredient_parsing.ipynb
units_list = ["ounce", "pinch", "dash", "cup", "gallon", "pint", "milliliter", "liter", "gram", "pound", "fluid ounce", "kilogram",
             "spoon", "quart", "container", "can", "box", "package", "packet", "tablespoon", "teaspoon"]
equipment_list = ["pot", "pan", "sheet pan", "tray",
                  "dish", "pressure cooker", "blender", "toaster",
                  "microwave", "stove", "oven", "range", "burner",
                  "cooktop", "measuring cup", "cup", "bowl", "mixing bowl", 
                  "spoon", "fork", "knife", "ladle", "spatula", "tongs", "paddle", "strainer", "mixer", "whisk",
                  "toothpick", "foil", "parchment", "paper", "plastic wrap", "paper towel", "food processor", "skillet", "baking dish"]


ingredients =   ['2 medium carrots, cut into chunks', 
                '2 stalks celery, cut into chunks',
                '1 large shallot, cut into chunks'
                '1 1/2 tablespoons olive oil',
                '1/2 pound lean ground beef',
                '2 tablespoons plus 4 teaspoons tomato paste',
                '1/3 cup chopped fresh parsley, dill or a combination',
                '1/3 cup golden raisins',
                '2 tablespoons red wine vinegar',
                'Kosher salt',
                '4 red, yellow, orange or green bell peppers or a mix of colors, halved lengthwise and seeded',
                '1/2 cup whole wheat couscous',
                '3/4 cup grated asiago cheese']

# Strip away stopwords, predetermined units and/or equipment from list of ingredients for direction parsing purposes
inflection_engine = inflect.engine()
for ingredient_idx, i in enumerate(ingredients):
    ingredients[ingredient_idx] = i.replace(',', "")
    new_sentence = ingredients[ingredient_idx]
    for word in ingredients[ingredient_idx].split(" "):
        # for stopword in stopwords.words('english'):
        #     if word == stopword: 
        #         print(word, stopword)
        #         new_sentence = new_sentence.replace(word, "")
        for unit in units_list:
            if word == unit or word == inflection_engine.plural(unit): 
                new_sentence = new_sentence.replace(word, "")
        for equipment in equipment_list:
            if word == equipment: 
                new_sentence = new_sentence.replace(word, "")
    ingredients[ingredient_idx] = new_sentence

ingredients

In [None]:
direction_dep_df = pd.DataFrame(columns = ['direction_index', 'word', 'pos_coarse', 'pos_fine', 'dependency', 'parent'])

for i in range(len(directions)):
    # Creating a Doc object from the directions
    direction_doc = nlp(directions[i])
    # Visualize the dependency relations
    #displacy.render(direction_doc, style = "dep")
    # Dependency parsing the directions
    for token in direction_doc:
        new_row = {'direction_index': i, 'word': token.text, 'pos_coarse': token.pos_, 'pos_fine': token.tag_, 'dependency': token.dep_, 'parent': token.head}
        direction_dep_df = direction_dep_df.append(new_row, ignore_index = True)

print(direction_dep_df.to_string())

In [None]:
# Taken from https://www.reddit.com/r/LanguageTechnology/comments/egh7jk/how_to_check_if_a_word_can_be_interpreted_as_a/ for purposes of determining whether or not a given direction should be split up
def possible_verb(surface): 
    return 'v' in set(s.pos() for s in wordnet.synsets(surface))

print(possible_verb("4"))

In [None]:
inflection_engine = inflect.engine()

unit_conversion = {"oz": "ounce", "c": "cup", "pt": "pint", "gal": "gallon", "ml": "milliliter", "mL": "milliliter",
                  "g": "gram", "lb": "pound", "kg": "kilogram", "kilo": "kilogram", "qt": "quart", "l": "liter", "L": "liter",
                  "tb": "tablespoon", "tbs": "tablespoon", "tbsp": "tablespoon", "tsp": "teaspoon"}
units_of_time_list = ["second", "seconds", "minute", "minutes", "hour", "hours"]
temp_levels_list = ["low", "medium", "high", "simmer", "boil"] 

direction_df = pd.DataFrame(columns = ['direction_index', 'raw_text', 'action', 'action_duration', 'action_temperature', 'action_details', 'ingredient_amount', 'ingredient_unit', 'ingredient_name', 'equipment'])
direction_index = 0
current_direction_length = len(directions)
while direction_index < current_direction_length:
    if direction_index >= 5 and direction_index < 10: print("\n".join(directions))
    # Creating a Doc object from the directions
    direction_doc = nlp(directions[direction_index])
    # Pulling out the relevant information from the dependency parsed directions
    raw_text = direction_doc.text
    action = ""
    action_duration = ""
    action_temperature = ""
    action_details = ""
    ingredient_amount = [] # stored as array so that corresponding unit can be retrieved w/ the same idx
    ingredient_unit = []  # stored as array so that corresponding amount can be retrieved w/ the same idx
    ingredient_name = set()
    equipment = []
    direction_not_split = True
    for token_index, token in enumerate(direction_doc):
        token_is_action_preceded_by_and = possible_verb(token.text) and direction_doc[token_index-1].text == 'and'
        token_is_root = token.dep_ == 'ROOT'
        token_is_lower_range_of_action_duration = token.dep_ == 'quantmod' and token.head.dep_== 'nummod'
        token_is_higher_range_of_action_duration = token.dep_ == 'nummod' and token.head.text in units_of_time_list
        token_is_action_temperature = token.dep_ == 'nummod' and token.head.text == 'degrees'
        token_is_action_temperature_unit = (token.text == 'F' or re.match('[Ff]ahrenheit', token.text)) or (token.text == 'C' or re.match('[Cc]elsius', token.text))
        token_is_ingredient_amount_and_abbreviated_unit = direction_doc[token_index-1].pos_ == 'NUM' and token.text in unit_conversion.keys() and unit_conversion[token.text] in units_list
        token_is_ingredient_amount_and_unit = direction_doc[token_index-1].pos_ == 'NUM' and (token.text in units_list or inflection_engine.plural(token.text) in units_list)

        if token_is_action_preceded_by_and:
            '''
                A WEIRD BUG / EDGE CASE
                -----------------------
                In the end, the code below is splitting the following instruction in the following manner:

                "Add the ground beef and 4 teaspoons of the tomato paste and cook, breaking the mixture up with a wooden spoon, until browned, about 4 minutes"
                =>
                "Add the ground beef",
                "4 teaspoons of the tomato paste",
                "Cook, breaking the mixture up with a wooden spoon, until browned, about 4 minutes"

                Somehow this is done in two steps, i.e.:
                
                1. "Add the ground beef and 4 teaspoons of the tomato paste and cook, breaking the mixture up with a wooden spoon, until browned, about 4 minutes"
                    =>
                    "Add the ground beef and 4 teaspoons of the tomato paste",
                    "Cook, breaking the mixture up with a wooden spoon, until browned, about 4 minutes"
                2. "4 teaspoons of the tomato paste and cook, breaking the mixture up with a wooden spoon, until browned, about 4 minutes"
                    =>
                    "4 teaspoons of the tomato paste",
                    "Cook, breaking the mixture up with a wooden spoon, until browned, about 4 minutes"

                The existence of the raw text on the 2nd step is what confuses me, as the original direction should never be split on "...and 4 teaspoons...", and yet somehow the final output has this as a sub-direction with "teaspoons" as the root/action
            '''
            and_space_str = "and "
            idx_of_and_action = raw_text.index(and_space_str + token.text)
            split_raw_text = raw_text.split(and_space_str, 1)
            first_half_of_raw_text = raw_text[ : idx_of_and_action]
            second_half_of_raw_text = raw_text[ idx_of_and_action + len(and_space_str) :].capitalize()
            # print("\n")
            # print("Action to Split On:", token.text)
            # print("Raw Text:", raw_text)
            # print("First half:", first_half_of_raw_text)
            # print("Second half:", second_half_of_raw_text)
            # print("Replaced dir:", directions[direction_index])
            # print("\n")
            # alter directions array to reflect new sub-directions
            directions[direction_index] = split_raw_text[0] # Replace the original direction string with the first new direction
            directions.insert(direction_index+1, split_raw_text[1].capitalize()) # Insert the second new direction after the first
            current_direction_length += 1
            direction_not_split = False
            break
        elif token_is_root:
            action = token.text
        elif token_is_lower_range_of_action_duration:
            action_duration += token.text + ' '
        elif token_is_higher_range_of_action_duration:
            action_duration += token.text + ' ' + token.head.text
        elif token_is_action_temperature:
            action_temperature = token.text + ' ' + token.head.text
        elif token_is_action_temperature_unit:
            action_temperature += ' ' + token.text
        elif token_is_ingredient_amount_and_abbreviated_unit:
            ingredient_amount.append(direction_doc[token_index-1].text)
            ingredient_unit.append(unit_conversion[token.text])
        elif token_is_ingredient_amount_and_unit:
            ingredient_amount.append(direction_doc[token_index-1].text)
            ingredient_unit.append(token.text)
        else:
            for i in ingredients:
                plural_version = inflection_engine.plural(token.text)
                if (token.text in i or plural_version in i) and token.pos_ != 'NUM':
                    ingredient_name.add(token.text)
            for e in equipment_list:
                if token.text == e:
                    equipment_string = ''
                    equipment_adj_idx = token_index-1
                    while equipment_adj_idx > 0 and direction_doc[equipment_adj_idx].head.text == token.text and direction_doc[equipment_adj_idx].dep_ != 'det':
                        equipment_string = direction_doc[equipment_adj_idx].text + ' ' + equipment_string
                        equipment_adj_idx -= 1
                    equipment_string += token.text
                    equipment.append(equipment_string)
    
    if direction_not_split:
        new_row = {'direction_index': direction_index, 'raw_text': raw_text, 'action': action, 'action_duration': action_duration, 'action_temperature': action_temperature, 'action_details': action_details, 'ingredient_amount': ingredient_amount, 'ingredient_unit': ingredient_unit, 'ingredient_name': ingredient_name, 'equipment': equipment}
        direction_df = direction_df.append(new_row, ignore_index = True)
        direction_index += 1

direction_df