# Parse Ingredients

First we need to import all necessary libraries:

In [1]:
import random
import spacy
import psycopg2
from typing import Tuple, Any, Dict
from spacy.matcher import Matcher
from spacy.tokens import Span, DocBin, Doc
from spacy.util import filter_spans, compile_suffix_regex, compile_infix_regex

pg_connection = psycopg2.connect("postgresql://postgres:password@localhost")
pg_cursor = pg_connection.cursor()

nlp = spacy.load("en_core_web_sm")

Doc.set_extension("recipe_id", default=None, force=True)

In [2]:
UNITS: list[str] = [ "g", "gram", "cup", "ml", "kg", "oz", "tablespoon", "teaspoon", "pinch", "dash", "sprig", "clove" ]
VULGAR_FRACTIONS: list[str] = ["¼", "½", "¾", "⅐", "⅑", "⅒", "⅓", "⅔", "⅕", "⅖", "⅗", "⅘", "⅙", "⅚", "⅛", "⅜", "⅝", "⅞"]

DASH_SYMBOLS: list[str] = [ "\u2012", "\u2013", "\u2014", "\u2015", "-" ]
MULTIPLICATION_SYMBOLS: list[str] = [ "x", "*" ]

## Tokenizer Update

Update `suffix_search` regex to catch all `VULGAR_FRACTIONS` and `UNITS` into separate tokens:

In [3]:
def get_after_number_regex(suffixes: list[str]) -> str:
    return f"(?<=[0-9])(?:{ '|'.join(suffixes) })"

suffixes = nlp.Defaults.suffixes + [ get_after_number_regex(UNITS + VULGAR_FRACTIONS) ]
suffix_regex = compile_suffix_regex(suffixes)
nlp.tokenizer.suffix_search = suffix_regex.search

Update `infix_finditer` regex to catch all versions of dash and `VULGAR_FRACTIONS` as infixes:

In [4]:
# NOTE: Adding MULTIPLICATION_SYMBOLS will cause unexpected word split
infixes = nlp.Defaults.infixes + VULGAR_FRACTIONS + DASH_SYMBOLS
infix_regex = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_regex.finditer

## Matcher

...

In [5]:
matcher = Matcher(nlp.vocab)

Add `RANGE_MEASURE` which will catch values like `2-3 cups` to convert them into `(2.5, cups)`:

In [6]:
matcher.add("RANGE_MEASURE", [
    ([
        { "LIKE_NUM": True, "OP": "?" },
        { "LOWER": { "IN": VULGAR_FRACTIONS }, "OP": "?" },

        { "LOWER": { "IN": DASH_SYMBOLS } },

        { "LIKE_NUM": True, "OP": "?" },
        { "LOWER": { "IN": VULGAR_FRACTIONS }, "OP": "?" },

        { "LEMMA": { "IN": UNITS } },
    ])
])

Add `MULTI_MEASURE` which will catch values like `2x20oz` to convert them into `(40, oz)`:

In [7]:
matcher.add("MULTI_MEASURE", [
    ([
        { "LIKE_NUM": True, "OP": "?" },
        { "LOWER": { "IN": VULGAR_FRACTIONS }, "OP": "?" },

        { "LOWER": { "IN": MULTIPLICATION_SYMBOLS } },

        { "LIKE_NUM": True, "OP": "?" },
        { "LOWER": { "IN": VULGAR_FRACTIONS }, "OP": "?" },

        { "LEMMA": { "IN": UNITS } },
    ])
])

Add `EXACT_MEASURE` which will catch values like `(35g)` to convert them into `(35, g)`:

In [8]:
matcher.add("EXACT_MEASURE", [
    ([
        { "LOWER": "(" },

        { "LIKE_NUM": True },
        { "LOWER": { "IN": VULGAR_FRACTIONS }, "OP": "?" },

        { "LEMMA": { "IN": UNITS } },

        { "LOWER": ")" },
    ]),
])

Add `MEASURE` which will catch values like `1 kg` to convert them into `(1, kg)`:

In [9]:
matcher.add("MEASURE", [
    ([
        { "LIKE_NUM": True },
        { "LOWER": { "IN": VULGAR_FRACTIONS }, "OP": "?" },

        { "LOWER": "heaping", "OP": "?" },

        { "LEMMA": { "IN": UNITS } },
    ]),
    ([
        { "LOWER": { "IN": VULGAR_FRACTIONS } },

        { "LOWER": "heaping", "OP": "?" },

        { "LEMMA": { "IN": UNITS } },
    ]),
])

Add `UNIT_MEASURE` which will catch values like `a cup` to convert them into `(DEFAULT_AMOUNT, cup)`:

In [10]:
matcher.add("UNIT_MEASURE", [
    ([ { "LEMMA": { "IN": UNITS } } ]),
])

Add `AMOUNT_MEASURE` which will catch values like `2` to convert them into `(2, DEFAULT_UNIT)`:

In [11]:
matcher.add("AMOUNT_MEASURE", [
    ([
        { "LIKE_NUM": True },
        { "LOWER": { "IN": VULGAR_FRACTIONS }, "OP": "?" },
    ]),
    ([
        { "LOWER": { "IN": VULGAR_FRACTIONS } },
    ]),
])

In [12]:
matcher.add("PRODUCT", [
    ([
        { "LOWER": "extra", "OP": "?" },
        { "LOWER": "virgin", "OP": "?" },
        { "LOWER": { "IN": [ "olive", "sesame", "sunflower", "peanut", "vegetable" ] } },
        { "LOWER": "oil" },
    ]),
    ([
        { "LOWER": "chipotle", "OP": "?" },
        { "LOWER": { "IN": [ "chilli", "tabasco" ] } },
        { "LOWER": "sauce" },
    ]),
    ([
        { "LOWER": "sea" },
        { "LOWER": "salt" },
    ]),
    ([
        { "LOWER": "black" },
        { "LOWER": "pepper" },
    ]),
    ([
        { "LOWER": "unsalted" },
        { "LOWER": "butter" },
    ]),
    ([
        { "LOWER": "lemon" },
        { "LOWER": "juice" },
    ]),
    ([
        { "LOWER": "runny" },
        { "LOWER": "honey" },
    ]),
    ([
        { "LOWER": "natural" },
        { "LOWER": "yoghurt" },
    ]),
    ([
        { "LOWER": "red" },
        { "LOWER": "wine" },
        { "LOWER": "vinegar" },
    ]),
    ([
        { "LOWER": "dried" },
        { "LOWER": "chilli" },
        { "LOWER": "flakes" },
    ]),
])

## Parse Single Ingredient Line

In [13]:
def get_ents_from_match(measure: Span) -> list[Span]:
    match measure.label_:
        case "EXACT_MEASURE":
            amount: Span = Span(measure.doc, measure.start + 1, measure.end - 2, "AMOUNT")
            unit: Span = Span(measure.doc, measure.end - 2, measure.end - 1, "UNIT")
            return [ amount, unit ]
        case "RANGE_MEASURE" | "MULTI_MEASURE" | "MEASURE":
            amount: Span = Span(measure.doc, measure.start, measure.end - 1, "AMOUNT")
            unit: Span = Span(measure.doc, measure.end - 1, measure.end, "UNIT")
            return [ amount, unit ]
        case "UNIT_MEASURE":
            unit: Span = Span(measure.doc, measure.start, measure.end, "UNIT")
            return [ unit ]
        case "AMOUNT_MEASURE":
            amount: Span = Span(measure.doc, measure.start, measure.end, "AMOUNT")
            return [ amount ]
        case "PRODUCT":
            product: Span = Span(measure.doc, measure.start, measure.end, "PRODUCT")
            return [ product ]
        case _:
            return []

In [14]:
def process_ingredient_doc(doc: Doc, context) -> Doc:
    doc._.recipe_id = context["id"]

    matches = matcher(doc)
    spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]

    non_overlapping_spans = filter_spans(spans)

    # if len(non_overlapping_spans) > 1:
    #     print(f"recipe_id: {str(doc._.recipe_id):<8} spans: {[(sp, sp.label_) for sp in non_overlapping_spans]}")

    measures: list[Span] = []

    for measure in non_overlapping_spans:
        measures += get_ents_from_match(measure)

    doc.ents = measures

    return doc

In [15]:
doc_temp = process_ingredient_doc(nlp("½-1 oz fresh red chilli"), { "id": 10 })
print([token for token in doc_temp])

[½, -, 1, oz, fresh, red, chilli]


### Parse Ingredients from File

Use NLTK library to identify most-popular collocations, it should be helpful to run it against ingredients were there are no amounts or units, because it usually means that something like "salt to taste" is used and it requires somewhat unique process in a future.

In [16]:
from nltk.metrics import BigramAssocMeasures
from nltk.metrics import TrigramAssocMeasures
from nltk.metrics import QuadgramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import QuadgramCollocationFinder


def get_bigram_collocations(docs: list[Doc]) -> None:
    bigram_collocation = BigramCollocationFinder.from_documents(docs)
    print("bigrams:", bigram_collocation.nbest(BigramAssocMeasures.likelihood_ratio, 5))

def get_trigram_collocations(docs: list[Doc]) -> None:
    trigram_collocation = TrigramCollocationFinder.from_documents(docs)
    print("trigrams:", trigram_collocation.nbest(TrigramAssocMeasures.likelihood_ratio, 5))

def get_quadgram_collocations(docs: list[Doc]) -> None:
    quadgram_collocation = QuadgramCollocationFinder.from_documents(docs)
    print("quadgrams:", quadgram_collocation.nbest(QuadgramAssocMeasures.likelihood_ratio, 5))

In [17]:
def process_ingredients_from_file(ingredients: Tuple[str, Dict[str, Any]]) -> list[Doc]:

    non_parsed_recipes: set[int] = set()
    doc_count: int = 0
    span_count: int = 0
    empty_count: int = 0

    docs: list[Doc] = []

    for doc, context in nlp.pipe(ingredients, as_tuples=True):
        doc_with_ents = process_ingredient_doc(doc, context)
        docs.append(doc_with_ents)

        if len(doc_with_ents.ents) > 0:
            span_count += 1
        else:
            non_parsed_recipes.add(doc_with_ents._.recipe_id)
            empty_count += 1

        doc_count += 1

    filtered_docs = list( filter(lambda doc: len(doc.ents) == 0, docs) )
    

    get_bigram_collocations( filtered_docs )
    get_trigram_collocations( filtered_docs )
    get_quadgram_collocations( filtered_docs )

    print(f"doc_count = {doc_count} / span_count = {span_count} / empty_count = {empty_count}")
    print(f"non_parsed_recipes = {len(non_parsed_recipes)} of 2454")

    return docs

In [18]:
pg_cursor.execute("SELECT text, jsonb_build_object('recipe_id'::text, recipe_id, 'id'::text, id) FROM recipe_scraper.ingredient GROUP BY id")
ingredients = pg_cursor.fetchall()

docs_to_save = process_ingredients_from_file(ingredients)

bigrams: [(a, bunch), (fresh, mint), (vegetables, ,), (fine, semolina), (flour, ,)]
trigrams: [(a, bunch, of), (fresh, mint, leaves), (vegetables, ,, such), (fine, semolina, ,), (flour, ,, for)]
quadgrams: [(a, bunch, of, fresh), (vegetables, ,, such, as), (fine, semolina, ,, for), (flour, ,, for, dusting), (higher, -, welfare, chipolata)]
doc_count = 28001 / span_count = 27062 / empty_count = 939
non_parsed_recipes = 939 of 2454


### Save Training and Evaluation Docs

In [19]:
def save_docs_into_bin(docs: list[Doc]) -> None:
    # doc_bin = DocBin(docs=docs)
    # doc_bin.to_disk("./train.spacy")

    random.shuffle(docs)
    train_docs = docs[:len(docs) // 2]
    dev_docs = docs[len(docs) // 2:]

    # Create and save a collection of training docs
    train_docbin = DocBin(docs=train_docs)
    train_docbin.to_disk("./docs/train.spacy")
    # Create and save a collection of evaluation docs
    dev_docbin = DocBin(docs=dev_docs)
    dev_docbin.to_disk("./docs/dev.spacy")

In [20]:
# save_docs_into_bin(docs_to_save)