In [1]:
from pytesseract import pytesseract
import spacy
import cv2
import numpy as np
from spacy.language import Language

Initialise Tesseract

In [2]:
path_to_tesseract = r"/opt/homebrew/opt/tesseract/bin/tesseract"
pytesseract.tesseract_cmd = path_to_tesseract

Import Image

In [7]:
# image_path = r"sample_data/Thali_sweetcorn.png"
image_path = r"sample_data/Tahini-test_kitchen.png"
img = cv2.imread(image_path)

Image Processing

In [8]:
# --- dilation on the green channel ---
dilated_img = cv2.dilate(img[:, :, 1], np.ones((7, 7), np.uint8))
bg_img = cv2.medianBlur(dilated_img, 21)

# --- finding absolute difference to preserve edges ---
diff_img = 255 - cv2.absdiff(img[:, :, 1], bg_img)

# --- normalizing between 0 to 255 ---
norm_img = cv2.normalize(
    diff_img, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1
)

# --- Otsu threshold ---
th = cv2.threshold(norm_img, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

Extract Text

In [64]:
text = pytesseract.image_to_string(th)
text

"Prep time: 25 minutes\nCook time: 40 minutes\n\n1 round white or brown\npita (100g), pocket opened\nup, then roughly torn into\n2-3cm pieces (see p. 23 for\nhomemade pita)\n\n1 tbsp za’atar\n\n75ml olive oil\n\n3 tins of cannellini beans\n(1.2kg), drained (720g)\n\n30g parsley, roughly chopped\n\n30g fresh coriander, roughly\nchopped\n\n30g chives, roughly chopped\n\n1% tsp cumin seeds, toasted\nand roughly crushed with a\npestle and mortar\n\n1 garlic clove, crushed\n\n2% tbsp lemon juice\n\nsait and black pepper\n\nTAHINI SAUCE\n\n80g tahini\n\n1% tbsp lemon juice\n1 garlic clove, crushed\n\nCHILLI OIL\n\n2% tbsp olive oll\n% tsp chilli flakes\nYa tsp paprika\n\nGreen cannellini and tahini\n\nVariations of warm beans served with tahini are popular throughout\nthe Arab world, with dishes such as chickpea fatteh and ful mudammas\nwith tahini at the forefront. Such dishes are typically eaten warm for\nbreakfast, and are a sure way to keep you full until dinner. They're the\ninspiration

Decided that the line break fix will need to happen after topic classification. Some ingredients were getting falsely added together because of the line break removal. 

In [65]:
# import re

# sentence = "They're the\ninspiration for these herby\n\ncannellini beans, which can easily be served\n\nat any mealtime."

# # Find start of all double line breaks in sentence
# double_line_breaks = [m.start() for m in re.finditer("\n\n", text)]
# double_line_breaks.reverse()

# # If character before double line break isn't a full stop and if character after double line break isn't a capital letter replace linebreaks with whitespace
# for i in double_line_breaks:
#     if text[i - 1] != "." and text[i + 2].isupper() == False and text[i + 2].isnumeric() == False:
#         print(f"Replacing linebreaks at index {i}")
#         print(f"Char before: {text[i - 1]}, Char after: {text[i + 2]}")
#         # replace double line break with whitespace at position
#         text = text[:i] + " " + text[i + 2 :]

# text


Replacing linebreaks at index 1901
Char before: o, Char after: a
Replacing linebreaks at index 1372
Char before: s, Char after: i
Replacing linebreaks at index 1008
Char before: d, Char after: a
Replacing linebreaks at index 465
Char before: e, Char after: s


"Prep time: 25 minutes\nCook time: 40 minutes\n\n1 round white or brown\npita (100g), pocket opened\nup, then roughly torn into\n2-3cm pieces (see p. 23 for\nhomemade pita)\n\n1 tbsp za’atar\n\n75ml olive oil\n\n3 tins of cannellini beans\n(1.2kg), drained (720g)\n\n30g parsley, roughly chopped\n\n30g fresh coriander, roughly\nchopped\n\n30g chives, roughly chopped\n\n1% tsp cumin seeds, toasted\nand roughly crushed with a\npestle and mortar\n\n1 garlic clove, crushed\n\n2% tbsp lemon juice sait and black pepper\n\nTAHINI SAUCE\n\n80g tahini\n\n1% tbsp lemon juice\n1 garlic clove, crushed\n\nCHILLI OIL\n\n2% tbsp olive oll\n% tsp chilli flakes\nYa tsp paprika\n\nGreen cannellini and tahini\n\nVariations of warm beans served with tahini are popular throughout\nthe Arab world, with dishes such as chickpea fatteh and ful mudammas\nwith tahini at the forefront. Such dishes are typically eaten warm for\nbreakfast, and are a sure way to keep you full until dinner. They're the\ninspiration fo

Initialise NLP Model

In [10]:
nlp = spacy.load("en_core_web_sm")

Add Sentensizer to recognise line breaks as a sentence

In [11]:
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == "\n\n":
            doc[token.i].is_sent_start = True
    return doc

In [12]:
nlp.add_pipe("set_custom_boundaries", before="parser")

<function __main__.set_custom_boundaries(doc)>

Process Text with Spacy

In [66]:
doc = nlp(text)

In [76]:
# Find paragraphs by splitting on "whitespace" sentences
paragraphs = []
paragraph = []
for sent in doc.sents:
    if len(sent) <= 1:
        paragraphs.append(paragraph)
        paragraph = []
        continue
    paragraph.append(sent)
paragraphs.append(paragraph)
paragraphs

[[Prep time: 25 minutes
  Cook time: 40 minutes],
 [1 round white or brown
  pita (100g), pocket opened
  up, then roughly torn into
  2-3cm pieces (see p. 23 for
  homemade pita)],
 [1 tbsp za’atar],
 [75ml olive oil],
 [3 tins of cannellini beans
  (1.2kg), drained (720g)],
 [30g parsley, roughly chopped],
 [30g fresh coriander, roughly
  chopped],
 [30g chives, roughly chopped],
 [1% tsp cumin seeds, toasted
  and roughly crushed with a
  pestle and mortar],
 [1 garlic clove, crushed],
 [2% tbsp lemon juice sait and black pepper],
 [TAHINI SAUCE],
 [80g tahini],
 [1% tbsp lemon juice
  1 garlic clove, crushed],
 [CHILLI OIL],
 [2% tbsp olive oll
  % tsp chilli flakes
  Ya tsp paprika],
 [Green cannellini and tahini],
 [Variations of warm beans served with tahini are popular throughout
  the Arab world, with dishes such as chickpea fatteh and ful mudammas
  with tahini at the forefront.,
  Such dishes are typically eaten warm for
  breakfast, and are a sure way to keep you full until

Store text

In [51]:
# Merge paragraphs into a single string
texts = []
for paragraph in paragraphs:
    text = ""
    if len(paragraph) > 1:
        text = " ".join([sent.text for sent in paragraph])
    else:
        text = paragraph[0].text
    
    # remove \n
    text = text.replace("\n", " ")
    texts.append(text)

# write text to file
with open("output.txt", "w") as f:
    for text in texts:
        f.write(text + "\n")

Convert `\n` in the text to a space (no more need for line breaks)

In [47]:
def replace_word(orig_text, replacement):
    matcher = spacy.matcher.Matcher(nlp.vocab)
    matcher.add("\n", [{"ORTH": "\n"}])
    tok = nlp(orig_text)
    text = ''
    buffer_start = 0
    for _, match_start, _ in matcher(tok):
        if match_start > buffer_start:  # If we've skipped over some tokens, let's add those in (with trailing whitespace if available)
            text += tok[buffer_start: match_start].text + tok[match_start - 1].whitespace_
        text += replacement + tok[match_start].whitespace_  # Replace token, with trailing whitespace if available
        buffer_start = match_start + 1
    text += tok[buffer_start:].text
    return text

for token in paragraphs[1][0]:
    if token.text == "\n":
        replace_word(token, " ")

paragraphs[1][0].text

ValueError: [E178] Each pattern should be a list of dicts, but got: {'ORTH': '\n'}. Maybe you accidentally passed a single pattern to Matcher.add instead of a list of patterns? If you only want to add one pattern, make sure to wrap it in a list. For example: `matcher.add('
', [pattern])`

Extract Amount of Serves

In [21]:
def get_serving_sentences(paragraphs: list) -> str:
    for paragraph in paragraphs:
        if len(paragraph) > 1:
            continue
        if "SERVES" in paragraph[0].text:
            return paragraph[0]
    

def get_serving_amount(serving_sentence) -> str:
    for token in serving_sentence:
        if token.pos_ == "NUM":
            return token


serving_sentence = get_serving_sentences(paragraphs)
serving_amount = get_serving_amount(serving_sentence)
serving_amount

4

Extract ingredients

In [22]:
def is_ingredient_sent(paragraph: list) -> bool:
    # Ingredients only have one Span in the paragraph
    if len(paragraph) > 1:
        return False

    ingredient_span = paragraph[0]
    
    # Only Ingredient's begin with a number
    if list(ingredient_span)[0].pos_ == "NUM":
        return True

    # Ingredients often follow the format of: [AMOUNT] of [optional adjetive] [INGREDIENT] eg pinch of salt, ROOT prep ... pobj
    serving_sentence_list = list(ingredient_span)
    if (
        serving_sentence_list[0].dep_ == "ROOT"
        and serving_sentence_list[1].dep_ == "prep"
        and serving_sentence_list[-1].dep_ == "pobj"
    ):
        return True

    # Ingredient sometimes listed with some extra steps but no additional nouns. Eg "salt, to taste"
    check_if_noun = lambda token: token.pos_ == "NOUN"
    if serving_sentence_list[0].pos_ == "NOUN" and not any([check_if_noun(token) for token in serving_sentence_list[1:]]):
        return True

    return False
        
ingredients_indexes = [idx for idx, p in enumerate(paragraphs) if is_ingredient_sent(p)]
ingredients = [paragraphs[idx] for idx in ingredients_indexes]
print(ingredients)
no_ingred_paragraphs = [p for idx, p in enumerate(paragraphs) if idx not in ingredients_indexes]

[[520 g (1 Ib 2 02) tin of
sweetcorn, drained], [3 tbsp vegetable oil], [pinch of asafoetida], [1 heaped tsp black
mustard seeds], [1 tsp ground turmeric], [salt, to taste], [1 tbsp finely chopped
coriander (cilantro)], [1 tbsp roasted peanuts,
crushed], [juice of % lime], [1 green bird's-eye chilli], [3 garlic cloves, roughly
chopped], [5 cm (2 in) ginger root,
roughly chopped]]


In [37]:
no_ingred_paragraphs


[[SERVES 4],
 [For the chilli and ginger paste],
 [GUJARATI VAGHARA MAKKAI
  Spicy Sweetcorn with
  Ginger & Green Chilli],
 [Stir-fried sweetcorn with ginger, green chilli and
  turmeric.,
  Simple quick and delicious, this stir-fry is
  a must to accompany every thali.,
  I’ve used tinned
  sweetcorn for this recipe, which is readily available.],
 [First make the chilli and ginger paste by placing all the
  ingredients into a blender, along with 3 tablespoons of the
  sweetcorn, and blitz to a coarse paste.,
  Set aside.],
 [Heat the oil in a large frying pan (skillet) over a medium heat.,
  Add the asafoetida and mustard seeds, and fry for a few
  seconds until they sputter.],
 [Add the prepared chilli and ginger paste and fry for 1 minute,
  stirring well.,
  Add the turmeric and stir, then add the remaining
  sweetcorn and fry for 2 minutes.,
  Reduce the heat to low,
  season, and add the fresh coriander and crushed peanuts.,
  Cover and cook for 1 more minute.,
  Finish with the