In [1]:
from pytesseract import pytesseract
import spacy
import cv2
import numpy as np
from spacy.language import Language

Initialise Tesseract

In [2]:
path_to_tesseract = r"/opt/homebrew/opt/tesseract/bin/tesseract"
pytesseract.tesseract_cmd = path_to_tesseract

Import Image

In [13]:
image_path = r"sample_data/Thali_sweetcorn.png"
# image_path = r"sample_data/Tahini-test_kitchen.png"
img = cv2.imread(image_path)

Image Processing

In [14]:
# --- dilation on the green channel ---
dilated_img = cv2.dilate(img[:, :, 1], np.ones((7, 7), np.uint8))
bg_img = cv2.medianBlur(dilated_img, 21)

# --- finding absolute difference to preserve edges ---
diff_img = 255 - cv2.absdiff(img[:, :, 1], bg_img)

# --- normalizing between 0 to 255 ---
norm_img = cv2.normalize(
    diff_img, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1
)

# --- Otsu threshold ---
th = cv2.threshold(norm_img, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

Extract Text

In [15]:
text = pytesseract.image_to_string(th)

Initialise NLP Model

In [16]:
nlp = spacy.load("en_core_web_sm")

Add Sentensizer to recognise line breaks as a sentence

In [17]:
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == "\n\n":
            doc[token.i].is_sent_start = True
    return doc

In [18]:
nlp.add_pipe("set_custom_boundaries", before="parser")

<function __main__.set_custom_boundaries(doc)>

Process Text with Spacy

In [19]:
doc = nlp(text)

In [20]:
# Find paragraphs by splitting on "whitespace" sentences
paragraphs = []
paragraph = []
for sent in doc.sents:
    if len(sent) <= 1:
        paragraphs.append(paragraph)
        paragraph = []
        continue
    paragraph.append(sent)
paragraphs.append(paragraph)
paragraphs

[[SERVES 4],
 [520 g (1 Ib 2 02) tin of
  sweetcorn, drained],
 [3 tbsp vegetable oil],
 [pinch of asafoetida],
 [1 heaped tsp black
  mustard seeds],
 [1 tsp ground turmeric],
 [salt, to taste],
 [1 tbsp finely chopped
  coriander (cilantro)],
 [1 tbsp roasted peanuts,
  crushed],
 [juice of % lime],
 [For the chilli and ginger paste],
 [1 green bird's-eye chilli],
 [3 garlic cloves, roughly
  chopped],
 [5 cm (2 in) ginger root,
  roughly chopped],
 [GUJARATI VAGHARA MAKKAI
  Spicy Sweetcorn with
  Ginger & Green Chilli],
 [Stir-fried sweetcorn with ginger, green chilli and
  turmeric.,
  Simple quick and delicious, this stir-fry is
  a must to accompany every thali.,
  I’ve used tinned
  sweetcorn for this recipe, which is readily available.],
 [First make the chilli and ginger paste by placing all the
  ingredients into a blender, along with 3 tablespoons of the
  sweetcorn, and blitz to a coarse paste.,
  Set aside.],
 [Heat the oil in a large frying pan (skillet) over a medium he

Extract Amount of Serves

In [21]:
def get_serving_sentences(paragraphs: list) -> str:
    for paragraph in paragraphs:
        if len(paragraph) > 1:
            continue
        if "SERVES" in paragraph[0].text:
            return paragraph[0]
    

def get_serving_amount(serving_sentence) -> str:
    for token in serving_sentence:
        if token.pos_ == "NUM":
            return token


serving_sentence = get_serving_sentences(paragraphs)
serving_amount = get_serving_amount(serving_sentence)
serving_amount

4

Extract ingredients

In [22]:
def is_ingredient_sent(paragraph: list) -> bool:
    # Ingredients only have one Span in the paragraph
    if len(paragraph) > 1:
        return False

    ingredient_span = paragraph[0]
    
    # Only Ingredient's begin with a number
    if list(ingredient_span)[0].pos_ == "NUM":
        return True

    # Ingredients often follow the format of: [AMOUNT] of [optional adjetive] [INGREDIENT] eg pinch of salt, ROOT prep ... pobj
    serving_sentence_list = list(ingredient_span)
    if (
        serving_sentence_list[0].dep_ == "ROOT"
        and serving_sentence_list[1].dep_ == "prep"
        and serving_sentence_list[-1].dep_ == "pobj"
    ):
        return True

    # Ingredient sometimes listed with some extra steps but no additional nouns. Eg "salt, to taste"
    check_if_noun = lambda token: token.pos_ == "NOUN"
    if serving_sentence_list[0].pos_ == "NOUN" and not any([check_if_noun(token) for token in serving_sentence_list[1:]]):
        return True

    return False
        
ingredients_indexes = [idx for idx, p in enumerate(paragraphs) if is_ingredient_sent(p)]
ingredients = [paragraphs[idx] for idx in ingredients_indexes]
print(ingredients)
no_ingred_paragraphs = [p for idx, p in enumerate(paragraphs) if idx not in ingredients_indexes]

[[520 g (1 Ib 2 02) tin of
sweetcorn, drained], [3 tbsp vegetable oil], [pinch of asafoetida], [1 heaped tsp black
mustard seeds], [1 tsp ground turmeric], [salt, to taste], [1 tbsp finely chopped
coriander (cilantro)], [1 tbsp roasted peanuts,
crushed], [juice of % lime], [1 green bird's-eye chilli], [3 garlic cloves, roughly
chopped], [5 cm (2 in) ginger root,
roughly chopped]]


In [37]:
no_ingred_paragraphs


[[SERVES 4],
 [For the chilli and ginger paste],
 [GUJARATI VAGHARA MAKKAI
  Spicy Sweetcorn with
  Ginger & Green Chilli],
 [Stir-fried sweetcorn with ginger, green chilli and
  turmeric.,
  Simple quick and delicious, this stir-fry is
  a must to accompany every thali.,
  I’ve used tinned
  sweetcorn for this recipe, which is readily available.],
 [First make the chilli and ginger paste by placing all the
  ingredients into a blender, along with 3 tablespoons of the
  sweetcorn, and blitz to a coarse paste.,
  Set aside.],
 [Heat the oil in a large frying pan (skillet) over a medium heat.,
  Add the asafoetida and mustard seeds, and fry for a few
  seconds until they sputter.],
 [Add the prepared chilli and ginger paste and fry for 1 minute,
  stirring well.,
  Add the turmeric and stir, then add the remaining
  sweetcorn and fry for 2 minutes.,
  Reduce the heat to low,
  season, and add the fresh coriander and crushed peanuts.,
  Cover and cook for 1 more minute.,
  Finish with the