In [20]:
import numpy as np
import cv2
import pandas as pd
import pytesseract
import re

In [21]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [22]:
img = cv2.imread('../img/costco-flyer1.jpg')

In [23]:
def preprocess(image):
    """
    :param image:
    :return preprocessed image:
    """
    global img
    # Preprocess
    img = cv2.resize(image, (0, 0), fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    imgGray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # Grayscale
    imgBlur = cv2.GaussianBlur(imgGray, (7, 7), 0)
    ret, thresh = cv2.threshold(imgBlur, 0, 255, cv2.THRESH_OTSU + cv2.THRESH_BINARY_INV)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9, 9))
    connected = cv2.dilate(thresh, kernel, iterations=8)
    return connected

In [24]:
def get_contours(image, imgdilated):
    contours, hierarchy = cv2.findContours(imgdilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    roi_arr = []
    for cnt in contours:
        area = cv2.contourArea(cnt)
        if area > 8000:
            x, y, w, h = cv2.boundingRect(cnt)
            padding = 0
            cv2.rectangle(image, (x, y + padding), (x + w, y + h + padding), (0, 255, 0), 3)
            roi = image[y:y + h, x:x + w]
            roi_arr.append(roi)
    return roi_arr

In [25]:
def get_products(lst):
    flyer_str = []
    for i in lst:
        i_gray = cv2.cvtColor(i, cv2.COLOR_BGR2GRAY)
        string = pytesseract.image_to_string(i_gray)
        flyer_str.append(string)
    return flyer_str

In [26]:
connected = preprocess(img)
roi_array = get_contours(img, connected)
flyer_str = get_products(roi_array)

In [29]:
def text_extraction():
    without_empty_strings = [string for string in flyer_str if string != ""]
    
    return without_empty_strings

In [30]:
text_extraction()


['Available for delivery at a higher price.\n\n= VIEW ONLINE OFFER\n\n',
 '=a\n\nTORTILLAS\n\nim\n',
 'Valid Oct. 16 to 29, 2023 2 WEEKS\n\nSAVE\n\n$2\n\nMejicano 7-in. or 10-in. regular\nor whole wheat tortillas\n\n3 packs of 12 or 2 packs of 15\n\n1240888, 1240892, 1240894, 1240895\n\n',
 'Valid Oct. 16 to 29, 2023 2 WEEKS\nSAVE\n\n$3\n\nMoulin Rouge crushed\ncayenne pepper or cayenne\npepper powder\n\n1.4 kg or 1.8 kg\n\n1248185, 1248197\n\nAvailable for delivery at a higher price.\n\nVIEW ONLINE OFFER\n\n',
 'Valid Oct. 16 to 29, 2023 2 WEEKS\n\n4 =\n\nHeinz real mayonnaise\n\n4L i\n1180168 |—_——"8\nIn-warehouse $19.99\n\nInstant savings -$4.00 - =\nPRICE $15.99\n\nAvailable for delivery at a higher price.\n\n= VIEW ONLINE OFFER\n\n',
 'Available for delivery at a higher price.\n\nVIEW ONLINE OFFER\n\n',
 'Available for delivery at a higher price.\n\nVIEW ONLINE OFFER\n\n',
 'Valid Oct. 16 to 29, 2023 2 WEEKS\nSAVE\n\n$6\n\nLea & Perrins Worcestershire\n\nsauce\n\neral\n\n1170282\n

In [35]:
" 'Valid Oct. 16 to 29, 2023 2 WEEKS\n\nSAVE\n\n$2\n\nMejicano 7-in. or 10-in. regular\nor whole wheat tortillas\n\n3 packs of 12 or 2 packs of 15\n\n1240888, 1240892, 1240894, 1240895\n\n'".splitlines()

[" 'Valid Oct. 16 to 29, 2023 2 WEEKS",
 '',
 'SAVE',
 '',
 '$2',
 '',
 'Mejicano 7-in. or 10-in. regular',
 'or whole wheat tortillas',
 '',
 '3 packs of 12 or 2 packs of 15',
 '',
 '1240888, 1240892, 1240894, 1240895',
 '',
 "'"]

In [27]:
def text_extraction():
    without_empty_strings = [string for string in flyer_str if string != ""]
    brand, product, serving, sale_price, reg_price, valid_dt = [], [], [], [], [], []
    for prod in without_empty_strings:

        temp_lst = prod.splitlines()
        str_list = list(filter(None, temp_lst))  # remove all empty strings

        if len(str_list) < 2:
            continue

        if str_list[1].endswith(('”', '™', '“')):
            str_list[1] = str_list[1][:-1]

        if str_list[2].endswith(('”', '™', '“', '*')):
            str_list[1] = str_list[1] + " " + str_list[2]
            str_list.pop(2)

        str_list[1] = re.sub("\*", '', str_list[1])

        # unit of measure
        try:
            unit_index = [idx for idx, val in enumerate(str_list) if re.search(r"\d+\s?(ml|-pack|g|L)$", val)][0]
        except:
            unit_index = None

        # Sale Prices
        sale_index = [idx for idx, val in enumerate(str_list) if 'sale' in val.lower()]

        # Regular Prices
        reg_index = [idx for idx, val in enumerate(str_list) if 'regular' in val.lower()]

        # Valid Dates
        valid_index = [idx for idx, val in enumerate(str_list) if 'valid' in val.lower()]
        print(valid_index)

        brand.append(str_list[0])  # brand
        product.append(str_list[1])  # product
        sale_price.append(str_list[sale_index[0]] if sale_index else '')
        reg_price.append(str_list[reg_index[0]] if reg_index else '')
        valid_dt.append(str_list[valid_index[0]] if valid_index else '')
        serving.append(str_list[unit_index] if unit_index else '')

    df = pd.DataFrame(data=list(zip(brand, product, serving, sale_price, reg_price, valid_dt)),
                      columns=['Brand', 'Product', 'Unit', ' Sale Price', 'Regular Price', 'Valid Dates']
                      )
    return df

In [28]:
df = text_extraction()
print(df)

IndexError: list index out of range

In [None]:
df