In [272]:
%pip install -q \
    easyocr \
    python-Levenshtein \
    fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [273]:
import csv
import easyocr
# import Levenshtein

In [274]:
reader = easyocr.Reader(['en']) # this needs to run only once to load the model into memory
# result = reader.readtext('IMG_4549.jpeg')

In [275]:
# receipt_strings_test = reader.readtext('IMG_4549.jpeg')
# receipt_strings_test

In [276]:
# i could select areas of interest in a UI and send only these
# crop_test = reader.readtext('crop-test.png')
# crop_test

In [277]:
# import product list from csv file
# i had this list from previous years of manually entering receipts

products_list = []
with open('product_list.csv', newline='') as f:
    csvreader = csv.reader(f)
    next(csvreader) # skip header row
    for row in csvreader:
        products_list.append(row[0])
# products_list

In [278]:
# # find gluten free products using levenstein distance

# threshold = 0.3

# # Function to find the best match
# def find_best_match(receipt_string, products_list, threshold=threshold):
#     best_match = None
#     best_score = float('-inf')  # Initialize to negative infinity

#     for name in products_list:
#         score = Levenshtein.ratio(receipt_string, name)
#         if score > best_score:
#             best_score = score
#             best_match = name

#     if best_score >= threshold:
#         return best_match, best_score
#     else:
#         return None, None

# # Find the best match for each shorthand name and filter results by threshold

# receipt_matches = []

# for receipt_string in receipt_strings:
#     best_match, best_score = find_best_match(receipt_string[1], products_list, threshold)
#     if best_match is not None:
#         receipt_matches.append((receipt_string, best_match, best_score))

# # Print the results
# for receipt_string, best_match, best_score in receipt_matches:
#     print(f"Best match for '{receipt_string[1]}': {best_match} (score: {best_score})")

In [279]:
# levenstein is not good at all, let's try fuzzywuzzy

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re

def find_best_match(shorthand_name, full_product_names, threshold):
    best_match, best_score = process.extractOne(shorthand_name, full_product_names, scorer=fuzz.token_set_ratio)
    if best_score >= threshold:
        # avoid strict substring matches
        if len(shorthand_name)/len(best_match) > 0.4:
            return best_match, best_score
        else:
            return None, None
    else:
        return None, None

def pre_fuzzy(search_query):
    # regex out 1 to 10 digits in the beginning of the string for sku codes in store receipts
    search_query = re.sub(r'^\d{1,10}', '', search_query)
    return search_query

# Find the best match for each shorthand name and filter results by threshold
def find_glutenfree(receipt_strings, products_list, threshold):
    results = []

    for receipt_string in receipt_strings:
        search_query = pre_fuzzy(receipt_string[1])
        if len(search_query) > 2:           
            best_match, best_score = find_best_match(search_query, products_list, threshold)
        else:
            best_match, best_score = None, None
        if best_match is not None:
            results.append((receipt_string, best_match, best_score))
    return results

# Print the results
# glutenfree_matches = find_glutenfree(receipt_strings_test, products_list, threshold)
# for receipt_string, best_match, best_score in glutenfree_matches:
#     print(f"Best match for '{receipt_string[1]}': {best_match} with a score of {best_score}, with a length ratio of {len(receipt_string[1])/len(best_match)}")

In [280]:
# lets grab the prices

# return index of best_match
def get_index(best_match, receipt_strings):
    for index, sub_array in enumerate(receipt_strings):
        if len(sub_array) > 1 and sub_array[1] == best_match:
            return index
    return -1  # Return -1 if no match is found

def find_next_two_values(index, receipt_strings):
    if index == -1 or index + 2 >= len(receipt_strings):
        return []  # Return an empty list if there are not enough elements
    result = []
    for i in range(1, 10):
        if len(receipt_strings[index + i]) > 1:
            result.append(receipt_strings[index + i][1])
    return result

def extract_price(value):
    # Remove spaces and commas from the string
    cleaned_value = value.replace(" ","").replace(",", ".")
    # print(cleaned_value)
    return cleaned_value

def price_search(values):
    # print(values)
    cleaned_values = [extract_price(value) for value in values]
    # print(cleaned_values)
    # keep only values containing a dot
    cleaned_values = [value for value in cleaned_values if '.' in str(value)]
    cleaned_values = [value for value in cleaned_values if value is not None]
    # print(f'clean values: {cleaned_values}')

    if cleaned_values:
        # return cleaned_values[0]
        try:
            # regex out any character that isn't a digit or a dot
            price = re.sub(r'[^\d.]', '', cleaned_values[0])
            price = float(price)
            return price
        except ValueError:
            return None
    else:
        return None
        # return ','.join(values) # for testing

def find_price(receipt_string, receipt_strings):
    target_index = get_index(receipt_string, receipt_strings)
    # print(target_index)
    next_values = find_next_two_values(target_index, receipt_strings)
    # print(f'next values: {next_values}')
    price = price_search(next_values)
    return price

# product = 'SCHR GF HAMB BUN'
product = 'GF  BRIOCHE HAM'
print(f'price of {product} is {find_price(product, receipt_strings)}')

price of GF  BRIOCHE HAM is None


In [281]:
# lets put it all together
def make_table(glutenfree_matches, receipt_strings, filename):
    for receipt_string, best_match, best_score in glutenfree_matches:
        product = receipt_string[1]
        price = find_price(product, receipt_strings)
        # TODO: add to an array that can be exported to a csv
        print(f'{filename} -- "{product}" ({best_match}) @ ${price}')

# make_table(glutenfree_matches)

In [282]:
import os

def list_files(directory, extension):
    return list(f for f in os.listdir(directory) if f.endswith('.' + extension))

for filename in list_files('./receipts', 'jpg'):
    receipt_strings = reader.readtext('./receipts/'+filename)
    # print(f'Processing {filename}')
    make_table(find_glutenfree(receipt_strings, products_list, threshold=80), receipt_strings, filename)

IMG_4560.jpg -- "Doughnut" (Doughnut) @ $5.0
IMG_4561.jpg -- "Eco Pack Cer" (Eco Pack Cert) @ $8.99
IMG_4561.jpg -- "Eco Pack Cer" (Eco Pack Cert) @ $8.99
IMG_4559.jpg -- "22143 PRANA  CASHEVS" (Prana cashews gf) @ $13.95
IMG_4559.jpg -- "20862 NATP WAFFLES GF HOMESTYLE" (NATP WAFFLES GF HOMESTYLE) @ $6.25
IMG_4559.jpg -- ""16171 BOB WHOLE GROUND  FLAXSEED MEA" (Bob Red Mill Whole Ground Flaxseed) @ $7.95
IMG_4559.jpg -- "29805  SNYDER 'S GF  PRETZEL STICKS" (Snyder's GF Pretzel Sticks) @ $7.15
IMG_4559.jpg -- "22418 QUE ORG,   TORTILLAS" (Que Organic Tortillas) @ $6.95
IMG_4559.jpg -- "22705 SCHAR GF  CIABATTA BUNS" (schar gf buns) @ $8.45
IMG_4559.jpg -- "18182 FODY SALSA MEDIUM 450ML" (Fody salsa) @ $6.95
IMG_4559.jpg -- "20844 NATP  FRUIT   JUICE   CORN  FLAKES" (NATP  FRUIT   JUICE   CORN  FLAKES) @ $13.55
IMG_4559.jpg -- "29804  SNYDER'$ GF  MINI  PRETZELS" (Snyder's GF Mini Pretzels) @ $7.15
IMG_4557.jpg -- "17570 ODO HAMBURGER   BUNS" (ODO HAMBURGER   BUNS) @ $5.25
IMG_4557.jpg