In [48]:
%pip install -q \
    easyocr \
    python-Levenshtein \
    fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [12]:
import csv
import easyocr
import Levenshtein

In [14]:
reader = easyocr.Reader(['en']) # this needs to run only once to load the model into memory
# result = reader.readtext('IMG_4549.jpeg')

In [15]:
receipt_strings = reader.readtext('IMG_4549.jpeg')
receipt_strings

[([[np.int32(182), np.int32(174)],
   [np.int32(872), np.int32(174)],
   [np.int32(872), np.int32(319)],
   [np.int32(182), np.int32(319)]],
  '@IIndependent',
  np.float64(0.8128134623300763)),
 ([[np.int32(669), np.int32(265)],
   [np.int32(740), np.int32(265)],
   [np.int32(740), np.int32(293)],
   [np.int32(669), np.int32(293)]],
  'onoa',
  np.float64(0.20261910557746887)),
 ([[np.int32(74), np.int32(336)],
   [np.int32(416), np.int32(336)],
   [np.int32(416), np.int32(378)],
   [np.int32(74), np.int32(378)]],
  "JONSSOH 'S VIG ALHONTE",
  np.float64(0.31761375759226756)),
 ([[np.int32(76), np.int32(369)],
   [np.int32(462), np.int32(369)],
   [np.int32(462), np.int32(414)],
   [np.int32(76), np.int32(414)]],
  '4-401  OTTAWA ST , ALHONTE',
  np.float64(0.2500778113248592)),
 ([[np.int32(74), np.int32(406)],
   [np.int32(195), np.int32(406)],
   [np.int32(195), np.int32(445)],
   [np.int32(74), np.int32(445)]],
  'WeIcone',
  np.float64(0.7270426119236494)),
 ([[np.int32(205), np.

In [4]:
# i could select areas of interest in a UI and send only these
crop_test = reader.readtext('crop-test.png')
crop_test

[([[np.int32(5), np.int32(9)],
   [np.int32(143), np.int32(9)],
   [np.int32(143), np.int32(50)],
   [np.int32(5), np.int32(50)]],
  '81075701137',
  np.float64(0.625027839958588)),
 ([[np.int32(181), np.int32(9)],
   [np.int32(437), np.int32(9)],
   [np.int32(437), np.int32(49)],
   [np.int32(181), np.int32(49)]],
  'SCHR  GF   HAMB  BUN',
  np.float64(0.3145630801259684)),
 ([[np.int32(494), np.int32(12)],
   [np.int32(526), np.int32(12)],
   [np.int32(526), np.int32(42)],
   [np.int32(494), np.int32(42)]],
  'HR',
  np.float64(0.433735365883307)),
 ([[np.int32(571), np.int32(3)],
   [np.int32(643), np.int32(3)],
   [np.int32(643), np.int32(41)],
   [np.int32(571), np.int32(41)]],
  '8 , 49',
  np.float64(0.7198822068324803))]

In [25]:
# import product list from csv file
# i had this list from previous years of manually entering receipts

products_list = []
with open('product_list.csv', newline='') as f:
    csvreader = csv.reader(f)
    for row in csvreader:
        products_list.append(row[0])
products_list

['name',
 '3x KIND Oats & Honey Granola with Toasted Coconut KIND Oats & Honey Granola with Toasted Coconut Bundle 3 x 312 g - 936 g',
 "Annie's Homegrown Gluten-Free Rice Pasta & Cheddar - 170 g",
 "Annie's Homegrown Gluten-Free Rice Pasta White Cheddar Macaroni & Cheese - 170 g",
 'Annies rice pasta and white cheddar',
 'Baby Bum Hand Sanitizer - 2 oz',
 'Baby Bum Hand Sanitizer Sample - 60 mL Sample',
 'Baldstreet all beef frankfurters',
 'Barb baked white cheddar cheese puffs',
 'Barb original cheese puffs',
 'basd Body Wash Invigorating Mint Sample - 1 oz',
 'BioGaia Probiotic Drops Sample Sample',
 'Blue Diamond Nut Thins Crackers Almond - 120 g',
 'Blue nut thin',
 "Bob's Red Mill Cornstarch - 510 g",
 "Bob's Red Mill Gluten Free Old Fashioned Rolled Oats - 907 g",
 "Bob's Red Mill Gluten Free Quick Cooking Rolled Oats - 794 g",
 "Bob's Red Mill Gluten Free Steel Cut Oats - 680 g",
 "Bob's Red Mill Organic Extra Thick Rolled Oats",
 "Bob's Red Mill Organic Extra Thick Rolled Oat

In [52]:
# find gluten free products using levenstein distance

threshold = 0.3

# Function to find the best match
def find_best_match(receipt_string, products_list, threshold=threshold):
    best_match = None
    best_score = float('-inf')  # Initialize to negative infinity

    for name in products_list:
        score = Levenshtein.ratio(receipt_string, name)
        if score > best_score:
            best_score = score
            best_match = name

    if best_score >= threshold:
        return best_match, best_score
    else:
        return None, None

# Find the best match for each shorthand name and filter results by threshold

receipt_matches = []

for receipt_string in receipt_strings:
    best_match, best_score = find_best_match(receipt_string[1], products_list, threshold)
    if best_match is not None:
        receipt_matches.append((receipt_string, best_match, best_score))

# Print the results
for receipt_string, best_match, best_score in receipt_matches:
    print(f"Best match for '{receipt_string[1]}': {best_match} (score: {best_score})")

Best match for '@IIndependent': Jade pearl rice ramen noodles (score: 0.33333333333333337)
Best match for 'onoa': name (score: 0.5)
Best match for 'WeIcone': name (score: 0.36363636363636365)
Best match for 'SCHR GF HAMB BUN': Schar GF hotdog buns (score: 0.33333333333333337)
Best match for '0,970 kg Gross': Gluten Free Oreos (score: 0.32258064516129037)
Best match for '~0.010 kg Tare': Thorn golden flax bread (score: 0.32432432432432434)
Best match for 'TotaL': Potato Starch (score: 0.33333333333333337)
Best match for 'Trans _': Prana cashews gf (score: 0.4347826086956522)
Best match for 'Account:': Schar pizza crust (score: 0.31999999999999995)
Best match for 'Card Type:': Fody pasta sauce (score: 0.3076923076923077)
Best match for 'Card Hunber:': schar gf buns (score: 0.4)
Best match for 'Dateline:': EatWell Flings Buffalo (score: 0.3870967741935484)
Best match for 'Retain this copy for statenent': Retta chicken dogs (score: 0.45833333333333337)
Best match for 'validatjon': Load mul

In [66]:
# levenstein is not good at all, let's try fuzzywuzzy

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

threshold=60

def find_best_match(shorthand_name, full_product_names, threshold=threshold):
    best_match, best_score = process.extractOne(shorthand_name, full_product_names, scorer=fuzz.token_set_ratio)
    if best_score >= threshold:
        # avoid strict substring matches
        if len(shorthand_name)/len(best_match) > 0.4:
            return best_match, best_score
        else:
            return None, None
    else:
        return None, None

# Find the best match for each shorthand name and filter results by threshold
results = []

for receipt_string in receipt_strings:
    best_match, best_score = find_best_match(receipt_string[1], products_list, threshold)
    if best_match is not None:
        results.append((receipt_string, best_match, best_score))

# Print the results
for receipt_string, best_match, best_score in results:
    print(f"Best match for '{receipt_string[1]}': {best_match} with a score of {best_score}, with a length ratio of {len(receipt_string[1])/len(best_match)}")

Best match for 'SCHR GF HAMB BUN': Schar GF hamburger buns with a score of 82, with a length ratio of 0.6956521739130435
Best match for 'GF  BRIOCHE HAM': Brioche Hot Dog and Hamburger Buns with a score of 67, with a length ratio of 0.4411764705882353


In [102]:
# lets grab the prices

# return index of best_match
def get_index(best_match, receipt_strings):
    for index, sub_array in enumerate(receipt_strings):
        if len(sub_array) > 1 and sub_array[1] == best_match:
            return index
    return -1  # Return -1 if no match is found

def find_next_two_values(index, receipt_strings):
    if index == -1 or index + 2 >= len(receipt_strings):
        return []  # Return an empty list if there are not enough elements
    result = []
    for i in range(1, 3):
        if len(receipt_strings[index + i]) > 1:
            result.append(receipt_strings[index + i][1])
    return result

def extract_price(value):
    # Remove spaces and commas from the string
    cleaned_value = value.replace(" ", "").replace(",", ".")
    # print(cleaned_value)
    try:
        price = float(cleaned_value)
        return price
    except ValueError:
        return None

def price_search(values):
    cleaned_values = [extract_price(value) for value in values] 
    cleaned_values = [value for value in cleaned_values if value is not None]
    # print(f'clean values: {cleaned_values}')
    if cleaned_values:
        return cleaned_values[0]
    else:
        return None 

def find_price(receipt_string, receipt_strings):
    target_index = get_index(receipt_string, receipt_strings)
    # print(target_index)
    next_values = find_next_two_values(target_index, receipt_strings)
    # print(f'next values: {next_values}')
    price = price_search(next_values)
    return price

# product = 'SCHR GF HAMB BUN'
product = 'GF  BRIOCHE HAM'
print(f'price of {product} is {find_price(product, receipt_strings)}')

price of GF  BRIOCHE HAM is 7.99


In [107]:
# lets put it all together
for receipt_string, a, b in results:
    product = receipt_string[1]
    price = find_price(product, receipt_strings)
    print(f'{product} -- {price}')

SCHR GF HAMB BUN -- 8.49
GF  BRIOCHE HAM -- 7.99
