In [55]:
%pip install -q \
    easyocr \
    python-Levenshtein \
    fuzzywuzzy \
    asyncio \
    prisma

Note: you may need to restart the kernel to use updated packages.


In [13]:
import csv
import easyocr
# import Levenshtein

In [14]:
reader = easyocr.Reader(['en']) # this needs to run only once to load the model into memory
# result = reader.readtext('IMG_4549.jpeg')

In [15]:
receipt_strings_test = reader.readtext('./test-images/IMG_4549.jpeg')
receipt_strings_test

[([[np.int32(182), np.int32(174)],
   [np.int32(872), np.int32(174)],
   [np.int32(872), np.int32(319)],
   [np.int32(182), np.int32(319)]],
  '@IIndependent',
  np.float64(0.8128134623300763)),
 ([[np.int32(669), np.int32(265)],
   [np.int32(740), np.int32(265)],
   [np.int32(740), np.int32(293)],
   [np.int32(669), np.int32(293)]],
  'onoa',
  np.float64(0.20261910557746887)),
 ([[np.int32(74), np.int32(336)],
   [np.int32(416), np.int32(336)],
   [np.int32(416), np.int32(378)],
   [np.int32(74), np.int32(378)]],
  "JONSSOH 'S VIG ALHONTE",
  np.float64(0.31761375759226756)),
 ([[np.int32(76), np.int32(369)],
   [np.int32(462), np.int32(369)],
   [np.int32(462), np.int32(414)],
   [np.int32(76), np.int32(414)]],
  '4-401  OTTAWA ST , ALHONTE',
  np.float64(0.2500778113248592)),
 ([[np.int32(74), np.int32(406)],
   [np.int32(195), np.int32(406)],
   [np.int32(195), np.int32(445)],
   [np.int32(74), np.int32(445)]],
  'WeIcone',
  np.float64(0.7270426119236494)),
 ([[np.int32(205), np.

In [16]:
# i could select areas of interest in a UI and send only these
# crop_test = reader.readtext('crop-test.png')
# crop_test

In [17]:
# import product list from csv file
# i had this list from previous years of manually entering receipts

products_list = []
with open('product_list.csv', newline='') as f:
    csvreader = csv.reader(f)
    next(csvreader) # skip header row
    for row in csvreader:
        products_list.append(row[0])
# products_list

In [18]:
# # find gluten free products using levenstein distance

# threshold = 0.3

# # Function to find the best match
# def find_best_match(receipt_string, products_list, threshold=threshold):
#     best_match = None
#     best_score = float('-inf')  # Initialize to negative infinity

#     for name in products_list:
#         score = Levenshtein.ratio(receipt_string, name)
#         if score > best_score:
#             best_score = score
#             best_match = name

#     if best_score >= threshold:
#         return best_match, best_score
#     else:
#         return None, None

# # Find the best match for each shorthand name and filter results by threshold

# receipt_matches = []

# for receipt_string in receipt_strings:
#     best_match, best_score = find_best_match(receipt_string[1], products_list, threshold)
#     if best_match is not None:
#         receipt_matches.append((receipt_string, best_match, best_score))

# # Print the results
# for receipt_string, best_match, best_score in receipt_matches:
#     print(f"Best match for '{receipt_string[1]}': {best_match} (score: {best_score})")

In [19]:
# levenstein is not good at all, let's try fuzzywuzzy

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re

def find_best_match(shorthand_name, full_product_names, threshold):
    best_match, best_score = process.extractOne(shorthand_name, full_product_names, scorer=fuzz.token_set_ratio)
    if best_score >= threshold:
        # avoid strict substring matches
        if len(shorthand_name)/len(best_match) > 0.4:
            return best_match, best_score
        else:
            return None, None
    else:
        return None, None

def pre_fuzzy(search_query):
    # regex out 1 to 10 digits in the beginning of the string for sku codes in store receipts
    search_query = re.sub(r'^\d{1,10}', '', search_query)
    return search_query

# Find the best match for each shorthand name and filter results by threshold
def find_glutenfree(receipt_strings, products_list, threshold):
    results = []

    for receipt_string in receipt_strings:
        search_query = pre_fuzzy(receipt_string[1])
        # print(search_query)
        # if search_query contains at least one digit or letter
        if any(char.isalnum() for char in search_query):
        # if len(search_query) > 2:           
            best_match, best_score = find_best_match(search_query, products_list, threshold)
        else:
            best_match, best_score = None, None
        if best_match is not None:
            results.append((receipt_string, best_match, best_score))
    return results

# Print the results
# glutenfree_matches = find_glutenfree(receipt_strings_test, products_list, threshold)
# for receipt_string, best_match, best_score in glutenfree_matches:
#     print(f"Best match for '{receipt_string[1]}': {best_match} with a score of {best_score}, with a length ratio of {len(receipt_string[1])/len(best_match)}")

In [56]:
# lets grab the prices

# return index of best_match
def get_index(best_match, receipt_strings):
    for index, sub_array in enumerate(receipt_strings):
        if len(sub_array) > 1 and sub_array[1] == best_match:
            return index
    return -1  # Return -1 if no match is found

def find_next_n_values(index, receipt_strings, n=10):
    # cap n at length of receipt_strings
    if index + n >= len(receipt_strings):
        n = len(receipt_strings) - index
    # Return an empty list if there are not enough elements
    if index == -1 or index + n >= len(receipt_strings):
        return []
    # Return the next n values
    result = []
    for i in range(1, n):
        if len(receipt_strings[index + i]) > 1:
            result.append(receipt_strings[index + i][1])
    return result

def extract_price(value):
    # Remove spaces and commas from the string
    cleaned_value = value.replace(" ","").replace(",", ".")
    # print(cleaned_value)
    return cleaned_value

def price_search(values):
    # print(values)
    cleaned_values = [extract_price(value) for value in values]
    # print(cleaned_values)
    # keep only values containing a dot
    cleaned_values = [value for value in cleaned_values if '.' in str(value)]
    cleaned_values = [value for value in cleaned_values if value is not None]
    # print(f'clean values: {cleaned_values}')

    if cleaned_values:
        # return cleaned_values[0]
        try:
            # regex out any character that isn't a digit or a dot
            price = re.sub(r'[^\d.]', '', cleaned_values[0])
            price = float(price)
            return price
        except ValueError:
            return None
    else:
        return None
        # return ','.join(values) # for testing

def find_price(receipt_string, receipt_strings):
    target_index = get_index(receipt_string, receipt_strings)
    # print(target_index)
    next_values = find_next_n_values(target_index, receipt_strings, n=10)
    # print(f'next values: {next_values}')
    price = price_search(next_values)
    return price

# product = 'SCHR GF HAMB BUN'
# product = 'GF  BRIOCHE HAM'
# print(f'price of {product} is {find_price(product, receipt_strings)}')

In [57]:
# lets put it all together
import pandas as pd

def make_table(glutenfree_matches, receipt_strings, filename):
    results = []
    for receipt_string, best_match, best_score in glutenfree_matches:
        product = receipt_string[1]
        price = find_price(product, receipt_strings)
        results.append((filename, product, best_match, price))
        # print(f'{filename} -- "{product}" ({best_match}) @ ${price}')
        # add to new panda dataframe
    return results

# make_table(glutenfree_matches)

In [58]:
import os

def list_files(directory, extension):
    return list(f for f in os.listdir(directory) if f.endswith('.' + extension))

# new panda df
expenses_df = pd.DataFrame(columns=['filename', 'product', 'best_match', 'price'])
receipts_df = pd.DataFrame(columns=['box', 'text', 'confidence'])

for filename in list_files('./receipts', 'jpg'):
    receipt_strings = reader.readtext('./receipts/'+filename)
    print(f'processing {filename}...')
    if len(receipt_strings) > 0:
        receipt_strings_df = pd.DataFrame(receipt_strings, columns=['box', 'text', 'confidence'])
        receipts_df = pd.concat([receipts_df, receipt_strings_df])

    receipt_data = make_table(find_glutenfree(receipt_strings, products_list, threshold=80), receipt_strings, filename)
    if len(receipt_data) > 0:
        receipt_df = pd.DataFrame(receipt_data, columns=['filename', 'product', 'best_match', 'price'])
        expenses_df = pd.concat([expenses_df, receipt_df])

processing img_4661.jpg...


  receipts_df = pd.concat([receipts_df, receipt_strings_df])


processing img_4649.jpg...
processing img_4648.jpg...
processing img_4660.jpg...
processing img_4658.jpg...
processing img_4659.jpg...
processing img_4588.jpg...
processing img_4577.jpg...
processing img_4601.jpg...


  expenses_df = pd.concat([expenses_df, receipt_df])


processing img_4615.jpg...
processing img_4629.jpg...
processing img_4628.jpg...
processing img_4614.jpg...
processing img_4600.jpg...
processing img_4576.jpg...
processing img_4589.jpg...
processing IMG_4560.jpg...
processing img_4574.jpg...
processing img_4616.jpg...
processing img_4602.jpg...
processing img_4603.jpg...
processing img_4617.jpg...
processing img_4575.jpg...
processing IMG_4561.jpg...
processing IMG_4559.jpg...
processing img_4571.jpg...
processing img_4613.jpg...
processing img_4607.jpg...
processing img_4612.jpg...
processing img_4570.jpg...
processing IMG_4558.jpg...
processing img_4599.jpg...
processing img_4572.jpg...
processing img_4638.jpg...
processing img_4604.jpg...
processing img_4610.jpg...
processing img_4611.jpg...
processing img_4605.jpg...
processing img_4639.jpg...
processing img_4573.jpg...
processing img_4598.jpg...
processing img_4595.jpg...
processing img_4581.jpg...
processing IMG_4556.jpg...
processing img_4620.jpg...
processing img_4634.jpg...
p

In [59]:
expenses_df.to_csv('expenses.csv', index=False)
expenses_df

Unnamed: 0,filename,product,best_match,price
0,img_4601.jpg,Mac & Cheese,Mac & Cheese,512.99
1,img_4601.jpg,Bacon Mac & Cheese,Mac & Cheese,54.25
0,img_4576.jpg,19554 KINN ENGLISH MUFFIN TAPIOCA,Kinn english muffin tapioca,6.35
1,img_4576.jpg,22143 PRANA CASHEWS,Prana cashews gf,13.95
2,img_4576.jpg,18184 FODY TOMATO BASIL SAUCE,Fody Premium Tomato Basil Sauce - 547 mL,9.95
...,...,...,...,...
5,img_4644.jpg,22143 PRANA CASHEWS,Prana cashews gf,13.95
6,img_4644.jpg,28289 ENJOY SEMI-SWEET MEGA CHOC CH,Enjoy Life Mega Chocolate Chunks Semi-Sweet - ...,8.45
7,img_4644.jpg,20236 MAISON ORP OLIVE OIL DELICATE,Maison Orphee Olive Oil Extra Virgin Delicate ...,25.95
8,img_4644.jpg,"29114 LUCKY THAI SPRING ROLLS GF"" 24",Lucky Spring Rolls GF,8.65


In [60]:
receipts_df.to_csv('receipts.csv', index=False)
receipts_df

Unnamed: 0,box,text,confidence
0,"[[631, 269], [806, 269], [806, 341], [631, 341]]",2_3,0.040573
1,"[[116, 263], [821, 263], [821, 475], [116, 475]]",I~~fs (pke =,0.124128
2,"[[405, 436], [481, 436], [481, 488], [405, 488]]","3 """,0.019708
3,"[[120, 447], [317, 447], [317, 609], [120, 609]]",Lec|,0.570547
4,"[[656, 493], [900, 493], [900, 565], [656, 565]]",'!,0.086648
...,...,...,...
184,"[[464, 3867], [733, 3867], [733, 3919], [464, ...",70104-2707 RTo0o1,0.528192
185,"[[55, 3892], [501, 3892], [501, 3945], [55, 39...",THANK You FOR SHOPPING AT 'YIG,0.403911
186,"[[53, 3931], [259, 3931], [259, 3971], [53, 39...",STORE MANAGER,0.388632
187,"[[291, 3929], [532, 3929], [532, 3978], [291, ...",SPENCER JONSSON,0.359595
