In [61]:
%pip install -q \
    easyocr \
    python-Levenshtein \
    fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [62]:
import csv
import easyocr
import pandas as pd
import os
import time # not super important, just for timing of how long it takes to run OCR on all files
# import Levenshtein

In [63]:
reader = easyocr.Reader(['en']) # this needs to run only once to load the model into memory
# result = reader.readtext('IMG_4549.jpeg')

In [78]:
# receipt_strings_test = reader.readtext('./test-images/IMG_4549.jpeg')
# receipt_strings_test

In [65]:
# i could select areas of interest in a UI and send only these
# crop_test = reader.readtext('crop-test.png')
# crop_test

In [66]:
# import inputs

referenceitems = pd.read_csv('referenceitems.csv')
referenceitems # these are the items against which the price difference is calculated

queryclassification = pd.read_csv('queryclassification.csv')
queryclassification # these are strings with names of products and their corresponding reference item from previous years manually entering data

# TODO: need to tie the queryclassification to the referenceitems (on import to database)

# quick and dirty, add classification items to referenceitems in so they are searchable too
searchtable = queryclassification
temp_add_referenceitems = referenceitems.copy()
temp_add_referenceitems = temp_add_referenceitems[['name']]
temp_add_referenceitems.columns = ['query']
temp_add_referenceitems['referenceitem'] = temp_add_referenceitems['query']
searchtable = pd.concat([searchtable, temp_add_referenceitems], axis=0)
searchtable = searchtable.drop_duplicates()
searchtable = searchtable.reset_index(drop=True)
searchtable

Unnamed: 0,query,referenceitem
0,3x KIND Oats & Honey Granola with Toasted Coco...,Granola
1,Annie's Homegrown Gluten-Free Rice Pasta & Che...,Mac and cheese
2,Annie's Homegrown Gluten-Free Rice Pasta White...,Mac and cheese
3,Annies rice pasta and white cheddar,Mac and cheese
4,Baldstreet all beef frankfurters,Hot dogs saussages
...,...,...
207,Spring Rolls,Spring Rolls
208,Waffles,Waffles
209,Nuts,Nuts
210,Hot dogs saussages,Hot dogs saussages


In [67]:
# # find gluten free products using levenstein distance

# threshold = 0.3

# # Function to find the best match
# def find_best_match(receipt_string, products_list, threshold=threshold):
#     best_match = None
#     best_score = float('-inf')  # Initialize to negative infinity

#     for name in products_list:
#         score = Levenshtein.ratio(receipt_string, name)
#         if score > best_score:
#             best_score = score
#             best_match = name

#     if best_score >= threshold:
#         return best_match, best_score
#     else:
#         return None, None

# # Find the best match for each shorthand name and filter results by threshold

# receipt_matches = []

# for receipt_string in receipt_strings:
#     best_match, best_score = find_best_match(receipt_string[1], products_list, threshold)
#     if best_match is not None:
#         receipt_matches.append((receipt_string, best_match, best_score))

# # Print the results
# for receipt_string, best_match, best_score in receipt_matches:
#     print(f"Best match for '{receipt_string[1]}': {best_match} (score: {best_score})")

In [68]:
# levenstein is not good at all, let's try fuzzywuzzy

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re

def find_best_match(shorthand_name, full_product_names, threshold):
    best_match, best_score = process.extractOne(shorthand_name, full_product_names, scorer=fuzz.token_set_ratio)
    if best_score >= threshold:
        # avoid strict substring matches
        if len(shorthand_name)/len(best_match) > 0.4:
            return best_match, best_score
        else:
            return None, None
    else:
        return None, None

def pre_fuzzy(search_query):
    # regex out 1 to 10 digits in the beginning of the string for sku codes in store receipts
    search_query = re.sub(r'^\d{1,10}', '', search_query)
    return search_query

# Find the best match for each shorthand name and filter results by threshold
def find_glutenfree(receipt_strings, products_list, threshold):
    results = []

    for receipt_string in receipt_strings:
        search_query = pre_fuzzy(receipt_string[1])
        # print(search_query)
        # if search_query contains at least one digit or letter
        if any(char.isalnum() for char in search_query):
        # if len(search_query) > 2:           
            best_match, best_score = find_best_match(search_query, products_list, threshold)
        else:
            best_match, best_score = None, None
        if best_match is not None:
            results.append((receipt_string, best_match, best_score))
    return results

# Print the results
# glutenfree_matches = find_glutenfree(receipt_strings_test, products_list, threshold)
# for receipt_string, best_match, best_score in glutenfree_matches:
#     print(f"Best match for '{receipt_string[1]}': {best_match} with a score of {best_score}, with a length ratio of {len(receipt_string[1])/len(best_match)}")

In [69]:
# lets grab the prices

# return index of best_match
def get_index(best_match, receipt_strings):
    for index, sub_array in enumerate(receipt_strings):
        if len(sub_array) > 1 and sub_array[1] == best_match:
            return index
    return -1  # Return -1 if no match is found

def find_next_n_values(index, receipt_strings, n=10):
    # cap n at length of receipt_strings
    if index + n >= len(receipt_strings):
        n = len(receipt_strings) - index
    # Return an empty list if there are not enough elements
    if index == -1 or index + n >= len(receipt_strings):
        return []
    # Return the next n values
    result = []
    for i in range(1, n):
        if len(receipt_strings[index + i]) > 1:
            result.append(receipt_strings[index + i][1])
    return result

def extract_price(value):
    # Remove spaces and commas from the string
    cleaned_value = value.replace(" ","").replace(",", ".")
    # print(cleaned_value)
    return cleaned_value

def price_search(values):
    # print(values)
    cleaned_values = [extract_price(value) for value in values]
    # print(cleaned_values)
    # keep only values containing a dot
    cleaned_values = [value for value in cleaned_values if '.' in str(value)]
    cleaned_values = [value for value in cleaned_values if value is not None]
    # print(f'clean values: {cleaned_values}')

    if cleaned_values:
        # return cleaned_values[0]
        try:
            # regex out any character that isn't a digit or a dot
            price = re.sub(r'[^\d.]', '', cleaned_values[0])
            price = float(price)
            return price
        except ValueError:
            return None
    else:
        return None
        # return ','.join(values) # for testing

def find_price(receipt_string, receipt_strings):
    target_index = get_index(receipt_string, receipt_strings)
    # print(target_index)
    next_values = find_next_n_values(target_index, receipt_strings, n=10)
    # print(f'next values: {next_values}')
    price = price_search(next_values)
    return price

# product = 'SCHR GF HAMB BUN'
# product = 'GF  BRIOCHE HAM'
# print(f'price of {product} is {find_price(product, receipt_strings)}')

In [70]:
# lets put it all together
import pandas as pd

def make_table(glutenfree_matches, receipt_strings, filename):
    results = []
    for receipt_string, best_match, best_score in glutenfree_matches:
        product = receipt_string[1]
        price = find_price(product, receipt_strings)
        # look up reference item in referenceitems that matches best_match
        referenceitem = searchtable[searchtable['query'] == best_match]
        if not referenceitem.empty:
            referenceitem = referenceitem['referenceitem'].values[0]
        else:
            referenceitem = None
        results.append((filename, product, best_match, referenceitem, price))
        # print(f'{filename} -- "{product}" ({best_match}) @ ${price}')
        # add to new panda dataframe
    return results

# make_table(glutenfree_matches)

In [89]:
# do _ALL THE THINGS_
# the whole thing runs for about 8 minutes on approx 100 receipts on my device

def list_files(directory, extension):
    return list(f for f in os.listdir(directory) if f.endswith('.' + extension))

receipts_df = pd.DataFrame() #[], columns=['box', 'text', 'confidence'])
expenses_df = pd.DataFrame() #[], columns=['filename', 'product', 'best_match', 'referenceitem', 'price'])

all_files = list_files('./receipts', 'jpg')
# for fun, let's time it
track_start_time = time.time()

for i, filename in enumerate(all_files):
    # fancy progress print, great example of difference between \n and \r
    print(f'processing file {i+1} of {len(all_files)} ({filename})', end='\r')
    receipt_strings = reader.readtext('./receipts/'+filename)
    if len(receipt_strings) > 0:
        receipt_strings_df = pd.DataFrame(receipt_strings, columns=['box', 'text', 'confidence'])
        receipt_strings_df['filename'] = filename
        receipts_df = pd.concat([
            receipts_df if not receipts_df.empty else None,
            receipt_strings_df
            ])
        receipt_strings

        receipt_data = make_table(find_glutenfree(receipt_strings, searchtable['query'].tolist(), threshold=80), receipt_strings, filename)
        if len(receipt_data) > 0:
            receipt_df = pd.DataFrame(receipt_data, columns=['filename', 'product', 'best_match', 'referenceitem', 'price'])
            expenses_df = pd.concat([
                expenses_df if not expenses_df.empty else None,
                receipt_df
                ])

# set column names
receipts_df.columns = ['box', 'text', 'confidence', 'filename']
expenses_df.columns = ['filename', 'product', 'best_match', 'referenceitem', 'price']

print(f'\nProcessed {len(all_files)} files in {(time.time() - track_start_time)/60:.2f} minutes')

processing file 7 of 102 (img_4588.jpg)

  expenses_df = pd.concat([


processing file 12 of 102 (img_4628.jpg)

  expenses_df = pd.concat([


processing file 34 of 102 (img_4638.jpg)

  expenses_df = pd.concat([


processing file 52 of 102 (img_4580.jpg)

  expenses_df = pd.concat([


processing file 59 of 102 (img_4623.jpg)

  expenses_df = pd.concat([


processing file 89 of 102 (img_4656.jpg)

  expenses_df = pd.concat([


processing file 94 of 102 (img_4655.jpg)

  expenses_df = pd.concat([


processing file 98 of 102 (img_4650.jpg)

  expenses_df = pd.concat([


Processed 102 files in 8.97 minutes7.jpg)


  expenses_df = pd.concat([


In [88]:
expenses_df

Unnamed: 0,filename,product,best_match,referenceitem,price
0,img_4659.jpg,7JA),Jam,Jam,
0,img_4588.jpg,CHOCOLAT,Chocolate,Chocolate,6.60
0,img_4601.jpg,Price,Rice,Rice,15.00
1,img_4601.jpg,Not So Classic Chocolate,Chocolate,Chocolate,15.00
2,img_4601.jpg,Mac & Cheese,Mac & Cheese,Mac and cheese,512.99
...,...,...,...,...,...
10,img_4644.jpg,15393 AMY BEAN AND RICE BURRITO GF,Rice,Rice,5.25
11,img_4644.jpg,"29114 LUCKY THAI SPRING ROLLS GF"" 24",Lucky Spring Rolls GF,Spring Rolls,8.65
12,img_4644.jpg,28931 LUCKY SPRING ROLLS GF 241G,Lucky Spring Rolls GF,Spring Rolls,9.25
0,img_4650.jpg,Cooki e,Cookies,Cookies,


In [90]:
expenses_df.to_csv('expenses.csv', index=False)
expenses_df

Unnamed: 0,filename,product,best_match,referenceitem,price
0,img_4659.jpg,7JA),Jam,Jam,
0,img_4588.jpg,CHOCOLAT,Chocolate,Chocolate,6.60
0,img_4601.jpg,Price,Rice,Rice,15.00
1,img_4601.jpg,Not So Classic Chocolate,Chocolate,Chocolate,15.00
2,img_4601.jpg,Mac & Cheese,Mac & Cheese,Mac and cheese,512.99
...,...,...,...,...,...
10,img_4644.jpg,15393 AMY BEAN AND RICE BURRITO GF,Rice,Rice,5.25
11,img_4644.jpg,"29114 LUCKY THAI SPRING ROLLS GF"" 24",Lucky Spring Rolls GF,Spring Rolls,8.65
12,img_4644.jpg,28931 LUCKY SPRING ROLLS GF 241G,Lucky Spring Rolls GF,Spring Rolls,9.25
0,img_4650.jpg,Cooki e,Cookies,Cookies,


In [91]:
receipts_df.to_csv('receipts.csv', index=False)
receipts_df

Unnamed: 0,box,text,confidence,filename
0,"[[631, 269], [806, 269], [806, 341], [631, 341]]",2_3,0.040573,img_4661.jpg
1,"[[116, 263], [821, 263], [821, 475], [116, 475]]",I~~fs (pke =,0.124128,img_4661.jpg
2,"[[405, 436], [481, 436], [481, 488], [405, 488]]","3 """,0.019708,img_4661.jpg
3,"[[120, 447], [317, 447], [317, 609], [120, 609]]",Lec|,0.570547,img_4661.jpg
4,"[[656, 493], [900, 493], [900, 565], [656, 565]]",'!,0.086648,img_4661.jpg
...,...,...,...,...
184,"[[464, 3867], [733, 3867], [733, 3919], [464, ...",70104-2707 RTo0o1,0.528192,img_4647.jpg
185,"[[55, 3892], [501, 3892], [501, 3945], [55, 39...",THANK You FOR SHOPPING AT 'YIG,0.403911,img_4647.jpg
186,"[[53, 3931], [259, 3931], [259, 3971], [53, 39...",STORE MANAGER,0.388632,img_4647.jpg
187,"[[291, 3929], [532, 3929], [532, 3978], [291, ...",SPENCER JONSSON,0.359595,img_4647.jpg
