In [99]:
%pip install -q \
    fuzzywuzzy \
    easyocr \
    pandas \
    requests

debug = False


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [100]:
import requests

def writeToEndpoint(endpoint, json):
    url = f'http://localhost:3000/api/{endpoint}'
    response = requests.post(url, json=json)
    if response.status_code == 200:
        if debug:
            print(f'Succes writing {endpoint}. Response:\n{response.text}')
    else:
        # if debug:
        print(f'\nError writing {endpoint}. \nRequest:{json}\nResponse:\n{response.text}')
    return response


In [101]:
# seed reference items using api

# import from csv
import pandas as pd

referenceitems_df = pd.read_csv('referenceitems.csv')

for index, row in referenceitems_df.iterrows():
    response = writeToEndpoint('referenceItem', {
        "name": row['name'],
        "quantity": row['quantity'],
        "unitOfMeasure": row['unitofmeasure'],
        "price": row['price'],
        "pricePerWeight": row['priceperweight'],
        "referenceUrl": row['referenceurl']
    })
    new_referenceitem_id = response.json()['id']
    referenceitems_df.at[index, 'id'] = int(new_referenceitem_id)
referenceitems_df['id'] = referenceitems_df['id'].astype(int)
referenceitems_df.set_index('id', inplace=True)
# referenceitems_df = referenceitems_df[['id', 'name', 'quantity', 'unitofmeasure', 'price', 'priceperweight', 'referenceurl']]
referenceitems_df

# test
# writeReferenceItem({
#     "name": "Sample item",
#     "quantity": 10,
#     "unitOfMeasure": "g",
#     "price": 100,
#     "pricePerWeight": 10,
#     "referenceUrl": "test.com"
# })

# import from queryclassification.csv to panda dataframe
queryclassification_df = pd.read_csv('queryclassification.csv')
queryclassification_df.reset_index(drop=True, inplace=True)
queryclassification_df['id'] = queryclassification_df.index
queryclassification_df.set_index('id', inplace=True)
queryclassification_df

temp_referenceItem_map = referenceitems_df.copy()
temp_referenceItem_map['id'] = temp_referenceItem_map.index
temp_referenceItem_map

queryclassification_df['referenceItemId'] = queryclassification_df['referenceItem'].map(temp_referenceItem_map.set_index('name')['id'].to_dict())
try:
    queryclassification_df['referenceItemId'] = queryclassification_df['referenceItemId'].astype(int)
except:
    print('Failed to convert referenceItemId to int')
    print(queryclassification_df[queryclassification_df['referenceItemId'].isnull()])
queryclassification_df.drop(columns=['referenceItem'], inplace=True)
queryclassification_df

Unnamed: 0_level_0,query,referenceItemId
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3x KIND Oats & Honey Granola with Toasted Coco...,14
1,Annie's Homegrown Gluten-Free Rice Pasta & Che...,19
2,Annie's Homegrown Gluten-Free Rice Pasta White...,19
3,Annies rice pasta and white cheddar,19
4,Baldstreet all beef frankfurters,40
...,...,...
169,Mac & Cheese,19
170,Caramel Chocolate Chip Cookie Bar,34
171,Purest White Bread,32
172,Chicken Brast Strips EatWell,35


In [102]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re
import easyocr
import pandas as pd
import shutil
import numpy as np
import os

reader = easyocr.Reader(['en']) # this needs to run only once to load the model into memory

def ocrImage(image_id, image_path):
    receipt_strings = reader.readtext(image_path)

    # guard clause
    if len(receipt_strings) < 1:
        return # no text found

    # transformation
    receipt_strings_df = pd.DataFrame(receipt_strings, columns=['boundingBox', 'text', 'confidence'])
    receipt_strings_df.drop(columns=['confidence'], inplace=True)
    receipt_strings_df['imageFileId'] = image_id
    # receipt_strings_df['imageFileId'] = receipt_strings_df['imageFileId'].astype(int)

    # push to API
    responses = []
    for index, row in receipt_strings_df.iterrows():
        # print(row['text'])
        # print(row['boundingBox'])
        # print(row['imageFileId'])
        boundingBox_str = [[int(value) for value in sublist] for sublist in row['boundingBox']]
        response = writeToEndpoint('receiptText', {
            'text': str(row['text']), 
            'boundingBox': str(boundingBox_str), 
            'imageFileId': int(row['imageFileId'])
        })
        responses.append(response.json())
    return responses

choices_dict = queryclassification_df[['query']].to_dict()['query']

def parseReceiptTextsForEligibleExpenses(receipt_texts, receipt_id):
    # receipt = fetch_receipt(receipt_id)
    # receipt_texts = receipt['data']['receipt']['receiptTexts']
    threshold = 80
    
    def clean_text(text):
        text = re.sub(r'^\d{1,10}', '', text)
        if len(text) <= 2:
            return ''
        if not any(char.isalnum() for char in text):
            return ''
        return text

    for receipt_text in receipt_texts:
        receipt_text_id = receipt_text['id']
        receipt_text = receipt_text['text']
        clean_receipt_text = clean_text(receipt_text)
        if clean_text(clean_receipt_text) == '':
            continue
        best_match, best_score, best_match_id = process.extractOne(clean_receipt_text, choices_dict, scorer=fuzz.token_set_ratio)

        if best_score >= threshold:
            # avoid strict substring matches
            if len(clean_receipt_text)/len(best_match) < 0.4:
                continue
            # found a decent match
            # get corresponding referenceItem 
            reference_item_id = int(queryclassification_df.loc[best_match_id]['referenceItemId'])
            # create eligible expense
            # print(f'Creating expense for {receipt_text} with reference item id {reference_item_id}')
            writeToEndpoint('expense',{
                    'receiptTextId': receipt_text_id,
                    'referenceItemId': reference_item_id,
                    'receiptId': receipt_id,
                    'priceEach': 0, # TODO: find price
                    'quantity': 1, # TODO: find quantity
                }
            )

def upload_image(image_path, receipt_id):
    # fake upload, i.e. copy to frontend folder
    destination_folder = "../receipts-app/public/uploads/"
    shutil.copy(image_path, destination_folder)
    image_url = destination_folder + image_path.split("/")[-1]
    response = writeToEndpoint('imageFile', {
        "url": image_url, 
        "receiptId": receipt_id,
        })
    return response
# test
# upload_image('./receipts/IMG_4553.jpg')

# TODO: batch multiple images in array
def main_single_receipt(image_path):
    receipt_id = writeToEndpoint('receipt', {}).json()['id']
    image_id = upload_image(image_path, receipt_id).json()['id']
    receiptTexts = ocrImage(image_id, image_path) # for notebooks only, using local file
    print(f'Parsed receipt {receipt_id}: {receiptTexts}', end='\r')
    if receiptTexts is None:
        return None
    parseReceiptTextsForEligibleExpenses(receiptTexts, receipt_id)

# main_single_receipt('./receipts/IMG_4553.jpg')


In [103]:
def list_files(directory, extension):
    return list(f for f in os.listdir(directory) if f.endswith('.' + extension))

all_files = list_files('./receipts', 'jpg')

for file in all_files:
    main_single_receipt(f'./receipts/{file}')

Parsed receipt 102: [{'id': 10947, 'text': 'uindependent', 'boundingBox': '[[195, 184], [831, 184], [831, 317], [195, 317]]', 'imageFileId': 102, 'createdAt': '2024-08-02T02:45:13.102Z', 'updatedAt': '2024-08-02T02:45:13.102Z'}, {'id': 10948, 'text': 'nden', 'boundingBox': '[[590, 280], [645, 280], [645, 298], [590, 298]]', 'imageFileId': 102, 'createdAt': '2024-08-02T02:45:13.106Z', 'updatedAt': '2024-08-02T02:45:13.106Z'}, {'id': 10949, 'text': 'Groce', 'boundingBox': '[[644, 275], [710, 275], [710, 304], [644, 304]]', 'imageFileId': 102, 'createdAt': '2024-08-02T02:45:13.111Z', 'updatedAt': '2024-08-02T02:45:13.111Z'}, {'id': 10950, 'text': "JONSSON 'S YIG ALHONTE", 'boundingBox': '[[84, 328], [406, 328], [406, 372], [84, 372]]', 'imageFileId': 102, 'createdAt': '2024-08-02T02:45:13.116Z', 'updatedAt': '2024-08-02T02:45:13.116Z'}, {'id': 10951, 'text': '4-401 OttAvA ST , ALMONTE', 'boundingBox': '[[81, 360], [447, 360], [447, 403], [81, 403]]', 'imageFileId': 102, 'createdAt': '2024