In [59]:
# OCR portion of recipEATS. Converts (Trader Joe's) receipt 
# 
# Inputs: 
# image of a (Trader Joe's) receipt as a .jpg file.
# 
# Outputs: 
# clean list of products from said receipt.

from PIL import Image
from pytesseract import image_to_string
import requests, re, os                         
from nutritionix import Nutritionix
import pandas as pd
from difflib import SequenceMatcher

In [60]:
# open each receipt image
ims = [Image.open('./receipts/' + x) for x in os.listdir('./receipts')]

In [87]:
# textify first receipt
txt = image_to_string(ims[1]).lower()

In [88]:
# split by line
# remove part of string to the right of digits (quantity, dollar amount)
# remove empty strings
item_list = txt.split('\n')
item_list = (re.split('\d', item)[0] for item in item_list)
item_list = list(filter(None, item_list))

In [89]:
# get Nutritionix API credentials
with open('./keys/nxAPIkey.txt') as f:
    nx_app_id = f.readline().split()[1]
    nx_api_key = f.readline().split()[1]
    
nx = Nutritionix(app_id = nx_app_id, api_key = nx_api_key)

In [100]:
def match_items(rough_string_list, threshold = 0.60):
    
    '''
    
    Refines rough results returned by OCR by comparing them against the Nutritionix database. Nutritionix has its own 'similarity' metric, but it's inconsistent. SequenceMatcher does a simple comparison between OCR's 'rough string' and Nutritionix's 10 most recommended items. If the most 'similar' string is above a 0.6 ratio compared to the rough string, or if the top three 'similar' strings are very similar to each other, then return a value.
    
    '''
    
    def similar(a, b):
        return SequenceMatcher(None, a, b).ratio()
    
    for rough_string in rough_string_list:

        res = nx.search(rough_string).json()
                          
        try:
            hits = res['hits']
        except KeyError:
            continue
                
        scores_gen = ((similar(rough_string, item['fields']['item_name']), item['fields']['item_name']) for item in hits)
        scores_df = pd.DataFrame(scores_gen, columns = ['Score', 'Name']).sort_values('Score', ascending = False)
        
        highest = scores_df.iloc[0]
                
        if highest['Score'] > threshold:
            yield highest['Name']
        
        try:
            if similar(scores_df.iloc[0,1], scores_df.iloc[1,1]) > threshold and similar(scores_df.iloc[0,1], scores_df.iloc[2,1]) > threshold and similar(scores_df.iloc[1,1], scores_df.iloc[2,1]) > threshold:
                yield highest['Name']
        except:
            continue