In [164]:
# OCR portion of recipEATS. Converts (Trader Joe's) receipt 
# 
# Inputs: 
# image of a (Trader Joe's) receipt as a .jpg file.
# 
# Outputs: 
# clean list of products from said receipt.

from PIL import Image
from pytesseract import image_to_string    # OCR
import requests
import re
import os                                 
from nutritionix import Nutritionix    # wrapper for nutritionix API
import pandas as pd
from difflib import SequenceMatcher

In [59]:
# open each receipt image
ims = [Image.open('./receipts/' + x) for x in os.listdir('./receipts')]

In [71]:
# split by line
# remove part of string to the right of digits (quantity, dollar amount)
# remove empty strings
item_list = txt.split('\n')
item_list = (re.split('\d', item)[0] for item in item_list)
item_list = (item for item in item_list if item)

In [78]:
# get Nutritionix API credentials
with open('./keys/nxAPIkey.txt') as f:
    nx_app_id = f.readline().split()[1]
    nx_api_key = f.readline().split()[1]
    
nx = Nutritionix(app_id = nx_app_id, api_key = nx_api_key)

In [166]:
def match_items(rough_string_list, threshold = 0.45):
    
    '''
    
    Refines rough results returned by OCR by comparing them against the Nutritionix database. Nutritionix has its own 'similarity' metric, but it's inconsistent. SequenceMatcher does a simple comparison between OCR's 'rough string' and Nutritionix's 10 most recommended items. The default threshold is set relatively low (0.45; docs suggest 0.6) because this is a secondary matching procedure to Nutritionix's metric. 
    
    '''
    
    for rough_string in rough_string_list:

        hits = nx.search(rough_string).json()['hits']

        scores_gen = ((SequenceMatcher(None, rough_string, item['fields']['item_name']).ratio(), item['fields']['item_name']) for item in hits)
        scores_df = pd.DataFrame(scores_gen, columns = ['Score', 'Name']).sort_values('Score', ascending = False)
        highest = scores_df.iloc[0]
        if highest['Score'] > threshold:
            yield highest['Name']

In [162]:
list(match_items(item_list))

['Kid Burrito',
 'Shredded Wheat',
 'Chicken Soup Dumplings',
 'Chewy Granola Bars, Organic Dark Chocolate Chip',
 'Seedless Raisins',
 'Eggs, Large Brown, One dozen',
 'Shortbread, Raspberry',
 'Macaroni & Cheese, Shells',
 'Turkey Meatballs',
 'Pasta, Penne Rigate',
 'Chicken Fried Rice',
 'Cookie Dough, Chocolate Chunk',
 'Whole Wheat British Muffins',
 'Healthy Whole Grain Bread',
 'Mandarin Orange Chicken',
 'Peanut Butter, Creamy No-Stir',
 'Protein Bar, Almond Bliss',
 'Protein Energy Bar, Triple Chocolate',
 'Protein Energy Bar, Triple Chocolate',
 'Macrobar, Mocha Chocolate Chip',
 'Soft Cream Cheese',
 'Movie Theater Butter Popcorn',
 'Honey Lemon Cough Drop',
 'Orange Peach Mango Juice',
 'Eggs, Extra Large Brown Cage Free',
 '1% Lowfat Milk, Gallon',
 'Gluten Free Hummus, Everything',
 'Chive Cream Cheese - 1 tbsp',
 'Earl Grey Tea',
 'Peppermint Herbal Tea',
 'Mac & Cheese',
 'Hash Browns',
 'Banana',
 'Honey Hedgehog Cookies',
 'Chicken Breast, Curry',
 'Sparkling Apple J