In [1]:
import ingredient_preprocessor as ip
import ingredient_indexing as ingredient_indexing
from food_ranker import *
import food_indexing as food_indexing
import food_preprocessor as fp

In [2]:
ingredient_tokenizer = ip.SplitTokenizer()
ingredient_tokenizer.tokenize("This is a test sentences, with a comma...., chicken breasts")


['this is a test sentence', 'with a comma', 'chicken breast']

In [3]:
stopwords = set()
with open('stopwords.txt', 'r', encoding='utf-8') as file:
    for stopword in file:
        stopwords.add(stopword.strip())
f'Stopwords collected {len(stopwords)}'

'Stopwords collected 610'

In [4]:
dataset_path = 'cleanedRecipes.jsonl'
# stopwords = {'and', 'the', 'or', 'is', 'for'}
text_key = 'NER'
doc_augment_dict = {}
food_preprocessor = fp.RegexTokenizer('/w+')
minimum_word_frequency = 5

In [5]:
# preprocessor = RegexTokenizer('\w+', lowercase=True, multiword_expressions=None)
ingredient_index = ingredient_indexing.Indexer.create_index(ingredient_indexing.IndexType.InvertedIndex, dataset_path='cleanedRecipes.jsonl', document_preprocessor=ingredient_tokenizer, stopwords=stopwords, minimum_word_frequency=0, text_key='NER', max_docs=750000)
print(ingredient_index.get_statistics())

0it [00:00, ?it/s]

2231142it [00:13, 170732.47it/s]
 34%|███▎      | 750000/2231142 [00:22<00:43, 34051.13it/s]


Counter()


 34%|███▎      | 750000/2231142 [00:26<00:51, 28803.88it/s]


defaultdict(<class 'collections.Counter'>, {'index_type': 'BasicInvertedIndex', 'unique_token_count': 45465, 'total_token_count': 5483334, 'number_of_documents': 750000, 'mean_document_length': 7.311112})


In [6]:
preprocessor = fp.RegexTokenizer('\w+', lowercase=True, multiword_expressions=None)

food_index = food_indexing.Indexer.create_index(food_indexing.IndexType.InvertedIndex, dataset_path='cleanedRecipes.jsonl', document_preprocessor=preprocessor, stopwords=stopwords, minimum_word_frequency=15, text_key='directions', max_docs=750000)

2231142it [00:12, 180529.55it/s]
 34%|███▎      | 750000/2231142 [00:18<00:36, 40991.86it/s]




 34%|███▎      | 750000/2231142 [00:17<00:35, 42236.32it/s]


In [9]:
food_index.get_term_metadata('chicken')
ingredient_index.get_term_metadata('chicken')

{'term_freq': 649, 'term_total_count': 662}

In [62]:
topQ = ingredient_index.get_postings('king')
ingredient_index.get_postings('king')

[(10800, 1), (14591, 1), (74649, 1)]

In [34]:
for q in topQ[:2]:
    with open('cleanedRecipes.jsonl') as f:
        for line in (f):
            json_record = json.loads(line)
            if q[0] == json_record['recipeID']:
                print(json_record)
                json_record['ingredients'] = json.loads(json_record['ingredients'])
                json_record['NER'] = json.loads(json_record['NER'])
                print(json_record)
                print(json_record['ingredients'][0])
                # print(json_record['directions'])
               
                # print(json_record['title'])

    print(q)

{'recipeID': 10800, 'title': 'Bread Pudding', 'ingredients': '["1 king size loaf bread (stale)", "2 cans evaporated milk", "2 cans water", "1 1/2 c. sugar (more or less to taste)", "4 large eggs", "1/2 tsp. margarine", "1 tsp. vanilla", "3/4 c. raisins (soak in water first)", "nutmeg", "guava paste* (optional)"]', 'directions': 'Soak bread in milk and water. Add sugar and eggs. Mix well. Add vanilla and raisins. Pour into greased 9 x 13-inch pan. Sprinkle nutmeg on top. Pat pieces of margarine over the pudding. Bake at 350\\u00b0.', 'link': 'www.cookbooks.com/Recipe-Details.aspx?id=381317', 'source': 'Gathered', 'NER': '["king", "milk", "water", "sugar", "eggs", "margarine", "vanilla", "raisins", "nutmeg", "guava paste"]', 'website': 'www.cookbooks.com'}
{'recipeID': 10800, 'title': 'Bread Pudding', 'ingredients': ['1 king size loaf bread (stale)', '2 cans evaporated milk', '2 cans water', '1 1/2 c. sugar (more or less to taste)', '4 large eggs', '1/2 tsp. margarine', '1 tsp. vanilla',

In [34]:
len([word for word in ingredient_index.index.keys() if len(word.split(" ")) > 1])

3894

In [17]:
from importlib import reload
import food_ranker
reload(food_ranker)
from food_ranker import *

In [18]:
ranker = Ranker(food_index, ingredient_index, preprocessor, ingredient_tokenizer, stopwords, BM25)

In [30]:
topq = ranker.query(query_ingr='pie, flour, cream, apples, blueberries', query_freetext='sweet and spicy pie', query_NOT='eggs, pecans, nuts, almonds')[:10]

pie, flour, cream, apples, blueberries sweet and spicy pie
['pie', 'flour', 'cream', 'apples', 'blueberries', 'sweet', 'spicy', 'pie']
egg
pecan
nut
almond


In [31]:
topq

[(425925, 25.34135345376678),
 (8257, 25.210549426069004),
 (408763, 24.203660992767674),
 (293926, 23.88519285061779),
 (124486, 23.37641882805338),
 (227756, 23.06351051048317),
 (190623, 23.011949882834326),
 (522343, 22.993799671156218),
 (568638, 22.937364936462494),
 (233308, 22.77059654701253)]

In [32]:
track = 0
for q in topq:
    track = 0
    with open('cleanedRecipes.jsonl') as f:
        for line in f:
            json_record = json.loads(line)
            if q[0] == json_record['recipeID']:
                # print(json_record)
                json_record['ingredients'] = json.loads(json_record['ingredients'])
                json_record['NER'] = json.loads(json_record['NER'])
                print(json_record)
                print(json_record['ingredients'][0])
                print(json_record['NER'])

            track += 1
            if track == 750000:
                break
                # print(json_record['directions'])
               
                # print(json_record['title'])

    print(q)

{'recipeID': 425925, 'title': 'Sugarless Apple Pie', 'ingredients': ['1 (6 oz.) can concentrated apple juice', '2 Tbsp. cornstarch', '1/2 can or 3 oz. water', '1/2 tsp. apple pie spice', "1 pkg. Sweet 'N Low", '2 c. apple slices'], 'directions': "Mix all ingredients until thick, then slice apples. Put in an uncooked pie crust or you may just pour over apples in a pie pan and cook. You can sprinkle Sweet 'N Low over apples and mixture if you want. Bake for 45 minutes at 350\\u00b0. Makes 1 pie.", 'link': 'www.cookbooks.com/Recipe-Details.aspx?id=428253', 'source': 'Gathered', 'NER': ['apple juice', 'cornstarch', 'water', 'apple pie spice', 'N', 'apple slices'], 'website': 'www.cookbooks.com'}
1 (6 oz.) can concentrated apple juice
['apple juice', 'cornstarch', 'water', 'apple pie spice', 'N', 'apple slices']
(425925, 25.34135345376678)
{'recipeID': 8257, 'title': 'Ladyfinger Cheese Cake', 'ingredients': ['1 pkg. ladyfingers', '8 oz. cream cheese', '1/2 c. sugar', '2 c. Cool Whip', '1 ca