In [3]:
import ingredient_preprocessor as ip
import ingredient_indexing as ingredient_indexing
from food_ranker import *
import food_indexing as food_indexing
import food_preprocessor as fp

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
ingredient_tokenizer = ip.SplitTokenizer()
ingredient_tokenizer.tokenize("This is a test sentences, with a comma...., chicken breasts")


['this is a test sentence', 'with a comma', 'chicken breast']

In [5]:
stopwords = set()
with open('stopwords.txt', 'r', encoding='utf-8') as file:
    for stopword in file:
        stopwords.add(stopword.strip())
f'Stopwords collected {len(stopwords)}'

'Stopwords collected 610'

In [6]:
dataset_path = 'cleanedRecipes.jsonl'
# stopwords = {'and', 'the', 'or', 'is', 'for'}
text_key = 'NER'
doc_augment_dict = {}
food_preprocessor = fp.RegexTokenizer('/w+')
minimum_word_frequency = 5

In [12]:
preprocessor = fp.RegexTokenizer('\w+', lowercase=True, multiword_expressions=None)

# TRY LOADING FIRST

In [13]:
ingredient_index = ingredient_indexing.InvertedIndex()
ingredient_index.load('ingredient_index')

food_index = food_indexing.InvertedIndex()
food_index.load('food_index')

# OR CREATE THE INDEXES IDK

In [7]:
# preprocessor = RegexTokenizer('\w+', lowercase=True, multiword_expressions=None)
ingredient_index = ingredient_indexing.Indexer.create_index(ingredient_indexing.IndexType.InvertedIndex, dataset_path='cleanedRecipes.jsonl', document_preprocessor=ingredient_tokenizer, stopwords=stopwords, minimum_word_frequency=0, text_key='NER', max_docs=1000000)
print(ingredient_index.get_statistics())

2231142it [00:10, 213462.82it/s]
 45%|████▍     | 1000000/2231142 [00:26<00:32, 38382.18it/s]


Counter()


 45%|████▍     | 1000000/2231142 [00:31<00:38, 31933.07it/s]


defaultdict(<class 'collections.Counter'>, {'index_type': 'BasicInvertedIndex', 'unique_token_count': 60027, 'total_token_count': 7509897, 'number_of_documents': 1000000, 'mean_document_length': 7.509897})


In [8]:
food_index = food_indexing.Indexer.create_index(food_indexing.IndexType.InvertedIndex, dataset_path='cleanedRecipes.jsonl', document_preprocessor=preprocessor, stopwords=stopwords, minimum_word_frequency=10, text_key='directions', max_docs=1000000)
print(food_index.get_statistics())

2231142it [00:10, 211879.30it/s]
 45%|████▍     | 1000000/2231142 [00:22<00:27, 44482.35it/s]




 45%|████▍     | 1000000/2231142 [00:24<00:30, 40757.73it/s]


defaultdict(<class 'collections.Counter'>, {'index_type': 'BasicInvertedIndex', 'unique_token_count': 12686, 'total_token_count': 49818695, 'number_of_documents': 1000000, 'mean_document_length': 49.818695})


In [10]:
ingredient_index.save('ingredient_index')
food_index.save('food_index')

In [14]:
food_index.get_term_metadata('chicken')
ingredient_index.get_term_metadata('chicken')

{'term_freq': 41926, 'term_total_count': 42853}

In [15]:
topQ = ingredient_index.get_postings('king')
ingredient_index.get_postings('king')

[[10800, 1],
 [14591, 1],
 [74649, 1],
 [251959, 1],
 [267836, 1],
 [330864, 1],
 [359552, 1],
 [436990, 1],
 [465951, 1],
 [530753, 1],
 [624002, 1],
 [657903, 1]]

In [None]:
for q in topQ[:2]:
    with open('cleanedRecipes.jsonl') as f:
        for line in (f):
            json_record = json.loads(line)
            if q[0] == json_record['recipeID']:
                print(json_record)
                json_record['ingredients'] = json.loads(json_record['ingredients'])
                json_record['NER'] = json.loads(json_record['NER'])
                print(json_record)
                print(json_record['ingredients'][0])
                # print(json_record['directions'])
               
                # print(json_record['title'])

    print(q)

In [13]:
len([word for word in ingredient_index.index.keys() if len(word.split(" ")) > 1])

54400

In [16]:
from importlib import reload
import food_ranker
reload(food_ranker)
from food_ranker import *

In [17]:
ranker = Ranker(food_index, ingredient_index, preprocessor, ingredient_tokenizer, stopwords, BM25)

In [18]:
topq = ranker.query(query_ingr='pie, flour, cream, apples, blueberries', query_freetext='sweet and spicy pie', query_NOT='eggs, pecans, nuts, almonds')[:10]

pie, flour, cream, apples, blueberries sweet and spicy pie
['pie', 'flour', 'cream', 'apples', 'blueberries', 'sweet', 'spicy', 'pie']
egg
pecan
nut
almond


In [19]:
topq

[(425925, 26.335159904168286),
 (8257, 25.871304952472194),
 (408763, 24.97018147429639),
 (293926, 24.80657578140738),
 (124486, 24.50560801528211),
 (568638, 23.974394167172946),
 (227756, 23.85410134676416),
 (570966, 23.665431180690955),
 (233308, 23.53992587615707),
 (73794, 23.533914787275663)]

In [20]:
track = 0
for q in topq:
    track = 0
    with open('cleanedRecipes.jsonl') as f:
        for line in f:
            json_record = json.loads(line)
            if q[0] == json_record['recipeID']:
                # print(json_record)
                json_record['ingredients'] = json.loads(json_record['ingredients'])
                json_record['NER'] = json.loads(json_record['NER'])
                print(json_record)
                print(json_record['ingredients'][0])
                print(json_record['NER'])

            track += 1
            if track == 750000:
                break
                # print(json_record['directions'])
               
                # print(json_record['title'])

    print(q)

{'recipeID': 425925, 'title': 'Sugarless Apple Pie', 'ingredients': ['1 (6 oz.) can concentrated apple juice', '2 Tbsp. cornstarch', '1/2 can or 3 oz. water', '1/2 tsp. apple pie spice', "1 pkg. Sweet 'N Low", '2 c. apple slices'], 'directions': "Mix all ingredients until thick, then slice apples. Put in an uncooked pie crust or you may just pour over apples in a pie pan and cook. You can sprinkle Sweet 'N Low over apples and mixture if you want. Bake for 45 minutes at 350\\u00b0. Makes 1 pie.", 'link': 'www.cookbooks.com/Recipe-Details.aspx?id=428253', 'source': 'Gathered', 'NER': ['apple juice', 'cornstarch', 'water', 'apple pie spice', 'N', 'apple slices'], 'website': 'www.cookbooks.com'}
1 (6 oz.) can concentrated apple juice
['apple juice', 'cornstarch', 'water', 'apple pie spice', 'N', 'apple slices']
(425925, 26.335159904168286)
{'recipeID': 8257, 'title': 'Ladyfinger Cheese Cake', 'ingredients': ['1 pkg. ladyfingers', '8 oz. cream cheese', '1/2 c. sugar', '2 c. Cool Whip', '1 c

In [11]:
id_to_recipe = {}  # make doc id to url dict for quick loading
id_to_recipe_path = 'id_to_recipe.json'

with open(dataset_path, 'r') as recipes_read:
    for recipe in recipes_read:
        recipe = json.loads(recipe)
        recipe_id = recipe['recipeID']
        recipe_title = recipe['title']
        recipe_link = recipe['link']
        id_to_recipe[recipe_id] = (recipe_title, recipe_link)

with open(id_to_recipe_path, 'w') as json_out:
    json_data = json.dumps(id_to_recipe, indent=4)
    
    json_out.write(json_data)