## run imports

In [1]:
import ingredient_preprocessor as ip
import ingredient_indexing as ingredient_indexing
from food_ranker import *
import food_indexing as food_indexing
import food_preprocessor as fp
from importlib import reload

  from .autonotebook import tqdm as notebook_tqdm


## build ingredient tokenizer, stopwords, food tokenizer

In [2]:
ingredient_tokenizer = ip.SplitTokenizer()
ingredient_tokenizer.tokenize("This is a test sentences, with a comma...., chicken breasts")


['this is a test sentence', 'with a comma', 'chicken breast']

In [3]:
stopwords = set()
with open('stopwords.txt', 'r', encoding='utf-8') as file:
    for stopword in file:
        stopwords.add(stopword.strip())
f'Stopwords collected {len(stopwords)}'

'Stopwords collected 610'

In [4]:
dataset_path = 'cleanedRecipes.jsonl'
# stopwords = {'and', 'the', 'or', 'is', 'for'}
text_key = 'NER'
doc_augment_dict = {}
preprocessor = fp.RegexTokenizer('\w+', lowercase=True, multiword_expressions=None)

# TRY LOADING INVERTED INDEXES FIRST

In [5]:
ingredient_index = ingredient_indexing.InvertedIndex()
ingredient_index.load('ingredient_index')

food_index = food_indexing.InvertedIndex()
food_index.load('food_index')

# OR CREATE THEM IF NOT SAVED

In [None]:
# preprocessor = RegexTokenizer('\w+', lowercase=True, multiword_expressions=None)
ingredient_index = ingredient_indexing.Indexer.create_index(ingredient_indexing.IndexType.InvertedIndex, dataset_path='cleanedRecipes.jsonl', document_preprocessor=ingredient_tokenizer, stopwords=stopwords, minimum_word_frequency=0, text_key='NER', max_docs=1000000)
print(ingredient_index.get_statistics())

In [None]:
food_index = food_indexing.Indexer.create_index(food_indexing.IndexType.InvertedIndex, dataset_path='cleanedRecipes.jsonl', document_preprocessor=preprocessor, stopwords=stopwords, minimum_word_frequency=10, text_key='directions', max_docs=1000000)
print(food_index.get_statistics())

In [None]:
ingredient_index.save('ingredient_index')
food_index.save('food_index')

## run some tests

In [6]:
food_index.get_term_metadata('chicken')
ingredient_index.get_term_metadata('chicken')

{'term_freq': 41926, 'term_total_count': 42853}

In [7]:
topQ = ingredient_index.get_postings('king')
ingredient_index.get_postings('king')

[[10800, 1],
 [14591, 1],
 [74649, 1],
 [251959, 1],
 [267836, 1],
 [330864, 1],
 [359552, 1],
 [436990, 1],
 [465951, 1],
 [530753, 1],
 [624002, 1],
 [657903, 1]]

In [None]:
for q in topQ[:2]:
    with open('cleanedRecipes.jsonl') as f:
        for line in (f):
            json_record = json.loads(line)
            if q[0] == json_record['recipeID']:
                print(json_record)
                json_record['ingredients'] = json.loads(json_record['ingredients'])
                json_record['NER'] = json.loads(json_record['NER'])
                print(json_record)
                print(json_record['ingredients'][0])
                # print(json_record['directions'])
               
                # print(json_record['title'])

    print(q)

In [8]:
len([word for word in ingredient_index.index.keys() if len(word.split(" ")) > 1])

54400

## build the ranker (this can run again to get edited ranker)

In [9]:
import food_ranker
reload(food_ranker)
from food_ranker import *

In [10]:
ranker = Ranker(food_index, ingredient_index, preprocessor, ingredient_tokenizer, stopwords, BM25)

## run some test queries

In [11]:
topq = ranker.query(query_ingr='pie, flour, cream, apples, blueberries', query_freetext='sweet and spicy pie', query_NOT='eggs, pecans, nuts, almonds')[:10]

In [12]:
topq

[(425925, 26.335159904168286),
 (8257, 25.871304952472194),
 (408763, 24.97018147429639),
 (293926, 24.80657578140738),
 (124486, 24.50560801528211),
 (568638, 23.974394167172946),
 (227756, 23.85410134676416),
 (570966, 23.665431180690955),
 (233308, 23.53992587615707),
 (73794, 23.533914787275663)]

# LET'S RUN SOME METRIC TESTS WITH OUR ANNOTATED DATA

In [None]:
# code here

## build a doc id to doc info for quick loading in CLI

In [None]:
id_to_recipe = {}  # make doc id to info dict for quick loading
id_to_recipe_path = 'id_to_recipe.json'

with open(dataset_path, 'r') as recipes_read:
    for recipe in recipes_read:
        recipe = json.loads(recipe)
        recipe_id = recipe['recipeID']
        recipe_title = recipe['title']
        recipe_link = recipe['link']
        id_to_recipe[recipe_id] = (recipe_title, recipe_link)

with open(id_to_recipe_path, 'w') as json_out:
    json_data = json.dumps(id_to_recipe, indent=4)
    
    json_out.write(json_data)