## run imports

In [1]:
import ingredient_preprocessor as ip
import ingredient_indexing as ingredient_indexing
from food_ranker import *
import food_indexing as food_indexing
import food_preprocessor as fp
from importlib import reload

  from .autonotebook import tqdm as notebook_tqdm


## run constants

In [2]:
STOPWORDS_PATH = 'stopwords.txt'
DATASET_PATH = 'cleanedRecipes.jsonl'
INGREDIENT_INDEX_PATH = 'ingredient_index'
FOOD_INDEX_PATH = 'food_index'
ID_TO_RECIPE_PATH = 'id_to_recipe.json'

## ONLY RUN THIS IF IT DOESN'T EXIST FOR YOU YET IN YOUR PROJECT DIRECTORY: build a doc id to doc info JSON for quick loading in CLI

In [3]:
id_to_recipe = {}  # make doc id to info dict for quick loading

with open(DATASET_PATH, 'r') as recipes_read:
    for recipe in recipes_read:
        recipe = json.loads(recipe)
        recipe_id = recipe['recipeID']
        recipe_title = recipe['title']
        recipe_link = recipe['link']
        recipe_NERs = recipe['NER']
        id_to_recipe[recipe_id] = (recipe_title, recipe_link, recipe_NERs)

with open(ID_TO_RECIPE_PATH, 'w') as json_out:
    json_data = json.dumps(id_to_recipe, indent=4)
    
    json_out.write(json_data)

## or load it instead

In [4]:
with open(ID_TO_RECIPE_PATH, 'r') as json_file:
    id_to_recipe = json.load(json_file)

## build ingredient tokenizer, stopwords, food tokenizer

In [18]:
ingredient_preprocessor = ip.SplitTokenizer()
ingredient_preprocessor.tokenize("This is a test sentences, with a comma...., chicken breasts")

['this is a test sentence', 'with a comma', 'chicken breast']

In [6]:
stopwords = set()
with open(STOPWORDS_PATH, 'r', encoding='utf-8') as file:
    for stopword in file:
        stopwords.add(stopword.strip())
f'Stopwords collected {len(stopwords)}'

'Stopwords collected 610'

In [17]:
# stopwords = {'and', 'the', 'or', 'is', 'for'}
food_preprocessor = fp.RegexTokenizer('\w+', lowercase=True, multiword_expressions=None)

# TRY LOADING INVERTED INDEXES FIRST

In [10]:
ingredient_index = ingredient_indexing.InvertedIndex()
ingredient_index.load(INGREDIENT_INDEX_PATH)

food_index = food_indexing.InvertedIndex()
food_index.load(FOOD_INDEX_PATH)

# OR CREATE THEM IF NOT SAVED

In [None]:
# preprocessor = RegexTokenizer('\w+', lowercase=True, multiword_expressions=None)
ingredient_index = ingredient_indexing.Indexer.create_index(ingredient_indexing.IndexType.InvertedIndex, dataset_path=DATASET_PATH, document_preprocessor=ingredient_tokenizer, stopwords=stopwords, minimum_word_frequency=0, text_key='NER', max_docs=1000000)
print(ingredient_index.get_statistics())

In [None]:
food_index = food_indexing.Indexer.create_index(food_indexing.IndexType.InvertedIndex, dataset_path=DATASET_PATH, document_preprocessor=preprocessor, stopwords=stopwords, minimum_word_frequency=10, text_key='directions', max_docs=1000000)
print(food_index.get_statistics())

In [None]:
ingredient_index.save(INGREDIENT_INDEX_PATH)
food_index.save(FOOD_INDEX_PATH)

## run some tests

In [11]:
food_index.get_term_metadata('chicken')

{'term_freq': 77312, 'term_total_count': 168569}

In [12]:
ingredient_index.get_term_metadata('chicken')

{'term_freq': 41926, 'term_total_count': 42853}

In [13]:
ingredient_index.get_postings('king')

[[10800, 1],
 [14591, 1],
 [74649, 1],
 [251959, 1],
 [267836, 1],
 [330864, 1],
 [359552, 1],
 [436990, 1],
 [465951, 1],
 [530753, 1],
 [624002, 1],
 [657903, 1]]

In [14]:
len([word for word in ingredient_index.index.keys() if len(word.split(" ")) > 1])

54400

## build the ranker (this can run again to get edited ranker)

In [15]:
import food_ranker
reload(food_ranker)
from food_ranker import *

In [20]:
ranker = Ranker(food_index, ingredient_index, food_preprocessor, ingredient_preprocessor, stopwords, BM25, id_to_recipe)

## run some test queries

In [21]:
topq = ranker.query(query_ingr='pie, flour, cream, apples, blueberries', query_freetext='sweet and spicy pie', query_NOT='eggs, pecans, nuts, almonds')[:10]

In [22]:
topq

[(425925, 26.335159904168286),
 (8257, 25.871304952472194),
 (408763, 24.97018147429639),
 (293926, 24.80657578140738),
 (124486, 24.50560801528211),
 (568638, 23.974394167172946),
 (227756, 23.85410134676416),
 (570966, 23.665431180690955),
 (233308, 23.53992587615707),
 (73794, 23.533914787275663)]

# LET'S RUN SOME METRIC TESTS WITH OUR ANNOTATED DATA

In [None]:
# code here