# Preprocess FOOD Dataset

In [1]:
# Common imports
import json
import os
import re
import json
import operator

In [2]:
# Constants
build_ingredient_list = False
base_dir = os.getcwd()+"/food"

In [3]:
# Common Variables
ingredient_map = {"count": 0, "children": dict(), "words": ""}
all_recipes = dict()
all_ingredients = dict()
unknown_ingredients = dict()

In [4]:
# Load NLTK
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.corpus import wordnet as wn

def is_noun(tag):
	return tag in ['NN', 'NNS', 'NNP', 'NNPS']

def is_verb(tag):
	return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

def is_adverb(tag):
	return tag in ['RB', 'RBR', 'RBS']

def is_adjective(tag):
	return tag in ['JJ', 'JJR', 'JJS']
	
def penn_to_wn(tag):
	if is_adjective(tag):
		return wn.ADJ
	elif is_noun(tag):
		return wn.NOUN
	elif is_adverb(tag):
		return wn.ADV
	elif is_verb(tag):
		return wn.VERB
	return None

def lemmatize_sentence(sentence, includePos=False):
	lemmatized=[]
	sentence = nltk.word_tokenize(sentence.replace("*",""))
	# Tokenize sentence
	sentence = nltk.pos_tag(sentence)
	for token in sentence:
		wn=penn_to_wn(token[1]);
		if wn is not None:
			lemma = wordnet_lemmatizer.lemmatize(token[0], wn)
		else:
			lemma = wordnet_lemmatizer.lemmatize(token[0])
		if includePos:
			lemmatized.append((token[0], token[1], lemma.lower()))
		else:
			lemmatized.append(lemma.lower())
	return lemmatized

In [5]:
# Set Constants
invalid_tokens = [
	"(",")", "[", "]", "cup", "¾", "t", "ml", "l", "'", "c", "i", "a", "tablespoon", "ounce", "grain", ";", "+",
	"of", "%", ",", ".", "tsp", "tbsp", "pound", "pinch", "g", "oz", "tbsps", "tsps", ":", "tin", "the", "de", "pot",
	"tb", "lb", "package", "teaspoon", "s", "½", "ozs", "or", "and", "for", "in", "x", "tb", "*", "can", "pkg", "di"
]
valid_nutrition_facts = [
	"ENERC_KCAL", "FAT", "FASAT","CHOCDF","FIBTG","PROCNT",
    "CHOLE","NA","CA","MG","K","FE","ZN","P","VITA_RAE",
    "VITC","RIBF","VITB6A","VITB12","VITD","TOCPHA","VITK1"
];

In [6]:
# Load the dictionaries

def build_ingredients_dictionary():
	with open(base_dir+"/../ingredient_dataset.txt") as recipe_dataset:
		all_ingredients = dict()
		for line in recipe_dataset:
			line = line.strip().lower()
			if line in all_ingredients:
				all_ingredients[line]=all_ingredients[line]+1
			else:
				all_ingredients[line]=1

		all_ingredients_sorted = sorted(all_ingredients.items(), key=lambda ing: -len(ing[0])*1000000+ing[1])
		with open('ingredients.txt', 'w') as ingredients_file:
			for ingredient in all_ingredients_sorted:
				ingredients_file.write(str(ingredient[1])+" "+ingredient[0]+"\n")
		return all_ingredients_sorted

if build_ingredient_list:
	all_ingredients = build_ingredients_dictionary()
else:
	with open('ingredients.txt') as ingredients:
		for line in ingredients:
			line = line.strip()
			ingredient = line.split();
			all_ingredients[" ".join(ingredient[1:])] = int(ingredient[0])

In [7]:
# Ingredient management
def get_ingredient(line, imap):
	parts = lemmatize_sentence(line);
	ingredient =  aux_get_ingredient(parts, imap)
	if ingredient is None:
		return aux_get_ingredient(nltk.word_tokenize(line),imap)
	else:
		return ingredient

def aux_get_ingredient(parts, imap):
	for part in parts:
		if part in imap["children"]:
			aux = aux_get_ingredient(parts, imap["children"][part])
			if aux is not None:
				return aux
	if imap["count"] > 0:
		return imap
	else:
		return None

In [8]:
# Build the ingredient map
index=0
for line in all_ingredients:
	parts = line.split()
	aux = ingredient_map
	for part in parts:
		if part in aux["children"]:
			aux = aux["children"][part]
		else:
			aux["children"][part] = {"count": 0, "children": dict(), "words": "", "index": -1}
			aux = aux["children"][part]
	aux["count"] = all_ingredients[line]
	aux["words"] = line
	aux["index"] = index
	index+=1

# Extract facts

In [9]:
# Auxiliar functions
def extract_nutrition_facts(recipe):
	extracted=[]
	servings = recipe["yield"]
	for i in range(0, len(valid_nutrition_facts)):
		fact = valid_nutrition_facts[i]
		if fact in recipe["totalDaily"]:
			in_recipe = recipe["totalDaily"][fact]
			extracted.append([i, in_recipe["quantity"]/(100*servings)])
	return extracted

def extract_ingredient_facts(recipe):
	extracted=[]
	total_weight=0
	for ingredient_line in recipe["ingredients"]:
		ingredient = get_ingredient(ingredient_line["text"], ingredient_map)
		total_weight+=ingredient_line["weight"]
		if ingredient is None:
			tokenized = lemmatize_sentence(ingredient_line["text"], True)
			for i in range(0,len(tokenized)):
				ingredient=""
				anyNN=False
				for j in range(i, min(len(tokenized), i+3)):
					if tokenized[j][1] == "CD" or tokenized[j][1] == "," or tokenized[j][1] == "." or (tokenized[j][2] in invalid_tokens):
						break
					anyNN = anyNN or is_noun(tokenized[j][1])
					if i==j:
						ingredient = tokenized[j][2].lower()
					else:
						ingredient += " "+tokenized[j][2].lower()
					if anyNN:
						if ingredient not in unknown_ingredients:
							unknown_ingredients[ingredient]=1
						else:
							unknown_ingredients[ingredient]=unknown_ingredients[ingredient]+1
		else:
			# Build the ingredient map
			extracted.append([ingredient["index"], ingredient_line["weight"]])
	for ext in extracted:
		ext[1] = ext[1]/total_weight
	return extracted

In [10]:
# Read files
for filename in os.listdir(base_dir):
	# print("Reading file "+filename)
	with open(base_dir+"/"+filename) as json_data:
		recipes_results = json.load(json_data)
		for hit in recipes_results["hits"]:
			recipe = hit["recipe"]
			label = recipe["label"]
			if label not in all_recipes:
				idss_recipe = {"image": recipe["image"],
				"url": recipe["url"],
				"dietLabels": recipe["dietLabels"],
				"healthLabels": recipe["healthLabels"],
				"ingredients": extract_ingredient_facts(recipe), 
				"name": label, 
				"nutrition": extract_nutrition_facts(recipe)}
				all_recipes[label] = idss_recipe

FileNotFoundError: [Errno 2] No such file or directory: '/home/carlos/Documentos/idss/idss_pw3_food/food'

# Save everything to files

In [11]:
unknown_ingredients = sorted(unknown_ingredients.items(), key=operator.itemgetter(1))
with open('additional-ingredients.txt', 'w') as ingredients_file:
	for ingredient in unknown_ingredients:
		ingredients_file.write(str(ingredient[1])+" "+ingredient[0]+"\n")
with open('recipes.txt', 'w') as recipes_file:
	recipes_file.write("\n".join(all_recipes.keys()))
with open('recipes.json', 'w') as outfile:
    json.dump(all_recipes, outfile, indent=4)