# Required Libraries and Files

In [8]:
import zipfile
import gdown
import pandas as pd
import spacy
import en_core_web_sm
import re
from tqdm import tqdm

! gdown 10-NuS-prcLNZjlXSqQd6RBOM44k8epdx #dataset file

nlp = en_core_web_sm.load() #spacy's English model

Downloading...
From (original): https://drive.google.com/uc?id=10-NuS-prcLNZjlXSqQd6RBOM44k8epdx
From (redirected): https://drive.google.com/uc?id=10-NuS-prcLNZjlXSqQd6RBOM44k8epdx&confirm=t&uuid=db1126fe-4f62-4d15-ba92-71747e206c4d
To: /content/archive.zip
100% 666M/666M [00:05<00:00, 129MB/s]


# Load Dataset

In [9]:
#@title Unzip the dataset file
with zipfile.ZipFile('archive.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [10]:
#@title Build the dataframe
df = pd.read_csv('recipes_data.csv', header=0)

#eliminating duplicated recipes with the same name
df = df.drop_duplicates(subset='title')

#we randomize the recipes
random_state = 42
df = df.sample(frac=1, random_state=random_state) #random_state for reproducibility

#we drop the columns that we won't need
hide_columns = ['site', 'source', 'site', 'link']
df = df.drop(columns=hide_columns)

# We keep only the first 500.000 recipes so the code runs faster,
# feel free to change the parameter to your preference
number_of_recipes = 500000 #@param {type: "number"}
df = df.head(number_of_recipes)


In [11]:
#@title Anotate vegetarian and vegan recipes

non_veggie = ['chicken', 'beef', 'pork', 'fish', 'lamb', 'goat', 'meat', 'turkey', 'duck', 'veal', 'bacon', 'ham', 'salmon', 'copa',
              'shrimp', 'crab', 'lobster', 'oyster', 'sardine', 'anchovy', 'trout', 'tuna', 'mutton', 'venison', 'sole', 'jamon',
              'squid', 'octopus', 'clams', 'scallops', 'jelly', 'jell-o', 'jello', 'gelatin', 'rabbit', 'deer', 'quail', 'pepperoni',
              'snail', 'horse', 'buffalo', 'boar', 'guinea pig', 'kangaroo', 'ostrich', 'pigeon', 'turtle', 'frog', 'hamburguer',
              'alligator', 'elk', 'snake', 'sausage', 'pancetta', 'filet', 'chorico', 'ribs', 'chorizo', 'cod', 'meatballs', 'herring']

non_vegan = non_veggie + ['milk', 'butter', 'cream', 'cheese', 'yogurt', 'honey', 'whey', 'casein',
                           'beeswax', 'isenglass', 'carmine', 'shellac', 'albumin', 'cochineal',
                           'collagen', 'ghee', 'lanolin', 'suet', 'rennet', 'lard', 'buttermilk',
                           'squalene', 'taurine', 'egg', 'yolk', 'whey', 'casein', 'ghee', 'curd',
                           'paneer', 'lactose', 'quark', 'kefir', 'cottage', 'mascarpone', 'brie']

def is_vegetarian(ingredient):
    return all(i not in ingredient for i in non_veggie)

def is_vegan(ingredient):
    return all(i not in ingredient for i in non_vegan)

total_rows = len(df)

tqdm.pandas(desc="Annotating diets: ")
df['Vegetarian'] = df['ingredients'].progress_apply(lambda x: all(is_vegetarian(ingredient) for ingredient in x.split(', ')))
df['Vegan'] = df['ingredients'].progress_apply(lambda x: all(is_vegan(ingredient) for ingredient in x.split(', ')))

#print the df to visualize how the data looks
#df

Processing rows: 100%|██████████| 500000/500000 [00:48<00:00, 10301.54it/s]
Processing rows: 100%|██████████| 500000/500000 [00:52<00:00, 9555.30it/s] 


# **No Crumbs Recipe Finder**


In [None]:
#@title Main program
lines = "-"*40 # this variable is used throughout the code for better visibility

def select_category():
    # print diet options to the user
    print('Welcome to No Crumbs, your slayest recipe finder!')
    print("Please select a category:")
    print("[1] Vegetarian")
    print("[2] Vegan")
    print("[3] No restrictions")

    # and now we want the user's answer
    while True:
        diet_selection = input("Enter the number corresponding to your choice: ")
        try:
            category = int(diet_selection)
            if category in [1, 2, 3]:
                return category
            else: #in case they input a number higher than 3
                print("Invalid selection. Please enter a number between 1 and 3.")
        except ValueError: #in case the input is not a number
            print("Invalid input. Please enter a number.")

def get_input_ingredients():
    # we ask the user for the ingredients and the input given will be lowercased and
    # splited using the commas we very friendly asked them to use
    print(lines)
    return input("What ingredients do you have? (Please use commas to separate them)\n").lower().split(",")

def lemmatize(words):
    lemmatized_words = []
    for word in words:
        word = word.strip() #we eliminate unnecessary spaces
        doc = nlp(word)
        lemmatized_word = " ".join([token.lemma_ for token in doc])
        lemmatized_words.append(lemmatized_word)
    return lemmatized_words

def filter_recipes(df, ingredients, category):
    # separate the ingredients we want from the ones we don't want
    included_ingredients = []
    excluded_ingredients = []
    for ingredient in ingredients:
        ingredient = ingredient.lstrip() #we take away the space at the start of some ingredients
        if ingredient.startswith("-"):
            excluded_ingredients.append(ingredient.lstrip("-"))
        else:
            included_ingredients.append(ingredient)

    # we remove recipes containing excluded ingredients
    for excluded_ingredient in excluded_ingredients:
        df = df[~df['ingredients'].str.contains(re.escape(excluded_ingredient), case=False)]

    # we now choose the recipes that include all ingredients we want
    # depending on diet category selection
    if category == 1:
        filtered_df = df[(df['Vegetarian']) & (df['ingredients'].apply(lambda x: all(re.search(rf'{ingredient}', x, re.IGNORECASE) for ingredient in included_ingredients)))]
        #(breakdown of the line above)
        #to keep a recipe two conditions must apply: the recipe is categorized as vegetarian
        #the ingredients column contains all the ingredients listed
    elif category == 2:
        filtered_df = df[(df['Vegan']) & (df['ingredients'].apply(lambda x: all(re.search(rf'{ingredient}', x, re.IGNORECASE) for ingredient in included_ingredients)))]
    elif category == 3:
        filtered_df = df[(df['ingredients'].apply(lambda x: all(re.search(rf'{ingredient}', x, re.IGNORECASE) for ingredient in included_ingredients)))]
    else:
        filtered_df = pd.DataFrame(columns=df.columns)

    return filtered_df.head(10) #to not overwhelm the user we only keep maximum 10 of the results

#Main program
while True:
    category = select_category() #the user choses diet preference
    input_ingredients = get_input_ingredients() #the user inputs list of ingredients they want
    lemmatized_input = lemmatize(input_ingredients) #the list of ingredients is lemmatized
    filtered_recipes = filter_recipes(df, input_ingredients, category) #we search for matching recipes to diet category and containing all ingredients

    if filtered_recipes.empty: #in case no matching recipes are found we ask if they want to try again
        change_category = input("Sorry, no recipes found matching your criteria. Would you like to start again? (yes/no): ").lower()
        print(lines)
        if change_category == "yes" or change_category == "y":
            continue  # restart the loop if the user wants to start again
        else:
            break  # exit the loop if the user doesn't want to start again
    else:
        print(lines)
        #first we print the results from the filtering
        print("Here are some recipes containing your ingredient(s) (up to 10):")
        #we want the recipe titles shown to be enumerated
        for i, (_, row) in enumerate(filtered_recipes.iterrows(), start=1):
            print(f"{i}. {row['title']}")

        print(lines)

        #we let the user select the recipe they are interested in
        selection = input("Select a recipe number to view its ingredients:\n")
        try:
            #we make sure the number selected is the correct one, keeping in mind python starts at 0
            selected_recipe_index = int(selection) - 1
            if 0 <= selected_recipe_index < len(filtered_recipes): #the user should have written a number over 0 and under 11
                selected_recipe = filtered_recipes.iloc[selected_recipe_index:selected_recipe_index + 1]
                ingredients = eval(selected_recipe['ingredients'].iloc[0]) #we get the ingredients for the choosen recipe
                directions = eval(selected_recipe['directions'].iloc[0])#and the  instructions
                #we print all the information in a easily readable format
                print('\n***', selected_recipe['title'].iloc[0], '***')
                print()
                print(f"Ingredients:")
                for ingredient in ingredients:
                    print("-", ingredient.strip())
                print()
                print(f"Instructions:")
                for direction in directions:
                    print("-", direction.strip())
            else:
                print("Invalid selection.") #again we account for incorrect input number
        except ValueError:
            print("Invalid input. Please enter a number.") #or for incorrect type input

        print(lines)
        #lastly we give the option to start again or finish the search
        restart = input("Would you like to start again? (yes/no): ").lower()
        if restart == "yes" or restart == "y":
            continue
        else:
            break # Exit the loop if the user doesn't want to start again

print(lines)
print("Thank you for using No Crumbs! :)")

Welcome to No Crumbs, your slayest recipe finder!
Please select a category:
[1] Vegetarian
[2] Vegan
[3] No restrictions
Enter the number corresponding to your choice: 1
----------------------------------------
What ingredients do you have? (Please use commas to separate them)
tomato, -pasta
----------------------------------------
Here are some recipes containing your ingredient(s) (up to 10):
1. Creole Catsup
2. Cherry Tomato-Gruyere Tartlets
3. Black Pepper Honey Roasted Tomato Galette With Mascarpone Cream
4. Baked Lentil Cheese Burgers
5. Walk Away Tomato Sauce 
6. Ancho, Beef, And Bulgur Chili 
7. Cider Baked Beans
8. Firehouse Roasted Peppers and Garlic Salsa
9. Tilapia With Charred Cherry Tomatoes And Balsamic Reduction
10. 34 Morton Street Specials
----------------------------------------
Select a recipe number to view its ingredients:
2

*** Cherry Tomato-Gruyere Tartlets ***

Ingredients:
- 1 12 cups flour
- 1 12 tablespoons unsalted butter
- 18 teaspoon salt
- 5 -7 tablespo

In [None]:
#@title Error testing
#we used this for error testing, to easily access a specific recipe and its ingredients
def print_ingredients(df, recipe_title):
    # first check that the recipe title is correct
    if recipe_title in df['title'].values:
        # we get the ingredients from both the "ingredients" and "NER" columns to compare
        ingredients = df.loc[df['title'] == recipe_title, 'ingredients'].values[0]
        print(f"Ingredients for '{recipe_title}':")
        for ingredient in ingredients.split(','):
            print(f"- {ingredient.strip()}")
        NERingredients = df.loc[df['title'] == recipe_title, 'NER'].values[0]
        print(f"Ingredients in NER for '{recipe_title}':")
        for ingredient in NERingredients.split(','):
            print(f"- {ingredient.strip()}")
    else:
        print(f"Recipe '{recipe_title}' not found!")

print_ingredients(df, "Polish potato salad recipe")