In [108]:
import pandas as pd
import random
import re

from tqdm import tqdm

In [77]:
df = pd.read_csv("../data/full_dataset.csv")

In [64]:
important_columns = ["title", "ingredients", "directions", "NER"]
df = df[important_columns]

In [22]:
df.head()

Unnamed: 0,title,ingredients,directions,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...","[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....","[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...","[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...","[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...","[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [171]:
all_recipes = df["NER"].values

In [172]:
def get_cleaned_ingredient(ingredient): 
    bad_starts = "()[]{}$><\\'*+,_-./:@#"

    lower_ingr = ingredient.lower()
    result = re.search(r"\W?\w?\s", lower_ingr)
    if result and result.start() == 0:
        lower_ingr = lower_ingr[result.end():]

    while lower_ingr and lower_ingr[0] in bad_starts:
        lower_ingr = lower_ingr[1:]

    return lower_ingr

In [173]:
ingr_dict = {}
cleaned_recipes = []

# build Ingredient Frequency Dict
for recipe in tqdm(all_recipes):

    # gives a list of ingredients:str
    ingredients = recipe[2:-2].strip("[]").split('", "')

    # get all the ingredients within the recipe
    for ingredient in ingredients:        
        ingredient = get_cleaned_ingredient(ingredient)
        if ingredient:
            if ingredient not in ingr_dict:
                ingr_dict[ingredient] = 1
            else:
                ingr_dict[ingredient] += 1

ingr_list = [ingr[0] for ingr in ingr_dict.items() if ingr[1] > 500]
ingr_set = set(ingr_list)

100%|█████████████████████████████████████████████████████████████████████| 2231142/2231142 [00:31<00:00, 71065.53it/s]


In [176]:
rows_dropped = 0
row_idcs = []
for row_idx in tqdm(range(len(df))):

    recipe_ingredients = df.iloc[row_idx]["NER"][2:-2].strip("[]").split('", "')
    recipe_ingredients = set([get_cleaned_ingredient(ingredient) for ingredient in recipe_ingredients])

    if not recipe_ingredients.issubset(ingr_set):
        row_idcs.append(row_idx)
        rows_dropped += 1

print("Rows dropped:", rows_dropped)


100%|█████████████████████████████████████████████████████████████████████| 2231142/2231142 [02:14<00:00, 16566.66it/s]

Rows dropped: 933426





In [177]:
new_df = df.drop(row_idcs)
new_df[important_columns].head()

Unnamed: 0,title,ingredients,directions,NER
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....","[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...","[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...","[""peanut butter"", ""graham cracker crumbs"", ""bu..."
5,Cheeseburger Potato Soup,"[""6 baking potatoes"", ""1 lb. of extra lean gro...","[""Wash potatoes; prick several times with a fo...","[""baking potatoes"", ""extra lean ground beef"", ..."
6,Rhubarb Coffee Cake,"[""1 1/2 c. sugar"", ""1/2 c. butter"", ""1 egg"", ""...","[""Cream sugar and butter."", ""Add egg and beat ...","[""sugar"", ""butter"", ""egg"", ""buttermilk"", ""flou..."


In [302]:
new_df.to_csv("../data/reduced_dataset.csv", index=False)

In [179]:
ingr_with_freqs = [(ingr[0], ingr[1]) for ingr in ingr_dict.items() if ingr[1] > 500]
ingr_with_freqs = sorted(ingr_with_freqs, key=lambda x: x[1], reverse=True)
available_ingredients = ["salt", "sugar", "water", "olive oil", "pepper"]

In [313]:
def get_random_weighted_pantry(items_with_frequencies=ingr_with_freqs, available_ingredients=available_ingredients, sample_size=35):

    items, frequencies = zip(*items_with_frequencies)
    
    while len(available_ingredients) < sample_size:

        # sample a random ingredient and add to pantry if it's not in there yet
        sampled_item = random.choices(items, weights=frequencies, k=1)[0]
        if sampled_item not in available_ingredients:
            available_ingredients.append(sampled_item)
    
    return available_ingredients


# gets row of df as input
def format_recipe(recipe):
    title = recipe["title"]
    ingredients = recipe["ingredients"][2:-2].strip("[]").split('", "') # this gets all ingredients in a nicely structured list
    directions = recipe["directions"][2:-2].strip("[]").split('", "')
    
    base_ingredients = recipe["NER"][2:-2].strip("[]").split('", "')
    base_ingredients = [get_cleaned_ingredient(ingredient) for ingredient in base_ingredients]

    formatted_output = f"Title of Recipe: {title}\n"
    formatted_output += "Ingredients and their Quantities:\n"
    for ingredient in ingredients:
        formatted_output += f"- {ingredient}\n"

    formatted_output += "Directions\n"
    for i, direction in enumerate(directions):
        formatted_output += f"{i + 1}. {direction}\n"

    formatted_output += f"End of Recipe: {title}.\n"
    return formatted_output, base_ingredients
    

# gets df of recipes and number of desired recipes as input
def get_formatted_recipes(all_recipes, sample_size):

    formatted_outputs = []
    
    if sample_size > len(all_recipes):
        print("Invalid Sample Size")
        return None

    # get random recipes
    random_idcs = random.sample(range(len(all_recipes)), sample_size)
    random_recipes = new_df[important_columns].iloc[random_idcs]

    # format recipes and append to list
    for row_idx in range(len(random_recipes)):
        recipe = random_recipes.iloc[row_idx]
        formatted_recipe, required_ingredients = format_recipe(recipe)
        
        formatted_outputs.append((formatted_recipe, required_ingredients))

    return formatted_outputs


def get_intersection(pantry, recipe_ingredients):
    return set(pantry).intersection(set(recipe_ingredients))


def sort_recipes(formatted_output, pantry, sorting="balanced"):
    # there are multiple types of sorting that make sense
    # "relative" -> sorts based on how many ingredients you have from the recipe. The higher the percentage, the higher the rating. This
    #               prioritizes using ingredients you already have
    # "absolute" -> sorts based on how many ingredients you are still missing. The fewer missing ingredients, the higher the rating. This
    #               prioritizes having to buy as few additional ingredients as possible
    # Future Work: it could be nice to balance this? i.e. to not see this as two completely separate options, but to have a trade-off?

    def weighted_score(recipe_ingredients):
        total_ingredients = len(recipe_ingredients)
        available_ingredients = len(get_intersection(pantry, recipe_ingredients))
        missing_ingredients = total_ingredients - available_ingredients

        # Relative proportion of ingredients you have
        relative_score = available_ingredients / total_ingredients

        # Penalize recipes with more missing ingredients, but allow some trade-off
        penalty = missing_ingredients / total_ingredients

        # Combine relative score and penalty (you can tweak the weights)
        return relative_score - 0.5 * penalty

    if sorting == "relative":
        key = lambda x: len(get_intersection(pantry, x[1])) / len(x[1])
        sorted_recipes = sorted(formatted_output, key=key, reverse=True)
    elif sorting == "absolute":
        key = lambda x: len(x[1]) - len(get_intersection(pantry, x[1]))
        sorted_recipes = sorted(formatted_output, key=key)
    elif sorting == "balanced":
        key = lambda x: weighted_score(x[1])
        sorted_recipes = sorted(formatted_output, key=key, reverse=True)
    else:
        sorted_recipes = None

    return sorted_recipes


def get_output_as_string(list_of_recipe_tuples, pantry=pantry):

    string_output = "Here are the ingredients you have available in your pantry:\n"
    for ingredient in pantry:
        string_output += f"{ingredient}, "
    string_output = string_output[:-2] + ".\n\n"

    string_output += "Here are the suggested recipes:\n" 
    for (whole_recipe, _) in list_of_recipe_tuples:
        string_output += f"{whole_recipe}\n\n" 

    string_output += "Based on the items in my pantry, how would you rank these recipes? I want to use as many ingredients from my pantry as possible." 
    return string_output
    

def get_correct_ranking_as_string(list_of_recipe_tuples, sorting="relative", pantry=pantry):
    
    # First sort the recipe_tuples
    list_of_recipe_tuples = sort_recipes(list_of_recipe_tuples, pantry)
    
    # Now construct the string
    string_output = ""
    
    for rank, (whole_recipe, recipe_ingredients) in enumerate(list_of_recipe_tuples):

        intersection = get_intersection(pantry, recipe_ingredients)
        string_output += f"Rank: {rank + 1}\n"
        string_output += f"{whole_recipe}\n"
        string_output += f"Rank {rank + 1} has been chosen for this recipe "
        
        if sorting == "relative":
            string_output += f"because you have {len(intersection)} out of {len(recipe_ingredients)} ingredients in your pantry!\n"
        elif sorting == "absolute":
            string_output += f"because you only need {len(recipe_ingredients) - len(intersection)} ingredients more!\n"

        # If some ingredients are missing 
        if len(recipe_ingredients) - len(intersection) > 0:
            string_output += "Here are the ingredients you still need:\n"
            
            for ingredient in recipe_ingredients:
                if ingredient not in intersection:
                    string_output += f"{ingredient}, "
            string_output = string_output[:-2]
            
        string_output += "\n\n\n"

    return string_output

In [311]:
pantry = get_random_weighted_pantry()
output = get_formatted_recipes(new_df, 3)


In [299]:
print(get_output_as_string(output))

Here are the ingredients you have available in your pantry:
salt, sugar, water, olive oil, pepper, parmesan cheese, thyme, worcestershire sauce, milk, allspice, yeast, cilantro, white wine, pimento, pasta, fresh spinach, sweet butter, flour, cider vinegar, fruit, egg, black peppercorns, paprika, oil, onion, crust, cornstarch, yellow cake mix, red onion, vanilla, brown sugar, cream cheese, celery, white pepper, cinnamon.

Here are the suggested recipes:
Title of Recipe: Peanut Butter Picnic Cake
Ingredients and their Quantities:
- 12 cup butter
- 1 13 cups sugar
- 14 cup smooth peanut butter
- 1 teaspoon vanilla
- 2 eggs
- 2 cups flour
- 2 teaspoons baking powder
- 1 teaspoon salt
- 1 cup milk
- 14 cup butter
- 14 cup peanut butter
- 1 teaspoon vanilla
- 2 12 cups icing sugar
- 3 -4 tablespoons milk
Directions
1. Cream butter and sugar.
2. Add eggs, peanut butter, and vanilla.
3. Beat well.
4. Add dry ingredients alternately with milk, mixing well after each addition.
5. Pour into greas

In [314]:
print(get_correct_ranking_as_string(output))

Rank: 1
Title of Recipe: Delphine'S Famous Oatmeal Chocolate Chips Cookies
Ingredients and their Quantities:
- 1 c. melted butter or margarine
- 1 c. sugar
- 1 c. brown sugar
- 2 eggs (fresh)
- 1 tsp. vanilla
- 1 3/4 plus c. flour
- 1 tsp. salt
- 1 tsp. baking soda
- 3 c. old-fashioned oats
- chocolate chips
Directions
1. Mix butter, sugars, eggs and vanilla.
2. Mix flour, soda and salt; add to first mixture.
3. Add oats and chocolate chips.
4. Bake at 350\u00b0 for 10 to 14 minutes in very lightly greased pan.
5. Do not overbake.
End of Recipe: Delphine'S Famous Oatmeal Chocolate Chips Cookies.

Rank 1 has been chosen for this recipe because you have 5 out of 10 ingredients in your pantry!
Here are the ingredients you still need:
butter, eggs, baking soda, old-fashioned oats, chocolate chips


Rank: 2
Title of Recipe: Western Stew
Ingredients and their Quantities:
- cabbage
- 1 c. sugar
- 1 tsp. celery seed
- 3/4 c. salad oil
- 1 c. vinegar
- salt and pepper to taste
Directions
1. Shr

In [308]:
question = "HAHAHAHAH???"
answer = "HAHAHAHAHA!!!"

json_format = f"""[{{"question": "{question}", "answer": "{answer}"}}]"""

In [309]:
json_format

'[{"question": "HAHAHAHAH???", "answer": "HAHAHAHAHA!!!"}]'