In [1]:
import ast

import pandas as pd

In [3]:
recipe_dataset = pd.read_csv("data/full_dataset.csv", index_col=0)
recipe_dataset.head()

Unnamed: 0,title,ingredients,directions,link,source,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [4]:
len(recipe_dataset)

2231142

In [40]:
recipe_dataset.drop(axis=1, labels=["link", "source"], inplace=True)

In [55]:
import re


# Based on categories from https://cookbooks.com/recipes-search.aspx?id=1&tag=grill
def get_category(row):
    # 1. Specific Dish Types (Check Title first)
    specific_categories = {
    "Soup": ["soup", "stew", "chowder"],
    "Grains and Pasta": ["pasta", "spaghetti", "fettuccine", "lasagna","noodle", "noodles", "ramen", "udon", "soba", "mein", "vermicelli", "pho"],
    "Breads": ["pastry", "bread", "loaf", "muffin", "rolls", "muffins", "biscuit", "scone", "shortbread", "cracker", "crackers", "pizza", "calzone"],   
    "Salads": ["salad", "slaw", "vinaigrette"],
    "Drinks": ["drink", "smoothie", "cocktail", "punch", "beverage", "tea", "sangria"],
    "Condiments and sides": ["sandwich", "burger", "slider", "wrap", "panini", 
                                  "sauce", "dressing", "gravy", "pesto", "glaze","relish",
                                  "dip", "spread", "hummus", "guacamole"],   
    }
    # 2. Broad Ingredient
    broad_categories = {
    "Sweet": ["dessert", "caramel", "frosting", "whipped cream", "marshmallow", "marshmallows", "chocolate", "condensed milk","cake", "cupcake", "cheesecake", "gateau", "pie", "tart", "cobbler", "galette", "candy", "fudge", "truffle", "toffee", "brittle", "pralines", "cookie", "biscotti", "macaron", "macaroon", "brownie", "blondie"],
    "Baking": ["bake", "yeast", "vanilla", "shortening", "souffle", "souffles"],    
    "Mains": ["chicken", "ham", "hamburger","hamburgers", "pork", "meat",  "lamb", "bacon", "beef", "turkey", "steak", "sausage", "grilled", "grill", "bbq", "barbecue", "skewers", "fish", "prawns", "prawn", "shrimp", "tuna", "salmon", "crab", "lobster", "scallops", "oyster", "oysters",
              "casserole", "gratin", "hotdish"],
    "Vegan": ["vegan", "tofu", "lentils", "tempeh", "lentil", "chickpea", "seitan"]
    }
    non_vegan = ["cheese", "cream", "milk", "cream cheese", "condensed milk", "chicken", "ham", "pork", "meat",  "lamb", "bacon", "beef", "turkey", "steak", "sausage","fish", "prawns", "prawn", "shrimp", "tuna", "salmon", "crab", "lobster", "scallops", "oyster", "oysters", "egg", "eggs", "butter", "honey", "frosting", "whipped cream", "gelatin", "candy", "marshmallow", 
                 "marshmallows", "hamburger","hamburgers", "cake"]
    
    pattern = re.compile(r'\b(' + '|'.join(map(re.escape, non_vegan)) + r')\b', flags=re.IGNORECASE)
    
    full_data = ""
    if not pattern.search(row["NER"]):
        return "Vegan"
    
    if isinstance(row["title"], str):
        title = row["title"].lower()
        for cat in specific_categories:
            keywords = specific_categories[cat]
            if any(keyword in title for keyword in keywords):
                return cat        
        full_data += row["title"]   
    
    full_data += " "
    full_data = (full_data + " " + row["NER"] + " " + row["ingredients"]).lower()
    
    for broad_cat in broad_categories:
        keywords = broad_categories[broad_cat]
        if any(keyword in full_data for keyword in keywords):
            return broad_cat
    return "Other"

recipe_dataset["category"] = recipe_dataset.apply(get_category, axis=1)

In [56]:
value_counts = recipe_dataset["category"].value_counts()
for category, count in value_counts.items():
    print(f"{category} === {count} === {(count/len(recipe_dataset) * 100):.2f}%")

Sweet === 551923 === 24.74%
Vegan === 424420 === 19.02%
Mains === 386566 === 17.33%
Other === 173941 === 7.80%
Breads === 170246 === 7.63%
Condiments and sides === 150482 === 6.74%
Salads === 100107 === 4.49%
Baking === 99649 === 4.47%
Soup === 84005 === 3.77%
Grains and Pasta === 58802 === 2.64%
Drinks === 31001 === 1.39%


In [54]:
recipe_dataset.loc[recipe_dataset["category"] == "Vegan"][30:40]

Unnamed: 0,title,ingredients,directions,NER,category
148,"Chocolate ""Stuff""","[""1 large box instant chocolate pudding mix"", ...","[""Make pudding as directed on package, then ad...","[""chocolate pudding"", ""vanilla wafers"", ""pecans""]",Vegan
158,Eggplant Spaghetti Sauce,"[""1/2 c. oil (I use olive oil, extra light)"", ...","[""Heat oil in large pot."", ""Add all the vegeta...","[""oil"", ""eggplant"", ""onions"", ""cauliflower"", ""...",Vegan
161,Cranberry Punch,"[""1 qt. cranberry juice"", ""1 qt. ginger ale"", ...","[""Mix and put in punch bowl."", ""Top with sherb...","[""cranberry juice"", ""ginger ale"", ""O"", ""boilin...",Vegan
197,Festive Fruit Salad,"[""1 (20 oz.) can pineapple chunks, drained (re...","[""Combine pineapple, oranges, grapes, strawber...","[""pineapple"", ""mandarin oranges"", ""grapes"", ""m...",Vegan
202,Fruit Medley,"[""3 Tbsp. tapioca"", ""1 c. water"", ""1/2 c. suga...","[""Allow tapioca, 1 cup water and sugar to stan...","[""tapioca"", ""water"", ""sugar"", ""water"", ""orange...",Vegan
209,Leroy'S Heavenly Hash,"[""No. 2 can fruit cocktail"", ""4 bananas, cut u...","[""Mix in a large bowl."", ""Refrigerate and serv...","[""No"", ""bananas"", ""marshmallows"", ""pecan piece...",Vegan
214,Mistletoe Punch,"[""1 (6 oz.) can frozen lemonade concentrate, t...","[""Combine all ingredients, except ginger ale a...","[""frozen lemonade concentrate"", ""orange juice ...",Vegan
224,Spaghetti Salad,"[""1 lb. spaghetti"", ""3 cucumbers, diced"", ""4 t...","[""In a saucepan, cook spaghetti until tender.""...","[""spaghetti"", ""cucumbers"", ""tomatoes"", ""stalks...",Vegan
233,Beer Biscuits,"[""2 c. biscuit mix"", ""1 1/2 Tbsp. sugar"", ""6 o...","[""Dissolve sugar in beer and add to biscuit mi...","[""biscuit mix"", ""sugar"", ""warm beer""]",Vegan
236,Twinkie Dessert,"[""14 Twinkies"", ""1 large box strawberry jello""...","[""Line bottom of 13x9 pan with Twinkies."", ""Mi...","[""Twinkies"", ""strawberry jello"", ""water"", ""fro...",Vegan


In [44]:
results = recipe_dataset[recipe_dataset["title"].str.contains(" Appetizer", case=False, na=False)]
print(results.shape)
results.head()

(4030, 5)


Unnamed: 0,title,ingredients,directions,NER,category
827,Hot Tuna Appetizer,"[""1 (7 oz.) can water-packed tuna, drained and...","[""In medium bowl, combine tuna, anchovies, gre...","[""water"", ""anchovy"", ""green pepper"", ""pimento""...",Mains
846,Party Pizza Appetizers(Makes 90),"[""1 lb. hot sausage, cooked, drained and crumb...","[""Combine all ingredients, except biscuits."", ...","[""hot sausage"", ""onion"", ""sharp cheese"", ""Parm...",Breads
1041,"""Cholives"" Puff Appetizers","[""stuffed olives"", ""1/4 c. soft butter"", ""5 oz...","[""Soften butter."", ""Add cheese and blend well....","[""olives"", ""butter"", ""English cheese spread"", ...",Other
2938,Taco Salad(Cold Appetizer),"[""1 c. mayonnaise"", ""1 c. sour cream"", ""1 pkg....","[""Mix avocados with lemon juice to a spreading...","[""mayonnaise"", ""sour cream"", ""taco"", ""beans"", ...",Salads
3520,Sausage Appetizer,"[""1 lb. ground chuck"", ""1 lb. sausage"", ""1 lb....","[""Brown meat and drain. Add diced Velveeta; st...","[""ground chuck"", ""sausage"", ""Worcestershire sa...",Mains


In [62]:
recipe_sample_balanced = recipe_dataset.groupby("category", group_keys=False).apply(
    lambda x: x.sample(min(len(x), 7000), random_state=42).reset_index(drop=True)
)
recipe_sample_balanced.head()

  recipe_sample_balanced = recipe_dataset.groupby("category", group_keys=False).apply(


Unnamed: 0,title,ingredients,directions,NER,category
0,Butter Baked Rice (Oamc),"[""1 cup long grain rice"", ""1 teaspoon salt"", ""...","[""Measure rice and salt in a bowl and pour on ...","[""long grain rice"", ""salt"", ""butter"", ""garlic""...",Baking
1,Vegan Orange Dream,"[""1 12 cups orange juice, chilled"", ""1 cup lig...","[""In a blender, combine everything except the ...","[""orange juice"", ""light vanilla soymilk"", ""sil...",Baking
2,Baked Lima Beans,"[""1 lb. large dried lima beans, soaked overnig...","[""Soak beans overnight and rinse."", ""Cover wit...","[""beans"", ""brown sugar"", ""dry mustard"", ""onion...",Baking
3,Stromboli,"[""3 c. flour"", ""1 pkg. dry yeast"", ""2 Tbsp. su...","[""Combine 1 1/4 cups flour, yeast, sugar and s...","[""flour"", ""yeast"", ""sugar"", ""cooking oil"", ""sa...",Baking
4,Baked Clams,"[""3 pounds Littleneck clams"", ""1 cup cornmeal""...","[""Preheat oven to 350 degrees F."", ""Place clam...","[""Littleneck clams"", ""cornmeal"", ""salt"", ""brea...",Baking


In [63]:
print(recipe_sample_balanced["category"].value_counts())

category
Baking                  7000
Breads                  7000
Condiments and sides    7000
Drinks                  7000
Grains and Pasta        7000
Mains                   7000
Other                   7000
Salads                  7000
Soup                    7000
Sweet                   7000
Vegan                   7000
Name: count, dtype: int64


In [64]:
recipe_sample_balanced.to_csv("recipe_sample_balanced.csv")