In [1]:
import pandas as pd
import ast
import re
from sqlalchemy import create_engine
from __future__ import annotations

recipes = pd.read_csv("../web_scraper/trader_joes_recipes.csv")
products = pd.read_csv("../web_scraper/trader_joes_products.csv")
fruit_veg = pd.read_csv("../web_scraper/traderjoes_fresh-fruits-veggies_products.csv")
meat = pd.read_csv("../web_scraper/traderjoes_meat_products.csv")

In [2]:
recipes["ingredients"] = recipes["ingredients"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

def split_ingredient(text):
    """
    Split strings like '4 tablespoons TJ’s Salted Butter' into:
    quantity = '4 tablespoons', ingredient = "TJ’s Salted Butter"
    """

    match = re.match(r"([\d¼½¾⅓⅔⅛⅜⅝⅞\s\-–/a-zA-Z]+)\s+(.*)", text)
    if match:
        qty = match.group(1).strip()
        name = match.group(2).strip()
    else:
        qty, name = None, text.strip()
    return pd.Series([qty, name])

recipes_exploded = recipes.explode("ingredients", ignore_index=True)
recipes_exploded[["quantity_text", "ingredient_name"]] = recipes_exploded["ingredients"].apply(split_ingredient)

cookbook_df = recipes_exploded[["title", "category", "ingredient_name", "quantity_text"]]
possible_ingredients = cookbook_df['ingredient_name'].unique()
possible_ingredients_df = pd.DataFrame(possible_ingredients, columns=["Ingredient"])
possible_ingredients_df

Unnamed: 0,Ingredient
0,TJ’s Salted Butter
1,TJ’s Fresh Garlic
2,TJ’s Fresh Ginger
3,"TJ’s Jasmine Rice, rinsed"
4,water
...,...
2590,"TJ's Blueberries, fresh or frozen"
2591,"TJ's Organic Basil, cut into strips"
2592,TJ's Ciliegine Mozzarella
2593,TJ's Balsamic Vinaigrette


In [3]:
# Create SQLite DB in project folder
engine = create_engine("sqlite:///cookbook.db")

# Unique ingredient and quantity lookup tables
ingredient_df = cookbook_df[["ingredient_name"]].drop_duplicates().reset_index(drop=True)
ingredient_df.index.name = "ingredient_id"

quantity_df = cookbook_df[["quantity_text"]].drop_duplicates().reset_index(drop=True)
quantity_df.index.name = "quantity_id"

# Recipe table
recipe_df = recipes[["title", "category", "url", "image_url", "serves", "time"]].drop_duplicates().reset_index(drop=True)
recipe_df.index.name = "recipe_id"

# Write to database
ingredient_df.to_sql("ingredient", engine, if_exists="replace", index=True, index_label="ingredient_id")
quantity_df.to_sql("quantity", engine, if_exists="replace", index=True, index_label="quantity_id")
recipe_df.to_sql("recipe", engine, if_exists="replace", index=True, index_label="recipe_id")

# Create Cookbook table
cookbook_link = (
    cookbook_df.merge(ingredient_df.reset_index(), on="ingredient_name")
               .merge(quantity_df.reset_index(), on="quantity_text")
               .merge(recipe_df.reset_index(), on="title")
               [["recipe_id", "ingredient_id", "quantity_id"]]
)

cookbook_link.to_sql("recipe_ingredient", engine, if_exists="replace", index=False)

print("✅ Cookbook database built successfully.")

✅ Cookbook database built successfully.


In [4]:
# Option A: Define your first pantry here
pantry_names = [
    "TJ’s Salted Butter",
    "TJ’s Fresh Garlic",
    "TJ’s Jasmine Rice",
    "TJ’s Sea Salt",
    "TJ’s Large Eggs",
    "water",
    "TJ’s Zucchini, sliced into thin planks",
    "TJ’s Crunchy Sesame Sunflower Seeds Pepitas Salsa Macha",
    "TJ’s Authentic Greek Feta In Brine, crumbled",
    "TJ’s Lemon",
    "TJ’s Cauliflower, sliced into thick planks, core intact",
    "TJ’s Hot Honey Mustard Dressing",
    "TJ’s Heirloom Tomatoes, sliced into thick rounds",
    "TJ’s Organic Ranch Dressing",
    "TJ’s Olive Oil",
    "TJ’s Angus Chuck, Brisket, & Sirloin 1/3 lb. Ground Beef Patties",
    "TJ’s Triple Cream Soft Ripened Cambozola® Blue Cheese, sliced into rectangles",
    "TJ’s Brioche Buns",
    "TJ’s Fig Butter",
    "TJ’s Arugula",
    "TJ’s Zucchini, stems removed and sliced lengthwise into thin planks"
]

# Option B: or load from a CSV with a single column 'ingredient_name'
# pantry_names = pd.read_csv("my_pantry.csv")["ingredient_name"].tolist()

# Map names -> ingredient_id using the existing ingredient table
ingredient = pd.read_sql("SELECT ingredient_id, ingredient_name FROM ingredient", engine)
pantry_df = (
    pd.DataFrame({"ingredient_name": pantry_names})
    .merge(ingredient, on="ingredient_name", how="inner")
    [["ingredient_id"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

# Optional: add free-text notes (e.g., amounts you actually have)
# pantry_df["on_hand_note"] = None

# Write/replace the pantry table
pantry_df.to_sql("pantry", engine, if_exists="replace", index=False)

print("✅ Pantry table created with", len(pantry_df), "items")

✅ Pantry table created with 20 items


In [5]:
tables = ['ingredient', 'pantry', 'quantity', 'recipe', 'recipe_ingredient']

# Loop through and show first few rows of each
for t in tables:
    print(f"\n--- {t.upper()} ---")
    display(pd.read_sql(f"SELECT * FROM {t} LIMIT 5;", engine))


--- INGREDIENT ---


Unnamed: 0,ingredient_id,ingredient_name
0,0,TJ’s Salted Butter
1,1,TJ’s Fresh Garlic
2,2,TJ’s Fresh Ginger
3,3,"TJ’s Jasmine Rice, rinsed"
4,4,water



--- PANTRY ---


Unnamed: 0,ingredient_id
0,0
1,1
2,21
3,32
4,4



--- QUANTITY ---


Unnamed: 0,quantity_id,quantity_text
0,0,4 tablespoons
1,1,2 tablespoons finely chopped
2,2,1 tablespoon finely chopped
3,3,1 cup
4,4,2 cups hot



--- RECIPE ---


Unnamed: 0,recipe_id,title,category,url,image_url,serves,time
0,0,Aromatic Garlic Ginger Rice,Appetizers & Sides,https://www.traderjoes.com/home/recipes/aromat...,https://www.traderjoes.com/content/dam/trjo/co...,Serves 4,Time 20 mins
1,1,Roasted Pork Tenderloin & Potatoes with Honey ...,Dinner,https://www.traderjoes.com/home/recipes/roaste...,https://www.traderjoes.com/content/dam/trjo/co...,Serves 4,Time 45 mins
2,2,Crispy Garlic-Adorned Bok Choy,Appetizers & Sides,https://www.traderjoes.com/home/recipes/crispy...,https://www.traderjoes.com/content/dam/trjo/co...,Serves 4,Time 15 mins
3,3,Unexpected Potato Leek Galette,Appetizers & Sides,https://www.traderjoes.com/home/recipes/unexpe...,https://www.traderjoes.com/content/dam/trjo/co...,Serves 4-6,Time 2 h 25 mins - 2 h 1 mins
4,4,Inside-Out Apple & Cheddar Hand Pies,Breakfast & Desserts,https://www.traderjoes.com/home/recipes/inside...,https://www.traderjoes.com/content/dam/trjo/co...,Serves 4,Time 25 mins - 30 mins



--- RECIPE_INGREDIENT ---


Unnamed: 0,recipe_id,ingredient_id,quantity_id
0,0,0,0
1,0,1,1
2,0,2,2
3,0,3,3
4,0,4,4


In [9]:
# Taking a look at what ingredients match recipes

sql = """
SELECT
  r.recipe_id,
  r.title,
  1.0 * COUNT(p.ingredient_id) / COUNT(DISTINCT ri.ingredient_id) AS coverage
FROM recipe r
JOIN recipe_ingredient ri ON ri.recipe_id = r.recipe_id
LEFT JOIN pantry p ON p.ingredient_id = ri.ingredient_id
GROUP BY r.recipe_id, r.title
ORDER BY coverage DESC, r.title;
"""

df = pd.read_sql(sql, engine)
df_top_5 = df.head(5)
print(df_top_5)

   recipe_id                                title  coverage
0          9  Grilled & Glazed Cauliflower Steaks  1.000000
1          8    Grilled Zucchini with Salsa Macha  1.000000
2         21            Fig & Blue Cheese Burgers  0.875000
3         10               Fried Ranch-y Tomatoes  0.500000
4          0          Aromatic Garlic Ginger Rice  0.428571


In [15]:
# Defining missing ingredients for the top 5 matching recipes

sql_per_recipe = """
WITH coverage AS (
  SELECT r.recipe_id, r.title,
         1.0 * SUM(CASE WHEN p.ingredient_id IS NOT NULL THEN 1 ELSE 0 END)
           / COUNT(DISTINCT ri.ingredient_id) AS coverage
  FROM recipe r
  JOIN recipe_ingredient ri ON ri.recipe_id = r.recipe_id
  LEFT JOIN pantry p ON p.ingredient_id = ri.ingredient_id
  GROUP BY r.recipe_id, r.title
),
top5 AS (
  SELECT recipe_id, title, coverage
  FROM coverage
  ORDER BY coverage DESC, title
  LIMIT 5
),
missing AS (
  SELECT t.recipe_id, t.title, i.ingredient_name, q.quantity_text
  FROM top5 t
  JOIN recipe_ingredient ri ON ri.recipe_id = t.recipe_id
  JOIN ingredient i ON i.ingredient_id = ri.ingredient_id
  JOIN quantity   q ON q.quantity_id   = ri.quantity_id
  LEFT JOIN pantry p ON p.ingredient_id = ri.ingredient_id
  WHERE p.ingredient_id IS NULL
)
SELECT
  ingredient_name,
  REPLACE(GROUP_CONCAT(DISTINCT quantity_text), ',', ' | ') AS suggested_amounts,
  COUNT(DISTINCT recipe_id)                                   AS recipes_needing
FROM missing
GROUP BY ingredient_name
ORDER BY recipes_needing DESC, ingredient_name;
"""

missing_per_recipe = pd.read_sql(sql_per_recipe, engine)

print("Missing per recipe (top 5):")
missing_per_recipe

Missing per recipe (top 5):


Unnamed: 0,ingredient_name,suggested_amounts,recipes_needing
0,TJ’s 100% Canola Oil,,1
1,TJ’s Black Peppercorns,,1
2,TJ’s Chicken Broth Concentrate packets,2,1
3,TJ’s Fresh Ginger,1 tablespoon finely chopped,1
4,"TJ’s Green Onion, thinly sliced on a bias",1,1
5,TJ’s Japanese Style Panko Breadcrumbs,1 - 2 cups,1
6,"TJ’s Jasmine Rice, rinsed",1 cup,1
7,"TJ’s Red Onion, thinly sliced",1,1


In [17]:
# Making a consolidated grocery list based on the top 5 recipes

sql_consolidated = """
WITH coverage AS (
  SELECT r.recipe_id, r.title,
         1.0 * SUM(CASE WHEN p.ingredient_id IS NOT NULL THEN 1 ELSE 0 END)
           / COUNT(DISTINCT ri.ingredient_id) AS coverage
  FROM recipe r
  JOIN recipe_ingredient ri ON ri.recipe_id = r.recipe_id
  LEFT JOIN pantry p ON p.ingredient_id = ri.ingredient_id
  GROUP BY r.recipe_id, r.title
),
top5 AS (
  SELECT recipe_id, title, coverage
  FROM coverage
  ORDER BY coverage DESC, title
  LIMIT 5
),
missing AS (
  SELECT t.recipe_id, t.title, i.ingredient_name, q.quantity_text
  FROM top5 t
  JOIN recipe_ingredient ri ON ri.recipe_id = t.recipe_id
  JOIN ingredient i ON i.ingredient_id = ri.ingredient_id
  JOIN quantity   q ON q.quantity_id   = ri.quantity_id
  LEFT JOIN pantry p ON p.ingredient_id = ri.ingredient_id
  WHERE p.ingredient_id IS NULL
),
dedup_qty AS (
  SELECT ingredient_name, quantity_text
  FROM missing
  GROUP BY ingredient_name, quantity_text
),
need_counts AS (
  SELECT ingredient_name, COUNT(DISTINCT recipe_id) AS recipes_needing
  FROM missing
  GROUP BY ingredient_name
)
SELECT d.ingredient_name,
       GROUP_CONCAT(d.quantity_text, ' | ') AS suggested_amounts,
       n.recipes_needing
FROM dedup_qty d
JOIN need_counts n USING (ingredient_name)
GROUP BY d.ingredient_name
ORDER BY n.recipes_needing DESC, d.ingredient_name;
"""

grocery_list = pd.read_sql(sql_consolidated, engine)

print("\nConsolidated grocery list:")
grocery_list


Consolidated grocery list:


Unnamed: 0,ingredient_name,suggested_amounts,recipes_needing
0,TJ’s 100% Canola Oil,,1
1,TJ’s Black Peppercorns,,1
2,TJ’s Chicken Broth Concentrate packets,2,1
3,TJ’s Fresh Ginger,1 tablespoon finely chopped,1
4,"TJ’s Green Onion, thinly sliced on a bias",1,1
5,TJ’s Japanese Style Panko Breadcrumbs,1 - 2 cups,1
6,"TJ’s Jasmine Rice, rinsed",1 cup,1
7,"TJ’s Red Onion, thinly sliced",1,1
