In [1]:
# Import
import pandas as pd
import re
import ast
from sqlalchemy import create_engine, text
import sqlite3
from data.ingredient_normalization import normalize
import os

recipes = pd.read_csv("../web_scraper/trader_joes_recipes.csv")
norm_ingredients = pd.read_csv("../data/normalized_ingredients.csv")

products = pd.read_csv("../web_scraper/trader_joes_products.csv")
fruit_veg = pd.read_csv("../web_scraper/traderjoes_fresh-fruits-veggies_products.csv")
meat = pd.read_csv("../web_scraper/traderjoes_meat_products.csv")

In [2]:
# creating TJ inventory
products.dropna()
produce = fruit_veg.rename(columns={'name': 'product_name'}).drop(columns=["page"]).dropna()
meats = meat.rename(columns={'name': 'product_name'}).drop(columns=["page"]).dropna()

products["category"]  = "speciality"
produce["category"] = "produce"
meats["category"] = "meat"

all_prod = pd.concat([products, produce, meats], ignore_index=True)

def parse_price_to_float(p):
    """Convert price strings like '$3.99' or 3.99 to floats safely."""
    if pd.isna(p):
        return None
    if isinstance(p, (int, float)):
        return float(p)
    s = str(p).replace("$", "").replace(",", "").strip()
    try:
        return float(s)
    except ValueError:
        return None

tj = pd.DataFrame({
    "name":  all_prod["product_name"],
    "unit":  all_prod["unit"] if "unit" in all_prod.columns else None,
    "price": all_prod["price"].map(parse_price_to_float) if "price" in all_prod.columns else None,
    "url":   all_prod["url"] if "url" in all_prod.columns else None,
    "category": all_prod["category"],
})
tj["norm_name"]=tj["name"].apply(lambda x: normalize(x) if isinstance(x, str) else x)
tj = tj.drop_duplicates(subset=["name", "unit", "price"]).reset_index(drop=True)
tj["product_id"] = tj.index + 1
tj = tj[["product_id", "name", "norm_name", "unit", "price", "url", "category"]]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/missclariss/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/missclariss/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/missclariss/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/missclariss/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/missclariss/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/missclariss/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Do

In [3]:
# Helper function to normalize recipes
recipes["ingredients"] = recipes["ingredients"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

def split_ingredient(text):
    """
    Split strings like '4 tablespoons TJ’s Salted Butter' into:
    quantity = '4 tablespoons', ingredient = "TJ’s Salted Butter"
    """

    match = re.match(r"^([\d¼½¾⅓⅔⅛⅜⅝⅞\s\-–/]+[a-zA-Z]*)\s+(.*)", text)
    if match:
        qty = match.group(1).strip()
        name = match.group(2).strip()
    else:
        qty, name = None, text.strip()
    return pd.Series([qty, name])

# Explode recipes so each row = 1 ingredient per recipe
recipes_exploded = recipes.explode("ingredients", ignore_index=True)
recipes_exploded[["quantity_text", "ingredient_name"]] = recipes_exploded["ingredients"].apply(split_ingredient)

cookbook_df = recipes_exploded[["title", "category", "ingredient_name", "quantity_text", 'url', 'image_url', 'serves', 'time']]
cookbook_df["name"] = cookbook_df["ingredient_name"].apply(lambda x: normalize(x) if isinstance(x, str) else x)
cookbook_df.drop(columns=["ingredient_name"], inplace=True)
possible_ingredients = cookbook_df['name'].unique()
usable_ingredients = pd.DataFrame(possible_ingredients, columns=["Ingredient"])

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/missclariss/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/missclariss/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/missclariss/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/missclariss/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/missclariss/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/missclariss/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Do

In [4]:
# creating database
engine = create_engine("sqlite:///cookbook.db", echo=False)

schema_sql = """
PRAGMA foreign_keys = ON;
--Cookbook
CREATE TABLE IF NOT EXISTS recipe(
  recipe_id INTEGER PRIMARY KEY,
  title TEXT NOT NULL,
  url TEXT,
  image_url TEXT,
  serves TEXT,
  time TEXT,
  category TEXT
);

CREATE TABLE IF NOT EXISTS usable_ingredients(
  ingredient_id INTEGER PRIMARY KEY,
  raw_name TEXT,
  norm_name TEXT NOT NULL
);

CREATE TABLE IF NOT EXISTS quantity(
  quantity_id INTEGER PRIMARY KEY,
  amount_text TEXT NOT NULL
);

CREATE TABLE IF NOT EXISTS cookbook(
  recipe_id INTEGER NOT NULL,
  ingredient_id INTEGER NOT NULL,
  quantity_id INTEGER,
  PRIMARY KEY (recipe_id, ingredient_id, quantity_id),
  FOREIGN KEY (recipe_id) REFERENCES recipe(recipe_id) ON DELETE CASCADE,
  FOREIGN KEY (ingredient_id) REFERENCES usable_ingredients(ingredient_id) ON DELETE CASCADE,
  FOREIGN KEY (quantity_id) REFERENCES quantity(quantity_id)
);

-- Pantry
CREATE TABLE shelf_life (
  ingredient_id INTEGER PRIMARY KEY,
  shelf_life_days INTEGER NOT NULL,
  FOREIGN KEY (ingredient_id) REFERENCES usable_ingredients(ingredient_id)
);

CREATE TABLE pantry (
  pantry_id INTEGER PRIMARY KEY,
  ingredient_id INTEGER NOT NULL,
  amount REAL,
  unit TEXT,
  date_purchased TEXT,    -- stored right here
  expiration_date TEXT,   -- stored right here
  FOREIGN KEY (ingredient_id) REFERENCES usable_ingredients(ingredient_id)
);

-- TJs Inventory
CREATE TABLE IF NOT EXISTS tj_inventory (
  product_id   INTEGER PRIMARY KEY,
  name         TEXT NOT NULL,
  norm_name    TEXT,           -- normalized name to match usable_ingredients
  unit         TEXT,           -- package size text from CSV (e.g., '/1 Each', '/16 Oz')
  price        REAL,           -- numeric price (e.g., 3.99)
  url          TEXT,           -- if present in CSV
  category     TEXT            -- 'speciality' | 'produce' | 'meat'
);

-- Many-to-many: which product can satisfy which normalized ingredient
CREATE TABLE IF NOT EXISTS sold_as (
  product_id    INTEGER NOT NULL,
  ingredient_id INTEGER NOT NULL,
  PRIMARY KEY (product_id, ingredient_id),
  FOREIGN KEY (product_id)    REFERENCES tj_inventory(product_id),
  FOREIGN KEY (ingredient_id) REFERENCES usable_ingredients(ingredient_id)
);

CREATE INDEX IF NOT EXISTS idx_tj_normname ON tj_inventory(norm_name);
CREATE INDEX IF NOT EXISTS idx_soldas_ing   ON sold_as(ingredient_id);

-- Inverted Name
CREATE TABLE IF NOT EXISTS ingredient_recipe_inverted_index (
    ingredient_id INTEGER,
    recipe_id     INTEGER
);

-- Recipe Recommended
CREATE TABLE IF NOT EXISTS recipe_recommended (
    id        INTEGER PRIMARY KEY AUTOINCREMENT,
    recipe_id INTEGER,
    date      TEXT,
    recipe    TEXT
);

-- Recipe Selected
CREATE TABLE IF NOT EXISTS recipe_selected (
    sel_id    INTEGER PRIMARY KEY AUTOINCREMENT,
    recipe_id INTEGER,
    sel_ts    TEXT DEFAULT (datetime('now'))
);
"""

conn = engine.raw_connection()
try:
    cur = conn.cursor()
    cur.executescript(schema_sql)
    conn.commit()
finally:
    conn.close()

In [5]:
# Checking what is in database
conn = sqlite3.connect("cookbook.db")

tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn)
print("📋 Tables in your database:")
display(tables)
conn.close()

📋 Tables in your database:


Unnamed: 0,name
0,recipe
1,usable_ingredients
2,quantity
3,cookbook
4,shelf_life
5,pantry
6,tj_inventory
7,sold_as
8,ingredient_recipe_inverted_index
9,recipe_recommended


In [6]:
# Recipe table
recipe_df = (
    cookbook_df[["title", "category", "url", "image_url", "serves", "time"]]
    .drop_duplicates()
    .reset_index(drop=True)
)
recipe_df["recipe_id"] = recipe_df.index + 1

# Usable Ingredients table
ingredient_df = (
    cookbook_df[["name"]]  # ingredient column
    .rename(columns={"name": "raw_name"})
    .assign(norm_name=lambda d: d["raw_name"].str.lower().str.strip())
    .drop_duplicates(subset=["norm_name"])
    .reset_index(drop=True)
)
ingredient_df["ingredient_id"] = ingredient_df.index + 1

# Quantity table
quantity_df = (
    cookbook_df[["quantity_text"]]
    .rename(columns={"quantity_text": "amount_text"})
    .dropna()
    .drop_duplicates()
    .reset_index(drop=True)
)
quantity_df["quantity_id"] = quantity_df.index + 1

# Cookbook table
link_df = (
    cookbook_df
    .merge(recipe_df, on=["title", "category", "url", "image_url", "serves", "time"])
    .merge(ingredient_df, left_on=cookbook_df["name"].str.lower().str.strip(), right_on="norm_name")
    .merge(quantity_df, left_on="quantity_text", right_on="amount_text", how="left")
    [["recipe_id", "ingredient_id", "quantity_id"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

# Upload everything to Database
recipe_df.to_sql("recipe", engine, if_exists="append", index=False)
ingredient_df.to_sql("usable_ingredients", engine, if_exists="append", index=False)
quantity_df.to_sql("quantity", engine, if_exists="append", index=False)
link_df.to_sql("cookbook", engine, if_exists="append", index=False)

print("✅ Recipes and ingredients loaded into cookbook.db!")

✅ Recipes and ingredients loaded into cookbook.db!


In [7]:
# Load TJ into database
tj.to_sql(
    "tj_inventory",
    con=engine,
    if_exists="append",
    index=False
)

1244

In [23]:
# if os.path.exists("cookbook.db"):
#     os.remove("cookbook.db")
#     print("🧹 old cookbook.db deleted")

🧹 old cookbook.db deleted


In [10]:
# Used to peak inside database
df = pd.read_sql("SELECT * FROM tj_inventory LIMIT 5;", engine)
print(df)

   product_id                                         name  \
0           1                          Herbed Dinner Rolls   
1           2  Pumpkin & Spice Brioche Style Liège Waffles   
2           3         Glazed Pumpkin Pie Spice Donut Holes   
3           4                Pumpkin Spice Mini Sheet Cake   
4           5        Sliced Apple Cinnamon Sourdough Bread   

                                     norm_name       unit  price  \
0                          herbed dinner rolls     /12 Oz   3.49   
1  pumpkin & spice brioche style liège waffles  /11.64 Oz   4.49   
2                pumpkin pie spice donut holes      /6 Oz   3.49   
3                pumpkin spice mini sheet cake     /18 Oz   5.99   
4        sliced apple cinnamon sourdough bread  /17.63 Oz   4.99   

                                                 url    category  
0  https://www.traderjoes.com/home/products/pdp/h...  speciality  
1  https://www.traderjoes.com/home/products/pdp/p...  speciality  
2  https://www.tr