In [1]:
# this folder is used to explore quantities
import re
import pandas as pd
import numpy as np


cookbook = pd.read_csv('../quantity_exploration/parsed_raw_ingredients_all_mini_with_recipe_context.csv')
tj_inventory = pd.read_csv("../data/trader_joes_products_v3.csv")

In [2]:
# grabbing amount and units from cookbook to make
ingre_units= cookbook[['amount', 'unit']].copy()
# dropping NaNs in units for now
ingre_units.dropna()
# ingre_units['unit'].unique()

Unnamed: 0,amount,unit
0,4.00,tablespoon
1,2.00,tablespoon
2,1.00,tablespoon
3,1.00,cup
4,2.00,cup
...,...,...
3696,1.00,package
3697,1.00,package
3698,1.00,bottle
3699,0.25,teaspoon


In [3]:
start_here = [
    "cup",
    "cups",
    "tablespoon",
    "tablespoons",
    "teaspoon",
    "teaspoons",
    "ounce",
    "ounces"]
cup_table_tea_oz = ingre_units[ingre_units['unit'].isin(start_here)].copy()
cup_table_tea_oz

Unnamed: 0,amount,unit
0,4.00,tablespoon
1,2.00,tablespoon
2,1.00,tablespoon
3,1.00,cup
4,2.00,cup
...,...,...
3688,0.50,teaspoon
3693,2.00,tablespoon
3694,0.50,teaspoon
3699,0.25,teaspoon


In [4]:
tj_inventory.head()

Unnamed: 0,category,sub_category,product_name,price,unit,url
0,Fresh Fruits & Veggies,Fruits,Sugar Sweet Melon,$3.49,/1 Each,https://www.traderjoes.com/home/products/pdp/s...
1,Fresh Fruits & Veggies,Fruits,Saturn Peaches,$3.99,/19 Oz,https://www.traderjoes.com/home/products/pdp/s...
2,Fresh Fruits & Veggies,Fruits,Fruitful Medley,$3.99,/16 Oz,https://www.traderjoes.com/home/products/pdp/f...
3,Fresh Fruits & Veggies,Fruits,Seedless Lemons,$2.99,/2 Lb,https://www.traderjoes.com/home/products/pdp/s...
4,Fresh Fruits & Veggies,Fruits,Hass Avocado,$1.29,/1 Each,https://www.traderjoes.com/home/products/pdp/h...


In [5]:
tj_inventory["unit"] = tj_inventory["unit"].str.strip().str.replace("/", "", regex=False)
unit_split = tj_inventory["unit"].str.extract(r"(?P<store_amount>[\d\.]+)\s*(?P<store_unit>[A-Za-z\s]+)")
tj_inventory = pd.concat([tj_inventory, unit_split], axis=1)
tj_inventory.head()

Unnamed: 0,category,sub_category,product_name,price,unit,url,store_amount,store_unit
0,Fresh Fruits & Veggies,Fruits,Sugar Sweet Melon,$3.49,1 Each,https://www.traderjoes.com/home/products/pdp/s...,1,Each
1,Fresh Fruits & Veggies,Fruits,Saturn Peaches,$3.99,19 Oz,https://www.traderjoes.com/home/products/pdp/s...,19,Oz
2,Fresh Fruits & Veggies,Fruits,Fruitful Medley,$3.99,16 Oz,https://www.traderjoes.com/home/products/pdp/f...,16,Oz
3,Fresh Fruits & Veggies,Fruits,Seedless Lemons,$2.99,2 Lb,https://www.traderjoes.com/home/products/pdp/s...,2,Lb
4,Fresh Fruits & Veggies,Fruits,Hass Avocado,$1.29,1 Each,https://www.traderjoes.com/home/products/pdp/h...,1,Each


In [6]:
tj_inventory.columns

Index(['category', 'sub_category', 'product_name', 'price', 'unit', 'url',
       'store_amount', 'store_unit'],
      dtype='object')

In [7]:
tj_units = tj_inventory[['store_amount', 'store_unit']]
# tj_units
tj_units['store_unit'].unique()

array(['Each', 'Oz', 'Lb', 'Fl Oz', 'Doz', 'Pint', 'Qt'], dtype=object)

In [8]:
# focusing just on Oz for now
tj_oz = tj_inventory[tj_inventory["store_unit"].str.lower() == "oz"].copy()
tj_oz.head()

Unnamed: 0,category,sub_category,product_name,price,unit,url,store_amount,store_unit
1,Fresh Fruits & Veggies,Fruits,Saturn Peaches,$3.99,19 Oz,https://www.traderjoes.com/home/products/pdp/s...,19,Oz
2,Fresh Fruits & Veggies,Fruits,Fruitful Medley,$3.99,16 Oz,https://www.traderjoes.com/home/products/pdp/f...,16,Oz
9,Fresh Fruits & Veggies,Fruits,Cherry Tomatoes on the Vine,$4.99,12 Oz,https://www.traderjoes.com/home/products/pdp/c...,12,Oz
15,Fresh Fruits & Veggies,Fruits,Organic Cranberries,$3.29,12 Oz,https://www.traderjoes.com/home/products/pdp/o...,12,Oz
21,Fresh Fruits & Veggies,Fruits,Organic Strawberries 1 Lb,$6.99,16 Oz,https://www.traderjoes.com/home/products/pdp/o...,16,Oz


In [9]:
# first since most of ingre_units and tj_units are cups, tablespoons, and oz we will focus on those to see what this would look like

unit_to_oz = {
    "cup": 8,
    "cups": 8,
    "tablespoon": 0.5,
    "tablespoons": 0.5,
    "teaspoon": 0.1667,
    "teaspoons": 0.1667,
    "ounce": 1,
    "ounces": 1
}

def convert_to_oz(amount, unit):
    """
    Converts a recipe's quantity + unit to ounces.
    """
    if pd.isna(unit) or unit.lower() not in unit_to_oz:
        return None
    try:
        return float(amount) * unit_to_oz[unit]
    except (TypeError, ValueError):
        return None

In [10]:
cup_table_tea_oz["amount_oz"] = cup_table_tea_oz.apply(
    lambda row: convert_to_oz(row["amount"], row["unit"]), axis=1
)

In [11]:
cup_table_tea_oz

Unnamed: 0,amount,unit,amount_oz
0,4.00,tablespoon,2.000000
1,2.00,tablespoon,1.000000
2,1.00,tablespoon,0.500000
3,1.00,cup,8.000000
4,2.00,cup,16.000000
...,...,...,...
3688,0.50,teaspoon,0.083350
3693,2.00,tablespoon,1.000000
3694,0.50,teaspoon,0.083350
3699,0.25,teaspoon,0.041675


In [12]:
all_units = ingre_units['unit'].dropna().str.lower().str.strip().unique()

In [13]:
# now lets explore the other things

all_units = ingre_units['unit'].dropna().str.lower().str.strip().unique()
extra_units = [u for u in all_units if u not in unit_to_oz]

extra_counts = (
    ingre_units[ingre_units['unit'].isin(extra_units)]
    ['unit']
    .value_counts()
    .reset_index()
    .rename(columns={'index': 'unit', 'unit': 'count'})
)
extra_counts

Unnamed: 0,count,count.1
0,package,207
1,bag,144
2,packages,57
3,slices,52
4,can,47
...,...,...
87,oz bottles,1
88,large clove,1
89,picoinch * teaspoon,1
90,batches,1


In [23]:
# this allows us to explore the extra_counts and see if they can match up

def explore_ingredients(unit_name):
    unit_df = cookbook[cookbook['unit'] == unit_name].copy()
    unit_reduced = unit_df[['name', 'amount', 'unit']]
    tj_reduced = tj_inventory[['product_name', 'store_amount', 'store_unit']]
    matches = unit_reduced.merge(tj_reduced, left_on='name', right_on='product_name', how='inner')
    matches['amount'] = pd.to_numeric(matches['amount'], errors='coerce')
    matches['store_amount'] = pd.to_numeric(matches['store_amount'], errors='coerce')
    matches['amount_to_store_amount'] = matches['amount'] * matches['store_amount']
    matches_reduced = matches[['name', 'amount', 'unit','store_amount','amount_to_store_amount', 'store_unit']]
    return matches_reduced

explore_ingredients("pint")

Unnamed: 0,name,amount,unit,store_amount,amount_to_store_amount,store_unit


In [24]:
# does not work for tablespoon, teaspoon, slices, loaf, cloves, pound, stick, sprig, slice, scoops, bar, pint, heads and the rest under 3 count

In [16]:
# first attempt at merging convert_to_oz and explore_ingredients into one function
def units_to_store_units(df, tj_inventory):
    final_df = df.copy()

    first_units = [
        "cup", "cups",
        "tablespoon", "tablespoons",
        "teaspoon", "teaspoons",
        "ounce", "ounces"
    ]

    does_not_work = [
        "slices", "slice",
        "clove", "cloves",
        "handful", "sprig", "sprigs",
        'loaf', 'pound', 'scoops', 'bar', 'pint',
        'heads'
    ]

    unit_to_oz = {
        "cup": 8, "cups": 8,
        "tablespoon": 0.5, "tablespoons": 0.5,
        "teaspoon": 0.1667, "teaspoons": 0.1667,
        "ounce": 1, "ounces": 1
    }

    def convert_to_oz(amount, unit):
        """Converts a recipe's quantity + unit to ounces."""
        return float(amount) * unit_to_oz[unit]


    def explore_ingredients(unit_name):
        """Merges cookbook ingredients with TJ inventory by name."""
        unit_df = df[df["unit"] == unit_name].copy()
        unit_reduced = unit_df[["name", "amount", "unit"]]
        tj_reduced = tj_inventory[["product_name", "store_amount", "store_unit"]]
        matches = unit_reduced.merge(
            tj_reduced,
            left_on="name",
            right_on="product_name",
            how="inner"
        )
        matches["amount"] = pd.to_numeric(matches["amount"], errors="coerce")
        matches["store_amount"] = pd.to_numeric(matches["store_amount"], errors="coerce")
        matches["amount_to_store"] = matches["amount"] * matches["store_amount"]
        return matches[["name", "amount_to_store", "store_unit"]]

    # handle first_units (direct conversion to Oz)
    mask_first = final_df["unit"].isin(first_units)
    final_df.loc[mask_first, "store_amount"] = final_df.loc[mask_first].apply(
        lambda r: convert_to_oz(r["amount"], r["unit"]), axis=1
    )
    final_df.loc[mask_first, "store_unit"] = "Oz"

    # skip does_not_work
    mask_skip = final_df["unit"].isin(does_not_work)
    final_df.loc[mask_skip, ["store_amount", "store_unit"]] = [None, None]

    # everything else
    mask_other = ~(mask_first | mask_skip)
    unique_units = final_df.loc[mask_other, "unit"].unique()

    for unit_name in unique_units:
        matches = explore_ingredients(unit_name)
        if not matches.empty:
            for _, row in matches.iterrows():
                final_df.loc[
                    (final_df["name"] == row["name"]) & (final_df["unit"] == unit_name),
                    ["store_amount", "store_unit"]
                ] = [row["amount_to_store"], row["store_unit"]]

    return final_df

In [25]:
# final attempt. this takes the df, converts first units with convert_to_oz, then where it can convert using explore_ingredients, if it can't do either it brings over 'amount' and 'unit'
def units_to_store_units2(df, tj_inventory):
    final_df = df.copy()

    first_units = [
        "cup", "cups",
        "tablespoon", "tablespoons",
        "teaspoon", "teaspoons",
        "ounce", "ounces"
    ]

    does_not_work = [
        "slices", "slice",
        "clove", "cloves",
        "handful", "sprig", "sprigs",
        "loaf", "pound", "scoops", "bar", "pint",
        "heads"
    ]

    unit_to_oz = {
        "cup": 8, "cups": 8,
        "tablespoon": 0.5, "tablespoons": 0.5,
        "teaspoon": 0.1667, "teaspoons": 0.1667,
        "ounce": 1, "ounces": 1
    }

    def convert_to_oz(amount, unit):
        """Converts a recipe's quantity + unit to ounces."""
        try:
            return float(amount) * unit_to_oz[unit]
        except (TypeError, ValueError, KeyError):
            return None

    def explore_ingredients(unit_name):
        """Merges cookbook ingredients with TJ inventory by name."""
        unit_df = df[df["unit"] == unit_name].copy()
        unit_reduced = unit_df[["name", "amount", "unit"]]
        tj_reduced = tj_inventory[["product_name", "store_amount", "store_unit"]]
        matches = unit_reduced.merge(
            tj_reduced,
            left_on="name",
            right_on="product_name",
            how="inner"
        )
        matches["amount"] = pd.to_numeric(matches["amount"], errors="coerce")
        matches["store_amount"] = pd.to_numeric(matches["store_amount"], errors="coerce")
        matches["amount_to_store"] = matches["amount"] * matches["store_amount"]
        matches.rename(
            columns={
                "amount_to_store": "pantry_amount",
                "store_unit": "pantry_unit"
            },
            inplace=True
        )
        return matches[["name", "pantry_amount", "pantry_unit"]]

    # ---- 1) handle first_units (direct conversion to Oz) ----
    mask_first = final_df["unit"].isin(first_units)
    final_df.loc[mask_first, "pantry_amount"] = final_df.loc[mask_first].apply(
        lambda r: convert_to_oz(r["amount"], r["unit"]), axis=1
    )
    final_df.loc[mask_first, "pantry_unit"] = "Oz"

    # ---- 2) skip does_not_work ----
    mask_skip = final_df["unit"].isin(does_not_work)
    final_df.loc[mask_skip, ["pantry_amount", "pantry_unit"]] = [None, None]

    # ---- 3) everything else ----
    mask_other = ~(mask_first | mask_skip)
    unique_units = final_df.loc[mask_other, "unit"].unique()

    for unit_name in unique_units:
        matches = explore_ingredients(unit_name)
        if not matches.empty:
            for _, row in matches.iterrows():
                final_df.loc[
                    (final_df["name"] == row["name"]) & (final_df["unit"] == unit_name),
                    ["pantry_amount", "pantry_unit"]
                ] = [row["pantry_amount"], row["pantry_unit"]]

    # ---- 4) mark converted_to_store ----
    final_df["converted_to_store"] = (
        final_df["pantry_amount"].notna() & final_df["pantry_unit"].notna()
    )

    # ---- 5) fallback: if not converted, copy original unit + amount ----
    not_converted = ~final_df["converted_to_store"]
    final_df.loc[not_converted, "pantry_amount"] = final_df.loc[not_converted, "amount"]
    final_df.loc[not_converted, "pantry_unit"] = final_df.loc[not_converted, "unit"]

    return final_df

In [30]:
# dropping na since we can't convert if we don't have units and amounts
cookbook2 = cookbook.dropna().copy()

In [31]:
cookbook_revised2 = units_to_store_units2(cookbook2, tj_inventory)

In [32]:
cookbook_revised2

Unnamed: 0,original_text,name,name_confidence,amount,amount_text,unit,amount_confidence,preparation,preparation_confidence,recipe_title,...,main_category_1,likely_sub_category_2,likely_sub_category_2_score,main_category_2,likely_sub_category_3,likely_sub_category_3_score,main_category_3,pantry_amount,pantry_unit,converted_to_store
1,2 tablespoons finely chopped TJ’s Fresh Garlic,Fresh Garlic,0.942706,2.0,2 tablespoons,tablespoon,0.999844,finely chopped,0.961860,Aromatic Garlic Ginger Rice,...,Unknown,Fresh Vegetables,0.047255,Unknown,Veggies,0.007904,Fresh Fruits & Veggies,1.0000,Oz,True
2,1 tablespoon finely chopped TJ’s Fresh Ginger,Fresh Ginger,0.992840,1.0,1 tablespoon,tablespoon,0.999925,finely chopped,0.997657,Aromatic Garlic Ginger Rice,...,Unknown,Fresh Vegetables,0.053698,Unknown,Spices,0.042131,For the Pantry,0.5000,Oz,True
3,"1 cup TJ’s Jasmine Rice, rinsed",Jasmine Rice,0.997608,1.0,1 cup,cup,0.999978,rinsed,0.998351,Aromatic Garlic Ginger Rice,...,Unknown,Fresh,0.077892,Unknown,Fresh Fruits,0.060053,Unknown,8.0000,Oz,True
9,"3 tablespoons TJ’s Avocado Oil, divided",Avocado Oil,0.997538,3.0,3 tablespoons,tablespoon,0.999973,divided,0.999657,Roasted Pork Tenderloin & Potatoes with Honey ...,...,Unknown,Cream and Creamy Cheeses,0.081852,Cheese,Fresh,0.061031,Unknown,1.5000,Oz,True
10,"1 bag TJ’s Teeny Tiny Potatoes, sliced in half...",Teeny Tiny Potatoes,0.994637,1.0,1 bag,bag,0.999593,"sliced in half, lengthwise",0.928255,Roasted Pork Tenderloin & Potatoes with Honey ...,...,From The Freezer,Fresh,0.058672,Unknown,Fresh Vegetables,0.049792,Unknown,1.0000,bag,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3682,1 1/2 cups TJ's Organic Foursome Vegetable Med...,Organic Foursome Vegetable Medley,0.998896,1.5,1 1/2 cups,cup,0.999944,thawed,0.964607,Turkey Pot Pie with Buttermilk Biscuit Crust,...,Unknown,Fresh Vegetables,0.069194,Unknown,Shelf Stable Foods,0.046651,Unknown,12.0000,Oz,True
3685,2 cups chopped leftover cooked TJ's Turkey,leftover cooked Turkey,0.890696,2.0,2 cups,cup,0.999918,chopped,0.985658,Turkey Pot Pie with Buttermilk Biscuit Crust,...,Unknown,Fresh,0.189910,Unknown,Smoked or Processed,0.044721,Unknown,16.0000,Oz,True
3686,2 teaspoons chopped TJ's Fresh Thyme,Fresh Thyme,0.951175,2.0,2 teaspoons,teaspoon,0.999752,chopped,0.958873,Turkey Pot Pie with Buttermilk Biscuit Crust,...,Unknown,Fresh Vegetables,0.087996,Unknown,Veggies,0.060405,Fresh Fruits & Veggies,0.3334,Oz,True
3687,2 teaspoons chopped TJ's Fresh Rosemary,Fresh Rosemary,0.959007,2.0,2 teaspoons,teaspoon,0.999748,chopped,0.974456,Turkey Pot Pie with Buttermilk Biscuit Crust,...,Unknown,Fresh Vegetables,0.074127,Unknown,Spices,0.023347,For the Pantry,0.3334,Oz,True


In [34]:
# cookbook2.to_csv("../quantity_exploration/parsed_ingredients_with_pantry_units.csv", index=False)

In [33]:
# the following was used to explore the first iteration of units_to_store_units
unit_nan_pct = cookbook_revised2['unit'].isna().mean() * 100
store_unit_nan_pct = cookbook_revised2['pantry_unit'].isna().mean() * 100

print(f"NaN % in unit: {unit_nan_pct:.1f}%")
print(f"NaN % in store_unit: {store_unit_nan_pct:.1f}%")
print(f"Increase: {store_unit_nan_pct - unit_nan_pct:.1f}%")

NaN % in unit: 0.0%
NaN % in store_unit: 0.0%
Increase: 0.0%


In [22]:
# Find rows that used to have a unit but now lost store_unit
missing_after = cookbook_revised2[cookbook_revised2['unit'].notna() & cookbook_revised2['pantry_unit'].isna()]
print(f"{len(missing_after)} rows lost store_unit after conversion.")
missing_after

0 rows lost store_unit after conversion.


Unnamed: 0,original_text,name,name_confidence,amount,amount_text,unit,amount_confidence,preparation,preparation_confidence,recipe_title,...,main_category_1,likely_sub_category_2,likely_sub_category_2_score,main_category_2,likely_sub_category_3,likely_sub_category_3_score,main_category_3,pantry_amount,pantry_unit,converted_to_store


In [None]:
cookbook_revised2.to