In [1]:
# this folder is used to explore quantities
import re
import pandas as pd
import numpy as np


cookbook = pd.read_csv('../data/cookbook_df.csv')
tj_inventory = pd.read_csv("../data/trader_joes_products_v2.csv")

In [2]:
def split_quantity(qty):
    if pd.isna(qty) or not isinstance(qty, str) or qty.strip() == "":
        return pd.Series([np.nan, np.nan])

    qty = qty.lower().strip()

    # Match numeric part (integer, float, or fraction) + unit
    match = re.match(r"([\d¼½¾⅓⅔⅛⅜⅝⅞\s\/\.]+)\s*([a-zA-Z]+.*)?", qty)
    if match:
        amount = match.group(1).strip()
        unit = match.group(2).strip() if match.group(2) else np.nan
        return pd.Series([amount, unit])
    else:
        return pd.Series([np.nan, np.nan])

In [3]:
ingre= cookbook[['name', 'quantity_text']].copy()
ingre[["amount", "unit"]] = ingre["quantity_text"].apply(split_quantity)

In [10]:
ingre = ingre.dropna().reset_index(drop=True)

fractions_map = {
    "½": "1/2",
    "¼": "1/4",
    "¾": "3/4",
    "⅓": "1/3",
    "⅔": "2/3",
    "⅛": "1/8",
    "⅜": "3/8",
    "⅝": "5/8",
    "⅞": "7/8"
}

def normalize_fraction(text):
    if isinstance(text, str):
        for sym, rep in fractions_map.items():
            text = text.replace(sym, rep)
        return text
    return text

ingre["amount"] = ingre["amount"].apply(normalize_fraction)


In [12]:
ingre['unit'].unique()

array(['tablespoons', 'tablespoon', 'cup', 'cups', 'bag', 'stick',
       'package', 'teaspoons', 'large', 'block', 'pint', 'loaf', 'cloves',
       'head', 'firm', 'pound', 'box', 'can', 'wedge', 'slices', 'to',
       'tub', 'jar', 'teaspoon', 'heaping', 'bulb', 'bottle', 'cans',
       'of', 'jumbo', 'container', 'carton', 'cheese', 'wheel', 'whole',
       'short', 'cartons', 'tubs', 'tsp', 'quart', 'ounces', 'sleeve',
       'generous', 'scoops', 'ears', 'pounds', 'tube', 'ounce', 'sprigs',
       'small', 'packages', 'log', 'roughly', 'bar', 'microwavable',
       'stalks', 'jars', 'slice', 'oz', 'fully', 'sliced', 'part',
       'parts', 'medium', 'sprig', 'egg', 'a', 'pieces', 'sticks', 'demi',
       'wooden', 'fresh', 'decorative', 'sheets', 'containers',
       'handfuls', 'ripe', 'bags', 'handful', 'tbsp', 'batch', 'or',
       'strips', 'half', 'cage', 'food', 'roll', 'grinds', 'pkg',
       'turkey', 'pkgs', 'tray', 'butternut', 'pouch', 'cube', 'rolls',
       'lb', 'lin

In [18]:
store = tj_inventory[['product_name', 'unit']].copy()

In [19]:
def parse_store_unit(u):
    if not isinstance(u, str):
        return pd.Series([None, None])

    # Extract numbers and unit (e.g., "12", "oz" from "/12 Oz")
    match = re.match(r"\/?([\d\.]+)\s*([a-zA-Z\s]+)", u)
    if match:
        qty = float(match.group(1))
        unit = match.group(2).strip().lower()
        return pd.Series([qty, unit])
    else:
        return pd.Series([None, None])

store[["store_qty", "store_unit"]] = store["unit"].apply(parse_store_unit)

In [22]:
store['store_unit'].unique()

array(['oz', 'lb', 'fl oz', 'doz', 'pint', 'qt', 'each'], dtype=object)