In [2]:
import json
import re
index_data = []
# Load index.json
with open('../data/raw/index.json') as f:
    index_data = json.load(f)['results']

# Load flavor_bible_full.json
with open('../data/raw/flavor_bible_full.json') as f:
    flavor_bible = json.load(f)

# Normalize function
def normalize(name):
    return re.sub(r'[^a-zA-Z\s]', '', name.lower()).strip()

# Normalize flavor data into dict for lookup
flavor_map = {normalize(item['main']): item for item in flavor_bible}

print(flavor_map['achiote seeds'])

{'main': 'ACHIOTE SEEDS', 'pairing': 'achiote + pork + sour orange'}


In [7]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from collections import defaultdict

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(index_data)

clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=1.2)
labels = clustering.fit_predict(embeddings)

# Group by cluster
clusters = defaultdict(list)
for label, name in zip(labels, index_data):
    clusters[label].append(name)

In [8]:
print(clusters[1])

['bread', 'bread, bagels', 'bread, breadsticks, croutons, etc.', 'bread, crusty', 'bread, french', 'bread, olive', 'bread, pita', 'bread, pumpernickel', 'bread, white', 'breads', 'breads and breadsticks', 'nut bread', 'walnut bread']


In [9]:
from collections import Counter

def get_cluster_root(names):
    token_counts = Counter()
    for name in names:
        words = re.sub(r'[^a-zA-Z\s]', '', name.lower()).split()
        token_counts.update(words)
    return token_counts.most_common(1)[0][0]  # e.g. "apple"

In [30]:
def normalize(name):
    return re.sub(r'[^a-zA-Z\s]', '', name.lower()).strip()

def flatten(d):
    for v in d.values():
        if isinstance(v, dict):
            yield from flatten(v)
        elif isinstance(v, list):
            yield from v

def get_canonical_name(names):
    # "Heuristic: most frequent core word (e.g., 'chocolate' from variants)."
    token_counts = Counter()
    for name in names:
        words = normalize(name).split()
        token_counts.update(words)
    return token_counts.most_common(1)[0][0]

def parse_pairings(raw_string):
    if not raw_string or not isinstance(raw_string, str):
        return []
    parts = [p.strip().lower().replace(" ", "_") for p in raw_string.split("+")]
    return [p for p in parts if p]

In [31]:
# def build_ingredient_model(cluster_names, flavor_map):
#     all_names = list(set(normalize(n) for n in cluster_names))
#     core = get_cluster_root(cluster_names)
#     flavor = flavor_map.get(core)
#     return {
#         "id": core.replace(" ", "_"),
#         "name": core.title(),
#         "common_names": all_names,
#         "flavor_profile": flavor.get("flavor_profile") if flavor else [],
#         "pairings": flavor.get("pairings") if flavor else [],
#         "category": flavor.get("category") if flavor else None
#     }
def build_ingredient_model(cluster_data, flavor_map):
    normalized_names = [normalize(n) for n in cluster_data]
    canonical_name = get_canonical_name(normalized_names)

    # Try to get flavor entry using exact match or fallback
    flavor_entry = flavor_map.get(canonical_name) or next(
        (flavor_map.get(name) for name in normalized_names if flavor_map.get(name)), None
    )

    print(flavor_entry)

    return {
        "id": canonical_name.replace(" ", "_"),
        "name": canonical_name.title(),
        "common_names": sorted(set(normalized_names)),
        "flavor_profile": flavor_entry.get("flavor_profile") if flavor_entry else [],
        "pairings": parse_pairings(flavor_entry.get("pairing")) if flavor_entry else [],
        "category": flavor_entry.get("category") if flavor_entry else None,
        "status": "draft"
    }

In [32]:
ingredient_models = []

for cluster in clusters.values():
    model = build_ingredient_model(cluster, flavor_map)
    ingredient_models.append(model)

# Save the structured data
with open("../data/ingredients/ingredients_master_1.json", "w") as f:
    json.dump(ingredient_models, f, indent=2)

{'main': 'ACHIOTE SEEDS', 'pairing': 'achiote + pork + sour orange'}
None
None
{'main': 'BEER', 'pairing': 'stews'}
None
{'main': 'MUSTARD', 'pairing': 'combination of the three is delicious.'}
{'main': 'ALLSPICE', 'pairing': 'allspice + garlic + pork'}
{'main': 'ALMONDS', 'pairing': 'almonds + honey + orange zest + raisins'}
None
None
{'main': 'AMARETTO (sweet almond liqueur)', 'pairing': 'sugar'}
{'main': 'CHILI PASTE', 'pairing': 'sauces'}
{'main': 'ANCHOVIES', 'pairing': 'anchovies + lemon + olive oil + rosemary'}
None
{'main': 'ANGELICA', 'pairing': 'angelica + cream + rhubarb'}
{'main': 'ANISE HYSSOP', 'pairing': 'zucchini'}
None
None
None
{'main': 'APPLES', 'pairing': 'apples + red cabbage + cinnamon'}
None
None
{'main': 'APRICOTS, DRIED', 'pairing': 'dried apricots + dried cherries + ginger + orange + pistachios'}
{'main': 'TURBOT', 'pairing': 'turbot + lemon + miso + mushrooms'}
None
{'main': 'LAVENDER', 'pairing': 'lavender + meat + salt'}
None
{'main': 'ARTICHOKES', 'pairing