In [77]:
import json
import re
index_data = []
# Load index.json
with open('../data/raw/index.json') as f:
    index_data = json.load(f)['results']

# Load flavor_bible_full.json
with open('../data/raw/data-formatted-json.json') as f:
    flavor_bible = json.load(f)

# Normalize function
def normalize(name):
    return re.sub(r'[^a-zA-Z\s]', '', name.lower()).strip()

# Normalize flavor data into dict for lookup
flavor_map = {normalize(item['ingredient']): item for item in flavor_bible['data']}
flavor_map_sample = flavor_map['achiote seeds']
print(flavor_map_sample)

{'ingredient': 'achiote seeds', 'pairings': ['beef', 'chicken', 'chiles', 'citrus', 'fish', 'game birds', 'garlic', 'pork', 'shellfish', 'shrimp']}


In [78]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from collections import defaultdict

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(index_data)

clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=1.2)
labels = clustering.fit_predict(embeddings)

# Group by cluster
clusters = defaultdict(list)
for label, name in zip(labels, index_data):
    clusters[label].append(name)

In [79]:
sample = clusters[1]
print(sample)

['bread', 'bread, bagels', 'bread, breadsticks, croutons, etc.', 'bread, crusty', 'bread, french', 'bread, olive', 'bread, pita', 'bread, pumpernickel', 'bread, white', 'breads', 'breads and breadsticks', 'nut bread', 'walnut bread']


In [80]:
from collections import Counter

def get_cluster_root(names):
    token_counts = Counter()
    for name in names:
        words = re.sub(r'[^a-zA-Z\s]', '', name.lower()).split()
        token_counts.update(words)
    return token_counts.most_common(1)[0][0]  # e.g. "apple"

cluster_root_sample=get_cluster_root(sample)
print(cluster_root_sample)


bread


In [81]:
# def normalize(name):
#     return re.sub(r'[^a-zA-Z\s]', '', name.lower()).strip()

print(normalize(cluster_root_sample))

bread


In [82]:
def flatten(d):
    for v in d.values():
        if isinstance(v, dict):
            yield from flatten(v)
        elif isinstance(v, list):
            yield from v

In [83]:
def get_canonical_name(names):
    # "Heuristic: most frequent core word (e.g., 'chocolate' from variants)."
    token_counts = Counter()
    for name in names:
        words = normalize(name).split()
        token_counts.update(words)
    return token_counts.most_common(1)[0][0]

print(get_canonical_name(sample))

bread


In [133]:
# expects string
# def parse_pairings(raw_string):
#     if not raw_string or not isinstance(raw_string, str):
#         return []
#     parts = [p.strip().lower().replace(" ", "_") for p in raw_string.split("+")]
#     return [p for p in parts if p]

#expects array
def parse_pairings(raw_strings):
    return [normalize(n) for n in raw_strings]

print(flavor_map_sample['pairings'])
print(parse_pairings(flavor_map_sample['pairings']))

['beef', 'chicken', 'chiles', 'citrus', 'fish', 'game birds', 'garlic', 'pork', 'shellfish', 'shrimp']
['beef', 'chicken', 'chiles', 'citrus', 'fish', 'game birds', 'garlic', 'pork', 'shellfish', 'shrimp']


In [135]:
def build_ingredient_model_with_clustered_names(cluster_names, flavor_map):
    all_names = list(set(normalize(n) for n in cluster_names))
    core = get_cluster_root(cluster_names)
    flavor = flavor_map.get(core)
    return {
        "id": core.replace(" ", "_"),
        "name": core.title(),
        "flavor_db_name_variants": all_names,
        "status": "draft"
    }

In [134]:
def build_ingredient_model_with_pairings(cluster_data, flavor_map):
    normalized_names = [normalize(n) for n in cluster_data]
    canonical_name = get_canonical_name(normalized_names)

    # Try to get flavor entry using exact match or fallback
    flavor_entry = flavor_map.get(canonical_name) or next(
        (flavor_map.get(name) for name in normalized_names if flavor_map.get(name)), None
    )

    print(flavor_entry)

    # print(flavor_map['achiote seeds'].get("pairings"))

    return {
        "id": canonical_name.replace(" ", "_"),
        "name": canonical_name.title(),
        "flavor_bible_name_variants": sorted(set(normalized_names)),
        "flavor_bible_pairings_ids": parse_pairings(flavor_entry.get("pairings")) if flavor_entry else [],
        "status": "draft"
    }

In [144]:
def write_cluster_file(clusters, flavor_map, func, file_name):
    
    ingredient_models = []
    
    for cluster in clusters.values():
        model = func(cluster, flavor_map) #if model == "build_ingredient_model_with_clustered_names" else build_ingredient_model_with_pairings(clusters, flavor_map) 
        ingredient_models.append(model)
    
    # Save the structured data
    with open(file_name, "w") as f:
        json.dump(ingredient_models, f, indent=2)

# print(flavor_map['achiote seeds'])
# write_cluster_file(clusters, flavor_map, build_ingredient_model_with_clustered_names, "../data/ingredients/draft/ingredients_master.json")
# write_cluster_file(clusters, flavor_map, build_ingredient_model_with_pairings, "../data/ingredients/draft/ingredients_master_1.json")

{'ingredient': 'achiote seeds', 'pairings': ['beef', 'chicken', 'chiles', 'citrus', 'fish', 'game birds', 'garlic', 'pork', 'shellfish', 'shrimp']}
None
None
{'ingredient': 'beer', 'pairings': ['beef', 'cheese, cheddar', 'ham', 'marinades', 'meats', 'onions', 'pork', 'sauces', 'sauerkraut', 'sausages', 'shrimp', 'stews']}
None
{'ingredient': 'mustard', 'pairings': ['apples', 'apples, fruit', 'apples, juice', 'avocados', 'bay leaf', 'beef', 'beets', 'cabbage', 'capers', 'cheeses', 'cheese, soufflé)', 'chicken', 'chile peppers', 'cold cuts', 'coriander', 'crab', 'cream and sour cream', 'cucumbers', 'cumin', 'cured meats', 'curries', 'curry leaves', 'dill', 'egg dishes', 'fennel', 'fenugreek', 'fish', 'fruits', 'garlic', 'gingerbread', 'green beans', 'ham', 'herbs', 'honey', 'lamb', 'leeks', 'lemon, juice', 'mayonnaise', 'meats, cold or hot', 'mint', 'mostarda (mustard fruits)', 'mussels', 'oil, canola', 'olive oil', 'onions', 'oregano', 'paprika', 'parsley', 'pastrami', 'pepper, black', 

In [107]:
def build_master_lookup(master_ingredients):
    lookup = {}
    for item in master_ingredients:
        norm_id = normalize(item['id'])
        lookup[norm_id] = item
        for name in item.get('flavor_bible_name_variants', []):
            lookup[normalize(name)] = item
    return lookup

In [139]:
from difflib import get_close_matches

def merge_ingredient_lists(master_ingredients, food_ingredient_names):
    lookup = build_master_lookup(master_ingredients)
    merged_ingredients = []
    seen_ids = set()

    for ingredient in food_ingredient_names:
        raw_name=ingredient["name"]
        food_name = normalize(raw_name)
        matched_key = None

        # Exact match
        if food_name in lookup:
            matched_key = food_name
        else:
            # Fuzzy match
            matches = get_close_matches(food_name, lookup.keys(), n=1, cutoff=0.85)
            if matches:
                matched_key = matches[0]

        if matched_key:
            merged = dict(lookup[matched_key])
            merged["source"] = list(set(merged.get("source", []) + ["flavor_bible", "foodb"]))
            merged_ingredients.append(merged)
            seen_ids.add(merged["id"])
        else:
            # Create a new entry
            new_id = food_name
            if new_id in seen_ids:
                i = 1
                while f"{new_id}_{i}" in seen_ids:
                    i += 1
                new_id = f"{new_id}_{i}"
            new_entry = {
                "id": new_id,
                "name": raw_name.title(),
                "flavor_db_name_variants": [food_name],
                "source": ["foodb"],
                "status": "draft"
            }
            merged_ingredients.append(new_entry)
            seen_ids.add(new_id)

    return merged_ingredients

In [146]:
def merge_ingredient_lists_with_all_fields(master_ingredients, food_entries):
    lookup = build_master_lookup(master_ingredients)
    merged_ingredients = []
    seen = set()
    id_counter = 1

    for food_entry in food_entries:
        # If Food.json is a list of strings, wrap them as dicts
        if isinstance(food_entry, str):
            food_entry = {"name": food_entry}

        raw_name = food_entry.get("name", "")
        norm_name = normalize(raw_name)
        matched_key = None

        # Exact match
        if norm_name in lookup:
            matched_key = norm_name
        else:
            # Fuzzy match
            matches = get_close_matches(norm_name, lookup.keys(), n=1, cutoff=0.85)
            if matches:
                matched_key = matches[0]

        if matched_key:
            base = dict(lookup[matched_key])
            base["source"] = list(set(base.get("source", []) + ["flavor_bible", "foodb"]))
            base["food_db_name"] = raw_name
            base["flavor_db_name_variants"] = sorted(set(base.get("name_variants", []) + [norm_name]))
            # Merge all additional fields from food_entry
            for k, v in food_entry.items():
                if k != "name":
                    base[k] = v
        else:
            base = {
                "name": raw_name.title(),
                "flavor_db_name_variants": [norm_name],
                "source": ["foodb"],
                "status": "draft",
                "food_db_id": raw_name
            }
            # Include all additional fields
            for k, v in food_entry.items():
                if k != "name":
                    base[k] = v

        # Assign numeric ID
        base["id"] = id_counter
        id_counter += 1
        merged_ingredients.append(base)

    return merged_ingredients

In [147]:
import json

with open("../data/ingredients/draft/ingredients_master_1.json") as f:
    master = json.load(f)

# /Users/alessandravertrees/Development/interactive-cookbook/data/drafts/foodb/foodb_2020_04_07_json
with open("../data/drafts/foodb/foodb_2020_04_07_json/Food.json") as f:
    food = json.load(f)

food_data = food['data']
# print(food['data'])
# merged = merge_ingredient_lists(master, food_data)
merged = merge_ingredient_lists_with_all_fields(master,food_data)

# Save to file
# with open("../data/ingredients/merged/ingredients.json", "w") as f:
#     json.dump(merged, f, indent=2)