In [48]:
import json
import re
index_data = []
# Load index.json
with open('../data/raw/index.json') as f:
    index_data = json.load(f)['results']

# Load flavor_bible_full.json
with open('../data/raw/flavor_bible_full.json') as f:
    flavor_bible = json.load(f)

# Normalize function
def normalize(name):
    return re.sub(r'[^a-zA-Z\s]', '', name.lower()).strip()

# Normalize flavor data into dict for lookup
flavor_map = {normalize(item['main']): item for item in flavor_bible}
flavor_map_sample = flavor_map['achiote seeds']
print(flavor_map_sample)

{'main': 'ACHIOTE SEEDS', 'pairing': 'achiote + pork + sour orange'}


In [24]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from collections import defaultdict

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(index_data)

clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=1.2)
labels = clustering.fit_predict(embeddings)

# Group by cluster
clusters = defaultdict(list)
for label, name in zip(labels, index_data):
    clusters[label].append(name)

In [25]:
sample = clusters[1]
print(sample)

['bread', 'bread, bagels', 'bread, breadsticks, croutons, etc.', 'bread, crusty', 'bread, french', 'bread, olive', 'bread, pita', 'bread, pumpernickel', 'bread, white', 'breads', 'breads and breadsticks', 'nut bread', 'walnut bread']


In [45]:
from collections import Counter

def get_cluster_root(names):
    token_counts = Counter()
    for name in names:
        words = re.sub(r'[^a-zA-Z\s]', '', name.lower()).split()
        token_counts.update(words)
    return token_counts.most_common(1)[0][0]  # e.g. "apple"

cluster_root_sample=get_cluster_root(sample)
print(cluster_root_sample)


bread


In [49]:
# def normalize(name):
#     return re.sub(r'[^a-zA-Z\s]', '', name.lower()).strip()

print(normalize(cluster_root_sample))

bread


In [41]:
def flatten(d):
    for v in d.values():
        if isinstance(v, dict):
            yield from flatten(v)
        elif isinstance(v, list):
            yield from v

In [47]:
def get_canonical_name(names):
    # "Heuristic: most frequent core word (e.g., 'chocolate' from variants)."
    token_counts = Counter()
    for name in names:
        words = normalize(name).split()
        token_counts.update(words)
    return token_counts.most_common(1)[0][0]

print(get_canonical_name(sample))

bread


In [53]:
def parse_pairings(raw_string):
    if not raw_string or not isinstance(raw_string, str):
        return []
    parts = [p.strip().lower().replace(" ", "_") for p in raw_string.split("+")]
    return [p for p in parts if p]

print(flavor_map_sample['pairing'])
print(parse_pairings(flavor_map_sample['pairing']))

achiote + pork + sour orange
['achiote', 'pork', 'sour_orange']


In [37]:
def build_ingredient_model_with_clustered_names(cluster_names, flavor_map):
    all_names = list(set(normalize(n) for n in cluster_names))
    core = get_cluster_root(cluster_names)
    flavor = flavor_map.get(core)
    return {
        "id": core.replace(" ", "_"),
        "name": core.title(),
        "name_variants": all_names,
        "status": "draft"
    }

In [56]:
def build_ingredient_model_with_pairings(cluster_data, flavor_map):
    normalized_names = [normalize(n) for n in cluster_data]
    canonical_name = get_canonical_name(normalized_names)

    # Try to get flavor entry using exact match or fallback
    flavor_entry = flavor_map.get(canonical_name) or next(
        (flavor_map.get(name) for name in normalized_names if flavor_map.get(name)), None
    )

    # print(flavor_entry)

    return {
        "id": canonical_name.replace(" ", "_"),
        "name": canonical_name.title(),
        "name_variants": sorted(set(normalized_names)),
        "pairing_ids": parse_pairings(flavor_entry.get("pairing")) if flavor_entry else [],
        "status": "draft"
    }

In [58]:
def write_cluster_file(clusters, flavor_map, func, file_name):
    
    ingredient_models = []
    
    for cluster in clusters.values():
        model = func(cluster, flavor_map) #if model == "build_ingredient_model_with_clustered_names" else build_ingredient_model_with_pairings(clusters, flavor_map) 
        ingredient_models.append(model)
    
    # Save the structured data
    with open(file_name, "w") as f:
        json.dump(ingredient_models, f, indent=2)

print(flavor_map['achiote seeds'])
write_cluster_file(clusters, flavor_map, build_ingredient_model_with_clustered_names, "../data/ingredients/ingredients_master.json")
write_cluster_file(clusters, flavor_map, build_ingredient_model_with_pairings, "../data/ingredients/ingredients_master_1.json")

{'main': 'ACHIOTE SEEDS', 'pairing': 'achiote + pork + sour orange'}


In [63]:
def build_master_lookup(master_ingredients):
    lookup = {}
    for item in master_ingredients:
        norm_id = normalize(item['id'])
        lookup[norm_id] = item
        for name in item.get('common_names', []):
            lookup[normalize(name)] = item
    return lookup

In [70]:
from difflib import get_close_matches

def merge_ingredient_lists(master_ingredients, food_ingredient_names):
    lookup = build_master_lookup(master_ingredients)
    merged_ingredients = []
    seen_ids = set()

    for ingredient in food_ingredient_names:
        raw_name=ingredient["name"]
        food_name = normalize(raw_name)
        matched_key = None

        # Exact match
        if food_name in lookup:
            matched_key = food_name
        else:
            # Fuzzy match
            matches = get_close_matches(food_name, lookup.keys(), n=1, cutoff=0.85)
            if matches:
                matched_key = matches[0]

        if matched_key:
            merged = dict(lookup[matched_key])
            merged["source"] = list(set(merged.get("source", []) + ["master", "food"]))
            merged_ingredients.append(merged)
            seen_ids.add(merged["id"])
        else:
            # Create a new entry
            new_id = food_name
            if new_id in seen_ids:
                i = 1
                while f"{new_id}_{i}" in seen_ids:
                    i += 1
                new_id = f"{new_id}_{i}"
            new_entry = {
                "id": new_id,
                "name": raw_name.title(),
                "common_names": [food_name],
                "source": ["food"],
                "status": "draft"
            }
            merged_ingredients.append(new_entry)
            seen_ids.add(new_id)

    return merged_ingredients

In [None]:
def merge_ingredient_lists_with_ids(master_ingredients, food_ingredient_names):
    lookup = build_master_lookup(master_ingredients)
    merged_ingredients = []
    seen_ids = set()

    for ingredient in food_ingredient_names:
        raw_name=ingredient["name"]
        food_name = normalize(raw_name)
        matched_key = None

        # Exact match
        if food_name in lookup:
            matched_key = food_name
        else:
            # Fuzzy match
            matches = get_close_matches(food_name, lookup.keys(), n=1, cutoff=0.85)
            if matches:
                matched_key = matches[0]

        if matched_key:
            merged = dict(lookup[matched_key])
            merged["source"] = list(set(merged.get("source", []) + ["master", "food"]))
            merged["food_db_id"] = raw_name
            merged_ingredients.append(merged)
            seen_ids.add(merged["id"])
        else:
            # Create a new entry
            new_id = food_name
            if new_id in seen_ids:
                i = 1
                while f"{new_id}_{i}" in seen_ids:
                    i += 1
                new_id = f"{new_id}_{i}"
            new_entry = {
                "id": new_id,
                "name": raw_name.title(),
                "common_names": [food_name],
                "source": ["food"],
                "status": "draft",
                "food_db_id": raw_name
            }
            merged_ingredients.append(new_entry)
            seen_ids.add(new_id)

    return merged_ingredients


In [71]:
import json

with open("../data/ingredients/ingredients_master_1.json") as f:
    master = json.load(f)

# /Users/alessandravertrees/Development/interactive-cookbook/data/drafts/foodb/foodb_2020_04_07_json
with open("../data/drafts/foodb/foodb_2020_04_07_json/Food.json") as f:
    food = json.load(f)

food_data = food['data']
# print(food['data'])
merged = merge_ingredient_lists(master, food_data)

# Save to file
with open("../data/ingredients/merged/ingredients.json", "w") as f:
    json.dump(merged, f, indent=2)