In [32]:
import re
# Normalize function
def normalize_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', name.lower().strip())


def merge_entries(entries):
    # Merge logic: choose entry with most filled fields
    entries = sorted(entries, key=lambda e: sum(1 for v in e.values() if v), reverse=True)
    merged = entries[0].copy()
    
    # Merge pairings and name_variants from all entries
    all_pairings = set()
    all_variants = set()
    for e in entries:
        all_pairings.update(e.get("pairings", []))
        all_variants.update(e.get("name_variants", []))

    merged["pairings"] = sorted(list(all_pairings))
    merged["name_variants"] = sorted(list(all_variants))
    merged["original_ids"] = [e["id"] for e in entries]
    
    return merged
    
def deduplicate_ingredients(ingredient_list):
    name_map = defaultdict(list)

    for entry in ingredient_list:
        norm_name = normalize(entry["name"])
        name_map[norm_name].append(entry)

    deduped = []
    for name_group in name_map.values():
        if len(name_group) == 1:
            deduped.append(name_group[0])
        else:
            deduped.append(merge_entries(name_group))

    return deduped

In [33]:
# Fields you want to keep from each food.json record
fields_to_keep = [
    "id",
    "name",
    "food_db_id", 
    "food_db_name", 
    "name_scientific", 
    "description",
    "food_group", 
    "food_subgroup", 
    "food_type", 
    "category", 
    "ncbi_taxonomy_id", 
    "itis_id", 
    "public_id"
    "flavor_bible_name_variants", 
    "flavor_bible_pairings_ids", 
    "source", 
    "status", 
]

In [34]:
def strip_fields(entry):
    return {key: value for key, value in entry.items() if key in fields_to_keep}

In [35]:
def clean_food_data(food_data):
    return [strip_fields(entry) for entry in food_data]

In [40]:
import json
from collections import defaultdict
with open("../data/ingredients/merged/ingredients.json") as f:
    raw_data = json.load(f)

In [41]:
# Python 3.7+ maintains dict order, but you can use OrderedDict if needed
from collections import OrderedDict

# Desired order of fields
field_order = [
    "id",
    "name",
    "food_db_id", 
    "food_db_name", 
    "name_scientific", 
    "description",
    "food_group", 
    "food_subgroup", 
    "food_type", 
    "category", 
    "ncbi_taxonomy_id", 
    "itis_id", 
    "public_id"
    "flavor_bible_name_variants", 
    "flavor_bible_pairings_ids", 
    "source", 
    "status", 
]

def reorder_fields(entry, field_order):
    return {field: entry.get(field) for field in field_order if field in entry}

def reorder_all(data, field_order):
    return [reorder_fields(entry, field_order) for entry in data]

In [42]:
cleaned = clean_food_data(raw_data)
deduped = deduplicate_ingredients(cleaned)
reordered = reorder_all(deduped, field_order)

In [43]:
with open("../data/ingredients/merged/cleaned_deduped_food.json", "w") as f:
    json.dump(reordered, f, indent=2)