In [None]:
import pandas as pd
large_df = pd.read_csv("grupo_alimentos_large.csv")   # columna "Grupo"
small_df = pd.read_csv("grupo_alimentos_small.csv")   # columna "Alimento"

#normalizar texto
large_df["grupo_large_norm"] = large_df["Grupo"].str.lower().str.strip()
small_list = (
    small_df["Alimento"]
    .str.lower()
    .str.strip()
    .tolist()
)

#small por longitud descendente
small_list = sorted(small_list, key=len, reverse=True)

#función de mapeo
def mapear_a_small(texto):
    for small in small_list:
        if small in texto:
            return small
    return pd.NA

#mapeo
large_df["grupo_small"] = large_df["grupo_large_norm"].apply(mapear_a_small)

#filtrar coincidencias y guardar
mapped = large_df.dropna(subset=["grupo_small"])
mapped[["Grupo", "grupo_small"]].to_csv(
    "grupo_large_to_small.csv",
    index=False,
    encoding="utf-8-sig"
)


In [None]:
import pandas as pd

large_df = pd.read_csv("grupo_alimentos_large.csv")   # columna "Grupo"
small_df = pd.read_csv("grupo_alimentos_small.csv")   # columna "Alimento"

#normalizar el nombre en inglés
large_df["grp_eng_norm"] = large_df["Grupo"].str.lower().str.strip()

#reglas de mapeo: { categoría_español: [palabras_clave_en_inglés...] }
mapping_rules = {
    "Huevos":                ["egg","eggs"],
    "Carne de res":          ["beef","ground beef"],
    "Pollo":                 ["chicken","poultry"],
    "Conejo":                ["rabbit"],
    "Cordero":               ["lamb","goat","game"],
    "Cerdo":                 ["pork","bacon","ham"],
    "Embutidos":             ["cold cuts","sausage","frankfurter"],
    "Merluza":               ["hake","merluza"],
    "Sardina":               ["sardina","sardine"],
    "Atún fresco":           ["fresh tuna","tuna fresh"],
    "Caballa":               ["mackerel"],
    "Salmón":                ["salmon"],
    "Merluza (congelada)":   ["hake, frozen","merluza (frozen)"],
    "Mejillones":            ["mussel","mussels"],
    "Calamares y pulpo":     ["squid","octopus"],
    "Langostinos y camarones":["shrimp","prawn","langostino","camarones"],
    "Calamares/pulpo (congelados)": ["squid, frozen","octopus, frozen"],
    "Langostinos/camarones (congelados)": ["shrimp, frozen","prawn, frozen"],
    "Atún en conserva":      ["tuna, canned","canned tuna"],
    "Mejillones en conserva":["mussels, canned","canned mussels"],
    "Anchoas":               ["anchovy","anchovies"],
    "Leche":                 ["milk","milk, reduced fat","milk, whole","milk, lowfat","milk, nonfat","flavored milk","plant-based milk"],
    "Batidos":               ["milk shake","smoothie","grain drink"],
    "Helado":                ["ice cream","frozen dairy"],
    "Yogurt":                ["yogurt","yoghurt"],
    "Mantequilla":           ["butter","animal fats"],
    "Queso fresco":          ["cottage","ricotta","cream cheese","cheese, fresh"],
    "Queso semiduro":        ["cheddar","edam","semihard cheese","queso semiduro"],
    "Queso maduro":          ["parmesan","aged cheese","hard cheese","queso maduro"],
    "Pan":                   ["bread","roll","bagel","muffin","bun","yeast bread"],
    "Arroz":                 ["rice","fried rice","rice mixed"],
    "Pasta":                 ["pasta","noodle","macaroni","spaghetti","pasta mixed","pasta, noodles","pasta sauces"],
    "Galletas":              ["cookie","cracker","biscuit","sweet roll","pastry"],
    "Cereales":              ["cereal","oatmeal","grits","cereal bar","nutrition bar"],
    "Chocolate (tableta)":   ["chocolate, candy containing chocolate"],
    "Snack de chocolate":    ["snack mix","chocolate snack"],
    "Cacao en polvo":        ["cocoa powder"],
    "Azúcar":                ["sugar","honey","syrup"],
    "Legumbres":             ["bean","pea","lentil","legume"],
    "Aceite de oliva":       ["olive oil"],
    "Aceite vegetal":        ["vegetable oil","salad dressings and vegetable oils"],
    "Margarina":             ["margarine"],
    "Papas":                 ["potato","mashed potatoes","french fries","chips"],
    "Tomate":                ["tomato","tomatoes"],
    "Lechuga":               ["lettuce"],
    "Otros vegetales":       ["vegetable","spinach","broccoli","carrot","pepper","cabbage","onion","coleslaw"],
    "Cítricos":              ["citrus","orange","lemon","lime","grapefruit","tangerine"],
    "Plátano":               ["banana"],
    "Manzana":               ["apple"],
    "Durazno":               ["peach","nectarine"],
    "Aceitunas":             ["olive, pickles","olives"],
    "Frutos secos":          ["nut","seed","peanut butter","almond","walnut","cashew"],
    "Tomate procesado":      ["ketchup","tomato-based","pasta sauce"],
    "Gazpacho":              ["gazpacho"],
    "Fabada":                ["fabada"],
    "Ketchup":               ["ketchup"],
    "Café":                  ["coffee"],
    "Vino":                  ["wine"],
    "Cerveza":               ["beer","liquor","cocktail","wine"],  # beer y wine ya mapean; cocktales a descartar quizá
    "Agua":                  ["water","tap water","bottled water","enhanced water"],
    "Jugo":                  ["juice","fruit drink"],
    "Gaseosa":               ["soda","soft drink","carbonated water","flavored water"],
    "Sal":                   ["salt"]
}

# func que de ingles devuelve la categoría española
def map_to_spanish(grp_eng):
    for esp, keywords in mapping_rules.items():
        if any(kw in grp_eng for kw in keywords):
            return esp
    return pd.NA

#mapeo
large_df["categoria_es"] = (
    large_df["grp_eng_norm"]
    .apply(map_to_spanish)
)

#filtrar sólo los mapeos exitosos
mapped = large_df.dropna(subset=["categoria_es"])

#resultado final
mapped[["Grupo", "categoria_es"]].to_csv(
    "grupo_large_to_small_manual.csv",
    index=False,
    encoding="utf-8-sig"
)


In [None]:
df = pd.read_csv("agrupamiento_total.csv", encoding="utf-8-sig")

#extraer valores únicos de la columna
unique_cats = df["categoria_es"].dropna().drop_duplicates()

In [15]:
print(unique_cats.tolist())

['Leche', 'Yogurt', 'Batidos', 'Helado', 'Queso fresco', 'Huevos', 'Carne de res', 'Embutidos', 'Cerdo', 'Mantequilla', 'Cordero', 'Pollo', 'Arroz', 'Pasta', 'Pan', 'Cereales', 'Legumbres', 'Papas', 'Tomate', 'Otros vegetales', 'Lechuga', 'Aceitunas', 'Manzana', 'Plátano', 'Cítricos', 'Jugo', 'Azúcar', 'Café', 'Gaseosa', 'Agua', 'Cerveza', 'Vino', 'Durazno', 'Frutos secos', 'Galletas']
