In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import joblib

# === Step 1: Load the Trained Models ===
sbert = SentenceTransformer("ingredient_model/sbert_model")
knn = joblib.load("ingredient_model/knn_model.pkl")

# === Step 2: Load Manual Mapping from labelled_data.xlsx ===
label_df = pd.read_excel("labelled_data.xlsx")

# Use correct column names: "Original Line" and "Selected Match"
label_df = label_df.dropna(subset=["Original Line", "Selected Match"])

# Create lowercase dictionary for matching
manual_map = {
    row["Original Line"].strip().lower(): row["Selected Match"].strip().lower()
    for _, row in label_df.iterrows()
}

# === Step 3: Load the Exploded Ingredient File ===
input_file = "exploded_output_by_video_id.xlsx"
df = pd.read_excel(input_file)

if "ingredient" not in df.columns:
    raise ValueError("The input file must contain a column named 'ingredient'.")

# === Step 4: Normalize Each Ingredient Line (manual override + model) ===
normalized_col = []

for ingredient in df["ingredient"]:
    ingredient_text = str(ingredient).strip().lower()

    if not ingredient_text or pd.isna(ingredient):
        normalized_col.append("")
    elif ":" in ingredient_text:
        normalized_col.append("")
    elif ingredient_text in manual_map:
        normalized_col.append(manual_map[ingredient_text])
    else:
        # Substring fallback
        matched_key = next((k for k in manual_map if k in ingredient_text), None)
        if matched_key:
            normalized_col.append(manual_map[matched_key])
        else:
            embedding = sbert.encode([ingredient_text], show_progress_bar=False)
            pred = knn.predict(embedding)
            normalized_col.append(pred[0])

# === Step 5: Save Output ===
df["normalized_ingredient"] = normalized_col
output_file = "exploded_normalized_output.xlsx"
df.to_excel(output_file, index=False)

print(f"✅ Normalized output (with manual mapping + colon-safe logic) saved as: {output_file}")


✅ Normalized output (with manual mapping + colon-safe logic) saved as: exploded_normalized_output.xlsx
