In [None]:
import pandas as pd

# === Step 1: Load exploded file ===
df = pd.read_excel("exploded_normalized_output.xlsx")

# Ensure required columns are present
required_cols = {"video_id", "quantity", "unit", "normalized_ingredient", "group_id"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# === Step 2: Construct cleaned ingredient line ===
df = df.fillna("")
df["line"] = df.apply(
    lambda row: ' '.join(part for part in [
        str(row["quantity"]).strip(),
        str(row["unit"]).strip(),
        str(row["normalized_ingredient"]).strip()
    ] if part),
    axis=1
)

# === Step 3: Group by (video_id, group_id) ONLY ===
grouped_df = df.groupby(['video_id', 'group_id'], sort=False)["line"].apply(
    lambda lines: '\n'.join(line for line in lines if line.strip())
).reset_index(name="ingredient_list")

# === Step 4: Save output ===
grouped_df.to_excel("recombined_ingredients_by_group_id.xlsx", index=False)
print(" Recombined file saved: recombined_ingredients_by_group_id.xlsx")
