In [None]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
import re

In [None]:
# Load dataset
df = pd.read_csv("/home/minhas/cgn-dp-24-1/culinary_compass/data/recipes.csv")

In [None]:
# non_veg_ingredients = ["chicken", "beef", "pork", "fish", "shrimp", "lamb", "eggs","egg"]
# df["DietaryCategory"] = df["RecipeIngredientParts"].apply(
#     lambda ingredients: "Non-Vegetarian" if any(item in str(ingredients).lower() for item in non_veg_ingredients) else "Vegetarian"
# )


In [None]:
df =df.iloc[0:3000]

In [None]:
# Define non-vegetarian ingredients
non_veg_ingredients = set([
    # Meat & Poultry
    "chicken", "beef", "pork", "mutton", "lamb", "turkey", "duck", "quail", "goat", "veal",
    "rabbit", "boar", "venison", "bison", "kangaroo", "goose", "pheasant", "pigeon", "elk",
    
    # Processed Meat Products
    "bacon", "ham", "sausage", "pepperoni", "salami", "chorizo", "pastrami", "prosciutto",
    "mortadella", "hot dog", "jerky", "liverwurst", "blood sausage", "scrapple",
    
    # Seafood
    "fish", "tuna", "salmon", "trout", "cod", "haddock", "mackerel", "sardine", "anchovy",
    "herring", "catfish", "bass", "snapper", "grouper", "halibut", "swordfish", "mahi mahi",
    "flounder", "eel", "shark", "sturgeon", "tilapia", "tuna steaks", "swordfish steaks",
    
    # Shellfish
    "shrimp", "prawns", "crab", "lobster", "crawfish", "squid", "octopus", "scallops",
    "mussels", "clams", "oysters", "abalone", "conch",
    
    # Animal-Based Ingredients
    "eggs", "gelatin", "lard", "suet", "tallow", "bone broth", "fish sauce", "oyster sauce",
    "shrimp paste", "anchovy paste", "worcestershire sauce", "caviar", "roe", "squid ink",
    
    # Organ Meats (Offal)
    "liver", "kidney", "heart", "brain", "tripe", "sweetbreads", "tongue", "gizzards"
])

# Function to classify recipes correctly
def classify_recipe(ingredients):
    # Step 1: Clean string formatting issues
    ingredients = str(ingredients).lower().replace('"', '').replace("c(", "").replace(")", "")

    # Step 2: Convert to a list of ingredients
    ingredient_list = [ing.strip() for ing in ingredients.split(",")]

    # Step 3: Check for partial matches
    if any(any(non_veg in ingredient for non_veg in non_veg_ingredients) for ingredient in ingredient_list):
        return "Non-Vegetarian"
    return "Vegetarian"


# Apply classification
df["DietaryCategory"] = df["RecipeIngredientParts"].apply(classify_recipe)

In [None]:
df['DietaryCategory'].unique()

In [None]:
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_row', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# df[["RecipeIngredientParts", "DietaryCategory"]]

In [None]:
# Initialize Sentence Transformer Model

model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# Step 3: Compute embeddings for ingredients
df["IngredientEmbedding"] = df["RecipeIngredientParts"].apply(lambda x: model.encode(str(x), convert_to_tensor=True).tolist())

In [None]:

# Step 4: Save embeddings to a new file
df.to_csv("/home/minhas/cgn-dp-24-1/culinary_compass/data/recipes_with_embeddings.csv", index=False)

In [None]:
# Step 5: Load dataset with precomputed embeddings
df = pd.read_csv("/home/minhas/cgn-dp-24-1/culinary_compass/data/recipes_with_embeddings.csv")

In [None]:
df["IngredientEmbedding"] = df["IngredientEmbedding"].apply(lambda x: torch.tensor(eval(x)))

In [None]:
df.head()

In [None]:
def recommend_recipes(nutrients, ingredients, diet_preference):
    """Recommend recipes based on user nutrients, ingredients, and dietary preference."""

    # Filter dataset based on dietary preference
    if diet_preference != "Any":
        filtered_df = df[df["DietaryCategory"] == diet_preference].copy()
    else:
        filtered_df = df.copy()

    # Encode input ingredients
    input_ingredient_embedding = model.encode(" ".join(ingredients), convert_to_tensor=True)

    # Compute cosine similarity
    ingredient_similarities = util.pytorch_cos_sim(
        torch.stack(filtered_df["IngredientEmbedding"].tolist()), input_ingredient_embedding
    ).squeeze().numpy()

    # Normalize nutrient similarity
    nutrient_columns = ["Calories", "FatContent", "SaturatedFatContent", "CholesterolContent", 
                        "SodiumContent", "CarbohydrateContent", "FiberContent", "SugarContent", "ProteinContent"]
    
    df_nutrients = filtered_df[nutrient_columns].fillna(0)
    nutrient_array = df_nutrients.to_numpy()

    # Compute Euclidean distance
    input_nutrient_array = np.array([nutrients[col] for col in nutrient_columns]).reshape(1, -1)
    nutrient_distances = np.linalg.norm(nutrient_array - input_nutrient_array, axis=1)
    nutrient_similarities = 1 / (1 + nutrient_distances)

    # Compute final scores
    final_scores = (0.3 * nutrient_similarities) + (0.7 * ingredient_similarities)

    # Rank recipes and return top results
    filtered_df["SimilarityScore"] = final_scores
    top_recipes = filtered_df.sort_values(by="SimilarityScore", ascending=False).head(5)

    return top_recipes[["Name", "Images", "RecipeInstructions"]].to_dict(orient="records")


In [None]:
# Step 7: User input for testing
user_nutrients = {
    "Calories": 500, "FatContent": 20, "SaturatedFatContent": 5,
    "CholesterolContent": 10, "SodiumContent": 500, "CarbohydrateContent": 50,
    "FiberContent": 10, "SugarContent": 10, "ProteinContent": 30
}

user_ingredients = [ "Egg, whole, raw, frozen, salted, pasteurized", "Cheese, American, restaurant",
                    "Cheese, cotija, solid", "Crustaceans, crab, alaska king, raw",
                      "Mollusks, clam, mixed species, raw",  "Seaweed, wakame, raw",
                    "Cream cheese, full fat, block"]
# Define the diet preference
diet_preference = "Any"# You can set this to "Vegetarian", "Non-Vegetarian", or "Any" based on user input

# Step 8: Get recommendations
recommendations = recommend_recipes(user_nutrients, user_ingredients, diet_preference)

# Step 9: Display results
for recipe in recommendations:
    print(f"Name: {recipe['Name']}\nImage: {recipe['Images']}\nInstructions: {recipe['RecipeInstructions']}\n{'-'*50}")

In [None]:
df[df['Name']=="Chicken with Lemongrass"]

# End

In [None]:
diet_preference = st.sidebar.selectbox("Dietary Preference:", ["Any", "Vegetarian", "Non-Vegetarian"])



non_veg_ingredients = ["chicken", "beef", "pork", "fish", "shrimp", "lamb", "eggs", "egg"]

df["DietaryCategory"] = df["RecipeIngredientParts"].apply(
    lambda ingredients: "Non-Vegetarian" if any(re.search(rf"\b{item}\b", str(ingredients).lower()) for item in non_veg_ingredients) else "Vegetarian"
)

# Display the updated DataFrame
print(df)


def recommend_recipes(nutrients, ingredients, diet_preference):
    """Recommend recipes based on user nutrients, ingredients, and dietary preference."""

    # Filter dataset based on dietary preference
    if diet_preference != "Any":
        filtered_df = df[df["DietaryCategory"] == diet_preference].copy()
    else:
        filtered_df = df.copy()

    # Encode input ingredients
    input_ingredient_embedding = model.encode(" ".join(ingredients), convert_to_tensor=True)

    # Compute cosine similarity
    ingredient_similarities = util.pytorch_cos_sim(
        torch.stack(filtered_df["IngredientEmbedding"].tolist()), input_ingredient_embedding
    ).squeeze().numpy()

    # Normalize nutrient similarity
    nutrient_columns = ["Calories", "FatContent", "SaturatedFatContent", "CholesterolContent", 
                        "SodiumContent", "CarbohydrateContent", "FiberContent", "SugarContent", "ProteinContent"]
    
    df_nutrients = filtered_df[nutrient_columns].fillna(0)
    nutrient_array = df_nutrients.to_numpy()

    # Compute Euclidean distance
    input_nutrient_array = np.array([nutrients[col] for col in nutrient_columns]).reshape(1, -1)
    nutrient_distances = np.linalg.norm(nutrient_array - input_nutrient_array, axis=1)
    nutrient_similarities = 1 / (1 + nutrient_distances)

    # Compute final scores
    final_scores = (0.6 * nutrient_similarities) + (0.4 * ingredient_similarities)

    # Rank recipes and return top results
    filtered_df["SimilarityScore"] = final_scores
    top_recipes = filtered_df.sort_values(by="SimilarityScore", ascending=False).head(5)

    return top_recipes[["Name", "Images", "RecipeInstructions"]].to_dict(orient="records")


recommendations = recommend_recipes(user_nutrients, user_ingredients, diet_preference)



In [None]:
# Define non-vegetarian keywords
non_veg_keywords = set([
    # Meat & Poultry
    "chicken", "beef", "pork", "mutton", "lamb", "turkey", "duck", "quail", "goat", "veal",
    "rabbit", "boar", "venison", "bison", "kangaroo", "goose", "pheasant", "pigeon", "elk",

    # Processed Meat Products
    "bacon", "ham", "sausage", "pepperoni", "salami", "chorizo", "pastrami", "prosciutto",
    "mortadella", "hot dog", "jerky", "liverwurst", "blood sausage", "scrapple",

    # Seafood
    "fish", "tuna", "salmon", "trout", "cod", "haddock", "mackerel", "sardine", "anchovy",
    "herring", "catfish", "bass", "snapper", "grouper", "halibut", "swordfish", "mahi mahi",
    "flounder", "eel", "shark", "sturgeon", "tilapia", "tuna steaks", "swordfish steaks",

    # Shellfish
    "shrimp", "prawns", "crab", "lobster", "crawfish", "squid", "octopus", "scallops",
    "mussels", "clams", "oysters", "abalone", "conch",

    # Animal-Based Ingredients
    "eggs", "gelatin", "lard", "suet", "tallow", "bone broth", "fish sauce", "oyster sauce",
    "shrimp paste", "anchovy paste", "worcestershire sauce", "caviar", "roe", "squid ink",

    # Organ Meats (Offal)
    "liver", "kidney", "heart", "brain", "tripe", "sweetbreads", "tongue", "gizzards"
])

# Function to classify recipes
def classify_recipe(row):
    """
    Classifies a recipe as 'Vegetarian' or 'Non-Vegetarian' based on:
    - `RecipeIngredientParts`
    - `RecipeCategory`
    """
    # Extract ingredient list
    ingredients = str(row["RecipeIngredientParts"]).lower().replace('"', '').replace("c(", "").replace(")", "")
    ingredient_list = [ing.strip() for ing in ingredients.split(",")]

    # Extract category list
    categories = str(row["RecipeCategory"]).lower().replace('"', '').replace("c(", "").replace(")", "")
    category_list = [cat.strip() for cat in categories.split(",")]

    # Check for non-veg keywords in ingredients or category
    if any(any(non_veg in item for non_veg in non_veg_keywords) for item in ingredient_list + category_list):
        return "Non-Veg"

    return "Veg"

# Apply classification
df["DietaryCategory"] = df.apply(classify_recipe, axis=1)