In [43]:
import pandas as pd
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from nltk.stem import PorterStemmer
from scipy.spatial.distance import cosine

In [44]:
df = pd.read_csv('pure_vegetarian_recipes_dataset.csv')

In [45]:
stemmer=PorterStemmer()

In [46]:
def preprocess_ingredients(ingredient_str):
    ingredient_str = ingredient_str.lower()
    ingredient_str = ingredient_str.translate(str.maketrans('','',string.punctuation))

    ingredients = [ing.strip() for ing in ingredient_str.split(',')]
    stemmed_ingredients = []
    for ing in ingredients:
        tokens = re.findall(r'\b\w+\b', ing)
        stemmed = ' '.join(stemmer.stem(token) for token in tokens)
        stemmed_ingredients.append(stemmed)

    return stemmed_ingredients

In [47]:
df['Processed_Ingredients'] = df['Ingredients'].apply(preprocess_ingredients)

In [48]:
df['Processed_Text'] = df['Processed_Ingredients'].apply(lambda x: ' '.join(x))

In [49]:
df.head()

Unnamed: 0,Recipe Name,Country,Ingredients,Instructions,Cuisine Type,Dietary Restrictions,Complete Recipe Process,Processed_Ingredients,Processed_Text
0,Spaghetti Carbonara Variation 0,Italy,"Spaghetti, Eggs, Parmesan Cheese, Pancetta, Bl...",Cook spaghetti. Fry pancetta. Mix eggs and che...,Spanish,"Contains dairy, gluten, pork","To prepare Spaghetti Carbonara, start by gathe...",[spaghetti egg parmesan chees pancetta black p...,spaghetti egg parmesan chees pancetta black pe...
1,Borscht Variation 0,Russia,"Beets, Cabbage, Potatoes, Carrots, Onion, Sour...","Boil vegetables, blend, serve with sour cream.",Korean,"Vegetarian (without meat), contains dairy","To prepare Borscht, start by gathering the ing...",[beet cabbag potato carrot onion sour cream],beet cabbag potato carrot onion sour cream
2,Shakshuka Variation 0,Middle East,"Eggs, Tomatoes, Bell Peppers, Onion, Garlic, S...","Cook tomatoes, peppers, and spices, then poach...",Mexican,Vegetarian,"To prepare Shakshuka, start by gathering the i...",[egg tomato bell pepper onion garlic spice],egg tomato bell pepper onion garlic spice
3,Spaghetti Carbonara Variation 1,Italy,"Spaghetti, Eggs, Parmesan Cheese, Pancetta, Bl...",Cook spaghetti. Fry pancetta. Mix eggs and che...,Indian,"Contains dairy, gluten, pork","To prepare Spaghetti Carbonara, start by gathe...",[spaghetti egg parmesan chees pancetta black p...,spaghetti egg parmesan chees pancetta black pe...
4,Borscht Variation 1,Russia,"Beets, Cabbage, Potatoes, Carrots, Onion, Sour...","Boil vegetables, blend, serve with sour cream.",Thai,"Vegetarian (without meat), contains dairy","To prepare Borscht, start by gathering the ing...",[beet cabbag potato carrot onion sour cream],beet cabbag potato carrot onion sour cream


COUNTVERTORIZER + INVERTED INDEX

In [50]:
vectorizer = CountVectorizer()
ingredient_vectors = vectorizer.fit_transform(df['Processed_Text'])
feature_names = vectorizer.get_feature_names_out()

# Show feature names
print("Ingredient Vocabulary:", vectorizer.get_feature_names_out())

# Save vectorized matrix for future use (like clustering or searching)
print("Vectorized Matrix Shape:", ingredient_vectors.shape)

Ingredient Vocabulary: ['basil' 'bean' 'beet' 'bell' 'black' 'broccoli' 'cabbag' 'carrot'
 'cauliflow' 'chees' 'chickpea' 'cilantro' 'corn' 'cream' 'egg' 'flour'
 'fresh' 'garlic' 'ginger' 'mozzarella' 'oil' 'oliv' 'onion' 'option'
 'pancetta' 'parmesan' 'parsley' 'pepper' 'potato' 'rice' 'salt' 'sauc'
 'sour' 'soy' 'spaghetti' 'spice' 'tomato']
Vectorized Matrix Shape: (305, 37)


In [51]:
inverted_index={}
for idx,ingredients in enumerate(df['Processed_Ingredients']):
    for ingredient in ingredients:
        if ingredient not in inverted_index:
            inverted_index[ingredient] = []
        inverted_index[ingredient].append(idx)


In [52]:
clustering_model = AgglomerativeClustering(n_clusters=5)
df['Cluster'] = clustering_model.fit_predict(ingredient_vectors.toarray())

In [53]:
def recommend_recipe(user_ingredients):
    # Preprocess user ingredients
    user_processed = []
    for ing in user_ingredients:
        ing = ing.lower().translate(str.maketrans('', '', string.punctuation))
        tokens = re.findall(r'\b\w+\b', ing)
        stemmed = ' '.join(stemmer.stem(token) for token in tokens)
        user_processed.append(stemmed)

    # Find candidate recipes from inverted index
    candidate_ids = set()
    for ing in user_processed:
        if ing in inverted_index:
            candidate_ids.update(inverted_index[ing])

    # Score candidates by overlap
    best_match = None
    best_score = -1

    for idx in candidate_ids:
        recipe_ings = set(df.loc[idx, 'Processed_Ingredients'])
        overlap = len(set(user_processed) & recipe_ings)
        if overlap > best_score:
            best_score = overlap
            best_match = idx

    # If a good match is found, return it
    if best_match is not None and best_score > 0:
        return {
            "Match Type": "Exact/Partial Match",
            "Recipe Name": df.loc[best_match, 'Recipe Name'],
            "Ingredients": df.loc[best_match, 'Ingredients'],
            "Complete Recipe Process": df.loc[best_match, 'Complete Recipe Process']
        }

    # === Fallback: Use Clustering ===
    user_vector = vectorizer.transform([' '.join(user_processed)]).toarray()[0]
    min_distance = float('inf')
    closest_idx = None

    for idx in range(len(df)):
        recipe_vector = ingredient_vectors[idx].toarray()[0]
        dist = cosine(user_vector, recipe_vector)
        if dist < min_distance:
            min_distance = dist
            closest_idx = idx

    return {
        "Match Type": "Cluster-based Recommendation",
        "Recipe Name": df.loc[closest_idx, 'Recipe Name'],
        "Ingredients": df.loc[closest_idx, 'Ingredients'],
        "Complete Recipe Process": df.loc[closest_idx, 'Complete Recipe Process']
    }


In [54]:
user_ingredients = ["tomato", "onion"]
result = recommend_recipe(user_ingredients)

print("\n✅ Recommended Recipe:")
for key, val in result.items():
    print(f"{key}: {val}")


✅ Recommended Recipe:
Match Type: Cluster-based Recommendation
Recipe Name: Shakshuka Variation 0
Ingredients: Eggs, Tomatoes, Bell Peppers, Onion, Garlic, Spices
Complete Recipe Process: To prepare Shakshuka, start by gathering the ingredients: Eggs, Tomatoes, Bell Peppers, Onion, Garlic, Spices. Follow these steps carefully: 1) Prepare ingredients by washing and chopping as needed. 2) Heat the cooking surface and add base ingredients (e.g., oil, spices, or broth). 3) Cook the main ingredients, ensuring proper seasoning. 4) Let simmer or bake until fully cooked. 5) Serve hot and enjoy!


In [56]:
user_ingredients = ["cheese", "egg", "pasta","cabbag"]

result = recommend_recipe(user_ingredients)
print("\n✅ Recommended Recipe:")
for key, val in result.items():
    print(f"{key}: {val}")


✅ Recommended Recipe:
Match Type: Cluster-based Recommendation
Recipe Name: Spaghetti Carbonara Variation 0
Ingredients: Spaghetti, Eggs, Parmesan Cheese, Pancetta, Black Pepper
Complete Recipe Process: To prepare Spaghetti Carbonara, start by gathering the ingredients: Spaghetti, Eggs, Parmesan Cheese, Pancetta, Black Pepper. Follow these steps carefully: 1) Prepare ingredients by washing and chopping as needed. 2) Heat the cooking surface and add base ingredients (e.g., oil, spices, or broth). 3) Cook the main ingredients, ensuring proper seasoning. 4) Let simmer or bake until fully cooked. 5) Serve hot and enjoy!
