In [26]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [27]:
import pandas as pd

df = pd.read_csv('recipes1.csv')
df.head()

Unnamed: 0,Title,Total Cook Time,Prep Time,Cook Time,Recipe Servings,Difficulty,Ingredients,Image Link
0,Eggless Banana Pancake Recipe,20 mins,05 mins,15 mins,2.0,Easy,"1 Cup all-purpose flour, 1 tbsp sugar, 1 tsp ...",https://i.ndtvimg.com/i/2015-07/pancake-625_62...
1,Leftover Rice Pancakes Recipe,25 mins,10 mins,15 mins,2.0,Easy,"1 Cup leftover cooked rice, 1 cup all-purpose ...",https://c.ndtvimg.com/2023-07/o4pnb5j8_leftove...
2,Desi Style Masala-Cheese Hot Dog Recipe,20 mins,05 mins,15 mins,2.0,Easy,"4 tsp oil, 2 boiled potatoes, 1 tsp butter, 1...",https://c.ndtvimg.com/2023-04/ekp1vv4g_hot-dog...
3,Healthy Banana Pancakes Recipe,20 mins,10 mins,10 mins,5.0,Easy,"2 Ripe Banana, 2 Eggs, 1 tsp vanilla extract...",https://c.ndtvimg.com/2019-03/dehgn798_oats-pa...
4,3 Ingredient Banana Pancake Recipe,10 mins,05 mins,05 mins,2.0,Easy,"2 Bananas, 2 Eggs, 2 tbsp oats/flour",https://c.ndtvimg.com/2021-04/tg682u28_pancake...


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1075 entries, 0 to 1074
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Title            1075 non-null   object 
 1   Total Cook Time  1075 non-null   object 
 2   Prep Time        1075 non-null   object 
 3   Cook Time        1066 non-null   object 
 4   Recipe Servings  1028 non-null   float64
 5   Difficulty       1025 non-null   object 
 6   Ingredients      1075 non-null   object 
 7   Image Link       1075 non-null   object 
dtypes: float64(1), object(7)
memory usage: 67.3+ KB


In [29]:
total_recipes = df.shape[0]
print(f"Total number of recipes: {total_recipes}")

Total number of recipes: 1075


In [30]:
for column in df.columns:
    df[column].fillna(df[column].mode()[0], inplace=True)

print(df.isnull().sum())  


Title              0
Total Cook Time    0
Prep Time          0
Cook Time          0
Recipe Servings    0
Difficulty         0
Ingredients        0
Image Link         0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mode()[0], inplace=True)


In [31]:
df['Recipe Servings'] = df['Recipe Servings'].astype(int)


In [32]:
# Remove any duplicate recipes
df.drop_duplicates(inplace=True)


In [33]:
df['Ingredients'] = df['Ingredients'].apply(lambda x: str(x).lower())
df['Difficulty'] = df['Difficulty'].apply(lambda x: str(x).lower())

In [34]:
df.head()

Unnamed: 0,Title,Total Cook Time,Prep Time,Cook Time,Recipe Servings,Difficulty,Ingredients,Image Link
0,Eggless Banana Pancake Recipe,20 mins,05 mins,15 mins,2,easy,"1 cup all-purpose flour, 1 tbsp sugar, 1 tsp ...",https://i.ndtvimg.com/i/2015-07/pancake-625_62...
1,Leftover Rice Pancakes Recipe,25 mins,10 mins,15 mins,2,easy,"1 cup leftover cooked rice, 1 cup all-purpose ...",https://c.ndtvimg.com/2023-07/o4pnb5j8_leftove...
2,Desi Style Masala-Cheese Hot Dog Recipe,20 mins,05 mins,15 mins,2,easy,"4 tsp oil, 2 boiled potatoes, 1 tsp butter, 1...",https://c.ndtvimg.com/2023-04/ekp1vv4g_hot-dog...
3,Healthy Banana Pancakes Recipe,20 mins,10 mins,10 mins,5,easy,"2 ripe banana, 2 eggs, 1 tsp vanilla extract...",https://c.ndtvimg.com/2019-03/dehgn798_oats-pa...
4,3 Ingredient Banana Pancake Recipe,10 mins,05 mins,05 mins,2,easy,"2 bananas, 2 eggs, 2 tbsp oats/flour",https://c.ndtvimg.com/2021-04/tg682u28_pancake...


In [35]:
import pandas as pd
import re
import spacy

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

# Words to be removed from the ingredient lists
remove_words = ['tsp', 'tbsp', 'ml', 'gm', 'cups', 'boiled', 'chopped', 'sliced', 'crushed', 'powder', 'salt', 'butter', 'oil','cup','pinched','mashed','melted','allpurpose','ripe','baking','cooked','leftover','hot','diced','paste','gms','pinch','optional','large','small']

# Stopwords that are not relevant for ingredient lists
stopwords = ['and', 'with', 'in', 'for', 'of', 'to', 'from', 'a', 'the', 'on', 'at', 'by','or']

# Function to clean the ingredient string
def clean_ingredients(ingredient_str):
    # Remove digits
    ingredient_str = re.sub(r'\d+', '', ingredient_str)
    # Remove punctuation
    ingredient_str = re.sub(r'[^\w\s]', '', ingredient_str)
    
    # Split the ingredients into tokens
    tokens = ingredient_str.split()
    # Remove unwanted words
    cleaned_tokens = [token for token in tokens if token not in remove_words and token not in stopwords]
    
    return ' '.join(cleaned_tokens)


# Applying the cleaning function to the 'Ingredients' column
df['Core_Ingredients'] = df['Ingredients'].apply(clean_ingredients)

# Function to extract nouns from a given text using spaCy
def extract_nouns(text):
    doc = nlp(text)
    # Extract nouns and proper nouns
    nouns = {token.text for token in doc if token.pos_ in ['NOUN', 'PROPN']}
    return ' '.join(nouns)

# Replace the Core_Ingredients column with extracted nouns
df['Core_Ingredients'] = df['Core_Ingredients'].apply(extract_nouns)



In [36]:
df.head()

Unnamed: 0,Title,Total Cook Time,Prep Time,Cook Time,Recipe Servings,Difficulty,Ingredients,Image Link,Core_Ingredients
0,Eggless Banana Pancake Recipe,20 mins,05 mins,15 mins,2,easy,"1 cup all-purpose flour, 1 tbsp sugar, 1 tsp ...",https://i.ndtvimg.com/i/2015-07/pancake-625_62...,milk banana sugar flour
1,Leftover Rice Pancakes Recipe,25 mins,10 mins,15 mins,2,easy,"1 cup leftover cooked rice, 1 cup all-purpose ...",https://c.ndtvimg.com/2023-07/o4pnb5j8_leftove...,vanilla sugar buttermilkmilk rice flour soda e...
2,Desi Style Masala-Cheese Hot Dog Recipe,20 mins,05 mins,15 mins,2,easy,"4 tsp oil, 2 boiled potatoes, 1 tsp butter, 1...",https://c.ndtvimg.com/2023-04/ekp1vv4g_hot-dog...,buns chutney taste capsicum mint garlic corian...
3,Healthy Banana Pancakes Recipe,20 mins,10 mins,10 mins,5,easy,"2 ripe banana, 2 eggs, 1 tsp vanilla extract...",https://c.ndtvimg.com/2019-03/dehgn798_oats-pa...,eggs vanilla banana cinnamon oats
4,3 Ingredient Banana Pancake Recipe,10 mins,05 mins,05 mins,2,easy,"2 bananas, 2 eggs, 2 tbsp oats/flour",https://c.ndtvimg.com/2021-04/tg682u28_pancake...,eggs bananas


In [37]:
# Extracting all unique words in the 'Core_Ingredients' column
all_ingredients = []
for ingredients in df['Core_Ingredients']:
    items = [ingredient.strip() for ingredient in ingredients.split()]
    all_ingredients.extend(items)

vocab = sorted(list(set(all_ingredients)))

print("Size of Vocabulary:", len(vocab))

Size of Vocabulary: 1460


In [38]:
def vectorize_ingredients(ingredient_str, vocab):
    
    vector = [0] * len(vocab)
    
    ingredients = [ingredient.strip() for ingredient in ingredient_str.split()]
  
    for ingredient in ingredients:
        if ingredient in vocab:
            vector[vocab.index(ingredient)] = 1
    
    return vector

df['Ingredient_Vector'] = df['Core_Ingredients'].apply(lambda x: vectorize_ingredients(x, vocab))

In [39]:
df.head()

Unnamed: 0,Title,Total Cook Time,Prep Time,Cook Time,Recipe Servings,Difficulty,Ingredients,Image Link,Core_Ingredients,Ingredient_Vector
0,Eggless Banana Pancake Recipe,20 mins,05 mins,15 mins,2,easy,"1 cup all-purpose flour, 1 tbsp sugar, 1 tsp ...",https://i.ndtvimg.com/i/2015-07/pancake-625_62...,milk banana sugar flour,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Leftover Rice Pancakes Recipe,25 mins,10 mins,15 mins,2,easy,"1 cup leftover cooked rice, 1 cup all-purpose ...",https://c.ndtvimg.com/2023-07/o4pnb5j8_leftove...,vanilla sugar buttermilkmilk rice flour soda e...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Desi Style Masala-Cheese Hot Dog Recipe,20 mins,05 mins,15 mins,2,easy,"4 tsp oil, 2 boiled potatoes, 1 tsp butter, 1...",https://c.ndtvimg.com/2023-04/ekp1vv4g_hot-dog...,buns chutney taste capsicum mint garlic corian...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Healthy Banana Pancakes Recipe,20 mins,10 mins,10 mins,5,easy,"2 ripe banana, 2 eggs, 1 tsp vanilla extract...",https://c.ndtvimg.com/2019-03/dehgn798_oats-pa...,eggs vanilla banana cinnamon oats,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,3 Ingredient Banana Pancake Recipe,10 mins,05 mins,05 mins,2,easy,"2 bananas, 2 eggs, 2 tbsp oats/flour",https://c.ndtvimg.com/2021-04/tg682u28_pancake...,eggs bananas,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [40]:
# Check the tokenization result for a few rows
def check_tokenization(df, vocab):
    for index, row in df.iterrows():
        print(f"Original Ingredients: {row['Ingredients']}")
        print(f"Cleaned Ingredients: {row['Core_Ingredients']}")
        
        # Verify that for each ingredient in 'Core_Ingredients', its corresponding index in the vector is 1
        ingredients = row['Core_Ingredients'].split()
        for ingredient in ingredients:
            if ingredient in vocab:
                vocab_index = vocab.index(ingredient)
                print(f"Ingredient '{ingredient}' found in vocabulary at index {vocab_index}")
                print(f"Vector value at index {vocab_index}: {row['Ingredient_Vector'][vocab_index]}")
            else:
                print(f"Ingredient '{ingredient}' not found in vocabulary")
        print("="*50)

# Call the check function
check_tokenization(df, vocab)


Original Ingredients: 1 cup  all-purpose flour, 1 tbsp sugar, 1 tsp baking powder, a pinch of salt, 1  ripe banana, mashed, 1 cup milk, 2 tbsp melted butter or oil
Cleaned Ingredients: milk banana sugar flour
Ingredient 'milk' found in vocabulary at index 828
Vector value at index 828: 1
Ingredient 'banana' found in vocabulary at index 70
Vector value at index 70: 1
Ingredient 'sugar' found in vocabulary at index 1291
Vector value at index 1291: 1
Ingredient 'flour' found in vocabulary at index 470
Vector value at index 470: 1
Original Ingredients: 1 cup leftover cooked rice, 1 cup all-purpose flour, 1 tbsp granulated sugar, 1 tsp baking powder, 1/2 tsp baking soda, 1/4 tsp salt, 1 cup buttermilk/milk, 1 large egg, 2 tbsp melted butter/oil, 1 tbsp vanilla extract (optional)
Cleaned Ingredients: vanilla sugar buttermilkmilk rice flour soda egg butteroil
Ingredient 'vanilla' found in vocabulary at index 1384
Vector value at index 1384: 1
Ingredient 'sugar' found in vocabulary at index 12

Ingredient 'paneer' found in vocabulary at index 927
Vector value at index 927: 1
Ingredient 'ghee' found in vocabulary at index 512
Vector value at index 512: 1
Ingredient 'multigrain' found in vocabulary at index 861
Vector value at index 861: 1
Ingredient 'bowl' found in vocabulary at index 132
Vector value at index 132: 1
Ingredient 'cheese' found in vocabulary at index 219
Vector value at index 219: 1
Ingredient 'loaf' found in vocabulary at index 759
Vector value at index 759: 1
Ingredient 'masala' found in vocabulary at index 800
Vector value at index 800: 1
Ingredient 'onions' found in vocabulary at index 903
Vector value at index 903: 1
Original Ingredients: 1 bowl   dal, 2-3 tbsp besan, 1 tsp onion, finely chopped, 1/2 tsp green chillies, finely chopped, 1/2 tsp coriander leaves, oil to fry
Cleaned Ingredients: chillies dal bowl onion coriander
Ingredient 'chillies' found in vocabulary at index 240
Vector value at index 240: 1
Ingredient 'dal' found in vocabulary at index 370

Vector value at index 607: 1
Ingredient 'water' found in vocabulary at index 1413
Vector value at index 1413: 1
Ingredient 'flavor' found in vocabulary at index 464
Vector value at index 464: 1
Ingredient 'yogurt' found in vocabulary at index 1443
Vector value at index 1443: 1
Ingredient 'thick' found in vocabulary at index 1341
Vector value at index 1341: 1
Ingredient 'gram' found in vocabulary at index 542
Vector value at index 542: 1
Original Ingredients: 10  chikus (sapota/ sapodilla), 1/2 cup fresh cream, 1 cup sugar, 300 ml milk (full fat), 1/2 cup milk powder
Cleaned Ingredients: cream sugar chikus milk sapota sapodilla
Ingredient 'cream' found in vocabulary at index 334
Vector value at index 334: 1
Ingredient 'sugar' found in vocabulary at index 1291
Vector value at index 1291: 1
Ingredient 'chikus' found in vocabulary at index 233
Vector value at index 233: 1
Ingredient 'milk' found in vocabulary at index 828
Vector value at index 828: 1
Ingredient 'sapota' found in vocabulary

In [41]:
def extract_accuracy(row):
    val1 = row['Ingredients'].split(',')
    val2 = row['Core_Ingredients'].split()
    
    len1 = len(val1)
    len2 = len(val2)
    
    return int((len2/len1)*100)
    
    

In [42]:
df['Accuracy'] = df.apply(extract_accuracy, axis=1)

In [43]:
df.head()

Unnamed: 0,Title,Total Cook Time,Prep Time,Cook Time,Recipe Servings,Difficulty,Ingredients,Image Link,Core_Ingredients,Ingredient_Vector,Accuracy
0,Eggless Banana Pancake Recipe,20 mins,05 mins,15 mins,2,easy,"1 cup all-purpose flour, 1 tbsp sugar, 1 tsp ...",https://i.ndtvimg.com/i/2015-07/pancake-625_62...,milk banana sugar flour,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",50
1,Leftover Rice Pancakes Recipe,25 mins,10 mins,15 mins,2,easy,"1 cup leftover cooked rice, 1 cup all-purpose ...",https://c.ndtvimg.com/2023-07/o4pnb5j8_leftove...,vanilla sugar buttermilkmilk rice flour soda e...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",80
2,Desi Style Masala-Cheese Hot Dog Recipe,20 mins,05 mins,15 mins,2,easy,"4 tsp oil, 2 boiled potatoes, 1 tsp butter, 1...",https://c.ndtvimg.com/2023-04/ekp1vv4g_hot-dog...,buns chutney taste capsicum mint garlic corian...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",83
3,Healthy Banana Pancakes Recipe,20 mins,10 mins,10 mins,5,easy,"2 ripe banana, 2 eggs, 1 tsp vanilla extract...",https://c.ndtvimg.com/2019-03/dehgn798_oats-pa...,eggs vanilla banana cinnamon oats,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",100
4,3 Ingredient Banana Pancake Recipe,10 mins,05 mins,05 mins,2,easy,"2 bananas, 2 eggs, 2 tbsp oats/flour",https://c.ndtvimg.com/2021-04/tg682u28_pancake...,eggs bananas,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",66


In [44]:
df = df.reset_index(drop=True)

In [45]:
import numpy as np
def recommend_recipes(user_vector, df, top_n=5):
    recipe_vectors = np.array(df['Ingredient_Vector'].tolist())
    user_vector = (vectorize_ingredients(user_vector,vocab))
    user_vector = np.array(user_vector).reshape(1, -1)
    similarity_scores = cosine_similarity(user_vector, recipe_vectors)[0]
    
    recipe_similarity = list(enumerate(similarity_scores))
    top_indices = sorted(recipe_similarity, key=lambda x: x[1], reverse=True)[:top_n]
    
    top_recipe_indices = [index for index, score in top_indices]
    
    return top_recipe_indices

In [46]:
user_ingredients = 'flour sugar banana milk'
top_recipes = recommend_recipes(user_ingredients, df, top_n=5)
print('Top 5 most similar recipes are:')
for recipe in top_recipes:
    name = df.loc[recipe, 'Title']
    Ingredients = df.loc[recipe, 'Core_Ingredients']
    print(name, " : " , Ingredients)
    print()

Top 5 most similar recipes are:
Eggless Banana Pancake Recipe  :  milk banana sugar flour

Eggless Pav Recipe  :  yeast sugar milk flour purpose

Nutella Waffles Recipe  :  sugar milk flour purpose nutella

Pancake Lava Of Love Recipe  :  white sugar milk taste flour egg

Dinner Roll Recipe  :  yeast white sugar milk flour water egg lukewarm



In [47]:
import pickle

# Save your vocabulary and any other models if necessary
with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

# Optionally save the DataFrame if needed
df.to_csv('recipes.csv', index=False)  # Save your DataFrame as a CSV
