In [241]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer


After loading in pandas and matplot lib, let's make a dataframe from our downloaded csv file:

In [242]:
IndianFoodDF = pd.read_csv('IndianFoodDatasetCSV.csv')

In [243]:
#Take a first look at what we have
IndianFoodDF.head()

Unnamed: 0,Srno,RecipeName,TranslatedRecipeName,Ingredients,TranslatedIngredients,PrepTimeInMins,CookTimeInMins,TotalTimeInMins,Servings,Cuisine,Course,Diet,Instructions,TranslatedInstructions,URL
0,1,Masala Karela Recipe,Masala Karela Recipe,"6 Karela (Bitter Gourd/ Pavakkai) - deseeded,S...","6 Karela (Bitter Gourd/ Pavakkai) - deseeded,S...",15,30,45,6,Indian,Side Dish,Diabetic Friendly,"To begin making the Masala Karela Recipe,de-se...","To begin making the Masala Karela Recipe,de-se...",https://www.archanaskitchen.com/masala-karela-...
1,2,टमाटर पुलियोगरे रेसिपी - Spicy Tomato Rice (Re...,Spicy Tomato Rice (Recipe),"2-1/2 कप चावल - पका ले,3 टमाटर,3 छोटा चमच्च बी...","2-1 / 2 cups rice - cooked, 3 tomatoes, 3 teas...",5,10,15,3,South Indian Recipes,Main Course,Vegetarian,टमाटर पुलियोगरे बनाने के लिए सबसे पहले टमाटर क...,"To make tomato puliogere, first cut the tomato...",http://www.archanaskitchen.com/spicy-tomato-ri...
2,3,Ragi Semiya Upma Recipe - Ragi Millet Vermicel...,Ragi Semiya Upma Recipe - Ragi Millet Vermicel...,"1-1/2 cups Rice Vermicelli Noodles (Thin),1 On...","1-1/2 cups Rice Vermicelli Noodles (Thin),1 On...",20,30,50,4,South Indian Recipes,South Indian Breakfast,High Protein Vegetarian,"To begin making the Ragi Vermicelli Recipe, fi...","To begin making the Ragi Vermicelli Recipe, fi...",http://www.archanaskitchen.com/ragi-vermicelli...
3,4,Gongura Chicken Curry Recipe - Andhra Style Go...,Gongura Chicken Curry Recipe - Andhra Style Go...,"500 grams Chicken,2 Onion - chopped,1 Tomato -...","500 grams Chicken,2 Onion - chopped,1 Tomato -...",15,30,45,4,Andhra,Lunch,Non Vegeterian,To begin making Gongura Chicken Curry Recipe f...,To begin making Gongura Chicken Curry Recipe f...,http://www.archanaskitchen.com/gongura-chicken...
4,5,आंध्रा स्टाइल आलम पचड़ी रेसिपी - Adrak Chutney ...,Andhra Style Alam Pachadi Recipe - Adrak Chutn...,"1 बड़ा चमच्च चना दाल,1 बड़ा चमच्च सफ़ेद उरद दाल,2...","1 tablespoon chana dal, 1 tablespoon white ura...",10,20,30,4,Andhra,South Indian Breakfast,Vegetarian,आंध्रा स्टाइल आलम पचड़ी बनाने के लिए सबसे पहले ...,"To make Andhra Style Alam Pachadi, first heat ...",https://www.archanaskitchen.com/andhra-style-a...


It appears that every row in this df is a unique recipe. How much and what kinds of data do we have? Let's see below:

In [244]:
#find the rows/columns
IndianFoodDF.shape

(6871, 15)

In [245]:
#check out the column names for the 15 columns
IndianFoodDF.columns

Index(['Srno', 'RecipeName', 'TranslatedRecipeName', 'Ingredients',
       'TranslatedIngredients', 'PrepTimeInMins', 'CookTimeInMins',
       'TotalTimeInMins', 'Servings', 'Cuisine', 'Course', 'Diet',
       'Instructions', 'TranslatedInstructions', 'URL'],
      dtype='object')

In [246]:
#Now, let's drop any columns that we don't want.
IndianFoodDF = IndianFoodDF.drop(['Srno','RecipeName','Ingredients', 'Instructions'], axis=1)

In [247]:
#And now rename the columns that are left:
IndianFoodDF.rename(columns={'TranslatedRecipeName':'RecipeName', 'TranslatedIngredients':'Ingredients', 'TranslatedInstructions':'Instructions'}, inplace=True)

In [248]:
#confirm the change
IndianFoodDF.columns

Index(['RecipeName', 'Ingredients', 'PrepTimeInMins', 'CookTimeInMins',
       'TotalTimeInMins', 'Servings', 'Cuisine', 'Course', 'Diet',
       'Instructions', 'URL'],
      dtype='object')

In [249]:
#check to see the datatypes and null counts:
IndianFoodDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6871 entries, 0 to 6870
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   RecipeName       6871 non-null   object
 1   Ingredients      6865 non-null   object
 2   PrepTimeInMins   6871 non-null   int64 
 3   CookTimeInMins   6871 non-null   int64 
 4   TotalTimeInMins  6871 non-null   int64 
 5   Servings         6871 non-null   int64 
 6   Cuisine          6871 non-null   object
 7   Course           6871 non-null   object
 8   Diet             6871 non-null   object
 9   Instructions     6871 non-null   object
 10  URL              6871 non-null   object
dtypes: int64(4), object(7)
memory usage: 590.6+ KB


In [250]:
#There are only 6 recipes that do not have ingredients. Although there is other info for these recipes, I feel like these aren't really usable for us... I'm going to decide to drop them. 
IndianFoodDF = IndianFoodDF.dropna()

In [251]:
#confirm the change
IndianFoodDF.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6865 entries, 0 to 6870
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   RecipeName       6865 non-null   object
 1   Ingredients      6865 non-null   object
 2   PrepTimeInMins   6865 non-null   int64 
 3   CookTimeInMins   6865 non-null   int64 
 4   TotalTimeInMins  6865 non-null   int64 
 5   Servings         6865 non-null   int64 
 6   Cuisine          6865 non-null   object
 7   Course           6865 non-null   object
 8   Diet             6865 non-null   object
 9   Instructions     6865 non-null   object
 10  URL              6865 non-null   object
dtypes: int64(4), object(7)
memory usage: 643.6+ KB


The next few cells we're just going to look around the data to see what we find :) 

In [252]:
#What is represented by Cuisine
print(f'There are {IndianFoodDF.Cuisine.nunique()} different types of Cuisine')
IndianFoodDF.Cuisine.value_counts()

There are 82 different types of Cuisine


Cuisine
Indian                  1157
Continental             1020
North Indian Recipes     936
South Indian Recipes     681
Italian Recipes          235
                        ... 
Jewish                     1
Dessert                    1
Side Dish                  1
Shandong                   1
Lunch                      1
Name: count, Length: 82, dtype: int64

In [253]:
print(f'There are {IndianFoodDF.Course.nunique()} unique values for Course')
IndianFoodDF.Course.value_counts()

There are 20 unique values for Course


Course
Lunch                           1763
Side Dish                        992
Snack                            876
Dinner                           781
Dessert                          659
Appetizer                        637
Main Course                      315
South Indian Breakfast           260
World Breakfast                  260
North Indian Breakfast           122
Indian Breakfast                 101
Vegetarian                        47
One Pot Dish                      33
High Protein Vegetarian            7
Brunch                             4
Vegan                              3
Non Vegeterian                     2
Eggetarian                         1
No Onion No Garlic (Sattvic)       1
Sugar Free Diet                    1
Name: count, dtype: int64

In [254]:
#Let's group our info by country and cuisine, looking just at average cook time and number of recipes:
IFdfByCountry = IndianFoodDF.groupby(['Cuisine', 'Course']).agg({'TotalTimeInMins':'mean', 'RecipeName':'count'}).round(0).reset_index().rename(columns={'RecipeName':'Count', 'TotalTimeInMins':'AverageTotalTime'})

In [255]:
#Which types of food/cuisine are in the top 10 most common in this df?
IFdfByCountry.sort_values('Count', ascending=False)

Unnamed: 0,Cuisine,Course,AverageTotalTime,Count
364,North Indian Recipes,Lunch,55.0,346
191,Indian,Lunch,53.0,255
196,Indian,Snack,50.0,234
437,South Indian Recipes,Lunch,45.0,229
99,Continental,Dessert,127.0,226
...,...,...,...,...
96,Coastal Karnataka,Snack,60.0,1
92,Chinese,World Breakfast,45.0,1
91,Chinese,Vegetarian,15.0,1
90,Chinese,Snack,40.0,1


In [256]:
#What can we see about the places and course type pairings that have the fastest average total time?
IFdfByCountry.sort_values('AverageTotalTime')

Unnamed: 0,Cuisine,Course,AverageTotalTime,Count
31,Asian,Vegetarian,7.0,2
101,Continental,Eggetarian,10.0,1
398,Parsi Recipes,Vegetarian,10.0,1
391,Parsi Recipes,High Protein Vegetarian,10.0,1
199,Indian,Vegetarian,10.0,9
...,...,...,...,...
444,South Karnataka,Main Course,530.0,1
308,Mangalorean,Indian Breakfast,570.0,1
348,Nepalese,Dinner,762.0,2
189,Indian,High Protein Vegetarian,850.0,1


In [257]:
#What if we want to dive more deeply into fast and slow recipes, using all the data?
QuickIFdf = IndianFoodDF[IndianFoodDF['TotalTimeInMins'] <=30]
SlowIFdf = IndianFoodDF[IndianFoodDF['TotalTimeInMins'] > 30]

In [258]:
QuickIFshape = QuickIFdf.shape
SlowIFdfshape = SlowIFdf.shape
print(f'There are {QuickIFshape[0]} recipes that take 30 mins and under, and {SlowIFdfshape[0]} recipes that take more than 30 mins total time.')

There are 2065 recipes that take 30 mins and under, and 4800 recipes that take more than 30 mins total time.


Wow, more than twice as many long recipes! Let's see if there are any similarities or differences between the diet and course info for slow vs quick:

In [259]:
QuickIFdf.Diet.value_counts()

Diet
Vegetarian                      1548
High Protein Vegetarian          171
Eggetarian                        89
Diabetic Friendly                 84
High Protein Non Vegetarian       55
Non Vegeterian                    55
Vegan                             27
No Onion No Garlic (Sattvic)      17
Gluten Free                       13
Sugar Free Diet                    6
Name: count, dtype: int64

In [260]:
SlowIFdf.Diet.value_counts()

Diet
Vegetarian                      3158
High Protein Vegetarian          534
Non Vegeterian                   372
Eggetarian                       255
Diabetic Friendly                176
High Protein Non Vegetarian      170
No Onion No Garlic (Sattvic)      56
Gluten Free                       37
Vegan                             34
Sugar Free Diet                    8
Name: count, dtype: int64

In [261]:
QuickIFdf.Course.value_counts()

Course
Side Dish                       507
Lunch                           395
Snack                           268
Appetizer                       224
Dinner                          174
World Breakfast                 134
Dessert                          97
South Indian Breakfast           76
Main Course                      70
North Indian Breakfast           39
Indian Breakfast                 34
Vegetarian                       34
One Pot Dish                      4
High Protein Vegetarian           3
Vegan                             2
Eggetarian                        1
No Onion No Garlic (Sattvic)      1
Brunch                            1
Sugar Free Diet                   1
Name: count, dtype: int64

In [262]:
SlowIFdf.Course.value_counts()

Course
Lunch                      1368
Snack                       608
Dinner                      607
Dessert                     562
Side Dish                   485
Appetizer                   413
Main Course                 245
South Indian Breakfast      184
World Breakfast             126
North Indian Breakfast       83
Indian Breakfast             67
One Pot Dish                 29
Vegetarian                   13
High Protein Vegetarian       4
Brunch                        3
Non Vegeterian                2
Vegan                         1
Name: count, dtype: int64

Not surprisingly, the top quick dishes are smaller/side dishes and not main courses. 

In [263]:
#What can we see about the fastest foods?
QuickIFdf.groupby(['Cuisine','Course']).agg({'TotalTimeInMins':'mean'}).rename(columns={'TotalTimeInMins':'AverageTotalTime'}).round().sort_values('AverageTotalTime')

Unnamed: 0_level_0,Unnamed: 1_level_0,AverageTotalTime
Cuisine,Course,Unnamed: 2_level_1
North Indian Recipes,Indian Breakfast,5.0
Asian,Vegetarian,7.0
Afghan,Snack,10.0
Chinese,Dessert,10.0
Indian,Vegetarian,10.0
...,...,...
Pakistani,Side Dish,30.0
Kerala Recipes,One Pot Dish,30.0
North Karnataka,Main Course,30.0
Fusion,Indian Breakfast,30.0


In [264]:
#What about the slowest foods?
SlowIFdf.groupby(['Cuisine','Course']).agg({'TotalTimeInMins':'mean'}).rename(columns={'TotalTimeInMins':'AverageTotalTime'}).round().sort_values('AverageTotalTime', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,AverageTotalTime
Cuisine,Course,Unnamed: 2_level_1
Oriya Recipes,Indian Breakfast,1470.0
Continental,Vegetarian,1440.0
Indian,High Protein Vegetarian,850.0
Nepalese,Dinner,762.0
Bihari,Dinner,580.0
...,...,...
Kerala Recipes,Indian Breakfast,35.0
Konkan,Appetizer,35.0
Maharashtrian Recipes,Appetizer,35.0
Malaysian,Snack,35.0


Wow! Some of those times are crazy long. Is that an error? Let's see what the directions and ingredients are for those

In [265]:
ExtralongIFdf = IndianFoodDF[IndianFoodDF.TotalTimeInMins >= 500]

Let's take a look at what we can do with the DataFrame to make it searchable by ingredient:

In [266]:
#First, build a second dataframe with just the recipe name and the ingredients list.
IngredientsDF = IndianFoodDF[['RecipeName','Ingredients']].copy()

In [267]:
IngredientsDF.head()

Unnamed: 0,RecipeName,Ingredients
0,Masala Karela Recipe,"6 Karela (Bitter Gourd/ Pavakkai) - deseeded,S..."
1,Spicy Tomato Rice (Recipe),"2-1 / 2 cups rice - cooked, 3 tomatoes, 3 teas..."
2,Ragi Semiya Upma Recipe - Ragi Millet Vermicel...,"1-1/2 cups Rice Vermicelli Noodles (Thin),1 On..."
3,Gongura Chicken Curry Recipe - Andhra Style Go...,"500 grams Chicken,2 Onion - chopped,1 Tomato -..."
4,Andhra Style Alam Pachadi Recipe - Adrak Chutn...,"1 tablespoon chana dal, 1 tablespoon white ura..."


In [270]:
def split(string):
    return string.lower().split(', ')

In [271]:
IngredientsDF['Ingredients'] = IngredientsDF['Ingredients'].apply(split)

In [None]:
#define a search function
def search_string(s, search):
    return search in str(s)

In [275]:
#search for any item in the ingredients column
search_item = input(f'Find Recipes with a chosen ingredient! \n What ingredient would you like to search for?')
mask = IngredientsDF.apply(lambda x: x.map(lambda s: search_string(s, search_item)))

# Filter the DataFrame based on the mask
filtered_df = IngredientsDF.loc[mask.any(axis=1)]
filtered_df.head(25)

Unnamed: 0,RecipeName,Ingredients
13,And fish soup recipe - Bengali style fish in t...,[600 grams aar maach (fish) - rohu/ katla fish...
55,Spicy Seafood Stew Casserole With Tomatoes And...,"[250 grams fish fillet - basa,250 grams prawns..."
85,Drunken Noodles Recipe - Drunken Noodles Recipe,"[200 grams rice noodles, 1/2 cup baby corn - c..."
134,And fish soup recipe - Bengali fish curry,"[1 aar maach (fish) - (rohu or katla),1 tomato..."
333,Meen Vevichathu Recipe - Kottayam Style Fish C...,"[2 seer fish - slices,1/2 teaspoon turmeric po..."
340,Thai Som Tum Recipe (Thai Green Papaya Salad),"[5 cherry tomatoes,1 tablespoon dried shrimps ..."
355,Macher Chop Recipe (Bengali Style Fish Croquet...,"[500 grams fish fillet,1 cup onion - chopped,1..."
410,Baked Fish Crisps Recipe (Fish Fry In Oven),"[10 fish - pieces,1 teaspoon sunflower oil - o..."
493,Cantonese Chicken With Mushrooms Recipe,"[250 grams boneless chicken,1 cup shiitake mus..."
516,Broccoli Chilli Noodles With Grilled Salmon Re...,[400 grams salmon fillet - (indian salmon work...
