In [67]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

---
# Read in the data
---

In [119]:
food = pd.read_csv('IndianFoodDatasetCSV.csv')
food = food.dropna()
food = food.drop_duplicates()
food = food.reset_index(drop=True)
food = food[['TranslatedRecipeName', 'TranslatedIngredients', 'PrepTimeInMins', 'CookTimeInMins', 'TotalTimeInMins', 'Servings', 'Cuisine', 'Course', 'Diet', 'TranslatedInstructions', 'URL']]
food.columns = ['Recipe', 'Ingredients', 'PrepTime', 'CookTime', 'TotalTime', 'Servings', 'Cuisine', 'Course', 'Diet', 'Instructions', 'URL']
food = food.iloc[:6819, :]
food.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6819 entries, 0 to 6818
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Recipe        6819 non-null   object
 1   Ingredients   6819 non-null   object
 2   PrepTime      6819 non-null   int64 
 3   CookTime      6819 non-null   int64 
 4   TotalTime     6819 non-null   int64 
 5   Servings      6819 non-null   int64 
 6   Cuisine       6819 non-null   object
 7   Course        6819 non-null   object
 8   Diet          6819 non-null   object
 9   Instructions  6819 non-null   object
 10  URL           6819 non-null   object
dtypes: int64(4), object(7)
memory usage: 586.1+ KB


In [120]:
food.head()

Unnamed: 0,Recipe,Ingredients,PrepTime,CookTime,TotalTime,Servings,Cuisine,Course,Diet,Instructions,URL
0,Masala Karela Recipe,"6 Karela (Bitter Gourd/ Pavakkai) - deseeded,S...",15,30,45,6,Indian,Side Dish,Diabetic Friendly,"To begin making the Masala Karela Recipe,de-se...",https://www.archanaskitchen.com/masala-karela-...
1,Spicy Tomato Rice (Recipe),"2-1 / 2 cups rice - cooked, 3 tomatoes, 3 teas...",5,10,15,3,South Indian Recipes,Main Course,Vegetarian,"To make tomato puliogere, first cut the tomato...",http://www.archanaskitchen.com/spicy-tomato-ri...
2,Ragi Semiya Upma Recipe - Ragi Millet Vermicel...,"1-1/2 cups Rice Vermicelli Noodles (Thin),1 On...",20,30,50,4,South Indian Recipes,South Indian Breakfast,High Protein Vegetarian,"To begin making the Ragi Vermicelli Recipe, fi...",http://www.archanaskitchen.com/ragi-vermicelli...
3,Gongura Chicken Curry Recipe - Andhra Style Go...,"500 grams Chicken,2 Onion - chopped,1 Tomato -...",15,30,45,4,Andhra,Lunch,Non Vegeterian,To begin making Gongura Chicken Curry Recipe f...,http://www.archanaskitchen.com/gongura-chicken...
4,Andhra Style Alam Pachadi Recipe - Adrak Chutn...,"1 tablespoon chana dal, 1 tablespoon white ura...",10,20,30,4,Andhra,South Indian Breakfast,Vegetarian,"To make Andhra Style Alam Pachadi, first heat ...",https://www.archanaskitchen.com/andhra-style-a...


---
# Ingredients Cleaning
---

In [121]:
food['Ingredients'].iloc[0], food['Ingredients'].iloc[1]

('6 Karela (Bitter Gourd/ Pavakkai) - deseeded,Salt - to taste,1 Onion - thinly sliced,3 tablespoon Gram flour (besan),2 teaspoons Turmeric powder (Haldi),1 tablespoon Red Chilli powder,2 teaspoons Cumin seeds (Jeera),1 tablespoon Coriander Powder (Dhania),1 tablespoon Amchur (Dry Mango Powder),Sunflower Oil - as required',
 '2-1 / 2 cups rice - cooked, 3 tomatoes, 3 teaspoons BC Belle Bhat powder, salt - as per taste, 1 teaspoon chickpea lentils, 1/2 teaspoon cumin seeds, 1 teaspoon white urad dal, 1/2 Teaspoon mustard, 1 green chilli, 1 dry red chilli, 2 teaspoon cashew - or peanuts, 1-1 / 2 tablespoon oil - 1/2 teaspoon asafoetida')

In [122]:
import re
import spacy

nlp = spacy.load("en_core_web_sm")

def clean_ingredients_combined(ingredients):
    # Remove numbers and unwanted words using regex
    clean = ingredients.split(',')
    clean = [text.strip().lower() for text in clean]
    cleaned_ingredients = [re.sub(r'\b\d+(\.\d+)?\s*(\/\s*\d+(\.\d+)?)?\s*(teaspoon|tablespoon|cup|ounce|pound|g|kg)?[s]?\b', '', ingredients) for ingredients in clean]

    # Remove special characters and extra whitespaces
    cleaned_ingredients = [re.sub(r'[^a-zA-Z\s]', '', cleaned_ingredients) for cleaned_ingredients in cleaned_ingredients]

    # Remove extra whitespaces
    cleaned_ingredients = [re.sub(r'\s+', ' ', cleaned_ingredients) for cleaned_ingredients in cleaned_ingredients]

    cleaned_ingredients = [text.strip() for text in cleaned_ingredients]

    cleaned_ingredients = ', '.join(cleaned_ingredients)
    # cleaned_ingredients = cleaned_ingredients.replace(', ', ' ')

    # Tokenize the ingredients using spaCy
    doc = nlp(cleaned_ingredients)

    # Filter out tokens that are verbs, adjectives, and prepositions
    cleaned_ingredients_spacy = [token.text.lower() for token in doc if token.pos_ not in ('VERB', 'ADJ', 'ADP') and not token.is_stop]

    cleaned_ingredients_spacy = list(filter(lambda x: x.strip() != '', cleaned_ingredients_spacy))

    result_list = []
    current_item = ''

    for item in cleaned_ingredients_spacy:
        if item != ',':
            current_item += item + ' '
        else:
            result_list.append(current_item.strip())
            current_item = ''

    # Append the last item after the last comma
    result_list.append(current_item)

    # Filter out empty strings and remove commas
    result_list = [item.strip() for item in result_list if item]

    return result_list

In [123]:
food['Ingredients'] = food['Ingredients'].apply(clean_ingredients_combined)
food['Ingredients'].iloc[0], food['Ingredients'].iloc[1]

(['karela gourd pavakkai',
  'salt taste',
  'onion thinly',
  'gram flour besan',
  'powder haldi',
  'chilli powder',
  'cumin jeera',
  'coriander powder dhania',
  'amchur mango powder',
  'sunflower oil'],
 ['rice',
  'tomatoes',
  'bc belle bhat powder',
  'salt taste',
  'chickpea lentils',
  'cumin seeds',
  'urad dal',
  'mustard',
  'green chilli',
  'red chilli',
  'cashew peanuts',
  'oil asafoetida'])