In [1]:
import numpy as np
import pandas as pd
import ast
# Import linear_kernel
import nltk
import string
import re
import unidecode
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter
import config 
from sklearn.metrics.pairwise import linear_kernel
#Import TfIdfVectorizer (scikit-learn)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


## Loading data and looking at a  sample

In [2]:
df_82k = pd.read_csv ("C:/Users/Atif-Foxfire/Downloads/RAW_recipes.csv/recipes_82k.csv", sep="," )
#df_82k = pd.read_csv ("../data/salad-recipe-data/recipes_82k.csv", sep="," )

In [3]:
df_82k.head(10)

Unnamed: 0,category,cooking_method,cuisine,image,ingredients,prep_time,recipe_name,serves,tags
0,,['Set the racks in the middle and upper thirds...,['American'],https://www.skinnytaste.com/wp-content/uploads...,"['1 tablespoons extra virgin olive oil', '1 cu...",20 minutes,Mediterranean Sea Bass,4 servings,"Dairy Free, Gluten Free, Keto Recipes, Kid Fri..."
1,,['Place the eggs in the air fryer basket and c...,['American'],https://www.skinnytaste.com/wp-content/uploads...,"['4 large eggs', 'Salt (black pepper, everythi...",15 minutes,Air Fryer Hard Boiled Eggs,4 eggs,"Air Fryer Recipes, Dairy Free, Gluten Free, Ke..."
2,,"['Air Fryer directions:', 'Preheat air fryer t...",['American'],https://www.skinnytaste.com/wp-content/uploads...,"['olive oil spray', '4 about 5 ounce each salm...",5 minutes,Air Fryer Basil-Parmesan Salmon,4 servings,"Air Fryer Recipes, Gluten Free, Keto Recipes, ..."
3,,"['Preheat the oven to 400F.', 'Pour 2 tablespo...",['American'],https://www.skinnytaste.com/wp-content/uploads...,['1/2 cup freshly grated Parmesan (not pre-gra...,15 minutes,Everything Parmesan Crisps,4 servings,"Gluten Free, Keto Recipes, Kid Friendly, Low C..."
4,,['Cook potatoes in a large pot of salted water...,['American'],https://www.skinnytaste.com/wp-content/uploads...,['3 1/2 pounds new potatoes (about 10 peeled a...,10 minutes,Potato and Green Bean Salad,12 servings,"Dairy Free, Gluten Free, Kid Friendly, Vegetar..."
5,,"['To Make the soup:', 'Heat 1 tablespoon of oi...",['American'],https://www.skinnytaste.com/wp-content/uploads...,"['2 tablespoons high-heat oil', '1 medium yell...",5 minutes,Green Detox Soup with Toasted Hemp Gremolata,2 servings,"Dairy Free, Gluten Free, Under 30 Minutes, Veg..."
6,,['Preheat the to 350F degrees. Spray a pie dis...,['American'],https://www.skinnytaste.com/wp-content/uploads...,"['cooking spray', '1 3/4 cups diced ham steak ...",15 minutes,Ham and Swiss Crustless Quiche,6 servings,"Gluten Free, Keto Recipes, Kid Friendly, Low C..."
7,,['Combine sour cream and brown sugar and mix w...,['American'],https://www.skinnytaste.com/wp-content/uploads...,"['16 oz strawberries (washed and cut)', '4 oz ...",5 minutes,Strawberries Romanoff,5 Servings,"Gluten Free, Kid Friendly, Low Carb, Under 30 ..."
8,,['Preheat the oven to 450F. Place a silicone l...,['American'],https://www.skinnytaste.com/wp-content/uploads...,['1 cup 5 oz all purpose or white whole wheat ...,15 minutes,Smoked Salmon Breakfast Flatbread,4 servings,"Air Fryer Recipes, Gluten Free, Under 30 Minutes"
9,,[],,https://www.skinnytaste.com/wp-content/uploads...,[],0 minutes,Skinnytaste Air Fryer Cookbook: Get a Free 39,0,Air Fryer Recipes


In [4]:
df_82k.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82245 entries, 0 to 82244
Data columns (total 9 columns):
category          18107 non-null object
cooking_method    82245 non-null object
cuisine           82230 non-null object
image             82245 non-null object
ingredients       82245 non-null object
prep_time         58630 non-null object
recipe_name       82245 non-null object
serves            82244 non-null object
tags              82011 non-null object
dtypes: object(9)
memory usage: 5.6+ MB


### Checking for duplicate recipe and ingrdients

In [5]:
print('No of Unique recipes: ',len(df_82k['recipe_name'].unique().tolist()))
print('No of Unique Ingredients: ', len(df_82k['ingredients'].unique().tolist()))

No of Unique recipes:  55838
No of Unique Ingredients:  61198



### Observations
- Dataset consist of 82245 rows and 9 Columns
- Text data no numerical data
- preptime and serves can be converted into numerical data
- a lot of duplicate recipes which can be removed
- some or the rows have 'ingreidentS' value '[]' whch is null but can't be de

In [6]:
# Removing duplicate recipes
df_82k = df_82k.drop_duplicates(subset=['recipe_name'], keep='first')
df_82k = df_82k[df_82k.ingredients != "[]"]
df_82k.reset_index(drop=True, inplace=True)

### Checking for Null / Missing  Values

In [7]:

type(df_82k['ingredients'])
total = df_82k.isnull().sum().sort_values(ascending=False)
percent = (df_82k.isnull().sum()/df_82k.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head()

Unnamed: 0,Total,Percent
category,47498,0.863239
prep_time,16466,0.299257
tags,178,0.003235
cuisine,8,0.000145
serves,1,1.8e-05


### Observations
- A lot of values are missing from 'category' it can be removed

In [8]:
 # dropping category  column
df_82k.drop('category',1, inplace =True)

## Data Cleaning
### Checking most frequently occuring words in 'ingredients'

In [9]:
vocabulary = nltk.FreqDist()
# This was done once I had already preprocessed the ingredients
for ingredients in df_82k['ingredients']:
    ingredients = ingredients.split()
    vocabulary.update(ingredients)
for word, frequency in vocabulary.most_common(200):
    print(f'{word};{frequency}')

'1;171451
cup;118273
'2;96269
tablespoons;63076
'1/2;60583
teaspoon;59321
and;59274
cups;41954
'1/4;40415
ground;36763
tablespoon;35952
fresh;33730
'3;29706
for;29247
pepper',;28915
or;27776
1/2;27091
'4;26186
oil',;25672
chopped;25267
black;25021
salt',;23817
freshly;22855
chopped',;21589
ounces;20985
olive;20020
sugar',;19997
large;19694
teaspoons;19051
to;18643
into;17573
red;17153
cut;16481
['1;16029
finely;15663
plus;14806
pound;13599
'Kosher;12852
white;12147
of;12062
unsalted;11686
1;11534
butter',;11003
salt;10885
small;10545
'Salt;10476
sliced;10409
garlic,;10405
powder',;10372
['2;10343
flour',;10293
'6;10219
'3/4;9917
grated;9911
sliced',;9788
chicken;9502
onion,;9387
cloves;9308
butter,;9226
'8;9144
juice',;8817
pounds;8699
(about;8693
all-purpose;8651
2;8524
vinegar',;8413
thinly;8233
cream',;8058
medium;8034
peeled;8014
leaves',;7979
vanilla;7962
sauce',;7847
recipe;7743
dried;7330
extra-virgin;7319
'1/3;7224
green;7203
lemon;7079
whole;7038
kosher;7013
water',;6979
diced

### Parsing 'ingredients' column and extractiting Vegetarian recipie and no of servings
the ingredient need to pe parsed as it conatins a lot of commonly occurying words or ingredients such as quantities, measuring units like tabelspoon, cup , gram etc. or common form or state of ingredients used like, chopped, diced, cube, fresh etc. These words can induce a bias in predicting the similarilties between the recipe's ingredients and the input ingredients so parsing ingredients is the most crucial step in building a recipe reccomendation engine.

In [10]:
def ingredient_parser(ingreds):
    '''
    
    '''
    ingred_list = []
    measure_units = ['teaspoon', 't', 'tsp.', 'tsp','tablespoon', 'T', 'tbl.', 'tb', 'tbsp.', 'fluid ounce', 'fl oz', 'gill', 'cup', 'c', 'pint', 'p', 'pt', 'fl pt', 'quart', 'q', 'qt', 'fl qt', 'gallon', 'g', 'gal', 'ml', 'milliliter', 'millilitre', 'cc', 'mL', 'l', 'liter', 'litre', 'L', 'dl', 'deciliter', 'decilitre', 'dL', 'bulb', 'level', 'heaped', 'rounded', 'whole', 'pinch', 'medium', 'slice', 'pound', 'lb', '#', 'ounce', 'oz', 'mg', 'milligram', 'milligramme', 'g', 'gram', 'gramme', 'kg', 'kilogram', 'kilogramme', 'x', 'of', 'mm', 'millimetre', 'millimeter', 'cm', 'centimeter', 'centimetre', 'm', 'meter', 'metre', 'inch', 'in', 'milli', 'centi', 'deci', 'hecto', 'kilo']
    words_to_remove = ['spicy','allspice','sweetened','bittersweet','golden','flavored','use','coloring','intant','sifted','ice','pure','at','follows','like','sir','granulated','powdered','topping','squeezed','pan','dice','cracked','nonstick','bag','semisweet','plum','whipped','smashed','very','zested','drizzling','well','sharp','reserved','bought','box','cored','salad','reduced','mini','by','julienned','everything','about','each','chopped','diced','together','fresh', 'oil', 'a', 'red', 'bunch', 'and', 'clove', 'or', 'leaf', 'chilli', 'large', 'extra', 'sprig',
    'ground', 'handful', 'free', 'small','brushing','de','slated','recipe','serving','minced', 'pepper', 'virgin', 'range', 'from', 'dried', 'sustainable', 'black', 'peeled', 'higher',
                       'welfare', 'seed', 'for', 'finely', 'freshly', 'sea', 'quality', 'white', 'ripe', 'few', 'piece', 'source', 'to',
                       'organic', 'flat', 'smoked', 'ginger', 'sliced', 'green', 'picked', 'the', 'stick', 'plain', 'plus', 'mixed', 'mint',
                       'bay', 'basil', 'your', 'cumin', 'optional', 'fennel', 'serve', 'mustard', 'unsalted', 'baby', 'paprika', 'fat', 'ask',
                       'natural', 'skin', 'roughly', 'into', 'such', 'cut', 'good', 'brown', 'grated', 'trimmed', 'oregano', 'powder', 'yellow',
                       'dusting', 'knob', 'frozen', 'on', 'deseeded', 'low', 'runny', 'balsamic', 'cooked', 'streaky', 'nutmeg', 'sage', 'rasher',
                       'zest', 'pin', 'groundnut', 'breadcrumb','long-grain', 'turmeric', 'halved', 'grating', 'stalk', 'light', 'tinned', 'dry', 'soft', 'rocket',
                       'bone', 'colour', 'washed', 'skinless', 'leftover', 'splash', 'removed', 'dijon', 'thick', 'big', 'hot', 'drained', 'sized',
                       'chestnut', 'watercress', 'fishmonger', 'english', 'dill', 'caper', 'raw', 'worcestershire', 'flake', 'cider', 'cayenne',
                       'tbsp', 'leg', 'pine', 'wild', 'if', 'fine', 'herb', 'almond', 'shoulder', 'cube', 'dressing', 'with', 'chunk', 'spice', 'thumb',
                       'garam', 'new', 'little', 'punnet', 'peppercorn', 'shelled', 'saffron', 'other','chopped', 'salt', 'olive', 'taste', 'can', 'sauce',
                       'water', 'diced', 'package', 'italian', 'shredded', 'divided', 'parsley', 'vinegar', 'all', 'purpose', 'crushed', 'juice', 'more',
                       'coriander', 'bell', 'needed', 'thinly', 'boneless', 'half', 'thyme', 'cubed', 'cinnamon', 'cilantro', 'jar', 'seasoning', 'rosemary',
                       'extract', 'sweet', 'baking', 'beaten', 'heavy', 'seeded', 'tin', 'vanilla', 'uncooked', 'crumb', 'style', 'thin', 'nut', 'coarsely',
                       'spring', 'chili', 'cornstarch', 'strip', 'cardamom', 'rinsed', 'honey', 'cherry', 'root', 'quartered', 'head', 'softened', 'container',
                       'crumbled', 'frying', 'lean', 'cooking', 'roasted', 'warm', 'whipping', 'thawed', 'corn', 'pitted', 'sun', 'kosher', 'bite', 'toasted', 'lasagna',
                       'split', 'melted', 'degree', 'lengthwise', 'romano', 'packed', 'pod', 'anchovy', 'rom', 'prepared', 'juiced', 'fluid', 'floret', 'room', 'active',
                       'seasoned', 'mix', 'deveined', 'lightly', 'anise', 'thai', 'size', 'unsweetened', 'torn', 'wedge', 'sour', 'basmati', 'marinara', 'dark',
                       'temperature', 'garnish', 'bouillon', 'loaf', 'shell', 'reggiano', 'canola', 'parmigiano', 'round', 'canned', 'ghee', 'crust', 'long',
                       'broken', 'ketchup', 'bulk', 'cleaned', 'condensed', 'sherry', 'provolone', 'cold', 'soda', 'cottage', 'spray', 'tamarind', 'pecorino',
                       'shortening', 'part', 'bottle', 'sodium', 'cocoa', 'grain', 'french', 'roast', 'stem', 'link', 'firm', 'asafoetida', 'mild', 'dash', 'boiling','one','two','three','four']
    
    #Check input ingredient is a list or not if not then convert it into a list. We use ast.literal_eval
    if isinstance(ingreds, list):
        ingredients = ingreds
    else:
        try:
            ingredients = ast.literal_eval(ingreds)
        except:
            return  'none'

    # We first get rid of all the punctuation. We make use of str.maketrans. It takes three input 
    # arguments 'x', 'y', 'z'. 'x' and 'y' must be equal-length strings and characters in 'x'
    # are replaced by characters in 'y'. 'z' is a string (string.punctuation here) where each character
    #  in the string is mapped to None. 
    translator = str.maketrans('', '', string.punctuation)
    lemmatizer = WordNetLemmatizer()
    try:
        for i in ingredients:
            i.translate(translator)
            # We split up with hyphens as well as spaces
            items = re.split(' |-|,', i)
            # Get rid of words containing non alphabet letters
            items = [word for word in items if word.isalpha()]
            # Turn everything to lowercase
            items = [word.lower() for word in items]
            # remove accents
            items = [unidecode.unidecode(word) for word in items] 
            # Lemmatize words so we can compare words to measuring words
            items = [lemmatizer.lemmatize(word) for word in items]
            # Gets rid of measuring units/words
            items = [word for word in items if word not in measure_units]
            # Get rid of commonly occuring words
            items = [word for word in items if word not in words_to_remove]
            if items:
                ingred_list.append(' '.join(items)) 
        ingred_list = " ".join(ingred_list)
    except:
            return ingred_list
    return ingred_list

def vegeterian_parser(ingreds):
    '''
    
    '''
    ingred_list =[]
    non_veg = ['chicken','duck','turkey','buffalo','egg', 'eggs', 'beef','ham','bacon','salami','sausage','pepperoni','steak', 'pork','lamb', 'fish', 'shrimp', 'meat', 'flesh','sardine','tuna','salmon','tilapia','cod',
              'snapper','herring','anchovies','haddock','flounder','trout','catfish','pollock','bass','halibut','sowrdfish','pike',
              'mackerel']
    
    
    #Check input ingredient is a list or not if not then convert it into a list. We use ast.literal_eval
    if isinstance(ingreds, list):
        ingredients = ingreds
    else:
        try:
            ingredients = ast.literal_eval(ingreds)
        except:
            return 'none'

    
    translator = str.maketrans('', '', string.punctuation)
    veg = "yes"
    for i in ingredients:
        i.translate(translator)
        # We split up with hyphens as well as spaces
        items = re.split(' |-|,', i)
        # Get rid of words containing non alphabet letters
        items = [word for word in items if word.isalpha()]
        # Turn everything to lowercase
        items = [word.lower() for word in items]
        # remove accents
        items = [unidecode.unidecode(word) for word in items] 
                
        if items:
            ingred_list.append(' '.join(items)) 
    ingred_list = " ".join(ingred_list)
    ingred_list=  list(ingred_list.split(" "))
    if any(x in non_veg for x in ingred_list):
        veg = "no"
    ''''for word in ingred_list:
        print (word)
        if word  in non_veg:
            veg = "no"'''
    return veg

def serving_extractor(servings):
    translator = str.maketrans('', '', string.punctuation)
    try:
        servings = str(servings)
        servings = re.split(" |-", servings)
        no_of_servings = 0
        for i in range (len(servings)):
            servings[i] = servings[i].translate(translator)
            if servings[i].isdigit():
                no_of_servings = servings[i]
                break
    except:
        return 0
    return no_of_servings
    

In [11]:
#Creating a new column of parsed ingredients vegetarian and no_of_servings
df_82k['parsed_ingredients'] = df_82k['ingredients'].apply(lambda x: ingredient_parser(x))

df_82k['vegetarian'] = df_82k['ingredients'].apply(lambda x: vegeterian_parser(x))

df_82k['no_of_servings'] = df_82k['serves'].apply(lambda x: serving_extractor(x))

#df_82k['parsed_cusine'] = df_82k['cuisine'].apply(lambda x: ingredient_parser(x))


In [12]:
# Dropping Cuisine Preptime and serves columns
df_82k.drop(['cuisine','prep_time','serves'],1, inplace =True)

### Looking at new 'parsed_ingredients'

In [13]:
df_82k.sample(10)

Unnamed: 0,cooking_method,image,ingredients,recipe_name,tags,parsed_ingredients,vegetarian,no_of_servings
49144,['Heat extra-virgin olive oil in a soup pot ov...,https://food.fnr.sndimg.com/content/dam/images...,"['2 tablespoons extra-virgin olive oil', '1 ca...",Three Bean Pasta e Fagioli,"Easy Pasta Recipes,Easy,Pasta Recipes,Easy Mai...",carrot rib celery heart onion garlic cannellin...,no,4
41205,['Put flour and sesame seeds on separate sheet...,https://food.fnr.sndimg.com/content/dam/images...,"['1 1/2 tablespoons all purpose flour', '2 tab...",Sesame Chicken with Napa Cabbage and Spinach S...,"Skillet Recipes,American,Asian,Sandwich,Chicke...",flour sesame egg chicken breast flattened slig...,no,4
47017,['Sprinkle 1 tablespoon of the gelatin over th...,https://food.fnr.sndimg.com/content/dam/images...,"['3 tablespoons powdered gelatin', '1 1/4 cups...",Striped Jiggly Fruit Shooters,"Fruit Dessert Recipes,Dessert,Fruit,American,T...",gelatin strawberry banana another pink fruit c...,yes,8
24464,['Preheat oven to 375 degrees F. Grease a 9 by...,https://food.fnr.sndimg.com/content/dam/images...,"['4 tablespoons butter, plus 1 tablespoon for ...",Glazed Sweet Potatoes,"Easy Side Dish Recipes,Easy,Side Dish,Easy Din...",butter greasing yam potato sugar maple syrup,yes,4
32357,['Cook the pasta in boiling salted water until...,https://food.fnr.sndimg.com/content/dam/images...,"['1 (16-ounce) package whole-wheat linguine', ...",Pasta Stir-Fry,"Budget-Friendly,Wok Recipes,Asian,Pasta Recipe...",wheat linguine onion eggplant garlic broccoli ...,no,4
39293,"['Directions', 'Preheat oven to 220 degrees. W...",https://food.fnr.sndimg.com/content/dam/images...,"['12 to 15 pound ham, spiral cut preferably', ...",Rootbeer Ham with Kumquats Recipe,"Ham,Meat,Fruit,Grape Recipes,Main Dish,Gluten ...",ham spiral preferably rootbeer sugar kumquat s...,no,0
29077,"['Preheat the oven to 200 degrees F.', 'Spray ...",https://food.fnr.sndimg.com/content/dam/images...,"['Eight 6-inch whole wheat tortillas', 'Cookin...","Ham, Apple and Cheese Quesadilla","Healthy Dinner,Healthy,Quesadilla,Apple,Fruit,...",eight wheat tortilla oiling tortilla swiss for...,no,4
35234,['Preheat the oven to 350 degrees F. Grease a ...,https://food.fnr.sndimg.com/content/dam/images...,"['1 tablespoon unsalted butter, at room temper...",Prune Plum and Walnut Butter Cake,"Fruit Dessert Recipes,Dessert,Fruit,Baking,Ame...",butter unbleached flour prune sugar brandy lem...,no,8
20376,"['Preheat oven to 450 degrees F.', 'Slice port...",https://food.fnr.sndimg.com/content/dam/images...,"['3 portobello mushroom caps', '2 tablespoons ...",For the Love of Mushrooms Pizza,"Italian Pizza,Italian,Pizza Restaurants,Vegeta...",portobello mushroom cap butter pizza dough pre...,yes,2
35876,['Heat 1 tablespoon of olive oil in casserole ...,https://food.fnr.sndimg.com/content/dam/images...,"['4 tablespoons olive oil', '3 whole chickens ...",Potted Chicken,"Chicken Casserole,Casserole,Chicken,Poultry,Eu...",chicken down onion garlic tomato orange chicke...,no,6



### Checking most frequently occuring words in 'parsed_ingredients'
(These observation are iterated over many steps to further enhanced the list of words to remove as most commonly occuring words will induce a bias in model's calculatin )

In [14]:
vocabulary = nltk.FreqDist()
# This was done once I had already preprocessed the ingredients
for ingredients in df_82k['parsed_ingredients']:
    ingredients = ingredients.split()
    vocabulary.update(ingredients)
for word, frequency in vocabulary.most_common(200):
    print(f'{word};{frequency}')

sugar;26367
butter;23202
garlic;22097
onion;19182
egg;16605
cream;14510
flour;14407
lemon;13350
cheese;11887
tomato;11791
chicken;11281
milk;8749
wine;8407
vegetable;8027
lime;6705
chocolate;6643
orange;6262
potato;5591
bread;4936
rice;4832
stock;4641
chile;4618
bean;4564
carrot;4433
apple;4075
parmesan;4016
celery;4013
scallion;3838
shallot;3737
beef;3649
mushroom;3372
broth;3346
pork;2957
mayonnaise;2940
bacon;2928
soy;2921
syrup;2882
peanut;2834
jalapeno;2722
breast;2618
paste;2582
chive;2581
chip;2575
coconut;2564
sesame;2481
cheddar;2396
yolk;2309
shrimp;2250
fillet;2082
steak;2043
lettuce;1922
strawberry;1901
spinach;1889
tortilla;1786
buttermilk;1778
yogurt;1724
mozzarella;1716
rib;1715
turkey;1682
sausage;1639
cucumber;1618
pea;1599
cake;1571
avocado;1570
pineapple;1563
coarse;1560
raspberry;1526
pecan;1497
walnut;1470
wheat;1425
cranberry;1387
chipotle;1357
stemmed;1357
banana;1348
cabbage;1330
fish;1323
pasta;1314
grape;1310
dough;1288
food;1236
zucchini;1214
chilled;1198
map

### Add specific index column

In [15]:
df_82k['id'] = df_82k.index.values
df_82k.head(5)

Unnamed: 0,cooking_method,image,ingredients,recipe_name,tags,parsed_ingredients,vegetarian,no_of_servings,id
0,['Set the racks in the middle and upper thirds...,https://www.skinnytaste.com/wp-content/uploads...,"['1 tablespoons extra virgin olive oil', '1 cu...",Mediterranean Sea Bass,"Dairy Free, Gluten Free, Keto Recipes, Kid Fri...",onion garlic tomato their wine kalamata chilea...,no,4,0
1,['Place the eggs in the air fryer basket and c...,https://www.skinnytaste.com/wp-content/uploads...,"['4 large eggs', 'Salt (black pepper, everythi...",Air Fryer Hard Boiled Eggs,"Air Fryer Recipes, Dairy Free, Gluten Free, Ke...",egg bagel,no,4,1
2,"['Air Fryer directions:', 'Preheat air fryer t...",https://www.skinnytaste.com/wp-content/uploads...,"['olive oil spray', '4 about 5 ounce each salm...",Air Fryer Basil-Parmesan Salmon,"Air Fryer Recipes, Gluten Free, Keto Recipes, ...",salmon fillet lemon mayonnaise parmesan cheese,no,4,2
3,"['Preheat the oven to 400F.', 'Pour 2 tablespo...",https://www.skinnytaste.com/wp-content/uploads...,['1/2 cup freshly grated Parmesan (not pre-gra...,Everything Parmesan Crisps,"Gluten Free, Keto Recipes, Kid Friendly, Low C...",parmesan pre using hole sesame onion garlic poppy,yes,4,3
4,['Cook potatoes in a large pot of salted water...,https://www.skinnytaste.com/wp-content/uploads...,['3 1/2 pounds new potatoes (about 10 peeled a...,Potato and Green Bean Salad,"Dairy Free, Gluten Free, Kid Friendly, Vegetar...",potato bean bean scallion,yes,12,4


### Add number of ingredients as specific columns

In [16]:
def count_n_ingredients(ingreds):
    ingreds_list = ingreds.split()
    n_ingredients = len(ingreds_list)
    return n_ingredients

In [17]:
df_82k['n_ingredients'] = [count_n_ingredients(x) for x in df_82k['parsed_ingredients']]
df_82k.head(5)

Unnamed: 0,cooking_method,image,ingredients,recipe_name,tags,parsed_ingredients,vegetarian,no_of_servings,id,n_ingredients
0,['Set the racks in the middle and upper thirds...,https://www.skinnytaste.com/wp-content/uploads...,"['1 tablespoons extra virgin olive oil', '1 cu...",Mediterranean Sea Bass,"Dairy Free, Gluten Free, Keto Recipes, Kid Fri...",onion garlic tomato their wine kalamata chilea...,no,4,0,15
1,['Place the eggs in the air fryer basket and c...,https://www.skinnytaste.com/wp-content/uploads...,"['4 large eggs', 'Salt (black pepper, everythi...",Air Fryer Hard Boiled Eggs,"Air Fryer Recipes, Dairy Free, Gluten Free, Ke...",egg bagel,no,4,1,2
2,"['Air Fryer directions:', 'Preheat air fryer t...",https://www.skinnytaste.com/wp-content/uploads...,"['olive oil spray', '4 about 5 ounce each salm...",Air Fryer Basil-Parmesan Salmon,"Air Fryer Recipes, Gluten Free, Keto Recipes, ...",salmon fillet lemon mayonnaise parmesan cheese,no,4,2,6
3,"['Preheat the oven to 400F.', 'Pour 2 tablespo...",https://www.skinnytaste.com/wp-content/uploads...,['1/2 cup freshly grated Parmesan (not pre-gra...,Everything Parmesan Crisps,"Gluten Free, Keto Recipes, Kid Friendly, Low C...",parmesan pre using hole sesame onion garlic poppy,yes,4,3,8
4,['Cook potatoes in a large pot of salted water...,https://www.skinnytaste.com/wp-content/uploads...,['3 1/2 pounds new potatoes (about 10 peeled a...,Potato and Green Bean Salad,"Dairy Free, Gluten Free, Kid Friendly, Vegetar...",potato bean bean scallion,yes,12,4,4


### Saving the Dataset into CSV file

In [None]:
df_82k.to_csv("../data/parsed_recipes.csv", sep=",", index_label = id)

### Save to .pkl

In [None]:
df_82k.to_pickle("../data/parsed_recipes.pkl")

### Load dataframe

In [None]:
df = pd.read_pickle("../data/parsed_recipes.pkl")
df.head(3)

Unnamed: 0,cooking_method,image,ingredients,recipe_name,tags,parsed_ingredients,vegetarian,no_of_servings,id,n_ingredients
0,['Set the racks in the middle and upper thirds...,https://www.skinnytaste.com/wp-content/uploads...,"['1 tablespoons extra virgin olive oil', '1 cu...",Mediterranean Sea Bass,"Dairy Free, Gluten Free, Keto Recipes, Kid Fri...",onion garlic tomato their wine kalamata chilea...,no,4,0,15
1,['Place the eggs in the air fryer basket and c...,https://www.skinnytaste.com/wp-content/uploads...,"['4 large eggs', 'Salt (black pepper, everythi...",Air Fryer Hard Boiled Eggs,"Air Fryer Recipes, Dairy Free, Gluten Free, Ke...",egg bagel,no,4,1,2
2,"['Air Fryer directions:', 'Preheat air fryer t...",https://www.skinnytaste.com/wp-content/uploads...,"['olive oil spray', '4 about 5 ounce each salm...",Air Fryer Basil-Parmesan Salmon,"Air Fryer Recipes, Gluten Free, Keto Recipes, ...",salmon fillet lemon mayonnaise parmesan cheese,no,4,2,6


## Add dataframe to postgresql database

In [19]:
from sqlalchemy import create_engine

In [None]:
engine = create_engine('postgresql://postgres:MySQLPW@localhost:5432/WhatToCook')

In [None]:
df.to_sql('recipes', engine)

## Building Model 
### TfidfVectorizer and Cosine Similarity

### TF-IDF Vectorizer
TF-IDF is short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.  The tf–idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word, which helps to adjust for the fact that some words appear more frequently in general. 

### Cosine Similarity
cosine similarity is a measure of similarity between two sequences of numbers. For defining it, the sequences are viewed as vectors in an inner product space, and the cosine similarity is defined as the cosine of the angle between them, that is, the dot product of the vectors divided by the product of their lengths. It follows that the cosine similarity does not depend on the magnitudes of the vectors, but only on their angle. Cosine similarity can give a useful measure of how similar two documents are likely to be, in terms of their subject matter, and indepently of the length of the documents.

In [None]:
# TF-IDF feature extractor
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(df_82k['parsed_ingredients'])
tfidf_recipe = tfidf.transform(df_82k['parsed_ingredients'])

In [None]:
tfidf_recipe.shape

(55023, 8928)

### Example ingredients

In [None]:

input_ingredients = ['Tomato', 'onion', 'potato', 'beef', "rice"]

### Claculating Similarity

In [None]:
# parse the ingredients using my ingredient_parser
try:
    ingredients_parsed = ingredient_parser(input_ingredients)
except:
    ingredients_parsed = ingredient_parser([input_ingredients])
    
# use pretrained  model to encode our input ingredients
input_ingredients_tfidf = tfidf.transform([ingredients_parsed])

# calculate cosine similarity between actual recipe ingreds and input ingreds
cos_sim = map(lambda x: cosine_similarity(input_ingredients_tfidf, x), tfidf_recipe)
scores = list(cos_sim)

In [None]:
print(scores[0])

[[0.07889054]]


### Getting  top 5 recomendations

In [None]:
# getting top 5 recomendations
top_score = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:5]
recommendation_df = pd.DataFrame(columns = ['recipe_name', 'ingredients','cooking_method','cuisine', 'score' ])

count = 0
for i in top_score:
    recommendation_df.at[count, 'recipe_name'] = (df_82k['recipe_name'][i])
    recommendation_df.at[count, 'ingredients'] = (df_82k['ingredients'][i])
    recommendation_df.at[count, 'cooking_method'] = df_82k['cooking_method'][i]
    recommendation_df.at[count, 'no_of_servings'] = df_82k['no_of_servings'][i]
    recommendation_df.at[count, 'score'] = "{:.3f}".format(float(scores[i]))
    count += 1


### Reccomendations

In [None]:
recommendation_df.head()

Unnamed: 0,recipe_name,ingredients,cooking_method,cuisine,score,no_of_servings
0,Porcupine Balls,"['1 pound lean ground beef', '1 1/2 cups uncoo...","['Preheat the oven to 350 degrees F.', 'Put th...",,0.768,4
1,Cuban Beef Stew: Carne con Papas,"['1/4 cup olive oil', '1 large white onion, cu...","['In a heavy skillet, heat oil and saute onion...",,0.712,6
2,Dressed Up Rice,"['1 cup brown rice', '1 cup beef stock', '1 1/...","['In a rice cooker, mix the rice, stock, water...",,0.648,4
3,Portuguese Rice,"['1/4 cup olive oil', '1 1/2 cups finely chopp...",['In a 4-quart medium saucepan heat the oil ov...,,0.645,6
4,Hopkins County Stew Family Size,"['4 cups water', '2 pounds skinless chicken or...",['Heat 4 cups water to boiling in a 5-quart sa...,,0.634,6


## Second Model Using Fuzz

In [None]:
from thefuzz import fuzz
from thefuzz import process

In [None]:
fuzz_ratio =  map(lambda x: fuzz.ratio(ingredients_parsed, x), df_82k['parsed_ingredients'])
#df_82k['ingredients'].apply(lambda x: fuzz.ratio(x))
fuzz_score = list(fuzz_ratio)

In [None]:
# getting top 5 recomendations
fuzz_score_top = sorted(range(len(fuzz_score)), key=lambda i: fuzz_score[i], reverse=True)[:5]
recommendation_dfuzz = pd.DataFrame(columns = ['recipe_name', 'ingredients','cooking_method','cuisine', 'score' ])

count = 0
for i in fuzz_score_top:
    recommendation_dfuzz.at[count, 'recipe_name'] = (df_82k['recipe_name'][i])
    recommendation_dfuzz.at[count, 'ingredients'] = (df_82k['ingredients'][i])
    recommendation_dfuzz.at[count, 'cooking_method'] = df_82k['cooking_method'][i]
    recommendation_dfuzz.at[count, 'no_of_servings'] = df_82k['no_of_servings'][i]
    recommendation_dfuzz.at[count, 'score'] = "{:.3f}".format(float(fuzz_score[i]))
    count += 1


In [None]:
recommendation_dfuzz.head()

Unnamed: 0,recipe_name,ingredients,cooking_method,cuisine,score,no_of_servings
0,Pommes De Terre a la Boulangere: Potatoes a la...,"['3 tablespoons butter', '1 tablespoon olive o...","['Preheat the oven to 300 degrees F.', 'Melt h...",,75.0,6
1,Basic Tomato Sauce,"['Two 28-ounce cans whole peeled tomatoes', '1...",['Add the tomatoes to a blender and puree unti...,,71.0,6
2,Roasted Sweet Potato and Green Onion Salad,['4 medium sweet potatoes (about 1 pound each)...,['Brush the potatoes and green onions with oli...,,70.0,8
3,Tomato-Red Onion Salad,"['3 tablespoons balsamic vinegar', '1 teaspoon...",['Whisk together the vinegar and mustard in a ...,,70.0,4
4,Sunny's Spicy Tomato Dip,"['2 tablespoons olive oil', '2 pints cherry to...",['Heat the oil in a medium pot or pan over med...,,69.0,1


### Observation
In comparison between the twomodels, using 'TfidfVectorizer' and cosine similarity the model score is much better than the 'fuzz' model so we have decided to use it for our reccomendations.