## Loading data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# loading cuisines
cuisines = pd.read_csv('dataset/Cuisines.csv', index_col=0, names=['Cuisine'])
cuisines = cuisines['Cuisine']

In [3]:
cuisines

1      Chinese
2      English
3       French
4       German
5        Greek
6       Indian
7      Italian
8     Japanese
9      Mexican
10    Moroccan
11     Spanish
12        Thai
Name: Cuisine, dtype: object

In [4]:
# loading recipes
recipes = pd.read_csv('dataset/recipes.csv')

# number of recipes and ingredients
NUM_RECIPES = recipes.shape[0]
NUM_INGREDIENTS = recipes.shape[1] - 1 # last column is the cuisine (i.e. class)

recipes.head()

Unnamed: 0,'acorn squash',adobo,'african birdseye chile pepper',ale,'aleppo pepper','alfalfa sprouts','alfredo sauce',allspice,almond,'almond butter',...,yeast,'yellow curry paste','yellow food coloring','yellow split pea','yellow squash',yogurt,zaatar,zest,zucchini,cuisine
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Data cleaning

In [5]:
import copy
import re

### Cleaning

In [6]:
ingredients = list(recipes.columns)

# apparently some ingredient names have '' even though they are of type: string.
ingredients = [re.sub(r'[^\w]', '', ingredient.replace(' ', '_')) for ingredient in ingredients]

bag_of_ingredients = copy.deepcopy(recipes)

In [7]:
bag_of_ingredients.columns = ingredients
ingredients = ingredients[:-1] # just removing the class (i.e. cuisine)

classes = bag_of_ingredients['cuisine'].copy()
bag_of_ingredients = bag_of_ingredients.drop(['cuisine'], axis=1)

### 1. Bag of Ingredients

In [8]:
bag_of_ingredients.head()

Unnamed: 0,acorn_squash,adobo,african_birdseye_chile_pepper,ale,aleppo_pepper,alfalfa_sprouts,alfredo_sauce,allspice,almond,almond_butter,...,yams,yeast,yellow_curry_paste,yellow_food_coloring,yellow_split_pea,yellow_squash,yogurt,zaatar,zest,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
bag_of_ingredients.to_pickle('bag_of_ingredients.pkl')

### 2. TF-IDF features 

In [10]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html

from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer

![tfidf](imgs/tfidf.png)

## Use this..

In [11]:
from sklearn.model_selection import train_test_split
    
X_train, X_test, y_train, y_test = train_test_split(recipes, classes.tolist(), test_size=0.2, random_state=444)    

In [12]:
bag_of_ingredients

Unnamed: 0,acorn_squash,adobo,african_birdseye_chile_pepper,ale,aleppo_pepper,alfalfa_sprouts,alfredo_sauce,allspice,almond,almond_butter,...,yams,yeast,yellow_curry_paste,yellow_food_coloring,yellow_split_pea,yellow_squash,yogurt,zaatar,zest,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4231,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4232,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4233,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4234,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
bag_of_ingredients_cuisine_train = X_train.groupby('cuisine').sum()
bag_of_ingredients_cuisine_train

Unnamed: 0_level_0,'acorn squash',adobo,'african birdseye chile pepper',ale,'aleppo pepper','alfalfa sprouts','alfredo sauce',allspice,almond,'almond butter',...,yams,yeast,'yellow curry paste','yellow food coloring','yellow split pea','yellow squash',yogurt,zaatar,zest,zucchini
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,1,0,0,4,0,...,0,0,0,1,0,0,0,0,7,2
1,0,0,0,5,0,0,0,2,0,0,...,0,2,0,0,1,0,3,0,0,7
2,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,1,2,0,1,11
3,0,0,0,0,0,0,0,11,0,0,...,0,3,0,0,0,0,1,0,1,1
4,0,1,0,0,1,0,0,10,0,0,...,0,0,0,0,0,1,29,0,1,15
5,0,0,1,0,0,0,0,4,11,0,...,0,0,0,0,1,0,65,0,0,2
6,0,0,0,0,0,0,2,1,0,0,...,0,7,0,0,0,3,0,0,0,11
7,0,0,0,0,0,1,0,3,1,0,...,0,1,0,0,0,0,2,0,0,2
8,0,3,0,0,0,0,0,3,2,0,...,0,1,0,0,0,1,1,0,0,2
9,2,0,1,0,0,1,0,17,34,0,...,1,1,0,0,0,3,10,2,1,25


In [14]:
bag_of_ingredients_cuisine_test = X_test.groupby('cuisine').sum()
bag_of_ingredients_cuisine_test

Unnamed: 0_level_0,'acorn squash',adobo,'african birdseye chile pepper',ale,'aleppo pepper','alfalfa sprouts','alfredo sauce',allspice,almond,'almond butter',...,yams,yeast,'yellow curry paste','yellow food coloring','yellow split pea','yellow squash',yogurt,zaatar,zest,zucchini
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,1,5
3,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,7,0,0,2
5,0,0,0,0,0,0,0,0,3,0,...,0,0,0,0,1,0,18,0,0,0
6,1,0,0,0,0,0,1,1,0,0,...,0,1,0,0,0,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,4
9,0,0,0,0,0,0,0,0,8,0,...,0,0,0,0,0,0,5,0,0,6


#### Compute the IDF values

In [15]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(bag_of_ingredients_cuisine_train)

# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=ingredients, columns=["idf_weights"]) 
 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
zucchini,1.000000
shallot,1.000000
salt,1.000000
lemon_juice,1.000000
cabbage,1.000000
...,...
processed_cheese,3.564949
calvados,3.564949
crystallized_ginger,3.564949
fresh_button_mushroom,3.564949


In [16]:
df_idf.sort_values(by=['idf_weights'],ascending=True).head(30)

Unnamed: 0,idf_weights
zucchini,1.0
shallot,1.0
salt,1.0
lemon_juice,1.0
cabbage,1.0
rice,1.0
garlic,1.0
garlic_powder,1.0
potato,1.0
pork,1.0


Notice that ingredients like: ‘garlic’, ‘onion’ 'olive oil' have the lowest IDF values. This is expected as these words appear many recipes. The lower the IDF value of a word, the less unique it is to any particular document.

#### Compute the TFIDF score for the recipes

Once you have the IDF values, you can now compute the tf-idf scores for recipe. Let’s compute tf-idf scores for all the recipes.

In [17]:
# tf-idf scores
# this does tf * idf where your term frequency is weighted by its IDF values
tf_idf_vector=tfidf_transformer.transform(bag_of_ingredients_cuisine_train)

#### Print the TF-IDF values of the first recipe

In [18]:
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[0] 
 
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=ingredients, columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
soy_sauce,0.449098
sesame_oil,0.324547
ginger,0.309002
garlic,0.287872
cornstarch,0.254109
...,...
long_grain_and_wild_rice_blend,0.000000
lobster,0.000000
liver,0.000000
chipotle_pepper,0.000000


Notice that only certain ingredients have scores. This is because the recipe doesn't have all the ingredients. 

The more common the word across recipes, the lower its score and the more unique a word is to our first recipe (e.g. ‘chinese_cabbage’) the higher the score. So it’s working as expected except.

In [19]:
df.sort_values(by=["tfidf"],ascending=False).head(20)

Unnamed: 0,tfidf
soy_sauce,0.449098
sesame_oil,0.324547
ginger,0.309002
garlic,0.287872
cornstarch,0.254109
green_onion,0.220347
chicken,0.211462
vegetable_oil,0.193692
sugar,0.18303
rice_vinegar,0.170271


In [20]:
df.sort_values(by=["tfidf"],ascending=True).head(20)

Unnamed: 0,tfidf
acorn_squash,0.0
kohlrabi,0.0
kielbasa,0.0
kidney_bean,0.0
ketjap_manis,0.0
soba_noodles,0.0
kamaboko,0.0
kale,0.0
kalamata_olive,0.0
kaffir_lime,0.0


## Prediction with tf-idf

To predict the cuisine of a recipe, we calculate the tf-idf scored for each ingredient and sum them for each cuisine. The cuisine with the highest score is the one we choose

In [21]:
def calculate_recipe_score_per_cusine(ingredients_sample, cuisine_idx, tf_idf_vector):
    # get tfidf vector for first document 
    first_document_vector=tf_idf_vector[cuisine_idx] 

    # print the scores 
    tfidf_scores = pd.DataFrame(first_document_vector.T.todense(), index=ingredients, columns=["tfidf"]) 
    
    total_score = 0
    for ingr in ingredients_sample:
        total_score += tfidf_scores.loc[ingr][0]
    
    return total_score

def predict_cuisine_with_fidf(recipe_idx, tf_idf_vector, boi=bag_of_ingredients):
    # get recipe ingredients
    recipe_sample_dict = {x:y for x,y in boi.iloc[recipe_idx].to_dict().items() if y!=0}
    ingredients_sample = list(recipe_sample_dict.keys())
    
    # get the recipes real class
    real_class = classes[recipe_idx]
    
    # list that holds the scores for each cuisine
    cuisine_scores = []
    
    # calculate the score for each cuisine
    for cuisine_idx in range(12):
        cuisine_score = calculate_recipe_score_per_cusine(ingredients_sample, cuisine_idx, tf_idf_vector)
    
        cuisine_scores.append(cuisine_score)
        
    return np.argmax(cuisine_scores), real_class

In [22]:
counter = 0
for r_idx in X_test.index.tolist():
    res = predict_cuisine_with_fidf(r_idx, tf_idf_vector)
    
    if res[0]==res[1]:
        counter += 1
print(f'Acc: {round(counter * 100/ len(X_test.index.tolist()), 5)}')

Acc: 68.75


## Attack

0      Chinese

1      English

2       French

3       German

4        Greek

5       Indian

6      Italian

7     Japanese

8      Mexican

9    Moroccan

10     Spanish

11        Thai

In [23]:
top_5_ingredients_per_cuisine = []

# get the top_5_ingredients_per_cuisine
for i in range(12):
    # get tfidf vector for cuisine
    first_document_vector=tf_idf_vector[i] 

    # df with tfidf per ingredient for this cuisine
    df = pd.DataFrame(first_document_vector.T.todense(), index=ingredients, columns=["tfidf"]) 

    # get top 5 ingredients - list names
    top_5_ingr = df.sort_values(by=["tfidf"],ascending=False).head(5).index.tolist()
    
    top_5_ingredients_per_cuisine.append(top_5_ingr)

In [24]:
top_5_ingredients_per_cuisine

[['soy_sauce', 'sesame_oil', 'ginger', 'garlic', 'cornstarch'],
 ['onion', 'butter', 'potato', 'garlic', 'flour'],
 ['garlic', 'butter', 'wine', 'thyme', 'onion'],
 ['onion', 'sauerkraut', 'pepper', 'salt', 'caraway_seed'],
 ['garlic', 'olive_oil', 'onion', 'oregano', 'feta_cheese'],
 ['onion', 'garlic', 'ginger', 'turmeric', 'masala'],
 ['garlic', 'parmesan_cheese', 'mozzarella', 'olive_oil', 'pasta'],
 ['soy_sauce', 'rice_wine', 'ginger', 'sugar', 'garlic'],
 ['tortilla', 'onion', 'taco_seasoning', 'garlic', 'cumin'],
 ['onion', 'garlic', 'olive_oil', 'cumin', 'cinnamon'],
 ['garlic', 'olive_oil', 'onion', 'saffron', 'tomato'],
 ['fish_sauce', 'coconut_milk_or_cream', 'garlic', 'curry_paste', 'lime']]

## Removing n best ingredients

In [25]:
# bag_of_ingredients_rmv_best = copy.deepcopy(bag_of_ingredients)

# for cuisine_idx in range(12):
#     for ingr in range(1):
#         bag_of_ingredients_rmv_best.loc[X_test_rmv_best[X_test_rmv_best['cuisine']==cuisine_idx].\
#                                      index.tolist(), top_5_ingredients_per_cuisine[cuisine_idx][ingr]]=0

In [26]:
# counter = 0
# for r_idx in X_test.index.tolist():
#     res = predict_cuisine_with_fidf(r_idx, tf_idf_vector, boi=bag_of_ingredients_rmv_best)
    
#     if res[0]==res[1]:
#         counter += 1
# print(f'Acc: {round(counter * 100/ len(X_test.index.tolist()), 5)}')

## Ingredient recommendation

In [27]:
import pickle

y_test_ = [i+1 for i in y_test]
X_test = X_test.drop(['cuisine'], axis=1)
 
# Load from file
with open('best_lr_model.pkl', 'rb') as file:
    pickle_model = pickle.load(file)
    
pickle_model.score(X_test, y_test_)



0.7665094339622641

In [28]:
def predict_ingredient(recipe_idx, cuisine, boi):
    recipe_sample_dict = {x:y for x,y in boi.iloc[recipe_idx].to_dict().items() if y!=0}
    ingredients_list = list(recipe_sample_dict.keys())   
    
    tfidf_df = pd.DataFrame(tf_idf_vector.todense())
    tfidf_df.columns = bag_of_ingredients.columns.tolist()

    tfidf_dict = tfidf_df.iloc[cuisine].to_dict()

    sort_tfidf_scores = sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)

    for ingredient in sort_tfidf_scores:
        if ingredient[0] not in ingredients_list:
            ingredients_list.append(ingredient[0])
            print(f'added: "{ingredient[0]}"\n\nto recipe: {ingredients_list}')
            break

In [42]:
predict_ingredient(0, 0, bag_of_ingredients)

added: "garlic"

to recipe: ['broth', 'chinese_cabbage', 'cornstarch', 'egg', 'ginger', 'green_onion', 'pork', 'salt', 'sesame_oil', 'shrimp', 'soy_sauce', 'sugar', 'vegetable_oil', 'water_chestnut', 'wine', 'garlic']


In [35]:
X_test.shape[0]/12

70.66666666666667

In [39]:
X_test.iloc[140]

'acorn squash'                     0
adobo                              0
'african birdseye chile pepper'    0
ale                                0
'aleppo pepper'                    0
                                  ..
'yellow squash'                    0
yogurt                             0
zaatar                             0
zest                               0
zucchini                           0
Name: 87, Length: 709, dtype: int64