In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Loading data

In [2]:
# loading cuisines
cuisines = pd.read_csv('dataset/Cuisines.csv', index_col=0, names=['Cuisine'])
cuisines = cuisines['Cuisine']

In [3]:
cuisines

1      Chinese
2      English
3       French
4       German
5        Greek
6       Indian
7      Italian
8     Japanese
9      Mexican
10    Moroccan
11     Spanish
12        Thai
Name: Cuisine, dtype: object

In [4]:
# loading recipes
recipes = pd.read_csv('dataset/recipes.csv')

# number of recipes and ingredients
NUM_RECIPES = recipes.shape[0]
NUM_INGREDIENTS = recipes.shape[1] - 1 # last column is the cuisine (i.e. class)

recipes.head()

Unnamed: 0,'acorn squash',adobo,'african birdseye chile pepper',ale,'aleppo pepper','alfalfa sprouts','alfredo sauce',allspice,almond,'almond butter',...,yeast,'yellow curry paste','yellow food coloring','yellow split pea','yellow squash',yogurt,zaatar,zest,zucchini,cuisine
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Data cleaning

In [5]:
import copy
import re

### Cleaning

In [6]:
ingredients = list(recipes.columns)

# apparently some ingredient names have '' even though they are of type: string.
ingredients = [re.sub(r'[^\w]', '', ingredient.replace(' ', '_')) for ingredient in ingredients]

bag_of_ingredients = copy.deepcopy(recipes)

In [7]:
bag_of_ingredients.columns = ingredients
ingredients = ingredients[:-1] # just removing the class (i.e. cuisine)

classes = bag_of_ingredients['cuisine'].copy()
bag_of_ingredients.to_csv('bag_of_ingre_w_cuisine.csv')
bag_of_ingredients = bag_of_ingredients.drop(['cuisine'], axis=1)

### 1. Bag of Ingredients

In [8]:
bag_of_ingredients.head()

Unnamed: 0,acorn_squash,adobo,african_birdseye_chile_pepper,ale,aleppo_pepper,alfalfa_sprouts,alfredo_sauce,allspice,almond,almond_butter,...,yams,yeast,yellow_curry_paste,yellow_food_coloring,yellow_split_pea,yellow_squash,yogurt,zaatar,zest,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2. TF-IDF features 

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer

![tfidf](imgs/tfidf.png)

In [None]:
corpus = []
for idx in range(NUM_RECIPES):
    # get recipe histogram (i.e. a list with 1 for the used ingredients)
    recipe_histogram = bag_of_ingredients.iloc[idx].tolist()
    # get recipe ingredient names
    used_ingredients = [ingredients[i] for i in range(NUM_INGREDIENTS) if recipe_histogram[i] != 0]
    # add recipe (w/ ingredient names) to the corpus 
    corpus.append(" ".join(used_ingredients))

#### Compute the IDF values

In [None]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(bag_of_ingredients)

# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=ingredients, columns=["idf_weights"]) 
 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])

Notice that ingredients like: ‘garlic’, ‘onion’ 'olive oil' have the lowest IDF values. This is expected as these words appear many recipes. The lower the IDF value of a word, the less unique it is to any particular document.

#### Compute the TFIDF score for the recipes

Once you have the IDF values, you can now compute the tf-idf scores for recipe. Let’s compute tf-idf scores for all the recipes.

In [None]:
# tf-idf scores
# this does tf * idf where your term frequency is weighted by its IDF values
tf_idf_vector=tfidf_transformer.transform(bag_of_ingredients) 

#### Print the TF-IDF values of the first recipe

In [None]:
# feature_names = cv.get_feature_names() 
 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[0] 
 
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=ingredients, columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

Notice that only certain ingredients have scores. This is because the recipe doesn't have all the ingredients. 

The more common the word across recipes, the lower its score and the more unique a word is to our first recipe (e.g. ‘chinese_cabbage’) the higher the score. So it’s working as expected except.