In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load Recipes

In [6]:
recipes = pd.read_csv('dataset/recipes.csv')

# number of recipes and ingredients
NUM_RECIPES = recipes.shape[0]
NUM_INGREDIENTS = recipes.shape[1] - 1 # last column is the cuisine (i.e. class)

recipes.head()

Unnamed: 0,'acorn squash',adobo,'african birdseye chile pepper',ale,'aleppo pepper','alfalfa sprouts','alfredo sauce',allspice,almond,'almond butter',...,yeast,'yellow curry paste','yellow food coloring','yellow split pea','yellow squash',yogurt,zaatar,zest,zucchini,cuisine
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
bag_of_ingredients = pd.read_pickle('bag_of_ingredients.pkl')

In [8]:
bag_of_ingredients

Unnamed: 0,acorn_squash,adobo,african_birdseye_chile_pepper,ale,aleppo_pepper,alfalfa_sprouts,alfredo_sauce,allspice,almond,almond_butter,...,yams,yeast,yellow_curry_paste,yellow_food_coloring,yellow_split_pea,yellow_squash,yogurt,zaatar,zest,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4231,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4232,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4233,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4234,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Load tf-idf vector

In [4]:
tf_idf_vector = np.load('tf_idf_vector.npy', allow_pickle=True)

In [5]:
tf_idf_vector

array(<12x709 sparse matrix of type '<class 'numpy.float64'>'
	with 2933 stored elements in Compressed Sparse Row format>, dtype=object)

## Testing

In [9]:
from sklearn.model_selection import train_test_split
classes = bag_of_ingredients['cuisine'].copy()
    
X_train, X_test, y_train, y_test = train_test_split(recipes, classes.tolist(), test_size=0.2, random_state=444)    

KeyError: 'cuisine'

In [None]:
def calculate_recipe_score_per_cusine(ingredients_sample, cuisine_idx, tf_idf_vector):
    # get tfidf vector for first document 
    first_document_vector=tf_idf_vector[cuisine_idx] 

    # print the scores 
    tfidf_scores = pd.DataFrame(first_document_vector.T.todense(), index=ingredients, columns=["tfidf"]) 
    
    total_score = 0
    for ingr in ingredients_sample:
        total_score += tfidf_scores.loc[ingr][0]
    
    return total_score

def predict_cuisine_with_fidf(recipe_idx, tf_idf_vector, boi=bag_of_ingredients):
    # get recipe ingredients
    recipe_sample_dict = {x:y for x,y in boi.iloc[recipe_idx].to_dict().items() if y!=0}
    ingredients_sample = list(recipe_sample_dict.keys())
    
    # get the recipes real class
    real_class = classes[recipe_idx]
    
    # list that holds the scores for each cuisine
    cuisine_scores = []
    
    # calculate the score for each cuisine
    for cuisine_idx in range(12):
        cuisine_score = calculate_recipe_score_per_cusine(ingredients_sample, cuisine_idx, tf_idf_vector)
    
        cuisine_scores.append(cuisine_score)
        
    return np.argmax(cuisine_scores), real_class