In [22]:
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from gensim.models import Word2Vec
import numpy as np

In [23]:
def clean_reviews(review):
    cleantext = BeautifulSoup(review, "lxml").text
    # 2. Retaining only alphabets.
    review_text = re.sub("[^a-zA-Z]"," ",cleantext)
    # 3. Converting to lower case and splittingç
    word_tokens= review_text.lower().split()
    # 4. Remove stopwords
    le=WordNetLemmatizer()
    stop_words= set(stopwords.words("english"))     
    word_tokens= [le.lemmatize(w) for w in word_tokens if not w in stop_words]
    cleaned_review=" ".join(word_tokens)
    return cleaned_review

In [24]:
foods = pd.read_csv('../raw_data/list_of_foods.csv')
foods_list = list(foods['Food'])
foods_list_normalized = [clean_reviews(f) for f in foods_list]
foods_list_preprocessed = list(set(foods_list_normalized))

In [25]:
wine_word2vec_model = Word2Vec.load("../raw_data/food_word2vec_model.bin")

In [26]:
foods_vecs = dict()

word_vectors = wine_word2vec_model.wv
for f in foods_list_preprocessed:
    try:
        food_vec = word_vectors[f]
        foods_vecs[f] = food_vec
    except:
        continue

In [29]:
from scipy import spatial

core_tastes_revised = {'weight': ['heavy', 'cassoulet', 'cassoulet', 'full_bodied', 'thick', 'milk', 'fat', 'mincemeat', 'steak', 'bold', 'pizza', 'pasta', 'creamy', 'bread'],
                       'sweet': ['sweet', 'sugar', 'cake', 'mango', 'stevia'], 
                       'acid': ['acid', 'sour', 'vinegar', 'yoghurt'],
                       'salt': ['salty', 'salty', 'parmesan', 'oyster', 'pizza', 'bacon', 'cured_meat', 'sausage', 'potato_chip'], 
                       'piquant': ['spicy'], 
                       'fat': ['fat', 'fried', 'creamy', 'cassoulet', 'foie_gras', 'buttery', 'cake', 'foie_gras', 'sausage', 'brie', 'carbonara'], 
                       'bitter': ['bitter', 'kale']
                      }

average_taste_vecs = dict()
core_tastes_distances = dict()
for taste, keywords in core_tastes_revised.items():
    
    all_keyword_vecs = []
    for keyword in keywords:
        c_vec = word_vectors[keyword]
        all_keyword_vecs.append(c_vec)
    
    avg_taste_vec = np.average(all_keyword_vecs, axis=0)
    average_taste_vecs[taste] = avg_taste_vec
        
    taste_distances = dict()
    for k, v in foods_vecs.items():
        similarity = 1- spatial.distance.cosine(avg_taste_vec, v)
        taste_distances[k] = similarity
        
    core_tastes_distances[taste] = taste_distances

In [30]:
food_nonaroma_infos = dict()
# for each core taste, identify the food item that is farthest and closest. We will need this to create a normalized scale between 0 and 1
for key, value in core_tastes_revised.items():
    dict_taste = dict()
    farthest = min(core_tastes_distances[key], key=core_tastes_distances[key].get)
    farthest_distance = core_tastes_distances[key][farthest]
    closest = max(core_tastes_distances[key], key=core_tastes_distances[key].get)
    closest_distance = core_tastes_distances[key][closest]
    print(key, farthest, closest)
    dict_taste['farthest'] = farthest_distance
    dict_taste['closest'] = closest_distance
    dict_taste['average_vec'] = average_taste_vecs[key]
    food_nonaroma_infos[key] = dict_taste

weight dragonfruit pasta
sweet mackerel honey
acid nibble tart
salt nectar bacon
piquant foodstuff pepper
fat coffee sausage
bitter biscuit kale


In [32]:
food_nonaroma_infos_df = pd.DataFrame(food_nonaroma_infos).T
food_nonaroma_infos_df.to_csv('average_nonaroma_vectors.csv')

In [33]:
food_nonaroma_infos_df.head(10)

Unnamed: 0,farthest,closest,average_vec
weight,-0.136163,0.494628,"[-0.9903981, 0.7549157, -0.11517195, -0.026751..."
sweet,-0.206559,0.511624,"[0.03282367, 0.761109, -0.17922759, -1.2561963..."
acid,-0.135847,0.509928,"[-0.024031281, 0.7139694, -0.23145452, -0.8389..."
salt,-0.13522,0.619147,"[-0.8502851, -0.08972356, -0.52269435, 0.96423..."
piquant,-0.157514,0.491482,"[-1.8512405, 0.8314903, -0.66967815, -1.931818..."
fat,-0.103601,0.573433,"[-0.23399287, 0.41445056, -0.26073048, -0.2112..."
bitter,-0.161778,0.661107,"[-0.38563982, 0.7832479, -0.20703274, -0.67184..."
