# Clustering: an example based on word embeddings
Data on recipes are taken from [https://www.kaggle.com/datasets/kaggle/recipe-ingredients-dataset](https://www.kaggle.com/datasets/kaggle/recipe-ingredients-dataset)

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import json

In [3]:
with open('/Users/flint/Data/recipe/kaggle_recipes/train.json', 'r') as infile:
    all_recipes = json.load(infile) 

In [19]:
recipes = []
for recipe in all_recipes:
    pseudo_doc = []
    for ingredient in recipe['ingredients']:
        pseudo_doc.append(ingredient)
#        parts = ingredient.split()
#        if len(parts) > 1:
#            pseudo_doc += parts
    recipes.append(pseudo_doc)

## Train word2vec

In [20]:
import gensim.models

In [28]:
model = gensim.models.Word2Vec(sentences=recipes, window=5, epochs=25, vector_size=100, min_count=10)

In [31]:
model.wv.most_similar('couscous')

[('dried apricot', 0.7119837999343872),
 ('chickpeas', 0.6943981051445007),
 ('harissa paste', 0.6842238903045654),
 ('preserved lemon', 0.666700541973114),
 ('ras el hanout', 0.6434797644615173),
 ('leg of lamb', 0.6352275609970093),
 ('harissa', 0.6299270987510681),
 ('green lentil', 0.5988842248916626),
 ('lamb', 0.5984291434288025),
 ('brown lentils', 0.5944010615348816)]

## Create clusters of words

In [37]:
from sklearn.cluster import KMeans

In [41]:
from collections import defaultdict

In [38]:
vocabulary = list(model.wv.key_to_index.keys())
X = np.array([model.wv.get_vector(w) for w in vocabulary])

In [40]:
clusters = KMeans(n_clusters=100).fit_predict(X)

In [42]:
K = defaultdict(list)
for w, c in enumerate(clusters):
    K[c].append(vocabulary[w])

In [43]:
for cluster, words in K.items():
    print("Cluster {}".format(cluster))
    print(", ".join(words))
    print("="*10)

Cluster 8
salt, kosher salt, potatoes, sea salt, coarse salt, fine sea salt, russet potatoes, yukon gold potatoes, baking potatoes, boiling potatoes
Cluster 52
onions, purple onion, yellow onion, white onion, sweet onion, diced onions
Cluster 18
olive oil, extra-virgin olive oil, fresh basil, red wine vinegar, balsamic vinegar, capers, fresh basil leaves, cherry tomatoes, fresh oregano, pinenuts, yellow bell pepper, basil, roasted red peppers, grape tomatoes, pitted kalamata olives, cannellini beans, kalamata, anchovy fillets, arugula, goat cheese, sun-dried tomatoes, olives, artichoke hearts, Italian parsley leaves, pesto, pizza doughs, orzo, chees fresh mozzarella, anchovy paste, fresh oregano leaves, artichok heart marin, olive oil flavored cooking spray, greek seasoning
Cluster 6
water, sauce, greens, rolls, herbs, marinade, agave nectar, pork loin, sirloin steak, steak, salad, liquid, pepper flakes, wine, boneless chicken thighs, pork loin chops, pineapple chunks, grated carrot, l