# Exploration of Recipe1M+ using clustering
For the subset of recipes used in this example, see the [Recipe1M+](http://pic2recipe.csail.mit.edu/) dataset.
> Marin, J., Biswas, A., Ofli, F., Hynes, N., Salvador, A., Aytar, Y., ... & Torralba, A. (2019). Recipe1m+: A dataset for learning cross-modal embeddings for cooking recipes and food images. IEEE transactions on pattern analysis and machine intelligence, 43(1), 187-203.

See the extracted dataset sample [here](https://unimi2013.sharepoint.com/:u:/s/InformationRetrieval/EaL7kid2qzdCmAA8RO-m5iQBsvCl5cuNIdn0rsJN1FUhSg?e=fdXkkB)

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [None]:
import os
import nltk

In [None]:
folder = "/Users/flint/Data/recipe/text-sample/"
files = [f for f in os.listdir(folder) if f.endswith('.txt')]
recipes = []
for file in files:
    with open(os.path.join(folder, file), 'r') as data:
        recipes.append(data.read())

## Extract ingredients to create a pseudo doc with the ingredients only

In [None]:
import re
from string import punctuation

In [None]:
def ingredient_tokenizer(recipe, pattern=r'-(.*?)\n'):
    pattern = re.compile(pattern)
    ingredients = []
    for ingredient_line in pattern.findall(recipe):
        parts = ingredient_line.split(', ')
        main_ingredient = parts[0].split('of ')[-1]
        tokens = ["".join([x for x in main_ingredient if x not in punctuation])]
        for token in parts[1:]:
            tokens.append("".join(x for x in token if x not in punctuation))
        ingredients += tokens
    return ingredients

In [None]:
corpus = [ingredient_tokenizer(r) for r in recipes]

## Vectorize
Try vectorizing the documents by TfIdf and by LDA

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
tfidf_vec = TfidfVectorizer(tokenizer=lambda x: x.split())
tfidf = tfidf_vec.fit_transform([" ".join(x) for x in corpus]).toarray()

In [None]:
lda_vec = LatentDirichletAllocation(n_components=20)
lda = lda_vec.fit_transform(tfidf)

### Explore topics

In [None]:
n_top_words = 6
description = []
feature_names = tfidf_vec.get_feature_names_out()
for topic_idx, topic in enumerate(lda_vec.components_):
    top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
    top_features = [feature_names[i] for i in top_features_ind]
    weights = topic[top_features_ind]
    for j, f in enumerate(top_features_ind):
        description.append({
            'topic': "T{}".format(topic_idx),
            'word': top_features[j],
            'score': round(weights[j], 3)
        })
T = pd.DataFrame(description)

In [None]:
T[T.topic=='T0']

### Visualize documents

In [None]:
from sklearn.decomposition import PCA

In [None]:
import matplotlib.pyplot as plt

In [None]:
pca_tfidf = PCA(n_components=2)
Xtfidf = pca_tfidf.fit_transform(tfidf)
pca_lda = PCA(n_components=2)
Xlda = pca_lda.fit_transform(lda)

In [None]:
fig, ax = plt.subplots(figsize=(14, 7), ncols=2)
ax[0].scatter(Xtfidf[:,0], Xtfidf[:,1], alpha=0.2, c='#999999')
ax[1].scatter(Xlda[:,0], Xlda[:,1], alpha=0.2, c='#999999')
ax[0].set_title('TfIdf')
ax[1].set_title('LDA')
plt.tight_layout()
plt.show()

## Clustering

In [None]:
from sklearn.cluster import KMeans, AgglomerativeClustering

In [None]:
models = {
    'Kmeans': (KMeans, {'n_clusters': 10}),
    'Agglomerative': (AgglomerativeClustering, {'n_clusters': 10})
}

In [None]:
run = list(models.items())

In [None]:
tfidf_clustering = {}
for k, (model, params) in tqdm(run):
    m = model(**params)
    tfidf_clustering[k] = m.fit_predict(tfidf)

In [None]:
lda_clustering = {}
for k, (model, params) in tqdm(run):
    m = model(**params)
    lda_clustering[k] = m.fit_predict(lda)

In [None]:
for model in models.keys():
    fig, ax = plt.subplots(figsize=(14, 7), ncols=2)
    fig.suptitle(model, fontsize=16)
    ax[0].scatter(Xtfidf[:,0], Xtfidf[:,1], alpha=0.2, c=tfidf_clustering[model])
    ax[1].scatter(Xlda[:,0], Xlda[:,1], alpha=0.2, c=lda_clustering[model])
    ax[0].set_title('TfIdf')
    ax[1].set_title('LDA')
    plt.tight_layout()
    plt.show()

## Cluster interpretation
**Exercize:** describe clusters by selecting the **top 10 specific and representative** features of each cluster.

In [None]:
from collections import defaultdict

In [None]:
assignment = tfidf_clustering['Kmeans']
assignment