# Restaurant-Menu-Analysis
## Clustering Word Embeddings of Restaurant Brands' Menu Item

### Clone Repository and Set Directory

In [3]:
# Clone this repository
!git clone https://github.com/alexdseo/Restaurant-Menu-Analysis

fatal: destination path 'Restaurant-Menu-Analysis' already exists and is not an empty directory.


In [4]:
# Change directory
%cd Restaurant-Menu-Analysis

/content/Restaurant-Menu-Analysis


### Install and Import Libraries

In [None]:
# @title
# pip install libraries
!pip install -U sentence-transformers
!pip install umap-learn
!pip install hdbscan
!pip install sister
#!python -m spacy download en_core_web_lg

In [189]:
# Import Libraries
#Basic
import torch
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
#Embeddings
from sentence_transformers import SentenceTransformer # SBERT
#from gensim.models import FastText
#from transformers import AutoModel, AutoTokenizer
import sister #FastText
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
#import spacy
#from gensim.corpora import Dictionary
#from gensim.models.tfidfmodel import TfidfModel
#from gensim.matutils import sparse2full
#Dimensionality Reduction
import umap
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
#Clustering
import hdbscan
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
#Visulaization
import matplotlib.pyplot as plt
import plotly.express as px

### Read pre-processed dataset

In [5]:
#Preprocessed data
df = pd.read_csv('MenuItem_cleaned.csv')
df_list = df['Original food Item'].to_list()

## Embedding models
- SBERT
- FastText
- TF-IDF

In [6]:
# Set seed
np.random.seed(1996)
# SBERT
sbert = SentenceTransformer('all-MiniLM-L6-v2') #default max seq length =256 #Truncate sentence length over 256: 7 cases
# FastText
ft = sister.MeanEmbedding(lang="en") #Average of word embeddings calculated from FastText

Loading model...




In [7]:
# SBERT embeddings
embeddings_sbert = sbert.encode(df['Original food Item'])

In [8]:
# FastTexct embeddings
embeddings_ft = ft(df_list[0])
for i in range(1, len(df)):
  embedding = ft(df_list[i])
  embeddings_ft = np.vstack((embeddings_ft, embedding))

In [9]:
# TF-IDF
tfidf = TfidfVectorizer()
embeddings_tfidf = tfidf.fit_transform(df_list).toarray()

In [10]:
print(f'SBERT embedding shape: {embeddings_sbert.shape}')
print(f'FastText embedding shape: {embeddings_ft.shape}')
print(f'TF-IDF embedding shape: {embeddings_tfidf.shape}')

SBERT embedding shape: (308, 384)
FastText embedding shape: (308, 300)
TF-IDF embedding shape: (308, 2152)


## Dimensionality reduction methods
- PCA
- t-SNE
- UMAP

#### SBERT

In [11]:
# PCA
sbert_pca = PCA(n_components=2).fit_transform(embeddings_sbert)
# t-SNE
sbert_tsne = TSNE(n_components=2, learning_rate='auto', init='pca').fit_transform(embeddings_sbert)
# UMAP
sbert_umap = umap.UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings_sbert)



#### FastText

In [12]:
# PCA
ft_pca = PCA(n_components=2).fit_transform(embeddings_ft)
# t-SNE
ft_tsne = TSNE(n_components=2, learning_rate='auto', init='pca').fit_transform(embeddings_ft)
# UMAP
ft_umap = umap.UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings_ft)



#### TF-IDF

In [13]:
# PCA
tfidf_pca = PCA(n_components=2).fit_transform(embeddings_tfidf)
# t-SNE
tfidf_tsne = TSNE(n_components=2, learning_rate='auto', init='pca').fit_transform(embeddings_tfidf)
# UMAP
tfidf_umap = umap.UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings_tfidf)



## Clustering
- k-means
- Gaussian Mixture Models
- HDBSCAN

#### SBERT

In [17]:
#K-means
#Set score to update
best_k = 0
best_score = 0
#Choose k from 5-10
k = np.arange(5, 10)

#Use all dataset
for i in k:
    kmeans = KMeans(n_clusters=i, random_state=1996).fit(sbert_pca) # Run for sbert_tsne and sbert_umap too
    #pick silhouette score
    sil_score = silhouette_score(sbert_pca, kmeans.labels_)
    #sil_score = 1 means clusters are well apart
    if sil_score > best_score:
        best_score = sil_score
        best_k = i

print('Chosen k:', best_k)
print('Silhouette score for chosen k:', best_score)

Chosen k: 9
Silhouette score for chosen k: 0.47177288


In [19]:
# Gaussian Mixture model
#Set score to update
best_k = 0
best_score = 0
#Choose k from 5-10
k = np.arange(5, 10)

#Use all dataset
for i in k:
    gmm = GaussianMixture(n_components=i, random_state=1996).fit_predict(sbert_pca) # Run for sbert_tsne and sbert_umap too
    #pick silhouette score
    sil_score = silhouette_score(sbert_pca, gmm)
    #sil_score = 1 means clusters are well apart
    if sil_score > best_score:
        best_score = sil_score
        best_k = i

print('Chosen k:', best_k)
print('Silhouette score for chosen k:', best_score)

Chosen k: 8
Silhouette score for chosen k: 0.39172366


In [27]:
# Run clustering models with fine tuned hyperparameter
#k-means
sbert_pca_kmeans = KMeans(n_clusters=5, random_state=1996).fit_predict(sbert_pca)
sbert_tsne_kmeans = KMeans(n_clusters=9, random_state=1996).fit_predict(sbert_tsne)
sbert_umap_kmeans = KMeans(n_clusters=9, random_state=1996).fit_predict(sbert_umap)
#GMM
sbert_pca_gmm = GaussianMixture(n_components=6, random_state=1996).fit_predict(sbert_pca)
sbert_tsne_gmm = GaussianMixture(n_components=5, random_state=1996).fit_predict(sbert_tsne)
sbert_umap_gmm = GaussianMixture(n_components=8, random_state=1996).fit_predict(sbert_umap)
#HDBSCAN
sbert_pca_hdbscan = hdbscan.HDBSCAN(min_cluster_size=5, metric='l2', cluster_selection_epsilon=0.5, min_samples=1).fit(sbert_pca)
sbert_tsne_hdbscan = hdbscan.HDBSCAN(min_cluster_size=5, metric='l2', cluster_selection_epsilon=0.5, min_samples=1).fit(sbert_tsne)
sbert_umap_hdbscan = hdbscan.HDBSCAN(min_cluster_size=5, metric='l2', cluster_selection_epsilon=0.5, min_samples=1).fit(sbert_umap)

#### FastText

In [23]:
#K-means
#Set score to update
best_k = 0
best_score = 0
#Choose k from 5-10
k = np.arange(5, 10)

#Use all dataset
for i in k:
    kmeans = KMeans(n_clusters=i, random_state=1996).fit(ft_pca) # Run for ft_tsne and ft_umap too
    #pick silhouette score
    sil_score = silhouette_score(ft_pca, kmeans.labels_)
    #sil_score = 1 means clusters are well apart
    if sil_score > best_score:
        best_score = sil_score
        best_k = i

print('Chosen k:', best_k)
print('Silhouette score for chosen k:', best_score)

Chosen k: 7
Silhouette score for chosen k: 0.37244686


In [26]:
# Gaussian Mixture model
#Set score to update
best_k = 0
best_score = 0
#Choose k from 5-10
k = np.arange(5, 10)

#Use all dataset
for i in k:
    gmm = GaussianMixture(n_components=i, random_state=1996).fit_predict(ft_pca) # Run for frt_tsne and ft_umap too
    #pick silhouette score
    sil_score = silhouette_score(ft_pca, gmm)
    #sil_score = 1 means clusters are well apart
    if sil_score > best_score:
        best_score = sil_score
        best_k = i

print('Chosen k:', best_k)
print('Silhouette score for chosen k:', best_score)

Chosen k: 9
Silhouette score for chosen k: 0.4492877


In [28]:
# Run clustering models with fine tuned hyperparameter
#k-means
ft_pca_kmeans = KMeans(n_clusters=5, random_state=1996).fit_predict(ft_pca)
ft_tsne_kmeans = KMeans(n_clusters=7, random_state=1996).fit_predict(ft_tsne)
ft_umap_kmeans = KMeans(n_clusters=9, random_state=1996).fit_predict(ft_umap)
#GMM
ft_pca_gmm = GaussianMixture(n_components=5, random_state=1996).fit_predict(ft_pca)
ft_tsne_gmm = GaussianMixture(n_components=6, random_state=1996).fit_predict(ft_tsne)
ft_umap_gmm = GaussianMixture(n_components=9, random_state=1996).fit_predict(ft_umap)
#HDBSCAN
ft_pca_hdbscan = hdbscan.HDBSCAN(min_cluster_size=5, metric='l2', cluster_selection_epsilon=0.5, min_samples=1).fit(ft_pca)
ft_tsne_hdbscan = hdbscan.HDBSCAN(min_cluster_size=5, metric='l2', cluster_selection_epsilon=0.5, min_samples=1).fit(ft_tsne)
ft_umap_hdbscan = hdbscan.HDBSCAN(min_cluster_size=5, metric='l2', cluster_selection_epsilon=0.5, min_samples=1).fit(ft_umap)

#### TF-IDF

In [31]:
#K-means
#Set score to update
best_k = 0
best_score = 0
#Choose k from 5-10
k = np.arange(5, 10)

#Use all dataset
for i in k:
    kmeans = KMeans(n_clusters=i, random_state=1996).fit(tfidf_pca) # Run for tfidf_tsne and tfidf_umap too
    #pick silhouette score
    sil_score = silhouette_score(tfidf_pca, kmeans.labels_)
    #sil_score = 1 means clusters are well apart
    if sil_score > best_score:
        best_score = sil_score
        best_k = i

print('Chosen k:', best_k)
print('Silhouette score for chosen k:', best_score)

Chosen k: 8
Silhouette score for chosen k: 0.45593092


In [34]:
# Gaussian Mixture model
#Set score to update
best_k = 0
best_score = 0
#Choose k from 5-10
k = np.arange(5, 10)

#Use all dataset
for i in k:
    gmm = GaussianMixture(n_components=i, random_state=1996).fit_predict(tfidf_pca) # Run for tfidf_tsne and tfidf_umap too
    #pick silhouette score
    sil_score = silhouette_score(tfidf_pca, gmm)
    #sil_score = 1 means clusters are well apart
    if sil_score > best_score:
        best_score = sil_score
        best_k = i

print('Chosen k:', best_k)
print('Silhouette score for chosen k:', best_score)

Chosen k: 8
Silhouette score for chosen k: 0.42113218


In [35]:
# Run clustering models with fine tuned hyperparameter
#k-means
tfidf_pca_kmeans = KMeans(n_clusters=7, random_state=1996).fit_predict(tfidf_pca)
tfidf_tsne_kmeans = KMeans(n_clusters=7, random_state=1996).fit_predict(tfidf_tsne)
tfidf_umap_kmeans = KMeans(n_clusters=8, random_state=1996).fit_predict(tfidf_umap)
#GMM
tfidf_pca_gmm = GaussianMixture(n_components=5, random_state=1996).fit_predict(tfidf_pca)
tfidf_tsne_gmm = GaussianMixture(n_components=5, random_state=1996).fit_predict(tfidf_tsne)
tfidf_umap_gmm = GaussianMixture(n_components=8, random_state=1996).fit_predict(tfidf_umap)
#HBDSCAN
tfidf_pca_hdbscan = hdbscan.HDBSCAN(min_cluster_size=5, metric='l2', cluster_selection_epsilon=0.5, min_samples=1).fit(tfidf_pca)
tfidf_tsne_hdbscan = hdbscan.HDBSCAN(min_cluster_size=5, metric='l2', cluster_selection_epsilon=0.5, min_samples=1).fit(tfidf_tsne)
tfidf_umap_hdbscan = hdbscan.HDBSCAN(min_cluster_size=5, metric='l2', cluster_selection_epsilon=0.5, min_samples=1).fit(tfidf_umap)

## Evaluation

In [161]:
#Make df for every cases total 27 cases with 3 embedding models, 3 dimensionality reduction tecniques, 3 clustering model
#SBERT
#pca
sbert_pca_df = pd.DataFrame(sbert_pca, columns=['x', 'y'])
sbert_pca_df['kmeans_labels'] = sbert_pca_kmeans
sbert_pca_df['gmm_labels'] = sbert_pca_gmm
sbert_pca_df['hdbscan_labels'] = sbert_pca_hdbscan.labels_#2 clusters
sbert_pca_df['Name'] = df['Restaurant Name']
#tsne
sbert_tsne_df = pd.DataFrame(sbert_tsne, columns=['x', 'y'])
sbert_tsne_df['kmeans_labels'] = sbert_tsne_kmeans
sbert_tsne_df['gmm_labels'] = sbert_tsne_gmm
sbert_tsne_df['hdbscan_labels'] = sbert_tsne_hdbscan.labels_#22 clusters
sbert_tsne_df['Name'] = df['Restaurant Name']
#umap
sbert_umap_df = pd.DataFrame(sbert_umap, columns=['x', 'y'])
sbert_umap_df['kmeans_labels'] = sbert_umap_kmeans
sbert_umap_df['gmm_labels'] = sbert_umap_gmm
sbert_umap_df['hdbscan_labels'] = sbert_umap_hdbscan.labels_#9 clusters
sbert_umap_df['Name'] = df['Restaurant Name']

In [162]:
#FastText
#pca
ft_pca_df = pd.DataFrame(ft_pca, columns=['x', 'y'])
ft_pca_df['kmeans_labels'] = ft_pca_kmeans
ft_pca_df['gmm_labels'] = ft_pca_gmm
ft_pca_df['hdbscan_labels'] = ft_pca_hdbscan.labels_# 2 clusters
ft_pca_df['Name'] = df['Restaurant Name']
#tsne
ft_tsne_df = pd.DataFrame(ft_tsne, columns=['x', 'y'])
ft_tsne_df['kmeans_labels'] = ft_tsne_kmeans
ft_tsne_df['gmm_labels'] = ft_tsne_gmm
ft_tsne_df['hdbscan_labels'] = ft_tsne_hdbscan.labels_#20 clusters
ft_tsne_df['Name'] = df['Restaurant Name']
#umap
ft_umap_df = pd.DataFrame(ft_umap, columns=['x', 'y'])
ft_umap_df['kmeans_labels'] = ft_umap_kmeans
ft_umap_df['gmm_labels'] = ft_umap_gmm
ft_umap_df['hdbscan_labels'] = ft_umap_hdbscan.labels_#8 clusters #no noise
ft_umap_df['Name'] = df['Restaurant Name']

In [163]:
#TF-IDF
#pca
tfidf_pca_df = pd.DataFrame(tfidf_pca, columns=['x', 'y'])
tfidf_pca_df['kmeans_labels'] = tfidf_pca_kmeans
tfidf_pca_df['gmm_labels'] = tfidf_pca_gmm
tfidf_pca_df['hdbscan_labels'] = tfidf_pca_hdbscan.labels_#2 clusters
tfidf_pca_df['Name'] = df['Restaurant Name']
#tsne
tfidf_tsne_df = pd.DataFrame(tfidf_tsne, columns=['x', 'y'])
tfidf_tsne_df['kmeans_labels'] = tfidf_tsne_kmeans
tfidf_tsne_df['gmm_labels'] = tfidf_tsne_gmm
tfidf_tsne_df['hdbscan_labels'] = tfidf_tsne_hdbscan.labels_ #25 clusters
tfidf_tsne_df['Name'] = df['Restaurant Name']
#umap
tfidf_umap_df = pd.DataFrame(tfidf_umap, columns=['x', 'y'])
tfidf_umap_df['kmeans_labels'] = tfidf_umap_kmeans
tfidf_umap_df['gmm_labels'] = tfidf_umap_gmm
tfidf_umap_df['hdbscan_labels'] = tfidf_umap_hdbscan.labels_ #14 clusters
tfidf_umap_df['Name'] = df['Restaurant Name']

In [436]:
#From this cell to next 6 cells, iterate through all the cases
#Check numbers of clusters for each cases
np.unique(tfidf_pca_kmeans)

array([0, 1, 2, 3, 4, 5, 6], dtype=int32)

In [422]:
#Save each cluster's index for all cases(27cases)
labels = pd.Series(tfidf_umap_hdbscan.labels_)
cluster_0 = labels.index[labels==0]
cluster_1 = labels.index[labels==1]
cluster_2 = labels.index[labels==2]
cluster_3 = labels.index[labels==3]
cluster_4 = labels.index[labels==4]
cluster_5 = labels.index[labels==5]
cluster_6 = labels.index[labels==6]
cluster_7 = labels.index[labels==7]
cluster_8 = labels.index[labels==8]
cluster_9 = labels.index[labels==9]
cluster_10 = labels.index[labels==10]
cluster_11 = labels.index[labels==11]
cluster_12 = labels.index[labels==12]
cluster_13 = labels.index[labels==13]
cluster_14 = labels.index[labels==14]
cluster_15 = labels.index[labels==-1]

In [423]:
#Save each cluster's index for all cases(27cases)
clusters=[cluster_0, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5, cluster_6, cluster_7, cluster_8, cluster_9, cluster_10, cluster_11, cluster_12,
          cluster_13, cluster_14, cluster_15]

In [424]:
#Calculate cosine similarity for all cluster
clustering_quality=[]
for c in clusters:
  clustering_quality.append((cosine_similarity(embeddings_sbert[c]).sum() - len(c)) / (len(c)*len(c)-1))

In [425]:
clustering_quality

[0.5931564172108968,
 0.3180164761013455,
 0.3653745940237334,
 0.49824537549700054,
 0.4637500239353554,
 0.441962650844029,
 0.34545103708902997,
 0.3950981193489128,
 0.35609258015950523,
 0.3550162300605892,
 0.5544767379760742,
 0.3359201431274414,
 0.3512550337874108,
 0.4187391371954055,
 0.4187368301146808,
 0.271366452413892]

In [426]:
#Calculate pairwise distance using cosine similiarity within each clusters and average them by each cases
tfidf_umap_hdbscan_q=np.mean(clustering_quality)

In [411]:
#Make dataframe with all models used and the clustering quality
overall = pd.DataFrame(columns=['Embedding model', 'Dimension reduction model', 'Clustering model', 'Quality'])

In [412]:
overall['Embedding model']=['SBERT','SBERT','SBERT','SBERT','SBERT','SBERT','SBERT','SBERT','SBERT',
                            'FastText','FastText','FastText','FastText','FastText','FastText','FastText','FastText','FastText',
                            'TF-IDF','TF-IDF','TF-IDF','TF-IDF','TF-IDF','TF-IDF','TF-IDF','TF-IDF','TF-IDF']

In [414]:
overall['Dimension reduction model']=['PCA','PCA','PCA','t-SNE','t-SNE','t-SNE','UMAP','UMAP','UMAP',
                                      'PCA','PCA','PCA','t-SNE','t-SNE','t-SNE','UMAP','UMAP','UMAP',
                                      'PCA','PCA','PCA','t-SNE','t-SNE','t-SNE','UMAP','UMAP','UMAP']

In [416]:
overall['Clustering model']=['k-means','GMM','HDBSCAN','k-means','GMM','HDBSCAN','k-means','GMM','HDBSCAN',
                             'k-means','GMM','HDBSCAN','k-means','GMM','HDBSCAN','k-means','GMM','HDBSCAN',
                             'k-means','GMM','HDBSCAN','k-means','GMM','HDBSCAN','k-means','GMM','HDBSCAN']

In [444]:
overall['Quality']=[sbert_pca_kmeans_q, sbert_pca_gmm_q, sbert_pca_hdbscan_q, sbert_tsne_kmeans_q, sbert_tsne_gmm_q, sbert_tsne_hdbscan_q, sbert_umap_kmeans_q, sbert_umap_gmm_q, sbert_umap_hdbscan_q,
                    ft_pca_kmeans_q, ft_pca_gmm_q, ft_pca_hdbscan_q, ft_tsne_kmeans_q, ft_tsne_gmm_q, ft_tsne_hdbscan_q, ft_umap_kmeans_q, ft_umap_gmm_q, ft_umap_hdbscan_q,
                    tfidf_pca_kmeans_q, tfidf_pca_gmm_q, tfidf_pca_hdbscan_q, tfidf_tsne_kmeans_q, tfidf_tsne_gmm_q, tfidf_tsne_hdbscan_q, tfidf_umap_kmeans_q, tfidf_umap_gmm_q, tfidf_umap_hdbscan_q] 


## Overall Results

In [445]:
overall

Unnamed: 0,Embedding model,Dimension reduction model,Clustering model,Quality
0,SBERT,PCA,k-means,0.420402
1,SBERT,PCA,GMM,0.425004
2,SBERT,PCA,HDBSCAN,0.340478
3,SBERT,t-SNE,k-means,0.408841
4,SBERT,t-SNE,GMM,0.371779
5,SBERT,t-SNE,HDBSCAN,0.38342
6,SBERT,UMAP,k-means,0.4259
7,SBERT,UMAP,GMM,0.419227
8,SBERT,UMAP,HDBSCAN,0.491033
9,FastText,PCA,k-means,0.310573


## Visualization

In [470]:
# Visualize SBERT Clustering result with higest quality
fig = px.scatter(sbert_umap_df, x="x", y="y", color="hdbscan_labels", title='SBERT - UMAP - HDBSCAN',
                 width=1000,height =800)
fig.show()

In [466]:
# Visualize FastText Clustering result with higest quality
fig = px.scatter(ft_umap_df, x="x", y="y", color="hdbscan_labels", title='FastText - UMAP - HDBSCAN',
                 width=1000,height =800)
fig.show()

In [469]:
# Visualize TF-IDF Clustering result with higest quality
fig = px.scatter(tfidf_pca_df, x="x", y="y", color="kmeans_labels", title='TF-IDF - PCA - k-means',
                 width=1000,height =800)
fig.show()