In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

## 1. Data Loading

In [None]:
ingredients = pd.read_csv('..\data\chefkoch22k\ingredients_with_titles.csv')

## 2. EDA

In [4]:
# Basic statistics
print(ingredients.info())
ingredients.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22137 entries, 0 to 22136
Data columns (total 47 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   RecipID      22137 non-null  int64 
 1   1            22137 non-null  object
 2   2            22132 non-null  object
 3   3            22080 non-null  object
 4   4            21895 non-null  object
 5   5            21465 non-null  object
 6   6            20576 non-null  object
 7   7            19291 non-null  object
 8   8            17627 non-null  object
 9   9            15575 non-null  object
 10  10           13295 non-null  object
 11  11           10880 non-null  object
 12  12           8670 non-null   object
 13  13           6658 non-null   object
 14  14           4994 non-null   object
 15  15           3678 non-null   object
 16  16           2665 non-null   object
 17  17           1900 non-null   object
 18  18           1364 non-null   object
 19  19           989 non-null

Unnamed: 0,RecipID,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,RecipeTitle
0,6971800010,Cachaça,Zucker,Limette,Eis,,,,,,...,,,,,,,,,,Caipirinha
1,7971878818,Wodka,Gin,Rum,Likör,Ananassaft,Grapefruitsaft,Zitronensaft,Grenadine,Guaranapulver,...,,,,,,,,,,Pangalaktischer Donnergurgler
2,23973004603,Petersilie,Chilischote,Knoblauchzehe,Garnele,Olivenöl,,,,,...,,,,,,,,,,Tapas-Garnelen in Knoblauchöl


In [5]:
# Check number of NaN values in 'RecipeTitle' column
print('There are\n', ingredients['RecipeTitle'].isnull().sum(), '\nNaN values in the RecipeTitle column')

There are
 26 
NaN values in the RecipeTitle column


In [6]:
# Check number of unique values in 'RecipeTitle' column
print('There are\n', ingredients['RecipeTitle'].nunique(), '(out of', ingredients['RecipeTitle'].count(), 'total),\nunique values in the RecipeTitle column')

There are
 20476 (out of 22111 total),
unique values in the RecipeTitle column


In [7]:
# Values and numbers of unique values in 'RecipeTitle' column that appear at least 5 times
# The Goal here is to get more images per recipe later on
print('Values and numbers of unique values in RecipeTitle column that appear at least 5 times:\n', ingredients['RecipeTitle'].value_counts()[ingredients['RecipeTitle'].value_counts() > 4])

Values and numbers of unique values in RecipeTitle column that appear at least 5 times:
 RecipeTitle
Pizzateig                  40
Quiche Lorraine            19
Szegediner Gulasch         17
Pizza                      15
Ratatouille                13
                           ..
Spätzle                     5
Griechischer Nudelsalat     5
Shepherds Pie               5
Griechischer Salat          5
Porridge                    5
Name: count, Length: 84, dtype: int64


> Using exact title matching we would get 84 recipes with at least 5 images (=examples) per recipe.
>
> To better handle slightly varying titles, we can use embeddings of the titles and cluster them to find similar titles.

In [15]:
# check the number of unique values in 'RecipeTitle' column with non case sensitive	
print('There are\n', ingredients['RecipeTitle'].str.lower().nunique(), '\nunique values in the RecipeTitle column with non case sensitive')

# check the number of unique values in 'RecipeTitle' column with non case sensitive and that appear at least 5 times
ingredients_lower = ingredients['RecipeTitle'].str.lower()
print('Values and numbers of unique values in RecipeTitle column with non case sensitive that appear at least 5 times:\n', ingredients_lower.value_counts()[ingredients_lower.value_counts() > 4])

There are
 20427 
unique values in the RecipeTitle column with non case sensitive
Values and numbers of unique values in RecipeTitle column with non case sensitive that appear at least 5 times:
 RecipeTitle
pizzateig                    40
quiche lorraine              19
szegediner gulasch           17
pizza                        15
ratatouille                  13
                             ..
saure kutteln                 5
pastitsio                     5
schwarzwälder kirschtorte     5
basler brunsli                5
schweizer wurstsalat          5
Name: count, Length: 92, dtype: int64


### 2.1 Better Title Grouping

In [9]:
# Save 'RecipeTitle' column to txt file, one title per line
# for batch-embedding via jina.ai online platform
ingredients = ingredients.dropna(subset=['RecipeTitle'])
ingredients['RecipeTitle'].to_csv('data\chefkoch22k\RecipeTitles.txt', index=False, header=False)

> txt file was batch embedded via https://jina.ai/embeddings/#apiform -> "BATCH JOB"
>
> csv with embedded titles received via mail

In [10]:
# load embeddings csv
embeddings = pd.read_csv('data\chefkoch22k\\20240428_152946_RecipeTitles_with_ids_embedding.csv', header=None)

embeddings.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
0,Caipirinha,0.120758,0.304606,0.056581,-0.270121,-0.245992,0.05924,0.055542,-0.071887,0.021748,...,0.408936,0.04894,-0.291239,-0.11613,-0.097061,-0.180908,0.062099,0.050821,0.054998,0.000848
1,Pangalaktischer Donnergurgler,-0.139787,0.226951,0.027728,-0.064323,-0.030274,-0.14573,-0.208995,0.055307,0.092607,...,0.189836,0.311412,-0.041982,0.037467,-0.015713,-0.001085,0.204901,0.084689,0.01198,0.150595


In [11]:
# add embeddings to ingridients dataframe

# Ensure the two DataFrames have the same number of rows
assert len(embeddings) == len(ingredients), "DataFrames have different number of rows!"

merged = pd.concat([ingredients, embeddings.iloc[:, 1:]], axis=1)

# rename the columns of the embeddings
col_rename = {i: f'EB_dim_{i}' for i in range(1, 769)}
merged.rename(columns=col_rename, inplace=True)

# save merged dataframe to csv
merged.to_csv('data\chefkoch22k\ingredients_with_titles_and_embeddings.csv', index=False)
merged.head(2)

KeyboardInterrupt: 

#### a) K-Means Clustering

In [None]:
# select only the embeddings columns
embeddings = merged.loc[:, 'EB_dim_1':]
embeddings.head(2)

Unnamed: 0,EB_dim_1,EB_dim_2,EB_dim_3,EB_dim_4,EB_dim_5,EB_dim_6,EB_dim_7,EB_dim_8,EB_dim_9,EB_dim_10,...,EB_dim_759,EB_dim_760,EB_dim_761,EB_dim_762,EB_dim_763,EB_dim_764,EB_dim_765,EB_dim_766,EB_dim_767,EB_dim_768
0,0.120758,0.304606,0.056581,-0.270121,-0.245992,0.05924,0.055542,-0.071887,0.021748,-0.161123,...,0.408936,0.04894,-0.291239,-0.11613,-0.097061,-0.180908,0.062099,0.050821,0.054998,0.000848
1,-0.139787,0.226951,0.027728,-0.064323,-0.030274,-0.14573,-0.208995,0.055307,0.092607,-0.010225,...,0.189836,0.311412,-0.041982,0.037467,-0.015713,-0.001085,0.204901,0.084689,0.01198,0.150595


In [None]:
# fit k-means clustering

# Remove rows with NaN values
embeddings = embeddings.dropna()

# check number of dropped rows
print('Number of dropped rows:', len(merged) - len(embeddings))

kmeans = KMeans(n_clusters=2000, random_state=0).fit(embeddings)

Number of dropped rows: 23


In [None]:
# show all values in 1st cluster

# drop rows with NaN values in embeddings
merged = merged.dropna(subset=(embeddings.columns.tolist()))

# add cluster column to merged dataframe
merged['cluster'] = kmeans.labels_

In [None]:
# show all titles in 1st cluster
print(merged[merged['cluster'] == 0]['RecipeTitle'])

34                                       Eier-Ragout
485                           Eier-Blumenkohl-Ragout
544                           Blumenkohl-Eier-Ragout
2652                           Ofencurry à la allure
2735                       Die weltbeste Schokotorte
6961                              Hefeteig für Pizza
7159                 Portugiesische Quittenmarmelade
7861      Hähnchen - Zucchini - Auflauf in Senfsahne
12135                     Griechische Thunfischpaste
13180    Kaninchenleber mit Rosmarin, natur gebraten
18944                Linsencurry mit Mango und Kokos
Name: RecipeTitle, dtype: object


> KMeans clustering was used to group the titles into 2000 clusters (number chosen by manual inspection of the clusters)
>
> -> Manual inspection of the clusters showed that the clusters were not very meaningful, trying different distance metrics

#### b) Cosine Similarity for Title Grouping

In [None]:
# sklearn cosine similarity for each data point to each other data point
cosine_sim = cosine_similarity(embeddings)

In [None]:
# create clusters based on cosine similarity
# set threshold
threshold = 0.99999999999999999999999999999999999

# create empty list to store the clusters
clusters = []


def get_cluster(data_point, cluster):
    # add data_point to cluster
    cluster.append(data_point)
    # get all data_points that are similar to data_point
    similar_points = np.where(cosine_sim[data_point] > threshold)[0]
    # remove data_point from similar_points
    similar_points = similar_points[similar_points != data_point]
    # remove all data_points that are already in the cluster
    similar_points = similar_points[~np.isin(similar_points, cluster)]
    # add all similar data_points to the cluster
    for point in similar_points:
        cluster = get_cluster(point, cluster)
    return cluster


# iterate over all data_points
for i in range(len(embeddings)):
    # check if data_point is already in a cluster
    if not any([i in cluster for cluster in clusters]):
        # create new cluster
        clusters.append(get_cluster(i, []))

# sort clusters by length
clusters = sorted(clusters, key=len, reverse=True)

# print number of clusters
print('Number of clusters:', len(clusters))

Number of clusters: 21355


In [None]:
# print values of first 5 clusters
for i in range(5):
    print('Cluster', i, ':', [merged.iloc[j]['RecipeTitle'] for j in clusters[i]])

Cluster 0 : ['Pizzateig', 'Pizzateig', 'Pizzateig', 'Pizzateig', 'Flammkuchen mit Quark', 'Tomatensoße für Pizza', 'Panna Cotta mit weißer Schokolade', 'Kichererbsen - Bulgur - Auflauf', 'Kartoffelpuffer sehr einfach', 'Pelmeni', 'Wirsingrouladen mit Lamm', 'Die Original Tiroler Schlipfkrapfen', 'Fladenbrotpizza', 'Lulas Recheadas I', 'Batatas douradas', 'Steirisches Wurzelfleisch mit Safterdäpfeln', 'Marinierte Wildschweinmedaillons', 'Westfälisches Zwiebelfleisch', 'Korsische Kartoffelpfanne', 'Feines Edelfisch - Ragout', 'Ungarische Beigel', 'Wackelpudding - Bowle', 'Weiße Espresso - Panna Cotta mit Himbeeren', 'Schlesische Kartoffelklöße', 'Mediterrane Gemüsepfanne mit Hähnchenbrustfilet', 'Hähnchenleber mit Schmorzwiebeln', 'Aprikosen - Brûlée', 'Nudeln mit Räucherlachs und Zitronenpesto', 'Lammhack - Kürbis Auflauf', 'Rinderfilet mit Polentasternen und Bohnen im Speckmantel', 'Spaghetti Bolognese', 'Putenoberkeule aus dem Ofen', 'Kräuterspätzle', 'Geschmorte Balsamico - Tomaten',

> Manual inspection: Cosine similarity seems to work better than KMeans clustering
>
> But this approach to clustering is not the best, trying sorting by cosine similarity

In [None]:
# sorting within the clusters by cosine similarity of its elements to the first element
sorted_clusters = []

for cluster in clusters:
    # get the first element of the cluster
    first_element = cluster[0]
    # get the cosine similarities of all elements in the cluster to the first element
    similarities = cosine_sim[first_element][cluster]
    # sort the cluster by the cosine similarities
    cluster = [x for _, x in sorted(zip(similarities, cluster), reverse=True)]
    sorted_clusters.append(cluster)

# print values of first 5 clusters
for i in range(5):
    print('Cluster', i, ':', [merged.iloc[j]['RecipeTitle'] for j in sorted_clusters[i]])

Cluster 0 : ['Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchini und Mozzarella', 'Pizzabrot mit Frischkäse, Zucchi