### Imports

In [45]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import random
import tqdm

import pandas as pd

### Read the dataset

In [4]:
df_recipies = pd.read_csv("./../dataset/recipes.csv")

print(f"Number of recipes: {len(df_recipies)}")
df_recipies.head()

Number of recipes: 231637


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


### Pre-processing the dataset

In [5]:
# Convert the ingredients to list
df_recipies["ingredients"] = df_recipies["ingredients"].str.strip("[]").str.replace("'","")

In [6]:
# Drop unnecessary columns
df_recipies.drop(["id", "contributor_id", "submitted", "tags", "nutrition"], axis=1, inplace=True)

In [7]:
df_recipies.head()

Unnamed: 0,name,minutes,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,55,11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"winter squash, mexican seasoning, mixed spice,...",7
1,a bit different breakfast pizza,30,9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"prepared pizza crust, sausage patty, eggs, mil...",6
2,all in the kitchen chili,130,6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"ground beef, yellow onions, diced tomatoes, to...",13
3,alouette potatoes,45,11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","spreadable cheese with garlic and herbs, new p...",11
4,amish tomato ketchup for canning,190,5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"tomato juice, apple cider vinegar, sugar, salt...",8


### Extract the ingredients vocab

In [25]:
ingredients_list = (
    df_recipies["ingredients"]
        .dropna()                 # drop missing cells
        .str.split(',')           # split each string on commas
        .explode()                # turn lists into rows
        .str.strip()              # trim spaces around each item
        .str.replace('"', "")     # replace " with nothing
        .loc[lambda s: s.ne('')]  # remove empty strings (if any)
        .tolist()                 # back to a single Python list
)
unique_ingredients_list = list(set(ingredients_list))
unique_ingredients_list

['liquid aloe vera',
 'garlic & herb salad dressing mix',
 'prepared brown mustard',
 'aubergine',
 'fat free monterey jack pepper cheese',
 'flat egg noodles',
 'whole grain fettuccine',
 'salted peanuts without skins',
 'red curry paste',
 'shell steaks',
 'bittersweet chocolate pieces',
 'freezer jam pectin',
 'johnsonville classic italian',
 'reduced-fat cream of mushroom soup',
 'sliced peaches in juice',
 'greek honey',
 'toasted sesame oil',
 'lemonade tea mix',
 'fresh cracked pepper',
 'salted pork tail',
 'whole grain dijon mustard',
 'dried birds eye chiles',
 'plain sweet biscuits',
 'apricots',
 'pineapple liqueur',
 'green chili',
 'quorn pieces',
 'homogenized milk',
 'asparagus spears',
 'mung dal',
 'green nori seaweed flakes',
 'shredded chicken with barbecue sauce',
 'sausage patty',
 'boneless beef shank',
 'vegetarian chicken strips',
 'tomatoes with basil',
 'heavy whipping cream',
 'black sticky rice',
 'reduced-sodium chicken flavor stuffing mix',
 'chocolate so

### Convert ingredients to index

In [26]:
ingredient2idx = {ing: i for i, ing in enumerate(unique_ingredients_list)}

print(f"Number of unique ingredients: {len(unique_ingredients_list)}")
ingredient2idx

Number of unique ingredients: 14902


{'liquid aloe vera': 0,
 'garlic & herb salad dressing mix': 1,
 'prepared brown mustard': 2,
 'aubergine': 3,
 'fat free monterey jack pepper cheese': 4,
 'flat egg noodles': 5,
 'whole grain fettuccine': 6,
 'salted peanuts without skins': 7,
 'red curry paste': 8,
 'shell steaks': 9,
 'bittersweet chocolate pieces': 10,
 'freezer jam pectin': 11,
 'johnsonville classic italian': 12,
 'reduced-fat cream of mushroom soup': 13,
 'sliced peaches in juice': 14,
 'greek honey': 15,
 'toasted sesame oil': 16,
 'lemonade tea mix': 17,
 'fresh cracked pepper': 18,
 'salted pork tail': 19,
 'whole grain dijon mustard': 20,
 'dried birds eye chiles': 21,
 'plain sweet biscuits': 22,
 'apricots': 23,
 'pineapple liqueur': 24,
 'green chili': 25,
 'quorn pieces': 26,
 'homogenized milk': 27,
 'asparagus spears': 28,
 'mung dal': 29,
 'green nori seaweed flakes': 30,
 'shredded chicken with barbecue sauce': 31,
 'sausage patty': 32,
 'boneless beef shank': 33,
 'vegetarian chicken strips': 34,
 '

### Extract ingredients and recipes

In [28]:
recipe_ingredients = df_recipies["ingredients"].tolist()
recipies = df_recipies["name"]

#### Test the extraction

In [29]:
recipies[10]

'berry  good sandwich spread'

In [30]:
recipe_ingredients[10]

'whole berry cranberry sauce, sour cream, prepared horseradish'

### Embedding model for ingredients

In [117]:
class RecipeEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, ingredient_indices):
        # Pad variable-length ingredient lists
        padded = nn.utils.rnn.pad_sequence(
            [torch.tensor(x) for x in ingredient_indices],
            batch_first=True
        )
        mask = (padded != 0)  # padding mask
        embeds = self.embedding(padded)

        # Mean pooling over ingredients
        masked_embeds = embeds * mask.unsqueeze(-1)
        recipe_embeds = masked_embeds.sum(1) / mask.sum(1, keepdim=True)

        # normalize for cosine similarity
        recipe_embeds = F.normalize(recipe_embeds, dim=-1)
        return recipe_embeds

### Contrastive loss definition

In [128]:
def contrastive_loss(batch_one, batch_two, temperature=0.2):
    batch_size = batch_one.size(0)
    z = torch.cat([batch_one, batch_two], dim=0)  # (2N, d)

    # Cosine similarity matrix
    sim = torch.matmul(z, z.T) / temperature
    sim.fill_diagonal_(-9e15)  # mask self-similarity

    # Positive pairs: i-th in first half with i-th in second half
    labels = torch.cat([torch.arange(batch_size) + batch_size,
                        torch.arange(batch_size)], dim=0).to(z.device)

    loss = F.cross_entropy(sim, labels)
    return loss

### Definition of the train run

In [129]:
def train(model, recipes, epochs=1, batch_size=4, lr=1e-3, device="cpu"):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    model.to(device)

    for epoch in range(epochs):
        random.shuffle(recipes)

        total_loss = 0
        for i in tqdm.tqdm(range(0, len(recipes), batch_size)):
            batch = recipes[i:i+batch_size]

            # Create augmented views (drop random ingredients for augmentation)
            def augment(recipe):
                if len(recipe) > 1:
                    keep = random.sample(recipe, k=max(1, len(recipe)-1))
                    return keep
                return recipe

            batch_i = [augment(r) for r in batch]
            batch_j = [augment(r) for r in batch]

            z_i = model(batch_i).to(device)
            z_j = model(batch_j).to(device)

            loss = contrastive_loss(z_i, z_j)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss/len(recipes):.4f}")

### Execute the train run

In [131]:
vocab_size = len(unique_ingredients_list)
encoding_dim = 64

In [132]:
train_recipies = list()
for recipe_ingredient in recipe_ingredients:
    train_recipies.append([ingredient2idx[i.strip().replace('"', "")] for i in recipe_ingredient.split(',')])

In [133]:
model = RecipeEmbeddingModel(vocab_size, encoding_dim)

In [134]:
train(model, train_recipies, epochs=10, batch_size=8)

100%|██████████| 28955/28955 [00:41<00:00, 691.36it/s]


Epoch 1, Loss: 0.0297


100%|██████████| 28955/28955 [00:42<00:00, 685.98it/s]


Epoch 2, Loss: 0.0277


100%|██████████| 28955/28955 [00:41<00:00, 690.30it/s]


Epoch 3, Loss: 0.0266


100%|██████████| 28955/28955 [00:41<00:00, 692.09it/s]


Epoch 4, Loss: 0.0260


100%|██████████| 28955/28955 [00:43<00:00, 658.38it/s]


Epoch 5, Loss: 0.0255


100%|██████████| 28955/28955 [00:42<00:00, 679.28it/s]


Epoch 6, Loss: 0.0251


100%|██████████| 28955/28955 [00:39<00:00, 726.06it/s]


Epoch 7, Loss: 0.0249


100%|██████████| 28955/28955 [00:40<00:00, 720.93it/s]


Epoch 8, Loss: 0.0246


100%|██████████| 28955/28955 [00:38<00:00, 743.34it/s]


Epoch 9, Loss: 0.0245


100%|██████████| 28955/28955 [00:38<00:00, 744.19it/s]

Epoch 10, Loss: 0.0243





### Test the model

In [87]:
print(recipe_ingredients[20])
print(recipe_ingredients[21])

water, salt, boiling potatoes, fresh spinach leaves, unsalted butter, coarse salt, fresh ground black pepper, nutmeg
onion, scallion, apple juice, olive oil, spinach, fresh parsley, celery, broth, rolled oats, salt, dried thyme, white pepper


In [88]:
print(recipies[20])
print(recipies[21])

cream  of spinach soup
cream  of spinach soup  vegan


In [89]:
test_query_recipe_1 = train_recipies[20].copy()
test_query_recipe_2 = train_recipies[21].copy()

In [101]:
unique_ingredients_list

['liquid aloe vera',
 'garlic & herb salad dressing mix',
 'prepared brown mustard',
 'aubergine',
 'fat free monterey jack pepper cheese',
 'flat egg noodles',
 'whole grain fettuccine',
 'salted peanuts without skins',
 'red curry paste',
 'shell steaks',
 'bittersweet chocolate pieces',
 'freezer jam pectin',
 'johnsonville classic italian',
 'reduced-fat cream of mushroom soup',
 'sliced peaches in juice',
 'greek honey',
 'toasted sesame oil',
 'lemonade tea mix',
 'fresh cracked pepper',
 'salted pork tail',
 'whole grain dijon mustard',
 'dried birds eye chiles',
 'plain sweet biscuits',
 'apricots',
 'pineapple liqueur',
 'green chili',
 'quorn pieces',
 'homogenized milk',
 'asparagus spears',
 'mung dal',
 'green nori seaweed flakes',
 'shredded chicken with barbecue sauce',
 'sausage patty',
 'boneless beef shank',
 'vegetarian chicken strips',
 'tomatoes with basil',
 'heavy whipping cream',
 'black sticky rice',
 'reduced-sodium chicken flavor stuffing mix',
 'chocolate so

In [139]:
model.eval()
with torch.no_grad():
    test_query_recipe_1_emb = model([[ingredient2idx["apricots"], ingredient2idx["clam broth"], ingredient2idx["cooking oil"]]])
    test_query_recipe_2_emb = model([[ingredient2idx["apricots"], ingredient2idx["green chilies"], ingredient2idx["cooking oil"], ingredient2idx["clam broth"]]])

In [140]:
F.cosine_similarity(test_query_recipe_1_emb, test_query_recipe_2_emb)

tensor([0.8774])

### Save the model

In [141]:
torch.save(model.state_dict(), "./../models/recipe_embedding.pt")