### Imports

In [180]:
import pickle
import joblib

import torch
import torch.nn as nn
import torch.nn.functional as F

import random
import tqdm

import pandas as pd

### Read the dataset

In [181]:
df_recipies = pd.read_csv("./../dataset/recipes_clean.csv", low_memory=False)

print(f"Number of recipes: {len(df_recipies)}")
df_recipies.head()

Number of recipes: 2231133


Unnamed: 0,title,ingredients,directions,link,source,NER,site
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""bite size shredded rice biscuits"", ""vanilla""...",www.cookbooks.com
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""cream of mushroom soup"", ""beef"", ""sour cream...",www.cookbooks.com
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""pepper"", ""cream cheese"", ""gar...",www.cookbooks.com
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken gravy"", ""cream of mushroom soup"", ""c...",www.cookbooks.com
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""graham cracker crumbs"", ""powdered sugar"", ""p...",www.cookbooks.com


In [182]:
df_recipies.dropna(axis=0, subset=["title"], inplace=True)

print(f"Number of recipes: {len(df_recipies)}")
df_recipies.head()

Number of recipes: 2231132


Unnamed: 0,title,ingredients,directions,link,source,NER,site
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""bite size shredded rice biscuits"", ""vanilla""...",www.cookbooks.com
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""cream of mushroom soup"", ""beef"", ""sour cream...",www.cookbooks.com
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""pepper"", ""cream cheese"", ""gar...",www.cookbooks.com
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken gravy"", ""cream of mushroom soup"", ""c...",www.cookbooks.com
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""graham cracker crumbs"", ""powdered sugar"", ""p...",www.cookbooks.com


In [183]:
df_recipies.drop_duplicates(subset=["title"], inplace=True)

print(f"Number of recipes: {len(df_recipies)}")
df_recipies.head()

Number of recipes: 1290105


Unnamed: 0,title,ingredients,directions,link,source,NER,site
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""bite size shredded rice biscuits"", ""vanilla""...",www.cookbooks.com
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""cream of mushroom soup"", ""beef"", ""sour cream...",www.cookbooks.com
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""pepper"", ""cream cheese"", ""gar...",www.cookbooks.com
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken gravy"", ""cream of mushroom soup"", ""c...",www.cookbooks.com
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""graham cracker crumbs"", ""powdered sugar"", ""p...",www.cookbooks.com


### Pre-processing the dataset

In [184]:
# Convert the ingredients to list
df_recipies["NER"] = df_recipies["NER"].str.strip("[]").str.replace('"', '')

In [185]:
# Drop unnecessary columns
df_recipies.drop(["source", "site"], axis=1, inplace=True)

df_recipies.head()

Unnamed: 0,title,ingredients,directions,link,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,"bite size shredded rice biscuits, vanilla, bro..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,"cream of mushroom soup, beef, sour cream, chic..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,"frozen corn, pepper, cream cheese, garlic powd..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,"chicken gravy, cream of mushroom soup, chicken..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,"graham cracker crumbs, powdered sugar, peanut ..."


### Extract the ingredients vocab

In [186]:
ingredients_list = (
    df_recipies["NER"]
        .dropna()                 # drop missing cells
        .str.split(',')           # split each string on commas
        .explode()                # turn lists into rows
        .str.strip()              # trim spaces around each item
        .str.replace("/", "")     # replace " with nothing
        .str.lower()              # all to lower case
        .loc[lambda s: s.ne('')]  # remove empty strings (if any)
        .tolist()                 # back to a single Python list
)
unique_ingredients_list = list(set(ingredients_list))
unique_ingredients_list

['seasons zesty',
 'boloney',
 'bagof spanish yellow rice',
 'andes peppermint baking chips',
 'filberts ground',
 'lemon sqeeze',
 'coconut cake frosting',
 'recipe whipped cream',
 'grain seedy crackers',
 'hemp hearts manitoba',
 'teleras',
 'shiitake soaking liquid',
 'cooking minute rice',
 'quartered cherry tomatoes',
 'using sharp knife',
 'marrow gut',
 'combine ingredients',
 'chicken andouille sausage',
 'green sherbet',
 'greek yoghurt natural',
 'olive garden italian dressing',
 'choke',
 'flat -leaf parsley',
 'tumbleweed shoots',
 "al's barbecue sauce",
 'apple wine vinegar',
 'black lumpfish roe',
 'knorr fiesta',
 'maggi vegetarian vegetable flavor',
 'shallots white pepper',
 'mixed fruit blend',
 'soy flaxseed tortilla chips',
 'vinagar',
 'non fat vanilla',
 'unripe nangka',
 'very ripe pureed bananas',
 'bella cucina',
 'jalepenos including seeds',
 'weight cured serrano',
 'either green olive',
 'orange and onion',
 'betty crocker super',
 'cajun-style crab boil se

### Convert ingredients to index

In [187]:
ingredient2idx = {ing: i for i, ing in enumerate(unique_ingredients_list)}

print(f"Number of unique ingredients: {len(unique_ingredients_list)}")
ingredient2idx

Number of unique ingredients: 165431


{'seasons zesty': 0,
 'boloney': 1,
 'bagof spanish yellow rice': 2,
 'andes peppermint baking chips': 3,
 'filberts ground': 4,
 'lemon sqeeze': 5,
 'coconut cake frosting': 6,
 'recipe whipped cream': 7,
 'grain seedy crackers': 8,
 'hemp hearts manitoba': 9,
 'teleras': 10,
 'shiitake soaking liquid': 11,
 'cooking minute rice': 12,
 'quartered cherry tomatoes': 13,
 'using sharp knife': 14,
 'marrow gut': 15,
 'combine ingredients': 16,
 'chicken andouille sausage': 17,
 'green sherbet': 18,
 'greek yoghurt natural': 19,
 'olive garden italian dressing': 20,
 'choke': 21,
 'flat -leaf parsley': 22,
 'tumbleweed shoots': 23,
 "al's barbecue sauce": 24,
 'apple wine vinegar': 25,
 'black lumpfish roe': 26,
 'knorr fiesta': 27,
 'maggi vegetarian vegetable flavor': 28,
 'shallots white pepper': 29,
 'mixed fruit blend': 30,
 'soy flaxseed tortilla chips': 31,
 'vinagar': 32,
 'non fat vanilla': 33,
 'unripe nangka': 34,
 'very ripe pureed bananas': 35,
 'bella cucina': 36,
 'jalepenos

In [9]:
with open("./../dataset/ingredient2idx.pkl", "wb") as f:
    pickle.dump(ingredient2idx, f)

### Extract ingredients and recipes

In [10]:
recipe_ingredients = df_recipies["NER"].tolist()
recipies = df_recipies["title"]

#### Test the extraction

In [11]:
recipies[10]

'Double Cherry Delight'

In [12]:
recipe_ingredients[10]

'flavor gelatin, dark sweet pitted cherries, marshmallows, ginger ale, almond extract, boiling water'

### Embedding model for ingredients

In [145]:
class RecipeEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, projection_dim=128, pooling="mean"):
        super().__init__()
        self.pooling = pooling
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Projection head (SimCLR style)
        self.projection = nn.Sequential(
            nn.Linear(embedding_dim, embedding_dim),
            nn.ReLU(),
            nn.Linear(embedding_dim, projection_dim)
        )

        if pooling == "attention":
            self.attention = nn.Linear(embedding_dim, 1)

    def forward(self, ingredient_indices):
        # Pad variable-length ingredient lists
        padded = nn.utils.rnn.pad_sequence(
            [torch.tensor(x) for x in ingredient_indices],
            batch_first=True,
            padding_value=0
        )
        mask = (padded != 0)  # padding mask
        embeds = self.embedding(padded)

        if self.pooling == "attention":
            # Compute attention scores
            attn_logits = self.attention(embeds).squeeze(-1)  # [batch, seq_len]
            attn_logits[mask == 0] = -1e9  # mask padding
            attn_weights = F.softmax(attn_logits, dim=1).unsqueeze(-1)  # [batch, seq_len, 1]
            recipe_embeds = (embeds * attn_weights).sum(1)
        elif self.pooling == "mean":
            # Mean pooling over ingredients
            masked_embeds = embeds * mask.unsqueeze(-1)
            recipe_embeds = masked_embeds.sum(1) / (mask.sum(1, keepdim=True) + 1e-8)
        else:
            raise NotImplementedError(f"Unknown pool_type: {self.pooling}")

        # normalize for cosine similarity
        recipe_embeds = F.normalize(recipe_embeds, dim=-1)

        # Projection for contrastive loss
        projected = self.projection(recipe_embeds)
        projected = F.normalize(projected, dim=-1)  # keep cosine scale

        return recipe_embeds, projected

### Contrastive loss definition

In [141]:
def contrastive_loss(batch_one, batch_two, temperature=0.2):
    batch_size = batch_one.size(0)
    z = torch.cat([batch_one, batch_two], dim=0)  # (2N, d)

    # Cosine similarity matrix
    sim = torch.matmul(z, z.T) / temperature
    sim.fill_diagonal_(-9e15)  # mask self-similarity

    # Positive pairs: i-th in first half with i-th in second half
    labels = torch.cat([torch.arange(batch_size) + batch_size,
                        torch.arange(batch_size)], dim=0).to(z.device)

    loss = F.cross_entropy(sim, labels)
    return loss

### Definition of the train run

In [142]:
def train(model, recipes, epochs=1, batch_size=4, lr=1e-3, device="cpu"):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    model.to(device)

    for epoch in range(epochs):
        random.shuffle(recipes)

        total_loss, n_batches = 0, 0
        for i in tqdm.tqdm(range(0, len(recipes), batch_size)):
            batch = recipes[i:i+batch_size]
            r1, r2 = zip(*batch)

            # z_i = model(batch_i).to(device)
            # z_j = model(batch_j).to(device)

            z_i_base, z_i_proj = model(list(r1))
            z_j_base, z_j_proj = model(list(r2))

            loss = contrastive_loss(z_i_proj, z_j_proj)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss/len(recipes):.4f}")

### Execute the train run

In [146]:
vocab_size = len(unique_ingredients_list)
encoding_dim = 128

In [148]:
ingredient_indices = list(ingredient2idx.values())

# Generate synthetic recipes
num_recipes = 20000  # adjust to create enough pairs
recipes = []
for _ in range(num_recipes):
    recipe_length = random.randint(5, 7)  # 2–5 ingredients per recipe
    recipe = random.sample(ingredient_indices, recipe_length)
    recipes.append(recipe)

# Generate positive and negative pairs
positive_pairs = list()
negative_pairs = list()

for i in range(len(recipes)):
    for j in range(i + 1, len(recipes)):
        set_i = set(recipes[i])
        set_j = set(recipes[j])
        overlap = set_i & set_j

        if len(overlap) == 1:   # exactly one shared ingredient
            positive_pairs.append((recipes[i], recipes[j]))
        else:                   # no overlap OR more than 1 overlap
            negative_pairs.append((recipes[i], recipes[j]))

# Step 3: Sample to limit size if needed
max_pairs = 15000
positive_pairs = random.sample(positive_pairs, min(max_pairs//2, len(positive_pairs)))
negative_pairs = random.sample(negative_pairs, min(max_pairs//2, len(negative_pairs)))

print(f"Positive pairs: {len(positive_pairs)}")
print(f"Negative pairs: {len(negative_pairs)}")

Positive pairs: 7500
Negative pairs: 7500


In [121]:
train_recipies = list()
for recipe_ingredient in recipe_ingredients:
    try:
        train_recipies.append([ingredient2idx[i.strip().replace("/", "").lower()] for i in recipe_ingredient.split(',')])
    except:
        pass

print(f"Number of recipes: {len(train_recipies)}")

Number of recipes: 1312143


In [149]:
model = RecipeEmbeddingModel(vocab_size, encoding_dim)

In [150]:
train(model, positive_pairs + negative_pairs, epochs=60, batch_size=128)

100%|██████████| 118/118 [00:03<00:00, 39.06it/s]


Epoch 1, Loss: 0.0427


100%|██████████| 118/118 [00:02<00:00, 39.66it/s]


Epoch 2, Loss: 0.0401


100%|██████████| 118/118 [00:03<00:00, 38.93it/s]


Epoch 3, Loss: 0.0373


100%|██████████| 118/118 [00:02<00:00, 39.62it/s]


Epoch 4, Loss: 0.0348


100%|██████████| 118/118 [00:02<00:00, 39.73it/s]


Epoch 5, Loss: 0.0324


100%|██████████| 118/118 [00:02<00:00, 39.45it/s]


Epoch 6, Loss: 0.0301


100%|██████████| 118/118 [00:02<00:00, 39.57it/s]


Epoch 7, Loss: 0.0280


100%|██████████| 118/118 [00:02<00:00, 39.73it/s]


Epoch 8, Loss: 0.0261


100%|██████████| 118/118 [00:02<00:00, 39.56it/s]


Epoch 9, Loss: 0.0245


100%|██████████| 118/118 [00:02<00:00, 39.61it/s]


Epoch 10, Loss: 0.0231


100%|██████████| 118/118 [00:02<00:00, 39.59it/s]


Epoch 11, Loss: 0.0219


100%|██████████| 118/118 [00:02<00:00, 39.62it/s]


Epoch 12, Loss: 0.0209


100%|██████████| 118/118 [00:02<00:00, 39.66it/s]


Epoch 13, Loss: 0.0201


100%|██████████| 118/118 [00:02<00:00, 39.58it/s]


Epoch 14, Loss: 0.0193


100%|██████████| 118/118 [00:02<00:00, 39.66it/s]


Epoch 15, Loss: 0.0187


100%|██████████| 118/118 [00:02<00:00, 39.74it/s]


Epoch 16, Loss: 0.0182


100%|██████████| 118/118 [00:02<00:00, 39.63it/s]


Epoch 17, Loss: 0.0177


100%|██████████| 118/118 [00:02<00:00, 39.66it/s]


Epoch 18, Loss: 0.0173


100%|██████████| 118/118 [00:02<00:00, 39.53it/s]


Epoch 19, Loss: 0.0169


100%|██████████| 118/118 [00:02<00:00, 39.48it/s]


Epoch 20, Loss: 0.0166


100%|██████████| 118/118 [00:02<00:00, 39.95it/s]


Epoch 21, Loss: 0.0163


100%|██████████| 118/118 [00:02<00:00, 39.77it/s]


Epoch 22, Loss: 0.0161


100%|██████████| 118/118 [00:02<00:00, 39.77it/s]


Epoch 23, Loss: 0.0158


100%|██████████| 118/118 [00:02<00:00, 39.81it/s]


Epoch 24, Loss: 0.0156


100%|██████████| 118/118 [00:02<00:00, 39.66it/s]


Epoch 25, Loss: 0.0154


100%|██████████| 118/118 [00:02<00:00, 39.93it/s]


Epoch 26, Loss: 0.0152


100%|██████████| 118/118 [00:02<00:00, 39.71it/s]


Epoch 27, Loss: 0.0150


100%|██████████| 118/118 [00:02<00:00, 39.88it/s]


Epoch 28, Loss: 0.0149


100%|██████████| 118/118 [00:02<00:00, 39.78it/s]


Epoch 29, Loss: 0.0147


100%|██████████| 118/118 [00:03<00:00, 38.77it/s]


Epoch 30, Loss: 0.0146


100%|██████████| 118/118 [00:02<00:00, 39.92it/s]


Epoch 31, Loss: 0.0145


100%|██████████| 118/118 [00:02<00:00, 39.73it/s]


Epoch 32, Loss: 0.0143


100%|██████████| 118/118 [00:02<00:00, 39.75it/s]


Epoch 33, Loss: 0.0142


100%|██████████| 118/118 [00:02<00:00, 39.78it/s]


Epoch 34, Loss: 0.0142


100%|██████████| 118/118 [00:02<00:00, 39.69it/s]


Epoch 35, Loss: 0.0140


100%|██████████| 118/118 [00:02<00:00, 39.84it/s]


Epoch 36, Loss: 0.0140


100%|██████████| 118/118 [00:02<00:00, 39.70it/s]


Epoch 37, Loss: 0.0139


100%|██████████| 118/118 [00:02<00:00, 39.79it/s]


Epoch 38, Loss: 0.0138


100%|██████████| 118/118 [00:02<00:00, 39.62it/s]


Epoch 39, Loss: 0.0137


100%|██████████| 118/118 [00:02<00:00, 39.78it/s]


Epoch 40, Loss: 0.0136


100%|██████████| 118/118 [00:02<00:00, 39.65it/s]


Epoch 41, Loss: 0.0136


100%|██████████| 118/118 [00:02<00:00, 39.82it/s]


Epoch 42, Loss: 0.0135


100%|██████████| 118/118 [00:02<00:00, 39.62it/s]


Epoch 43, Loss: 0.0135


100%|██████████| 118/118 [00:03<00:00, 39.02it/s]


Epoch 44, Loss: 0.0134


100%|██████████| 118/118 [00:03<00:00, 39.33it/s]


Epoch 45, Loss: 0.0134


100%|██████████| 118/118 [00:03<00:00, 38.53it/s]


Epoch 46, Loss: 0.0133


100%|██████████| 118/118 [00:02<00:00, 39.53it/s]


Epoch 47, Loss: 0.0133


100%|██████████| 118/118 [00:02<00:00, 39.60it/s]


Epoch 48, Loss: 0.0132


100%|██████████| 118/118 [00:03<00:00, 39.27it/s]


Epoch 49, Loss: 0.0132


100%|██████████| 118/118 [00:02<00:00, 39.73it/s]


Epoch 50, Loss: 0.0131


100%|██████████| 118/118 [00:02<00:00, 39.48it/s]


Epoch 51, Loss: 0.0131


100%|██████████| 118/118 [00:02<00:00, 39.58it/s]


Epoch 52, Loss: 0.0131


100%|██████████| 118/118 [00:02<00:00, 39.42it/s]


Epoch 53, Loss: 0.0131


100%|██████████| 118/118 [00:02<00:00, 39.61it/s]


Epoch 54, Loss: 0.0130


100%|██████████| 118/118 [00:03<00:00, 39.01it/s]


Epoch 55, Loss: 0.0130


100%|██████████| 118/118 [00:02<00:00, 39.34it/s]


Epoch 56, Loss: 0.0129


100%|██████████| 118/118 [00:02<00:00, 39.50it/s]


Epoch 57, Loss: 0.0129


100%|██████████| 118/118 [00:02<00:00, 39.43it/s]


Epoch 58, Loss: 0.0129


100%|██████████| 118/118 [00:02<00:00, 39.36it/s]


Epoch 59, Loss: 0.0129


100%|██████████| 118/118 [00:02<00:00, 39.44it/s]

Epoch 60, Loss: 0.0128





### Test the model

In [151]:
print(recipe_ingredients[1])
print(recipe_ingredients[2])

cream of mushroom soup, beef, sour cream, chicken breasts
frozen corn, pepper, cream cheese, garlic powder, butter, salt


In [152]:
print(recipies[1])
print(recipies[2])

Jewell Ball'S Chicken
Creamy Corn


In [153]:
model.eval()
with torch.no_grad():
    test_query_recipe_1_emb, _ = model([[ingredient2idx["beef"], ingredient2idx["spices"], ingredient2idx["salt"]]])
    test_query_recipe_2_emb, _ = model([[ingredient2idx["beef"], ingredient2idx["spices"], ingredient2idx["sauce"]]])

In [154]:
F.cosine_similarity(test_query_recipe_1_emb, test_query_recipe_2_emb)

tensor([0.5120])

### Save the model

In [155]:
torch.save(model.state_dict(), "./../models/recipe_embedding_model.pt")

### Generate embeddings for all recipies

In [188]:
df_recipies_for_app = df_recipies.sample(frac=0.1)

print(f"Number of recipes: {len(df_recipies_for_app)}")
df_recipies_for_app.head()

Number of recipes: 129010


Unnamed: 0,title,ingredients,directions,link,NER
1699455,Spicy Chocolate Cookies Recipe Adrienne,"[""1 1/2 cups flour"", ""1 cup sugar"", ""1/2 stick...","[""Preheat oven to 350"", ""Mix all of the ingred...",www.chowhound.com/recipes/spicy-chocolate-cook...,"cayenne, sugar, unsweetened cocoa, cinnamon, f..."
131298,Cape Cod Oatmeal Cookies,"[""1 1/2 c. flour"", ""1/2 tsp. baking soda"", ""1 ...","[""Set oven at 350\u00b0."", ""Sift together flou...",www.cookbooks.com/Recipe-Details.aspx?id=654568,"egg, sugar, baking soda, cinnamon, molasses, l..."
1319168,Pear And Goats' Cheese Bruschetta,"[""pear, chopped into wedges"", ""knob butter"", ""...","[""1. Heat the butter in a frying pan. Saute th...",www.epicurious.com/recipes/member/views/pear-a...,"knob butter, shallot, brown sugar, pear, rosem..."
1336030,Orange Aperol Sun,"[""Ice cubes"", ""6 tablespoons Aperol"", ""6 table...","[""Place enough ice cubes in each of 6 balloon ...",www.epicurious.com/recipes/food/views/orange-a...,"cubes, orange juice"
2062356,Tomato and Chive Salad With Goat Cheese Croutons,"[""1 small baguette, sliced into 24 thin slices...","[""Preheat the oven to 350F Put the bread on a ...",www.food.com/recipe/tomato-and-chive-salad-wit...,"tomatoes, salt, goat cheese, extra virgin oliv..."


In [189]:
df_recipies_for_app["id"] = df_recipies_for_app.index
joblib.dump(df_recipies_for_app, "./../dataset/recipes_for_app.pkl", compress=9)

['./../dataset/recipes_for_app.pkl']

In [190]:
recipe_ingredients_for_app = df_recipies_for_app["NER"].tolist()

In [191]:
def build_recipe_index(model, ingredients):
    model.eval()
    with torch.no_grad():
        generated_embedding, _ = model(ingredients)
    return generated_embedding

In [192]:
generated_embeddings = list()
for recipe_ingredient in recipe_ingredients_for_app:
    try:
        generated_embeddings.append([ingredient2idx[i.strip().replace("/", "").lower()] for i in recipe_ingredient.split(',')])
    except KeyError:
        pass

recipe_embeddings = build_recipe_index(model, generated_embeddings)
torch.save(recipe_embeddings, "./../embeddings/recipe_embeddings.pt")

### Test recipe retrival

In [193]:
loaded_embeddings = torch.load("./../embeddings/recipe_embeddings.pt")

print(f"Number of recipes: {len(loaded_embeddings)}")

Number of recipes: 128938


In [194]:
query_recipe_ingredients = [ingredient2idx["baking powder"], ingredient2idx["ham"], ingredient2idx["flour"]]
query_embedding, _ = model([query_recipe_ingredients])

similarity = F.cosine_similarity(query_embedding, loaded_embeddings)
topk = torch.topk(similarity, k=10, dim=-1)

In [195]:
results = [(idx.item(), similarity[idx].item()) for idx in topk.indices]

In [196]:
print("Top-k similar recipes:")
for idx, score in results:
    print(f"Recipe: {idx}: Menu: {df_recipies_for_app.iloc[idx]["NER"]}, Similarity: {score:.4f}")

Top-k similar recipes:
Recipe: 25684: Menu: bread, tomatoes, italian seasoning, goat cheese, parmesan cheese, freshly ground black pepper, baby spinach leaves, ricotta cheese, olive oil, Similarity: 0.7754
Recipe: 66676: Menu: mixed dried fruit, baking powder, oats, dark raisins, baking soda, brown sugar, eggs, apple pie spice, flour, margarine, confectioners sugar, Similarity: 0.6358
Recipe: 12642: Menu: pepper, young carrot, white onion, water, celery, parsley, salt, Similarity: 0.5999
Recipe: 105914: Menu: extra virgin olive oil, barley, parsley, yellow onions, garlic, chicken, Similarity: 0.5962
Recipe: 111685: Menu: bread flour, egg, sugar, shortening, water, yeast, butter, vegetable oil, salt, Similarity: 0.5810
Recipe: 120607: Menu: black peppercorns, pepper, water, thyme, bay leaves, apple juice, apple cider vinegar, maple syrup, kosher salt, garlic, Similarity: 0.5805
Recipe: 113007: Menu: chocolate, white karo, nuts, white sugar, butter, whipping cream, Similarity: 0.5802
Rec

In [58]:
df_recipies_for_app.iloc[124775]

title          Savory Pear, Sweet Potato, And Maple Syrup Sou...
ingredients    ["For the Souffle base", "2 tablespoons butter...
directions     ["Peel and cut sweet potatoes into 1-inch chun...
link           food52.com/recipes/14383-savory-pear-sweet-pot...
NER            orange zest, bartlett, eggs, parmesan cheese, ...
id                                                       1381518
Name: 1381518, dtype: object

### Test the reload of the dataset

In [76]:
df_recipies_for_app_load = joblib.load("./../dataset/recipes_for_app.pkl")
df_recipies_for_app_load.head()

Unnamed: 0,title,ingredients,directions,link,NER,id
1299563,Mexican Style Pasta Bake,"[""12 ounces dried bow tie pasta (about 5 cups)...","[""1. Preheat oven to 350 degree F. Butter six ...",www.epicurious.com/recipes/member/views/mexica...,"ground cumin, colby cheese, cilantro, chili po...",1299563
1507766,Slow & Easy Minestrone,"[""1 can (28 ounces) diced tomatoes, undrained""...","[""In a 4- or 5-qt. slow cooker, combine the fi...",www.tasteofhome.com/recipes/slow-easy-minestrone/,"cabbage, zucchini, tomatoes, mushrooms, onion,...",1507766
1850401,Manzo alla Panna,"[""3/4 teaspoons Fresh Cracked Black Pepper"", ""...","[""1."", ""Rub the pepper onto all sides of the r...",tastykitchen.com/recipes/main-courses/manzo-al...,"fresh cracked black pepper, olive oil, red win...",1850401
1039075,Tunisian Sauce Kerkennaise,"[""2 tomatoes, finely chopped"", ""1/4 cup olive ...","[""Combine all the ingredients in a bowl."", ""Wh...",www.food.com/recipe/tunisian-sauce-kerkennaise...,"fresh ground black pepper, sugar, tomatoes, fr...",1039075
909685,Chicken Cordon Bleu Ii,"[""6 skinless, boneless chicken breast halves"",...","[""Pound chicken breasts if they are too thick....",www.allrecipes.com/recipe/8669/chicken-cordon-...,"chicken bouillon granules, swiss cheese, all-p...",909685


In [77]:
df_recipies_for_app_load.iloc[59573]

title                      Bob L'S Favorite Sweet N' Sour Turkey
ingredients    ["1 (8 oz.) pkg. Green Giant harvest fresh fro...
directions     ["Cook green beans as directed on package. Dra...
link             www.cookbooks.com/Recipe-Details.aspx?id=699249
NER            vinegar, brown sugar, tomato halves, cornstarc...
id                                                        629868
Name: 629868, dtype: object