In [1]:
import json
import pickle
import collections
from tqdm.notebook import tqdm
import re

In [2]:
vocab = collections.defaultdict(int)

f = open('/home/ubuntu/recipe-dataset/json/cleaned_layers.json', 'r')
data = json.load(f)

In [5]:
for id in tqdm(data, total=len(data)):
    recipe = data[id]

    title = re.sub(r'[^\w\s]',' ',recipe['title'].lower())
    for word in title.split(' '):
        vocab[word] += 1
    
    for instruction in recipe['instructions']:
        instruction = re.sub(r'[^\w\s]',' ',instruction.lower())
        for word in instruction.split(' '):
            vocab[word] += 1
    
    for ingredient in recipe['ingredients']:
        ingredient = re.sub(r'[^\w\s]',' ',ingredient.lower())
        for word in ingredient.split(' '):
            vocab[word] += 1


  0%|          | 0/1029720 [00:00<?, ?it/s]

In [3]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
from transformers import BertTokenizer, BertModel
from models.bert_encoder import BERTEncoder
bert_encoder = BERTEncoder(BertModel.from_pretrained('bert-base-uncased'), BertTokenizer.from_pretrained('bert-base-uncased'), device=device)

bert_encoder.run('/home/ubuntu/recipe-dataset/json/vocab.pkl', '/home/ubuntu/recipe-dataset/json/vocab_bert.pkl')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Number of words in vocabulary: 151344
Creating embeddings...


100%|██████████| 1514/1514 [01:12<00:00, 20.86it/s]


Embeddings saved to /home/ubuntu/recipe-dataset/json/vocab_bert.pkl


In [21]:
import numpy as np

partition = 'test'
ids = pickle.load(open('/home/ubuntu/recipe-dataset/test/test_keys.pkl', 'rb'))
data = json.load(open('/home/ubuntu/recipe-dataset/json/cleaned_layers.json', 'r'))
new_data = {}
for i, (id, sample) in enumerate(data.items()):
    if sample['partition'] == partition:
        new_data[id] = sample

data = new_data
del new_data

image_map = json.load(open('/home/ubuntu/recipe-dataset/json/image_map.json', 'r'))

torch.random.manual_seed(42)
np.random.seed(42)
random_embedding = torch.randn(768)

bert_embeddings = pickle.load(open('/home/ubuntu/recipe-dataset/json/vocab_bert.pkl', 'rb'))
bert_embeddings = {k: torch.tensor(v) for k, v in bert_embeddings.items()}
ingredient_vocabulary = pickle.load(open('/home/ubuntu/recipe-dataset/json/ingredient_vocab.pkl', 'rb'))

In [22]:
id = 5
sample = data[ids[id]]

image_ids = image_map[ids[id]]
image_id = np.random.choice(image_ids)
dataset_images = '/home/ubuntu/recipe-dataset/test/'
image_path = dataset_images +'/'.join(list(image_id[:4])) + '/' + image_id

title = sample['title']
ingredients = sample['ingredients']
instructions = sample['instructions']

In [None]:
title_embedding = [bert_embeddings.get(word, random_embedding) for word in title.lower().split(' ')]

instruction_embedding = []
for instruction in instructions:
    temp = []
    instruction = re.sub(r"[^a-zA-Z0-9]", " ", instruction.lower())
    print(f"instruction: {instruction}")
    for word in instruction.strip().split(' '):
        e = bert_embeddings.get(word, random_embedding)
        temp.append(e)
    instruction_embedding.append(torch.cat(temp))

# ingredient embeddings contain an additional lookup in the ingredient vocabulary
ingredient_embedding = []
for ingredient in ingredients:
    temp = []
    ingredient = re.sub(r"[^a-zA-Z0-9]", " ", ingredient.lower())
    for word in ingredient.split(" "):
        if word in ingredient_vocabulary['ingredient2stem']:
            temp.append(bert_embeddings.get(word, random_embedding))
    
    ingredient_embedding.append(torch.cat(temp))

In [66]:
import os
import numpy as np
from PIL import Image

class RecipeDataset(torch.utils.data.Dataset):

    """
    Dataset class for loading the title, cleaned ingredients, instructions, list of images, ID for every recipe in the dataset, based on partition (train, validation, test)
    """
    def __init__(self, partition, ids_pkl, cleaned_layers, image_map, dataset_images, bert_embeddings, ingredient_vocabulary, image_logs='', transform=None, seed=42):
        
        self.partition = partition
        self.data = {}
        self.ids = []
        self.image_logs = image_logs
        self.dataset_images = dataset_images
        self.transform = transform
        self.seed = seed
        self.bert_embeddings_path = bert_embeddings
        self.ingredient_vocabulary_path = ingredient_vocabulary

        if self.partition not in ['train', 'val', 'test']:
            raise ValueError('Partition must be one of train, val, test')
        
        with open(cleaned_layers, 'r') as f:
            data = json.load(f)
        print(f"Loaded {len(data)} recipes from {cleaned_layers}")
        
        with open(ids_pkl, 'rb') as f:
            self.ids = pickle.load(f)
        # remove bad ids from the ids list
        remove_count = 0
        for id in self.ids:
            if id not in data or data[id]['partition'] != self.partition or data[id]['ingredients'] == [] or data[id]['instructions'] == []:
                self.ids.remove(id)
                remove_count += 1

        print(f"PARTITION: {self.partition}, TOTAL IDS AVAILABLE: {len(self.ids)}")

        # iterate through the data to obtain only samples which are from the partition
        for i, (id, sample) in enumerate(data.items()):
            if sample['partition'] == self.partition:
                self.data[id] = sample
        
        # memory cleanup
        del data

        with open(image_map, 'r') as f:
            self.image_map = json.load(f)
        print(f"Loaded {len(self.image_map)} image mappings from {image_map}")

        torch.random.manual_seed(self.seed)
        np.random.seed(self.seed)
        self.random_embedding = torch.randn(768).unsqueeze(0)
        print('random embedding', self.random_embedding.shape)

        with open(self.bert_embeddings_path, 'rb') as f:
            self.bert_embeddings = pickle.load(f)
            self.bert_embeddings = { k: torch.tensor(v).unsqueeze(0) for k, v in self.bert_embeddings.items() }
        with open(self.ingredient_vocabulary_path, 'rb') as f:
            self.ingredient_vocabulary = pickle.load(f)
            
        print(f"Loaded {len(self.bert_embeddings)} embeddings from {self.bert_embeddings_path}\nLoaded {len(self.ingredient_vocabulary['ingredients'])} ingredients from {self.ingredient_vocabulary_path}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        print('INDEX CALLED:', index)
        id = self.ids[index]
        sample = self.data[id]

        image_ids = self.image_map[id]

        # randomly pick out an image from the list of images if train, else pick the first image
        if self.partition == 'train':
            image_id = np.random.choice(image_ids)
        else:
            image_id = image_ids[0]

        # create the image path and load the image
        image_path = self.dataset_images +'/'.join(list(image_id[:4])) + '/' + image_id

        # load image from path
        try:
            image = Image.open(image_path).convert('RGB')
            if self.transform is not None:
                image = self.transform(image)
        except:
            raise ValueError(f"Image not found at path: {image_path}")

        # obtain list of ingredients and instructions
        title = sample['title']
        ingredients = sample['ingredients']
        instructions = sample['instructions']
        print(f"{title}\n{ingredients}\n{instructions}")

        # obtain the embeddings for title, ingredients and instructions from BERT
        # check against the dictionary saved, if not available, then use the random vector generated at the start

        title_embedding = [self.bert_embeddings.get(word, self.random_embedding) for word in title.lower().split(' ')]

        instruction_embedding = []
        for instruction in instructions:
            temp = []
            instruction = re.sub(r"[^a-zA-Z0-9]", " ", instruction.strip().lower())
            # print(f"instruction: {instruction}")
            for word in instruction.split():
                e = self.bert_embeddings.get(word, self.random_embedding)
                temp.append(e)
            
            instruction_embedding.append(torch.cat(temp, dim=0))

        # ingredient embeddings contain an additional lookup in the ingredient vocabulary
        ingredient_embedding = []
        for ingredient in ingredients:
            temp = []
            ingredient = re.sub(r"[^a-zA-Z0-9]", " ", ingredient.strip().lower())
            for word in ingredient.split(" "):
                temp.append(self.bert_embeddings.get(word, self.random_embedding))
            
            ingredient_embedding.append(torch.cat(temp, dim=0))

        # convert the list of embeddings to a tensor, with zero padding to cover variable length
        title_embedding = torch.nn.utils.rnn.pad_sequence(title_embedding, batch_first=True, padding_value=0)
        instruction_embedding = torch.nn.utils.rnn.pad_sequence(instruction_embedding, batch_first=True, padding_value=0)
        ingredient_embedding = torch.nn.utils.rnn.pad_sequence(ingredient_embedding, batch_first=True, padding_value=0)

        # print(f"TITLE EMBEDDING: {torch.squeeze(title_embedding).shape}")
        # print(f"INSTRUCTION EMBEDDING: {instruction_embedding.shape}")
        # print(f"INGREDIENT EMBEDDING: {ingredient_embedding.shape}")
        
        output = {
            'id': id,
            'image_id': image_id,
            'title': title,
            'ingredients': ingredients,
            'instructions': instructions,
            'title_embedding': torch.squeeze(title_embedding),
            'ingredient_embedding': ingredient_embedding,
            'instruction_embedding': instruction_embedding,
            'image': image
        }

        return output

    def visualize_sample(self, index):
        output = self.__getitem__(index)
        print(f"ID: {output['id']}\tImage ID: {output['image_id']}")
        print(f"Title: {output['title']}")
        print(f"Ingredients:")
        for ingredient in output['ingredients']:
            print(f"\t{ingredient}")
        print(f"Instructions:")
        for instruction in output['instructions']:
            print(f"\t{instruction}")

        if self.image_logs:
            image_path = self.dataset_images + '/'.join(list(output['image_id'][:4])) + '/' + output['image_id']
            image_path = os.path.join(self.image_logs, image_path)
            image = Image.open(image_path).convert('RGB')
            image.save(f"{self.image_logs}/{output['id']}.png")

In [67]:
import torchvision.transforms as transforms

dataset = RecipeDataset(
        partition='test',
        ids_pkl='/home/ubuntu/recipe-dataset/test/test_keys.pkl', 
        cleaned_layers='/home/ubuntu/recipe-dataset/json/cleaned_layers.json', 
        image_map='/home/ubuntu/recipe-dataset/json/image_map.json', 
        dataset_images='/home/ubuntu/recipe-dataset/test/', 
        bert_embeddings='/home/ubuntu/recipe-dataset/json/vocab_bert.pkl',
        ingredient_vocabulary='/home/ubuntu/recipe-dataset/json/ingredient_vocab.pkl',
        image_logs='/home/ubuntu/cooking-cross-modal-retrieval/sequential-autoencoder/logs',
        transform=transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    )



Loaded 1029720 recipes from /home/ubuntu/recipe-dataset/json/cleaned_layers.json
PARTITION: test, TOTAL IDS AVAILABLE: 51334
Loaded 402760 image mappings from /home/ubuntu/recipe-dataset/json/image_map.json
random embedding torch.Size([1, 768])
Loaded 151344 embeddings from /home/ubuntu/recipe-dataset/json/vocab_bert.pkl
Loaded 9225 ingredients from /home/ubuntu/recipe-dataset/json/ingredient_vocab.pkl


In [68]:
test_output = dataset.__getitem__(0)

title = test_output['title']
ingredients = test_output['ingredients']
instructions = test_output['instructions']

title_embedding = test_output['title_embedding']
ingredient_embedding = test_output['ingredient_embedding']
instruction_embedding = test_output['instruction_embedding']

print(f"TITLE: {title}\nINGREDIENTS: {ingredients}\nINSTRUCTIONS: {instructions}")
print(title_embedding.shape, ingredient_embedding.shape, instruction_embedding.shape)

INDEX CALLED: 0
Crunchy Onion Potato Bake
['milk', 'water', 'butter', 'mashed potatoes', 'whole kernel corn', 'cheddar cheese', 'French - fried onions']
['Preheat oven to 350 degrees Fahrenheit.', 'Spray pan with non stick cooking spray.', 'Heat milk, water and butter to boiling; stir in contents of both pouches of potatoes; let stand one minute.', 'Stir in corn.', 'Spoon half the potato mixture in pan.', 'Sprinkle half each of cheese and onions; top with remaining potatoes.', 'Sprinkle with remaining cheese and onions.', 'Bake 10 to 15 minutes until cheese is melted.', 'Enjoy !']
TITLE: Crunchy Onion Potato Bake
INGREDIENTS: ['milk', 'water', 'butter', 'mashed potatoes', 'whole kernel corn', 'cheddar cheese', 'French - fried onions']
INSTRUCTIONS: ['Preheat oven to 350 degrees Fahrenheit.', 'Spray pan with non stick cooking spray.', 'Heat milk, water and butter to boiling; stir in contents of both pouches of potatoes; let stand one minute.', 'Stir in corn.', 'Spoon half the potato mix

In [None]:
# create a simple dataloader and pass the inputs through the model
from torch.utils.data import DataLoader
import torch.nn.functional as F

def collate(batch):
    title_embeddings, ingredient_embeddings, instruction_embeddings, images = [], [], [], []
    ingredient_max_seq, ingredient_max_num, instruction_max_seq, instruction_max_num = 0, 0, 0, 0

    for elem in batch:
        title_embeddings.append(elem['title_embedding'])

        ingredient_max_num = max(ingredient_max_num, elem['ingredient_embedding'].shape[0])
        ingredient_max_seq = max(ingredient_max_seq, elem['ingredient_embedding'].shape[1])
        ingredient_embeddings.append(elem['ingredient_embedding'].unsqueeze(0))
        
        instruction_max_num = max(instruction_max_num, elem['instruction_embedding'].shape[0])
        instruction_max_seq = max(instruction_max_seq, elem['instruction_embedding'].shape[1])
        instruction_embeddings.append(elem['instruction_embedding'].unsqueeze(0))
        
        images.append(elem['image'])
    
    # title
    title_embeddings = torch.nn.utils.rnn.pad_sequence(title_embeddings, batch_first=True, padding_value=0)

    # ingredients
    padded_output_size = np.array([1, ingredient_max_num, ingredient_max_seq, 768])
    for i, elem in enumerate(ingredient_embeddings):
        pad = padded_output_size - np.array(elem.shape)
        ingredient_embeddings[i] = F.pad(elem, (0, pad[3], 0, pad[2], 0, pad[1], 0, pad[0]))
    ingredient_embeddings = torch.cat(ingredient_embeddings, dim=0)
    
    # instructions
    padded_output_size = np.array([1, instruction_max_num, instruction_max_seq, 768])

    for i, elem in enumerate(instruction_embeddings):
        pad = padded_output_size - np.array(elem.shape)
        instruction_embeddings[i] = F.pad(elem, (0, pad[3], 0, pad[2], 0, pad[1], 0, pad[0]))
    instruction_embeddings = torch.cat(instruction_embeddings, dim=0)

    # images
    images = torch.stack(images, dim=0)
    print(images.shape)

    print(title_embeddings.shape, ingredient_embeddings.shape, instruction_embeddings.shape)


dataloader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=0, collate_fn=collate)

for i, batch in enumerate(dataloader):
    print(f"Batch: {i}")
    print(batch)
    # print(batch['title_embedding'].shape, batch['ingredient_embedding'].shape, batch['instruction_embedding'].shape)
    # print(batch['image'].shape)
    break

In [None]:
item = dataset.__getitem__(12)
print(item)

In [71]:
with open('/home/ubuntu/recipe-dataset/json/cleaned_layers.json', 'r') as f:
    data = json.load(f)
    count = 0
    for id, item in data.items():
        title = item['title']
        ingredients = item['ingredients']
        instructions = item['instructions']

        if len(ingredients) == 0 or len(instructions) == 0:
            # print(id)
            count += 1
    
    print(count)


1018
