In [4]:
import os
import multiprocessing
from re import L
from dataset import get_loader
from models import get_model
import torch.backends.cudnn as cudnn
from config import get_args
from tqdm.notebook import tqdm
import torch
import numpy as np
import pickle
from utils.utils import load_checkpoint, count_parameters
import argparse

import json
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
map_loc = None if torch.cuda.is_available() else 'cpu'
random.seed(1234)


# load the dataset and dataloader...
# current implementation has the list of images to load already selected in the dataset file
def load_test_data():
    dataloader, dataset = get_loader(
        root='/home/ubuntu/recipe-dataset',
        batch_size=4,
        resize=224,
        im_size=224,
        augment=False, 
        split='test',
        mode='test',
        drop_last=False,
        load_actual_data=True
    )

    return dataset, dataloader


# manual hack to construct arguments for loading the model
def construct_model_args():

    parser = argparse.ArgumentParser()

    parser.add_argument('--tf_n_heads', type=int, default=4,
                        help='Number of attention heads in Transformer models.')
    parser.add_argument('--tf_n_layers', type=int, default=2,
                        help='Number of layers in Transformer models.')
    parser.add_argument('--hidden_recipe', type=int, default=512,
                        help='Embedding dimensionality for recipe representation.')
    parser.add_argument('--output_size', type=int, default=1024,
                        help='Dimensionality of the output embeddings.')
    parser.add_argument('--backbone', type=str, default='resnet50',
                        help='backbone for the vision model')

    args, unknown = parser.parse_known_args()
    return args

def obtain_features(args, dataset, dataloader, checkpoints_dir, store_dict, device):

    vocab_size = len(dataset.get_vocab())
    model = get_model(args, vocab_size)
    print("recipe encoder", count_parameters(model.text_encoder))
    print("image encoder", count_parameters(model.image_encoder))

    _, model_dict, _ = load_checkpoint(checkpoints_dir, 'best', map_loc,
                                          store_dict)
    model.load_state_dict(model_dict, strict=True)
    if device != 'cpu' and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    model = model.to(device)
    model.eval()

    image_features = recipe_features = None
    all_image_ids = []

    for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
        image, titles, title_targets, ingrs, ingredient_targets, instrs, instruction_targets, ids = batch
        image = image.to(device)
        title_targets = title_targets.to(device)
        ingredient_targets = ingredient_targets.to(device)
        instruction_targets = instruction_targets.to(device)

        all_image_ids.extend(ids)

        with torch.no_grad():
            test_images = model.image_encoder(image)
            image_feat, recipe_feat, projected_feat = model(image, title_targets, ingredient_targets, instruction_targets)
        
        if image_features is None:
            image_features = image_feat.cpu().detach().numpy()
            recipe_features = recipe_feat.cpu().detach().numpy()
        else:
            image_features = np.vstack((image_features, image_feat.cpu().detach().numpy()))
            recipe_features = np.vstack((recipe_features, recipe_feat.cpu().detach().numpy()))

    print(f"COMPLETED EXTRACTING FEATURES: FEATURE SIZE: {np.shape(image_features)}, {np.shape(recipe_features)}")

    return all_image_ids, image_features, recipe_features


def visualize_results():
    pass


# take in as input the image and recipe features and corresponding image ids
# Do the following: 
# - Compute average medRank and recall rates for top-1, 5, 10
# - For each image - compute the top-10 matching image IDs
def run_evaluation(task_type, image_features, recipe_features, image_ids, medr_N):
    indexes = np.argsort(image_ids)
    image_ids = [image_ids[i] for i in indexes]


    n = medr_N
    idxs = range(n)

    global_ranks = []
    global_recall = {1: 0.0, 5:0.0, 10:0.0}
    closest_image_dict = {}

    # repeat the exercise for 10 times
    for i in range(10):
        ids = random.sample(range(0, len(image_ids)), n)
        image_subfeatures = image_features[ids, :]
        recipe_subfeatures = recipe_features[ids, :]
        selected_ids = [image_ids[i] for i in ids]

        if task_type == 'im2recipe':
            similarities = np.dot(image_subfeatures, recipe_subfeatures.T)
        elif task_type =='recipe2im':
            similarities = np.dot(recipe_subfeatures, image_subfeatures.T)
        else:
            raise ValueError('Invalid task type')

        median_ranks = []
        recalls = {1: 0.0, 5:0.0, 10:0.0}

        for idx in idxs:
            name = selected_ids[idx]

            similarity = similarities[idx, :]
            sorting = np.argsort(similarity)[::-1].tolist()
            pos = sorting.index(idx)

            closest_image_dict[name] = {
                'top_10_images': [selected_ids[i] for i in sorting[:10]],
                'similarity_scores': [str(i) for i in similarity[sorting[:10]]]
            }

            if pos == 0:
                recalls[1] += 1
            if pos <= 5:
                recalls[5] += 1
            if pos <= 10:
                recalls[10] += 1
            
            median_ranks.append(pos)

        for k in recalls:
            recalls[k] = recalls[k] / n 

        median = np.median(median_ranks)
        for k in recalls:
            global_recall[k] += recalls[k]
        
        global_ranks.append(median)

    for k in global_recall:
        global_recall[k] = global_recall[k] / 10

    print(f"MEAN MEDIAN: {np.average(global_ranks)}")
    print(f"RECALL: {global_recall}")

    return np.average(global_ranks), global_recall, closest_image_dict

In [5]:
dataset, dataloader = load_test_data()

model_args = construct_model_args()
checkpoints_dir = '/home/ubuntu/cooking-cross-modal-retrieval/image-to-recipe-transformers-main/checkpoints/r50_ssl'
store_dict = {}

total data: 1029720
10000
0


In [6]:
image_ids, image_features, recipe_features = obtain_features(
    model_args, 
    dataset,
    dataloader,
    checkpoints_dir,
    store_dict,
    device
)

task_type = 'im2recipe'
average_median, global_recalls, closest_image_dict = run_evaluation(task_type, image_features, recipe_features, image_ids, medr_N=1000)



recipe encoder 39998976
image encoder 25606208


0it [00:00, ?it/s]

COMPLETED EXTRACTING FEATURES: FEATURE SIZE: (), ()


ValueError: Sample larger than population or is negative

In [None]:
print(json.dumps(closest_image_dict, indent=4))

In [9]:
import cv2
from matplotlib import pyplot as plt

def create_data():
    image_map = {}
    layer1 = json.load(open('/home/ubuntu/recipe-dataset/layer1.json', 'r'))
    layer2 = json.load(open('/home/ubuntu/recipe-dataset/layer2.json', 'r'))

    for sample in tqdm(layer1):
        count += 1
        if sample['partition'] == 'test':
            image_map[sample['id']] = {
                'title': sample['title'],
                'instructions': [a['text'] for a in sample['instructions']],
                'ingredients': [a['text'] for a in sample['ingredients']],
                'partition': sample['partition'],
                'image_path': layer2[sample['id']]['images']['id']
            }
    
    print(count)
    json.dump(image_map, open('/home/ubuntu/recipe-dataset/test-image-map.json', 'w'))

def load_image(image_id):
    data_root_folder = '/home/ubuntu/recipe-dataset/test'
    image_path = os.path.join(data_root_folder, image_id[0], image_id[1], image_id[2], image_id[3], image_id+'.jpg')
    image = cv2.imread(image_path)
    return image

print(f"STARTED JSON READING")
title_json = json.load(open('/home/ubuntu/recipe-dataset/title-map.json', 'r'))
print(f"DONE JSON READING")

def get_title(image_id):
    return title_json[image_id]

def visualize_results(image_dict, base_image):

    selected_image_ids = image_dict[base_image]['top_10_images'][:5]
    
    fig = plt.figure(figsize=(25, 8))

    rows, cols = 1, 6

    fig.add_subplot(rows, cols, 1)
    image = load_image(base_image)
    plt.imshow(image)
    plt.axis('off')
    plt.title(get_title(base_image))


    for i, image_id in selected_image_ids:
        fig.add_subplot(rows, cols, i+2)
        image = load_image(image_id)
        plt.imshow(image)
        plt.axis('off')
        plt.title(get_title(image_id))

STARTED JSON READING
DONE JSON READING


In [10]:
create_data()

In [None]:
visualize_results(closest_image_dict, '05096fc3ad')

In [None]:
count = 0

for root_dir, cur_dir, files in os.walk('/home/ubuntu/recipe-dataset/test'):
    print(files)
    count += len(files)

In [None]:
count

In [3]:
import pickle

with open('/home/ubuntu/recipe-dataset/img_name.pkl', 'rb') as file:
    image_file_names = pickle.load(file)

print(len(image_file_names))
print(image_file_names)

51334
['3e233001e2.jpg' '1657f23729.jpg' '3020f58577.jpg' ... 'dd7cdc4b69.jpg'
 '807de6023c.jpg' 'dd05312b3b.jpg']


In [None]:
import pickle

with open('/home/ubuntu/recipe-dataset/traindata/test.pkl', 'rb') as file:
    image_file_names = pickle.load(file)