#### Imports 

In [1]:
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torchvision.models as models
from PIL import Image
import pandas as pd
import numpy as np
import logging
import torch

In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

In [3]:
logger.info(f'Using torch: {torch.__version__}')

Using torch: 1.4.0


#### Essentials

In [4]:
DATASET_ROOT = './data/'
MODEL_PATH = f'{DATASET_ROOT}' + 'models/multi-modal-epoch-9.pth.tar'
CAPTIONS_TEST_SET_PATH = 'embeddings/caption_embeddings.csv'

EMBEDDING_DIMENSIONALITY = 1000 # number of CNN outputs = dimensionality of the captions word2vec model
BATCH_SIZE = 4 
WORKERS = 4 
GPU = 0

IMAGE_EMBEDDING_PATH = f'{DATASET_ROOT}/embeddings/image_embeddings.csv'

#### Load model

In [5]:
class Model(torch.nn.Module):
    def __init__(self, embedding_dimensionality):
        super(Model, self).__init__()
        self.cnn = models.resnet50(pretrained=True, num_classes=embedding_dimensionality)

    def forward(self, image):
        x = self.cnn(image)
        return x

In [6]:
model_encoder = Model(EMBEDDING_DIMENSIONALITY).cuda(GPU)
model_encoder = torch.nn.DataParallel(model_encoder, device_ids=[GPU]).cuda(GPU)
state_dict = torch.load(MODEL_PATH)
model_encoder.load_state_dict(state_dict, strict=True)

<All keys matched successfully>

#### Define and load datasets

In [7]:
class Dataset(Dataset):
    def __init__(self, root_dir, split, embedding_dimensionality):
        self.root_dir = root_dir
        self.split = split
        self.embedding_dimensionality = embedding_dimensionality
        self.preprocess = transforms.Compose([transforms.RandomHorizontalFlip(), 
                                              transforms.RandomCrop(224), 
                                              transforms.ToTensor(), 
                                              transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                                                   std=[0.229, 0.224, 0.225])])
        logger.info(f'Loading data from {split}')
        
        # count number of images in the split
        n = 0
        with open(f'{root_dir}{split}', 'r') as f:
            for _, _ in enumerate(f):
                n += 1
                
        # placeholder for image ids - dummy bytes
        self.img_ids = np.empty([n], dtype='S50')
        # placeholder for captions embedding - [number of captions * vector dimension]
        self.captions_embeddings = np.zeros((n, self.embedding_dimensionality), dtype=np.float32)
        
        # populate the placeholders 
        with open(f'{root_dir}{split}', 'r') as f:
            for idx, row in enumerate(f):
                uid, vec = row.split('\t')
                vec = vec.strip().split(',')
                self.img_ids[idx] = uid
                for i in range(self.embedding_dimensionality):
                    self.captions_embeddings[idx, i] = float(vec[i])
        logger.info(f'Caption embedding shape = {self.captions_embeddings[0].shape}')
        logger.info('Data loading done.')

    def __len__(self):
        return len(self.img_ids)
    
    def __getitem__(self, idx):
        img_id = self.img_ids[idx].decode('utf-8')
        img = Image.open(f'{self.root_dir}images/{img_id}.jpg').convert('RGB')
        img_tensor = self.preprocess(img)
        target_tensor = torch.from_numpy(self.captions_embeddings[idx, :])
        return img_id, img_tensor, target_tensor

In [8]:
test_dataset = Dataset(DATASET_ROOT, CAPTIONS_TEST_SET_PATH, EMBEDDING_DIMENSIONALITY)
test_loader = torch.utils.data.DataLoader(test_dataset, 
                                          batch_size=BATCH_SIZE, 
                                          shuffle=False, 
                                          num_workers=WORKERS, 
                                          pin_memory=True)

Loading data from embeddings/caption_embeddings.csv
Caption embedding shape = (1000,)
Data loading done.


#### Encode images

In [9]:
with torch.no_grad():
    model_encoder.eval()
    image_embeddings = {}
    for i, (img_ids, image, target) in enumerate(test_loader):
        image_var = torch.autograd.Variable(image)
        outputs = model_encoder(image_var)
        
        for batch_idx, image_embedding in enumerate(outputs):
            img_id = img_ids[batch_idx]
            
            image_embedding = np.array(image_embedding.cpu()).tolist()
            image_embeddings[img_id] = image_embedding 

In [10]:
len(image_embeddings)

20

In [11]:
df = pd.DataFrame(list(image_embeddings.items()), columns=['id', 'image_vec'])

In [12]:
df.head(5)

Unnamed: 0,id,image_vec
0,1481007530145672379,"[0.16084320843219757, -0.033384770154953, 0.23..."
1,1490659882930594965,"[0.16006359457969666, -0.03246902674436569, 0...."
2,1481097035704453947,"[0.15905271470546722, -0.03431205824017525, 0...."
3,1489658491986857252,"[0.1590772420167923, -0.034317757934331894, 0...."
4,1487676925685022333,"[0.15962550044059753, -0.03415822237730026, 0...."


In [13]:
df.to_csv(IMAGE_EMBEDDING_PATH, sep='\t', index=False, header=False)