In [1]:
# General Dependencies

import os
import math
import numpy as np
import torch


In [2]:
"""
Run from Colab or local
"""

try:
    from google.colab import drive
    drive.mount('/content/gdrive')
    ROOT_PATH = "/content/gdrive/MyDrive/502-796-Projects/502/deneme-1"
    DATA_PATH = os.path.join(ROOT_PATH, "Datasets")
    
    %cd ./gdrive/MyDrive/502-796-Projects/502/deneme-1

except:
    ROOT_PATH = os.curdir
    DATA_PATH = "../Datasets"




Mounted at /content/gdrive
/content/gdrive/MyDrive/502-796-Projects/502/deneme-1


In [3]:
# Source dependencies
from data.dataset import FSSDataset
from common.vis import Visualizer
from common.evaluation import Evaluator

## Prepare PASCAL-$5^i$ Dataset

In [4]:
# STEP 1: Download PASCAL VOC2012 devkit (train/val data): (uncomment lines below)
# ------------------------------------------------------------------------------
#!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
#!tar -xvf 'VOCtrainval_11-May-2012.tar' -C ./Datasets/ 

# (or instead of wget, use directly the link to download)
# STEP 2: Place "VOC2012" folder from downloaded "VOCdevkit" folder under a "Datasets" folder.
# ------------------------------------------------------------------------------
#!mv Datasets/VOCdevkit/VOC2012 Datasets/VOC2012

# STEP 3: Download extended annotations from here
#!wget https://drive.google.com/file/d/10zxG2VExoEZUeyQl_uXga2OWHjGeZaf2/view

# STEP 4: Extract Put the downloaded extension under "Datasets/VOC2012/"


In [5]:
# Dataset initialization 
# TODO: does paper mention about image size? TODO: yes, 473
# TODO: they also do data aug...
# TODO: add disclaimer to dataset files https://github.com/juhongm999/hsnet/blob/main/train.py
FSSDataset.initialize(img_size=128, datapath=DATA_PATH, use_original_imgsize=False)
dataloader_trn = FSSDataset.build_dataloader(benchmark='pascal', bsz=4, nworker=1, fold=0, split='trn')
dataloader_val = FSSDataset.build_dataloader(benchmark='pascal', bsz=4, nworker=1, fold=0, split='val')

Total (trn) images are : 11394
Total (val) images are : 346


## Visualization deneme

In [6]:
Visualizer.initialize(True)
Evaluator.initialize()

In [None]:
""" Code below visualizes prediction (to be used in testing)

for idx, batch in enumerate(dataloader_trn):
        # 1. Hypercorrelation Squeeze Networks forward pass
        #batch = utils.to_cuda(batch)
        #pred_mask = model.module.predict_mask_nshot(batch, nshot=nshot)

        #assert pred_mask.size() == batch['query_mask'].size()
        pred_mask = batch['query_mask']

        # 2. Evaluate prediction
        area_inter, area_union = Evaluator.classify_prediction(pred_mask.clone(), batch)
        
        #average_meter.update(area_inter, area_union, batch['class_id'], loss=None)
        #average_meter.write_process(idx, len(dataloader), epoch=-1, write_batch_idx=1)

        # Visualize predictions
        if Visualizer.visualize:
            Visualizer.visualize_prediction_batch(batch['support_imgs'], batch['support_masks'],
                                                  batch['query_img'], batch['query_mask'],
                                                  pred_mask, batch['class_id'], idx,
                                                  area_inter[1].float() / area_union[1].float())

        break   # TODO: delete this break to run visualization for full dataset
"""

## Extract features

## Initial Agent Tokens

In [17]:
from scipy.spatial.distance import cdist

# num_tokens is K in the algorithm 1 (not the K of K-shot images)
# Please refer to Algorithm 1 given in Supplementary Material
# to match the notation of variables in the comments.
# We assume X, locations of foreground pixels, is the locations of
# f_s, i..e foreground support pixel features.


# Shapes:
# X --> [batchsize, num_foreground_pixels, 2] --> edit: since number of foreground pixels change for every image, X is a list with len(X)=batchsize
# L --> [batchsize, num_background_pixels, 2] last dimension is (x,y) location
# f_s --> [batchsize, h, w, c] 
# "h, w denote the height, width of the feature map." (Supplementary Material)
def init_agent_tokens(num_tokens, X, L, f_s):
  
    # Compute euclidean distance between every pair
    # (foreground_pixel, bacground_pixel)
    # in total, |X| x |L| pairs
    #dists_batch = torch.cdist(X, L)   # Get all the distances for K support ims

    tokens = torch.empty((len(X), num_tokens, f_s.shape[1]))
    L_new = []
    # TODO: can we compute this jointly for all images in a batch?
    for i in range(len(X)):
        L_single = L[i]      # L for a single image in a batch

        for k in range(num_tokens):
            #dists = dists_batch[i]
            dists = torch.from_numpy(cdist(X[i], L[i], 'euclidean'))   # Get all the distances for K support ims

            # See line 3 of Algorithm 1 in Supplementary Material:
            # for a specific location x, min distance between x and all other locations in L
            d_x, d_x_ind = torch.min(dists, dim=1)  

            # We don't care about the actual distance value, so it is named as _
            # we care about which location has the furthest distance p* 
            _ , p_ind = torch.max(d_x, dim=0)

            p_furthest = X[i][p_ind, :]      # This is a location (x,y) of a pixel
            p_star = p_furthest.unsqueeze(0) # [2] --> [1,2] 
            L_single = torch.cat([L_single, p_star], dim=0) # L = (B) U (P), see line 5 in Algorithm 1

            f_a_k = f_s[i, :, p_furthest.data[0].long().item(), p_furthest.data[1].long().item()]
            
            tokens[i,k,:] = f_a_k
            
        L_new.append(L_single)
    
    return tokens
    
    

## Initial Agent Tokens test (to be removed)

In [11]:
from model.featureextractor import FeatureExtractor

Feat = FeatureExtractor()


Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 176MB/s]


In [18]:
"""
    tmp_supp_feat  torch.Size([4, 2048, 16, 16])
    tmp_mask  torch.Size([4, 1, 16, 16])
    tmp_supp  torch.Size([4, 2048, 16, 16])
"""

for idx, batch in enumerate(dataloader_trn):
    
    query_img = batch['query_img']
    supp_imgs = batch['support_imgs']
    supp_masks = batch['support_masks']
    
    F_Q, F_S, s_mask_list, f_s, M_s = Feat(query_img, supp_imgs, supp_masks)

    #f_s = Feat.fg_supp_pix
    #M_s = Feat.feature_supp_masks

    #print(f_s.shape)
    #print(M_s.shape)
    
    # get background pixels
    # get agent tokens

    # TODO: can we get rid of for loop?
    X = []
    L = []
    num_tokens = 15 # TODO: make it a hyperparameter
    for i, m in enumerate(M_s):  # M_s has shape (batchsize, 1, 16, 16)
      m = m.squeeze(0)  # Get a single mask, shape (16, 16)

      fg = np.where(m == 1.) # get foreground pixels
      bg = np.where(m == 0.) # get background pixels
      
      """
      print(fg[0].shape)
      print(fg[1].shape)
      print(bg[0].shape)
      print(bg[1].shape)
      """

      # Create tensor with shape [num_foreground_pix, 2] where the last dimension has (x,y) locations of foreground pixels
      foreground_pix = torch.stack((torch.from_numpy(fg[0]), torch.from_numpy(fg[1])), dim=1)
      background_pix = torch.stack((torch.from_numpy(bg[0]), torch.from_numpy(bg[1])), dim=1)

      """
      print("\n-----\n")
      print(foreground_pix.shape)
      print(background_pix.shape)
      print("\n=====\n")
      """

      X.append(foreground_pix)
      L.append(background_pix)


    tokens = init_agent_tokens(num_tokens, X, L, f_s) # every token has [K,c] dim for every sample in a batch
    break

In [19]:
print(tokens.shape) # shape [batchsize, num_tokens, c] where c is from support features dimension [batchsize, c, h, w]

torch.Size([4, 15, 2048])
