In [1]:
# imagenet_path = "/datasets01_ontap/imagenet_full_size/061417/"
# kinetics_path = "/datasets01_ontap/kinetics/070618/"
# sunrgbd_path = "/data/home/yosuamichael/datasets/SUN_RGBD"

imagenet_path = "/Users/yosuamichael/Downloads/datasets/mini_omnivore/mini_imagenet"
kinetics_path = "/Users/yosuamichael/Downloads/datasets/mini_omnivore/mini_kinetics"
sunrgbd_path = "/Users/yosuamichael/Downloads/datasets/SUN_RGBD"

In [2]:
import os
import PIL
import torch
import torchvision
import shutil
import collections
import datetime
import torchvision.transforms as T
import json
import numpy as np

import image_presets
import video_presets

from torchvision.datasets.vision import VisionDataset
from torchvision.transforms.functional import InterpolationMode
from pathlib import Path



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def lprint(*x):
    print(f"[{datetime.datetime.now()}]", *x)
    
lprint("HELLO")

[2022-06-15 14:19:27.231036] HELLO


# Create sunrgbd dataset

In [4]:
# !pip install opencv-contrib-python

In [5]:
# def read_sunrgbd_image(image_path):
#     rgb_dir = os.path.join(image_path, "image")
#     rgb_path = os.path.join(rgb_dir, os.listdir(rgb_dir)[0])
#     img_rgb = PIL.Image.open(rgb_path)
#     arr_rgb = np.asarray(img_rgb)
    
#     # Using depth_bfx, but maybe can also consider just using depth
#     depth_dir = os.path.join(image_path, "depth_bfx")
#     depth_path = os.path.join(depth_dir, os.listdir(depth_dir)[0])
#     img_d = PIL.Image.open(depth_path)
#     if img_d.mode == "I":
#         arr_d = (np.asarray(img_d) * 255.99999 / 2**16).astype(np.uint8)
    
#     arr_rgbd = np.dstack((arr_rgb, arr_d))
#     return arr_rgbd

    

In [6]:
# arr_rgbd = read_sunrgbd_image(f"{sunrgbd_path}/kv2/kinect2data/000065_2014-05-16_20-14-38_260595134347_rgbf000121-resize")
# arr_rgbd[0,0]

In [7]:
# sunrgbd_classes_index = {
#     "0": "bathroom", "1": "bedroom", "2": "classroom", "3": "computer_room", "4": "conference_room", "5": "corridor",
#     "6": "dining_area", "7": "dining_room", "8": "discussion_area", "9": "furniture_store", "10": "home_office",
#     "11": "kitchen", "12": "lab", "13": "lecture_theatre", "14": "library", "15": "living_room", "16": "office",
#     "17": "rest_space", "18": "study_space"
# }

# sunrgbd_classes_set = set(sunrgbd_classes_index.values())
# sunrgbd_classes_set

In [8]:
class OmnivoreSunRgbdDatasets(VisionDataset):
    def __init__(self, root, transform = None, target_transform = None, split="train"):
        super().__init__(root, transform=transform, target_transform=target_transform)
        self._data_dir = Path(self.root) / "SUNRGBD"
        self._meta_dir = Path(self.root) / "SUNRGBDtoolbox"
        
        if not self._check_exists():
            print(f"data_dir: {self._data_dir}\nmeta_dir: {self._meta_dir}")
            raise RuntimeError("Dataset not found.")
            
        self.classes = ['bathroom',
             'bedroom',
             'classroom',
             'computer_room',
             'conference_room',
             'corridor',
             'dining_area',
             'dining_room',
             'discussion_area',
             'furniture_store',
             'home_office',
             'kitchen',
             'lab',
             'lecture_theatre',
             'library',
             'living_room',
             'office',
             'rest_space',
             'study_space'
        ]
        self.class_to_idx = dict(zip(self.classes, range(len(self.classes))))
        
        # TODO: Need to change later!
        # Currently the file "sunrgbd_trainval_path.json" is manually created with a script
        # We should create this file from script that is downloaded!
        with open(Path(self.root) / "sunrgbd_trainval_path.json", "r") as fin:
            self.trainval_image_dir_map = json.load(fin)
            
        self.image_dirs = [key for key, value in self.trainval_image_dir_map.items() if value == split]
        
        
    def _check_exists(self):
        return self._data_dir.is_dir() and self._meta_dir.is_dir()
    
    def __len__(self):
        return len(self.image_dirs)

    def _read_sunrgbd_image(self, image_dir):
        rgb_dir = os.path.join(image_dir, "image")
        rgb_path = os.path.join(rgb_dir, os.listdir(rgb_dir)[0])
        img_rgb = PIL.Image.open(rgb_path)
        arr_rgb = np.asarray(img_rgb)

        # Using depth_bfx, but maybe can also consider just using depth
        depth_dir = os.path.join(image_dir, "depth_bfx")
        depth_path = os.path.join(depth_dir, os.listdir(depth_dir)[0])
        img_d = PIL.Image.open(depth_path)
        if img_d.mode == "I":
            arr_d = (np.asarray(img_d) * 255.99999 / 2**16).astype(np.uint8)

        arr_rgbd = np.dstack((arr_rgb, arr_d))
        return arr_rgbd
    
    def _get_sunrgbd_scene_class(self, image_dir):
        with open(os.path.join(image_dir, "scene.txt"), "r") as fin:
            scene_class = fin.read().strip()
        return scene_class
    
    def __getitem__(self, idx):
        # return tuple of image (H W C==4) and scene class index
        image_dir = self.image_dirs[idx]
        x_rgbd = torch.tensor(self._read_sunrgbd_image(image_dir), dtype=torch.uint8)
        x_rgbd = x_rgbd.permute(2, 0, 1) # H W C -> C H W
        scene_class = self._get_sunrgbd_scene_class(image_dir)
        scene_idx = self.class_to_idx[scene_class]
        
        if self.transform:
            x_rgbd = self.transform(x_rgbd)
            
        if self.target_transform:
            scene_idx = self.target_transform(scene_idx)
            
        return x_rgbd, scene_idx
        
        

In [9]:
sunrgbd_train = OmnivoreSunRgbdDatasets(root=sunrgbd_path, split="train")
sunrgbd_val = OmnivoreSunRgbdDatasets(root=sunrgbd_path, split="val")

In [10]:
len(sunrgbd_val)

4659

In [11]:
sunrgbd_train[0][0].shape, sunrgbd_train[0][1]

(torch.Size([4, 530, 730]), 7)

In [12]:
dl = torch.utils.data.DataLoader(sunrgbd_train)

len(iter(dl)), len(dl)

(4845, 4845)

# Create mini imagenet dataset

In [13]:
class OmnivoreImageFolder(torchvision.datasets.folder.ImageFolder):
    def __getitem__(self, idx):
        img, label = super().__getitem__(idx)
        return img, label

In [14]:
imagenet = OmnivoreImageFolder(f"{imagenet_path}/train", T.PILToTensor())

In [15]:
imagenet[0][0].shape

torch.Size([3, 250, 250])

In [16]:
len(imagenet.samples)

5000

In [17]:
label2path = {}
for sample in imagenet.samples:
    path, label = sample
    label2path[label] = label2path.get(label, []) + [path]
    

In [18]:
len(label2path)

1000

In [19]:
# new_imagenet_root = "/data/home/yosuamichael/datasets/mini_imagenet/"
# ori_imagenet_prefix = "/datasets01_ontap/imagenet_full_size/061417/"

In [20]:
# # Take 5 images for every label
# num_per_label = 5
# for label in label2path:
#     for i in range(num_per_label):
#         path = label2path[label][i]
#         new_path = os.path.join(new_imagenet_root, path[len(ori_imagenet_prefix):])
#         new_dir = os.path.dirname(new_path)
#         os.makedirs(new_dir, exist_ok=True)
#         shutil.copy(path, new_path)
#     print(f"Finish label: {label}")
    
# print("FINISH ALL")

In [21]:
# os.path.dirname("/data/home/yosuamichael/datasets/mini_imagenet/train/n01440764/n01440764_10026.JPEG")

In [22]:
mini_imagenet_train = OmnivoreImageFolder(f"{imagenet_path}/train", T.PILToTensor())
mini_imagenet_val = OmnivoreImageFolder(f"{imagenet_path}/val", T.PILToTensor())

In [23]:
len(mini_imagenet_train)

5000

# Create mini kinetics

In [24]:
# Prevent to do this because it is very slow!
# kinetics = torchvision.datasets.kinetics.Kinetics("/datasets01_ontap/kinetics/070618/train_avi-480p", frames_per_clip=32, frame_rate=16, _legacy=True)

In [25]:

# all_counter = 0

# mini_kinetics_folder = "/data/home/yosuamichael/datasets/mini_kinetics/"
# ori_kinetics_prefix = "/datasets01_ontap/kinetics/070618/"

# folder_counter = collections.Counter()
# num_file_per_folder = 3

# for root, folders, filenames in os.walk("/datasets01_ontap/kinetics/070618/val_avi-480p/"):
#     for filename in filenames:
#         ori_filepath = os.path.join(root, filename)
#         new_filepath = os.path.join(mini_kinetics_folder, ori_filepath[len(ori_kinetics_prefix):])
#         new_folder = os.path.dirname(new_filepath)
#         if folder_counter[new_folder] >= num_file_per_folder:
#             continue
#         os.makedirs(new_folder, exist_ok=True)
#         shutil.copy(ori_filepath, new_filepath)
#         folder_counter[new_folder] += 1
        
#         all_counter += 1
#         if all_counter % 100 == 0:
#             lprint(f"all_counter: {all_counter}")

# print("Finished!")

In [26]:
class OmnivoreKinetics(torchvision.datasets.kinetics.Kinetics):
    def __getitem__(self, idx):
        video, audio, label = super().__getitem__(idx)
        return video, label

In [27]:
# mini_kinetics_train = OmnivoreKinetics(
#     f"{kinetics_path}/train", 
#     frames_per_clip=32, frame_rate=32, step_between_clips=32, 
#     _legacy=True
# )

In [28]:
# lprint("Start")
# mini_kinetics_val = OmnivoreKinetics(
#     f"{kinetics_path}/val", 
#     frames_per_clip=32, frame_rate=32, step_between_clips=32, 
#     _legacy=True
# )

# lprint("End")

In [29]:
#len(mini_kinetics_train[5]), mini_kinetics_train[5][0].shape, mini_kinetics_train[5][1]

In [30]:
# mini_kinetics_train[0][0].shape, mini_kinetics_train[0][1], mini_kinetics_train[0][0].shape

# Create class to concat data loader

In [31]:
class ConcatIterable:
    def __init__(self, iterables, output_keys, repeat_factors, seed=42): 
        self.iterables = iterables
        self.output_keys = output_keys
        self.repeat_factors = repeat_factors
        self.seed = seed
        self.num_iterables = len(self.iterables)
        assert self.num_iterables == len(output_keys)
        assert self.num_iterables == len(repeat_factors)
        
        
        # The iterator len is adjusted with repeat_factors
        self.iterator_lens = [int(repeat_factors[i] * len(itb)) for i, itb in enumerate(self.iterables)]
        self.max_total_steps = sum(self.iterator_lens)
        self.indices = None
        self.iterators = None
        
        # self.step_counter == None indicate that self.indices are not yet initialized
        self.step_counter = None
        
    def init_indices(self, epoch=0, shuffle=False):
        # We should initiate indices for each epoch, especially if we want to shuffle
        self.step_counter = 0
    
        self.iterators = [iter(dl) for dl in self.iterables]
        self.indices = torch.cat([torch.ones(self.iterator_lens[i], dtype=torch.int32) * i for i in range(self.num_iterables)])
        assert self.max_total_steps == len(self.indices)
        
        if shuffle:
            g = torch.Generator()
            g.manual_seed(self.seed + epoch)
            shuffle_indices = torch.randperm(len(self.indices), generator=g)
            self.indices = self.indices[shuffle_indices]
            
    def __next__(self):
        if self.step_counter == None:
            # Initiate the indices without shuffle as default!
            self.init_indices()
        if self.step_counter >= self.max_total_steps:
            raise StopIteration
        
        idx = self.indices[self.step_counter]
        output_key = self.output_keys[idx]
        print(idx)
        try:
            batch = next(self.iterators[idx])
        except StopIteration:
            # We cycle over the data_loader to the beginning. This can happen when repeat_factor > 1
            # Take note that in this case we always use same shuffling from same data_loader in an epoch
            self.iterators[idx] = iter(self.iterables[idx])
            batch = next(self.iterators[idx])
        
        self.step_counter += 1
        # Return batch and output_key
        return batch, output_key
    
    def __len__(self):
        return self.max_total_steps
    
    def __iter__(self):
        return self
    
    
        

In [32]:
a = ConcatIterable( [[1], [1,2], [1,2,3]], ['a', 'b', 'c'], [2,1,1] )
a.indices

# a.init_indices(1, True)
# a.indices

for x in a:
    print(x)
    break

tensor(0, dtype=torch.int32)
(1, 'a')


In [33]:
# a = ConcatIterable([mini_imagenet_train, mini_kinetics_train], ['image', 'video'], [1,1])

# a.init_indices(epoch=0, shuffle=True)
# for batch, key in a:
#     print(key)
#     break
    
# print(len(a))

In [34]:
len(mini_imagenet_train) + len(mini_kinetics_train)

NameError: name 'mini_kinetics_train' is not defined

# Create dataset with augmentation

In [35]:
x = torch.randint(2, (1,))
if x:
    print("YES", x)
else:
    print("NO", x)

NO tensor([0])


In [36]:


train_crop_size = 224

imagenet_train_preset = image_presets.ImageNetClassificationPresetTrain(crop_size=train_crop_size, interpolation=InterpolationMode.BICUBIC,
                            auto_augment_policy="ra", random_erase_prob=0.25, )

In [37]:
mini_imagenet_train = OmnivoreImageFolder(f"{imagenet_path}/train", imagenet_train_preset)
mini_imagenet_train[0][0].shape  # C, H, W

torch.Size([1, 3, 224, 224])

In [38]:
train_resize_size = 256
train_crop_size = 224
video_train_preset = video_presets.VideoClassificationPresetTrain(crop_size=train_crop_size, resize_size=train_resize_size, )

mini_kinetics_train = OmnivoreKinetics(
    f"{kinetics_path}", 
    frames_per_clip=32, frame_rate=16, step_between_clips=16, 
    split="train", transform= video_train_preset
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 75/75 [00:06<00:00, 10.87it/s]


In [39]:
mini_kinetics_train[1][0].shape  # C, D, H, W



torch.Size([3, 32, 224, 224])

In [40]:
# TODO: Create RandAugment3d, see: 
# - https://www.internalfb.com/code/fbsource/[f1a98f41bcce]/fbcode/deeplearning/projects/omnivore/vissl/data/ssl_transforms/rand_aug_3d.py
# - https://github.com/pytorch/vision/blob/main/torchvision/transforms/autoaugment.py#L287
# Basically do normal augmentation for those operation that involve geometry (shear, translate, etc)
# and only apply color operation to rgb without changing the depth
# __DONE__

In [41]:
import depth_presets

train_crop_size = 224

depth_train_preset = depth_presets.DepthClassificationPresetTrain(crop_size=train_crop_size, interpolation=InterpolationMode.NEAREST,
                            random_erase_prob=0.25, )

mini_sunrgbd_train = OmnivoreSunRgbdDatasets(root=sunrgbd_path, split="train", transform=depth_train_preset)

In [42]:
a = mini_sunrgbd_train[0][0]
a.shape # C, H, W

torch.Size([1, 4, 224, 224])

In [43]:
# Try concat data_loader

def get_single_data_loader_from_dataset(train_dataset): 
    batch_size = 8
    
    train_sampler = torch.utils.data.RandomSampler(train_dataset)
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        sampler=train_sampler,
        pin_memory=True,
        collate_fn=torch.utils.data.dataloader.default_collate,
    )

    return train_data_loader

image_data_loader = get_single_data_loader_from_dataset(mini_imagenet_train)
video_data_loader = get_single_data_loader_from_dataset(mini_kinetics_train)
depth_data_loader = get_single_data_loader_from_dataset(mini_sunrgbd_train)


data_loader = ConcatIterable([image_data_loader, video_data_loader, depth_data_loader], ['image', 'video', 'depth'], [1,0,0])
data_loader.init_indices(epoch=0, shuffle=True)


In [44]:
x = next(data_loader)
x[0][0].shape, x[1]

tensor(0, dtype=torch.int32)


(torch.Size([8, 1, 3, 224, 224]), 'image')

In [45]:
# Video: B, C, D, H, W
# Depth: B, C, H, W
# Image: B, C, H, W

In [46]:
# norm_func = depth_presets.RGBToFloatAndDepthNorm(max_depth=75, clamp_max_before_scale=True)


In [47]:
# sums = []
# sumsqs = []
# pixels = []
# for_stds = []

# for i in range(len(sunrgbd_train)):
#     img = norm_func(sunrgbd_train[i][0])
#     sums.append(torch.sum(img.float(), dim=[1,2]))
#     sumsqs.append(torch.sum(img.float()**2, dim=[1,2]))
#     pixels.append(img.shape[1] * img.shape[2])
#     if i % 1000 == 0:
#         lprint(i)
   
# lprint("FINISHED")
    

In [48]:
# # Reduce the sums and sumsqs and pixels
# total_sums = torch.tensor([0., 0., 0., 0.])
# total_sumsqs = torch.tensor([0., 0., 0., 0.])
# total_pixels = 0

# for i in range(len(means)):
#     total_sums += sums[i]
#     total_sumsqs += sumsqs[i]
#     total_pixels += pixels[i]


In [49]:
# mean = total_sums / total_pixels

# variance = total_sumsqs / total_pixels - means ** 2

# stdev = variance ** 0.5

# print(f"mean: {mean}, stdev: {stdev}")

# mean: tensor([0.4975, 0.4648, 0.4412, 0.8049]), stdev: tensor([0.2781, 0.2873, 0.2910, 0.2116])

In [50]:
import torchmultimodal.models.omnivore as omnivore

In [54]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


In [55]:
m = omnivore.omnivore_swin_t()

print(count_parameters(m))

28945041
