tiny imagenet support added

acl21 · acl21 · commit fabd98929b06 · 2021-03-22T13:52:34.000+05:30
diff --git a/README.md b/README.md
@@ -13,10 +13,6 @@ The codebase currently only supports single-machine single-gpu training. We will
 
 Please see [`GETTING_STARTED`](docs/GETTING_STARTED.md) for brief installation instructions and basic usage examples.
 
-## Model Zoo
-
-We provide a large set of baseline results as proof of repository's efficiency. (coming soon)
-
 ## Active Learning Methods Supported
 * Uncertainty Sampling
   * Least Confidence
@@ -32,11 +28,16 @@ We provide a large set of baseline results as proof of repository's efficiency.
 
 
 ## Datasets Supported
-* CIFAR10
-* CIFAR100
-* MNIST
-* SVHN
-* TinyImageNet (coming soon)
+* [CIFAR10/100](https://www.cs.toronto.edu/~kriz/cifar.html)
+* [MNIST](http://yann.lecun.com/exdb/mnist/)
+* [SVHN](http://ufldl.stanford.edu/housenumbers/)
+* [TinyImageNet](https://www.kaggle.com/c/tiny-imagenet) (Download the zip file [here](http://cs231n.stanford.edu/tiny-imagenet-200.zip))
+
+
+## Model Zoo
+
+We provide a large set of baseline results as proof of repository's efficiency. (coming soon)
+
 
 ## Citing this repository
 
diff --git a/pycls/core/config.py b/pycls/core/config.py
@@ -217,6 +217,7 @@
 # ---------------------------------------------------------------------------- #
 _C.DATASET = CN()
 _C.DATASET.NAME = None
+# For Tiny ImageNet dataset, ROOT_DIR must be set to the dataset folder ("data/tiny-imagenet-200/"). For others, the outder "data" folder where all datasets can be stored is expected.
 _C.DATASET.ROOT_DIR = None
 # Specifies the proportion of data in train set that should be considered as the validation data
 _C.DATASET.VAL_RATIO = 0.1
diff --git a/pycls/datasets/data.py b/pycls/datasets/data.py
@@ -16,6 +16,7 @@
 from .utils import helpers
 import pycls.utils.logging as lu
 from pycls.datasets.sampler import IndexedSequentialSampler
+from pycls.datasets.tiny_imagenet import TinyImageNet
 
 logger = lu.get_logger(__name__)
 
@@ -148,7 +149,7 @@ def getDataset(self, save_dir, isTrain=True, isDownload=False):
         if self.dataset == "MNIST":
             mnist = datasets.MNIST(save_dir, train=isTrain, transform=preprocess_steps, download=isDownload)
             return mnist, len(mnist)
-        
+
         elif self.dataset == "CIFAR10":
             cifar10 = datasets.CIFAR10(save_dir, train=isTrain, transform=preprocess_steps, download=isDownload)
             return cifar10, len(cifar10)
@@ -159,11 +160,18 @@ def getDataset(self, save_dir, isTrain=True, isDownload=False):
 
         elif self.dataset == "SVHN":
             if isTrain:
-                svhn = SVHN(save_dir,split='train', transform=preprocess_steps, download=isDownload)
+                svhn = datasets.SVHN(save_dir, split='train', transform=preprocess_steps, download=isDownload)
             else:
-                svhn = SVHN(save_dir, split='test', transform=preprocess_steps, download=isDownload)
+                svhn = datasets.SVHN(save_dir, split='test', transform=preprocess_steps, download=isDownload)
             return svhn, len(svhn)
-        # TinyImageNet Implementation Needed
+
+        elif self.dataset == "TINYIMAGENET":
+            if isTrain:
+                tiny = TinyImageNet(save_dir, split='train', transform=preprocess_steps)
+            else:
+                tiny = TinyImageNet(save_dir, split='val', transform=preprocess_steps)
+            return tiny, len(tiny)
+
         else:
             print("Either the specified {} dataset is not added or there is no if condition in getDataset function of Data class".format(self.dataset))
             logger.info("Either the specified {} dataset is not added or there is no if condition in getDataset function of Data class".format(self.dataset))
@@ -200,7 +208,7 @@ def makeLUVSets(self, train_split_ratio, val_split_ratio, data, seed_id, save_di
 
         assert isinstance(train_split_ratio, float),"Train split ratio is of {} datatype instead of float".format(type(train_split_ratio))
         assert isinstance(val_split_ratio, float),"Val split ratio is of {} datatype instead of float".format(type(val_split_ratio))
-        assert self.dataset in ["MNIST","CIFAR10","CIFAR100", "SVHN"], "Sorry the dataset {} is not supported. Currently we support ['MNIST','CIFAR10', 'CIFAR100', 'SVHN']".format(self.dataset)
+        assert self.dataset in ["MNIST","CIFAR10","CIFAR100", "SVHN", "TINYIMAGENET"], "Sorry the dataset {} is not supported. Currently we support ['MNIST','CIFAR10', 'CIFAR100', 'SVHN', 'TINYIMAGENET']".format(self.dataset)
 
         lSet = []
         uSet = []
@@ -262,7 +270,7 @@ def makeTVSets(self, train_split_ratio, val_split_ratio, data, seed_id, save_dir
 
         assert isinstance(train_split_ratio, float),"Train split ratio is of {} datatype instead of float".format(type(train_split_ratio))
         assert isinstance(val_split_ratio, float),"Val split ratio is of {} datatype instead of float".format(type(val_split_ratio))
-        assert self.dataset in ["MNIST","CIFAR10","CIFAR100", "SVHN"], "Sorry the dataset {} is not supported. Currently we support ['MNIST','CIFAR10', 'CIFAR100', 'SVHN']".format(self.dataset)
+        assert self.dataset in ["MNIST","CIFAR10","CIFAR100", "SVHN", "TINYIMAGENET"], "Sorry the dataset {} is not supported. Currently we support ['MNIST','CIFAR10', 'CIFAR100', 'SVHN', 'TINYIMAGENET']".format(self.dataset)
 
         trainSet = []
         valSet = []
@@ -377,7 +385,7 @@ def getTestLoader(self, data, test_batch_size, seed_id=0):
         torch.manual_seed(seed_id)
         np.random.seed(seed_id)
 
-        if self.dataset in ["MNIST","CIFAR10","CIFAR100"]:
+        if self.dataset in ["MNIST","CIFAR10","CIFAR100", "TINYIMAGENET"]:
             n_datapts = len(data)
             idx = [i for i in range(n_datapts)]
             #np.random.shuffle(idx)
diff --git a/pycls/datasets/tiny_imagenet.py b/pycls/datasets/tiny_imagenet.py
@@ -0,0 +1,85 @@
+import os
+import numpy as np
+
+import torch
+import torchvision.datasets as datasets
+
+from typing import Any
+
+
+class TinyImageNet(datasets.ImageFolder):
+    """`Tiny ImageNet Classification Dataset.
+
+    Args:
+        root (string): Root directory of the ImageNet Dataset.
+        split (string, optional): The dataset split, supports ``train``, or ``val``.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        loader (callable, optional): A function to load an image given its path.
+
+     Attributes:
+        classes (list): List of the class name tuples.
+        class_to_idx (dict): Dict with items (class_name, class_index).
+        wnids (list): List of the WordNet IDs.
+        wnid_to_idx (dict): Dict with items (wordnet_id, class_index).
+        samples (list): List of (image path, class_index) tuples
+        targets (list): The class_index value for each image in the dataset
+    """
+    def __init__(self, root: str, split: str = 'train', **kwargs: Any) -> None:
+        self.root = root
+        assert self.check_root(), "Something is wrong with the Tiny ImageNet dataset. Download the official dataset zip from http://cs231n.stanford.edu/tiny-imagenet-200.zip and unzip it inside {}.".format(self.root)
+        self.split = datasets.utils.verify_str_arg(split, "split", ("train", "val"))
+
+        wnid_to_classes = self.load_wnid_to_classes()
+
+        super(TinyImageNet, self).__init__(self.split_folder, **kwargs)
+        self.wnids = self.classes
+        self.wnid_to_idx = self.class_to_idx
+        self.classes = [wnid_to_classes[wnid] for wnid in self.wnids]
+        self.class_to_idx = {cls: idx
+                             for idx, clss in enumerate(self.classes)
+                             for cls in clss}
+        # Tiny ImageNet val directory structure is not similar to that of train's
+        # So a custom loading function is necessary
+        if self.split == 'val':
+            self.root = root
+            self.imgs, self.target = self.load_val_data()
+            self.samples = [(self.imgs[idx],self.targets[idx]) for idx in range(len(self.imgs))]
+            self.root = os.path.join(self.root, 'val')
+
+
+    # Split folder is used for the 'super' call. Since val directory is not structured like the train, 
+    # we simply use train's structure to get all classes and other stuff
+    @property
+    def split_folder(self) -> str:
+        return os.path.join(self.root, 'train')
+
+
+    def load_val_data(self):
+        imgs, targets = [], []
+        with open(os.path.join(self.root, 'val', 'val_annotations.txt'), 'r') as file:
+            for line in file:
+                if line.split()[1] in self.wnids:
+                    img_file, wnid = line.split('\t')[:2]
+                    imgs.append(os.path.join(self.root, 'val', 'images', img_file))
+                    targets.append(wnid)
+        targets = np.array([self.wnid_to_idx[wnid] for wnid in targets])
+        return imgs, targets
+
+
+    def load_wnid_to_classes(self):
+        wnid_to_classes = {}
+        with open(os.path.join(self.root, 'words.txt'), 'r') as file:
+            lines = file.readlines()
+            lines = [x.split('\t') for x in lines]
+            wnid_to_classes = {x[0]:x[1].strip() for x in lines}
+        return wnid_to_classes
+
+    def check_root(self):
+        tinyim_set = ['words.txt', 'wnids.txt', 'train', 'val', 'test']
+        for x in os.scandir(self.root):
+            if x.name not in tinyim_set:
+                return False
+        return True
diff --git a/tools/ensemble_al.py b/tools/ensemble_al.py
@@ -110,12 +110,12 @@ def main(cfg):
     if not os.path.exists(cfg.OUT_DIR):
         os.mkdir(cfg.OUT_DIR)
     # Create "DATASET" specific directory
-    dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME)
+    dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME, cfg.MODEL.TYPE)
     if not os.path.exists(dataset_out_dir):
         os.mkdir(dataset_out_dir)
     # Creating the experiment directory inside the dataset specific directory 
     # all logs, labeled, unlabeled, validation sets are stroed here 
-    # E.g., output/CIFAR10/{timestamp or cfg.EXP_NAME based on arguments passed}
+    # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed}
     if cfg.EXP_NAME == 'auto':
         now = datetime.now()
         exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}'
@@ -246,7 +246,10 @@ def main(cfg):
         save_plot_values([plot_episode_xvalues, plot_episode_yvalues], \
             ["plot_episode_xvalues", "plot_episode_yvalues"], out_dir=cfg.EXP_DIR, saveInTextFormat=True)
 
-        
+
+        # No need to perform active sampling in the last episode iteration
+        if cur_episode == cfg.ACTIVE_LEARNING.MAX_ITER:
+            break
 
         # Active Sample 
         print("======== ENSEMBLE ACTIVE SAMPLING ========\n")
diff --git a/tools/train_al.py b/tools/train_al.py
@@ -113,7 +113,7 @@ def main(cfg):
     # Create "DATASET" specific directory
     dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME, cfg.MODEL.TYPE)
     if not os.path.exists(dataset_out_dir):
-        os.mkdir(dataset_out_dir)
+        os.makedirs(dataset_out_dir)
     # Creating the experiment directory inside the dataset specific directory 
     # all logs, labeled, unlabeled, validation sets are stroed here 
     # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed}