In [1]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torchvision.transforms as TF
from collections import Counter
from tqdm import tqdm 
from torchvision import datasets, models, transforms
from typing import Any, Callable, Dict, List, Optional, Union, Tuple
import os
import sys
import copy
from torch.utils.data import Dataset
import random
sys.path.append('../src/')

import tforms
import utils
sys.path.append('../src/novelty_dfm_CL/')

from dset8_specific_scripts.eightdset_wrappers.aircraft import MY_AIRCRAFT
from dset8_specific_scripts.eightdset_wrappers.birds import MY_BIRDS
from dset8_specific_scripts.eightdset_wrappers.cars import MY_CARS
from dset8_specific_scripts.eightdset_wrappers.voc import MY_VOC
from dset8_specific_scripts.eightdset_wrappers.char import MY_CHAR
from dset8_specific_scripts.eightdset_wrappers.flowers import MY_FLOWERS
from dset8_specific_scripts.eightdset_wrappers.scenes import MY_SCENES
from dset8_specific_scripts.eightdset_wrappers.svhn import MY_SVHN


## Transform to resize images 
class Resize_tf():
    """Pre-processing for inaturalist training.
    """
    # TODO maybe the tranform topilimage is messing up the normalization!
    def __init__(self):
        self.tf = TF.Compose([TF.Resize(299), TF.CenterCrop(299),
                            #   inaturalist_normalize(),
                              ])
    def __call__(self, img):
        return self.tf(img)
    
transform = Resize_tf()







# Prepare Dataset for SHELL

### 1) half of classes of each dataset
### 2) resize to 299 and 124or128??? Ask Laurent 
### 3) Save them to a tmpig21 folder so that all can access 



In [7]:


def filter_classes(all_labels_dset, percent_classes_keep=0.5):
    
    classes = np.unique(all_labels_dset)
    
    num_classes = classes.shape[0]
    
    num_keep = int(np.round(percent_classes_keep*num_classes))
    
    classes_keep = np.random.permutation(num_classes)[:num_keep]
    
    indices_keep = []
    for c in classes_keep:
        indices_keep.extend(np.where(all_labels_dset==c)[0].tolist())
        
    return np.array(indices_keep)
        




def Shell_8dset_prepare_test(dataroot, outdir, experiment_name, percent_classes_keep=0.5):
    """ 
    Experiment that keeps half of 8dset 
    """
    
    list_tasks = ['flowers', 'scenes', 'birds', 'cars', 'aircraft', 'voc', 'chars', 'svhn']
    individual_wrappers={'flowers':MY_FLOWERS, 'aircraft':MY_AIRCRAFT, 'birds': MY_BIRDS, \
            'voc':MY_VOC, 'cars': MY_CARS, 'svhn': MY_SVHN, 'chars':MY_CHAR, 'scenes':MY_SCENES}
    
    tasklists_train = ['%s/%s/labels/val.txt'%(dataroot,d) for d in list_tasks[:-1]]+['test']

    dsets = []
    for i, dname in enumerate(list_tasks):
        if dname =='svhn':
            dataset = MY_SVHN(img_path='/lab/arios/ProjIntel/incDFM/data/svhn/',
                                    split='val')
        else:
            dataset = individual_wrappers[dname](img_path='%s/%s'%(dataroot, dname),
                                            txt_path=tasklists_train[i],
                                            dataset='val')
        dsets.append(dataset)
        
        
    # filter classes     
    # --- Set up Task sequences 8dset
    tasks_list_test=[]
    for t, task in enumerate(list_tasks):
        
        labels = np.array(dsets[t].img_label)
        
        print('original number', labels.shape[0])
        
        indices_task = filter_classes(labels, percent_classes_keep=percent_classes_keep)
        
        tasks_list_test.append(indices_task)
        
        print(tasks_list_test[-1].shape)

    # --- save sequences to text files 
    task_filepaths_test = saveTasks_to_txt(tasks_list_test, tasklists_train, '8dset', 'test', outdir, experiment_name, scenario='nc')

    return task_filepaths_test
        


def Shell_8dset_prepare_train_val_split(dataroot, outdir, experiment_name,\
    percent_classes_keep=0.5, validation_percent=0.2):
    """ 
    Only do holdout for Train
    Args:
        holdout_percent (float): percent of train data to leave out for later
        max_holdout (float): maximum holdout_percent allowed. Usually not changed 
        root (string): Root directory of the dataset where images and paths file are stored
        outdir (string): Out directory to store experiment task files (txt sequences of objects)
        train (bool, optional): partition set. If train=True then it is train set. Else, test. 
        scenario (string, optional): What tye of CL learning regime. 'nc' stands for class incremental,\
             where each task contains disjoint class subsets
    """
    
    list_tasks = ['flowers', 'scenes', 'birds', 'cars', 'aircraft', 'voc', 'chars', 'svhn']
    individual_wrappers={'flowers':MY_FLOWERS, 'aircraft':MY_AIRCRAFT, 'birds': MY_BIRDS, \
            'voc':MY_VOC, 'cars': MY_CARS, 'svhn': MY_SVHN, 'chars':MY_CHAR, 'scenes':MY_SCENES}
    
    tasklists_train = ['%s/%s/labels/train.txt'%(dataroot,d) for d in list_tasks[:-1]]+['train']

    dsets = []
    for i, dname in enumerate(list_tasks):
        if dname =='svhn':
            dataset = MY_SVHN(img_path='/lab/arios/ProjIntel/incDFM/data/svhn/',
                                    split='train')
        else:
            dataset = individual_wrappers[dname](img_path='%s/%s'%(dataroot, dname),
                                            txt_path=tasklists_train[i],
                                            dataset='train')
        dsets.append(dataset)
        
                
    # --- Set up Task sequences 8dset
    tasks_list_train=[]
    tasks_list_val=[]
    for t, task in enumerate(list_tasks):
        
        labels = np.array(dsets[t].img_label)
        
        print('original number', labels.shape[0])
        
        indices_task = filter_classes(labels, percent_classes_keep=percent_classes_keep)
        
        num_samples_task = indices_task.shape[0]
        
        inds_shuf = np.random.permutation(indices_task)
        
        # divide the train/val/holdout split
        split_val = int(np.floor(validation_percent*num_samples_task))
        tasks_list_val.append(inds_shuf[:split_val])
        inds_train = inds_shuf[split_val:]
        tasks_list_train.append(inds_train)

        print(tasks_list_train[-1].shape, tasks_list_val[-1].shape)


    # --- save sequences to text files 
    task_filepaths_train = saveTasks_to_txt(tasks_list_train, tasklists_train, '8dset', 'train', outdir, experiment_name, scenario='nc')
    task_filepaths_val = saveTasks_to_txt(tasks_list_val, tasklists_train, '8dset', 'validation', outdir, experiment_name, scenario='nc')


    return task_filepaths_train, task_filepaths_val
        




def saveTasks_to_txt(tasklists_subset_indices, tasklists_orig, dset_name, partition, outdir, experiment_name, scenario='nc'):
    
    # --- save sequences to text files 
    dest_dir = '%s/%s/%s'%(outdir, dset_name, experiment_name)
    utils.makedirectory(dest_dir)
    dest_dir = dest_dir + '/%s'%(partition)
    utils.makedirectory(dest_dir)

    # Create directory for experiment 
    task_filepaths=[]
    for task in range(len(tasklists_orig)):
        subset_indices = tasklists_subset_indices[task]
        if task<7:
            task_filepaths.append("%s/%s_%s_task_%d.txt"%(dest_dir, scenario, partition, task))
            write_to_file_subset_8dset(tasklists_orig[task], subset_indices, task_filepaths[-1])
        else:
            task_filepaths.append("%s/%s_%s_task_%d.npy"%(dest_dir, scenario, partition, task))
            np.save(task_filepaths[-1], subset_indices)
        
    return task_filepaths



def write_to_file_subset_8dset(original_txt, subset_indices, new_txt):
    with open(original_txt, "r") as file_input:
        with open(new_txt, "w") as output: 
            for i, line in enumerate(file_input):
                if i in subset_indices:
                    output.write(line)
        


In [6]:
dataroot = '../data/8dset/'
outdir = '/lab/tmpig21/u/arios/8dset_shell/shell_experiments/'
experiment_name='Shell_HalfClasses_8dset'


task_filepaths_train, tasks_filepaths_val = Shell_8dset_prepare_train_val_split(dataroot, outdir, experiment_name,\
    percent_classes_keep=0.5, validation_percent=0.2)

original number 6149
(2506,) (626,)
original number 5359
(2169,) (542,)
original number 5994
(2399,) (599,)
original number 8144
(3292,) (823,)
original number 3334
(1256,) (314,)
original number 1683
(773,) (193,)
original number 4623
(1703,) (425,)
original number 73257
(23619,) (5904,)


In [4]:
dataroot = '../data/8dset/'
outdir = '/lab/tmpig21/u/arios/8dset_shell/shell_experiments/'
experiment_name='Shell_HalfClasses_8dset'


task_filepaths = Shell_8dset_prepare_test(dataroot, outdir, experiment_name, percent_classes_keep=0.5)

original number 2040
(1020,)
original number 1340
(687,)
original number 5794
(2872,)
original number 8041
(4004,)
original number 3333
(1404,)
original number 1651
(872,)
original number 1541
(839,)
original number 26032
(10265,)


In [None]:

def eightdset_Experiments_w_holdout_w_validation_trainset(dataroot, outdir, experiment_name, dset_name='8dset',\
    holdout_percent=0.2, validation_percent=0.1):
    """ 
    Only do holdout for Train
    Args:
        holdout_percent (float): percent of train data to leave out for later
        max_holdout (float): maximum holdout_percent allowed. Usually not changed 
        root (string): Root directory of the dataset where images and paths file are stored
        outdir (string): Out directory to store experiment task files (txt sequences of objects)
        train (bool, optional): partition set. If train=True then it is train set. Else, test. 
        scenario (string, optional): What tye of CL learning regime. 'nc' stands for class incremental,\
             where each task contains disjoint class subsets
    """
    
    list_tasks = ['flowers', 'scenes', 'birds', 'cars', 'aircraft', 'voc', 'chars', 'svhn']
    individual_wrappers={'flowers':MY_FLOWERS, 'aircraft':MY_AIRCRAFT, 'birds': MY_BIRDS, \
            'voc':MY_VOC, 'cars': MY_CARS, 'svhn': MY_SVHN, 'chars':MY_CHAR, 'scenes':MY_SCENES}
    
    tasklists_train = ['%s/%s/labels/train.txt'%(dataroot,d) for d in list_tasks[:-1]]+['train']

    dsets = []
    for i, dname in enumerate(list_tasks):
        if dname =='svhn':
            dataset = MY_SVHN(img_path='/lab/arios/ProjIntel/incDFM/data/svhn/',
                                    split='train')
        else:
            dataset = individual_wrappers[dname](img_path='%s/%s'%(dataroot, dname),
                                            txt_path=tasklists_train[i],
                                            dataset='train')
        dsets.append(dataset)
        
                
    # --- Set up Task sequences 8dset
    tasks_list_train=[]
    tasks_list_val=[]
    tasks_list_holdout=[]
    for t, task in enumerate(list_tasks):
        
        labels = np.array(dsets[t].img_label)
        print('Dset %s - Num data %d Num labels %d'%(task, labels.shape[0], np.unique(labels).shape[0]))
        num_samples_task = labels.shape[0]
        indices_task = np.arange(num_samples_task)

        # for each label subset
        print('num_samples_task', num_samples_task)
        
        inds_shuf = np.random.permutation(indices_task)
        
        # divide the train/val/holdout split
        split_val = int(np.floor(validation_percent*num_samples_task))
        tasks_list_val.append(inds_shuf[:split_val])
        inds_train = inds_shuf[split_val:]


        split_holdout = int(np.floor(holdout_percent*num_samples_task))
        tasks_list_train.append(inds_train[split_holdout:])
        tasks_list_holdout.append(inds_train[:split_holdout])

        print(tasks_list_train[-1].shape, tasks_list_val[-1].shape, tasks_list_holdout[-1].shape)

    # sys.exit()

    # --- save sequences to text files 
    task_filepaths_train = saveTasks_to_txt(tasks_list_train, tasklists_train, dset_name, 'train', outdir, experiment_name, scenario='nc')
    task_filepaths_train_holdout = saveTasks_to_txt(tasks_list_holdout, tasklists_train, dset_name, 'holdout', outdir, experiment_name, scenario='nc')
    task_filepaths_val = saveTasks_to_txt(tasks_list_val, tasklists_train, dset_name, 'validation', outdir, experiment_name, scenario='nc')


    return task_filepaths_train, task_filepaths_train_holdout, task_filepaths_val
        




class Shell8dset_Wrapper():
    def __init__(self, dataroot, dset_name, split='train', tasklist='task_indices.txt', transform=None, \
        returnIDX=False):
        '''
        dataroot - for 8dset /lab/arios/ProjIntel/incDFM/data/8dset
        dset_name - among the 8 datasets (flowers, aircrafts, birds, cars, char, scenes, voc, svhn)
        '''

        self.transform = transform
        
        self.dset_name = dset_name
        
        self.individual_wrappers={'flowers':MY_FLOWERS, 'aircraft':MY_AIRCRAFT, 'birds': MY_BIRDS, \
            'voc':MY_VOC, 'cars': MY_CARS, 'svhn': MY_SVHN, 'chars':MY_CHAR, 'scenes':MY_SCENES}
        
        self.order_tasks = {'flowers':0, 'scenes':1, 'birds':2, 'cars':3, 'aircraft':4, 'voc':5,  'chars':6, 'svhn':7}
        
        self.task_lb = self.order_tasks[self.dset_name]
        
        if dset_name !='svhn':
            self.intern_dset = self.individual_wrappers[dset_name](img_path='%s/%s'%(dataroot, dset_name),
                                        txt_path=tasklist,
                                        data_transforms=transform)
        else:
            self.intern_dset = self.individual_wrappers[dset_name]('%s/%s'%(dataroot, dset_name),
                                        tasklist=tasklist,
                                        data_transforms=transform,
                                        split=split)
            
                        
            
        self.returnIDX = returnIDX
        
        self.indices_task_init = np.arange(self.intern_dset.__len__())
        
        self.indices_task = copy.deepcopy(self.indices_task_init)

            
    def __len__(self):
        return self.indices_task.shape[0]

    def select_random_subset(self, random_num):

        inds_keep = np.random.permutation(np.arange(self.indices_task_init.shape[0]))[:random_num]

        self.indices_task = self.indices_task_init[inds_keep]
        
    def select_specific_subset(self, indices_select):
        
        self.indices_task = self.indices_task_init[indices_select]
        
    def __getitem__(self, idx):
        
        idx = self.indices_task[idx]

        im, class_lbl = self.intern_dset.__getitem__(idx)
        
        if self.returnIDX:
            return im, class_lbl, self.task_lb, self.task_lb, idx
            
        return im, class_lbl, self.task_lb, self.task_lb



In [6]:
        
## for svhn save list with right indices 

class MY_SVHN(Dataset):
    def __init__(self, img_path, indicespath=None, data_transforms=None, target_transform=None, split='train', balance=False):
        '''
        
        indicespath will contain .npy array with indices for half of the classes in svhn. 
        So total classes will be 5 
        '''
        
        self.train_map = {'train':True, 'val':False}
        
        self.balance=balance
        
        self.max_labels=10

        self.images, self.img_label = load_SVHN(img_path, self.train_map[split], balance=self.balance)
        
        # subset 
        if indicespath is not None:
            tasklist = np.load(indicespath)
            self.images = self.images[tasklist,...]
            self.img_label = self.img_label[tasklist]
            
        self.img_name = np.arange(self.img_label.shape[0])
                
        self.data_transforms = data_transforms
        self.target_transform=target_transform
        

    def __len__(self):
        return self.img_name.shape[0]

    def __getitem__(self, item):
        
        
        idx = self.img_name[item]
        
        img = self.images[idx,:]
        label = self.img_label[idx]
        

        if self.data_transforms is not None:
            try:

                img = self.data_transforms(img)
                
            except:

                print("Cannot transform image")
                
        if self.target_transform is not None:
            
            label = self.target_transform(label)
                

        return img, label



def load_SVHN(root, train=True, balance=True):

    root = os.path.expanduser(root)
    
    if train==True:
        filename = "train_32x32.mat"
    else:
        filename = "test_32x32.mat"

        
    import scipy.io as sio
    # reading(loading) mat file as array
    loaded_mat = sio.loadmat(os.path.join(root, filename))

    data = loaded_mat['X']
    # loading from the .mat file gives an np array of type np.uint8
    # converting to np.int64, so that we have a LongTensor after
    # the conversion from the numpy array
    # the squeeze is needed to obtain a 1D tensor
    labels = loaded_mat['y'].astype(np.int64).squeeze()

    # the svhn dataset assigns the class label "10" to the digit 0
    # this makes it inconsistent with several loss functions
    # which expect the class labels to be in the range [0, C-1]
    np.place(labels, labels == 10, 0)
    data = np.transpose(data, (3, 2, 0, 1))

    if train==True:
        max_ind_b = 4948
    else:
        max_ind_b = 1595
    
    
    if balance==True:
        
        inds_b = []
        random.seed(999)

        for i in range(10):

            arr = np.where(labels==i)[0]
            np.random.shuffle(arr)
            inds = arr[:max_ind_b]
            inds_b.extend(list(inds))

        inds_b = np.array(inds_b)
        inds_b = inds_b.astype(int)

        np.random.shuffle(inds_b)

        data = data[inds_b,...]
        labels = labels[inds_b]
        
        
    data = torch.from_numpy(data).type(torch.FloatTensor)
    # y_vec = torch.from_numpy(y_vec).type(torch.FloatTensor)
    labels = torch.from_numpy(labels).type(torch.LongTensor)
        
    return data, labels