In [2]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torchvision.transforms as TF
from collections import Counter
from tqdm import tqdm 
from torchvision import datasets, models, transforms
from typing import Any, Callable, Dict, List, Optional, Union, Tuple
import os
import sys

sys.path.append('../src/')

import tforms
import utils
sys.path.append('../src/novelty_dfm_CL/')

from dset8_specific_scripts.eightdset_wrappers.aircraft import MY_AIRCRAFT
from dset8_specific_scripts.eightdset_wrappers.birds import MY_BIRDS
from dset8_specific_scripts.eightdset_wrappers.cars import MY_CARS
from dset8_specific_scripts.eightdset_wrappers.voc import MY_VOC
from dset8_specific_scripts.eightdset_wrappers.char import MY_CHAR
from dset8_specific_scripts.eightdset_wrappers.flowers import MY_FLOWERS
from dset8_specific_scripts.eightdset_wrappers.scenes import MY_SCENES
from dset8_specific_scripts.eightdset_wrappers.svhn import MY_SVHN

    
class eightdset_normalize():
    def __init__(self):
        self.tf = TF.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))

    def __call__(self, img):
        return self.tf(img)


class eightdset_train():
    """Pre-processing for inaturalist training.
    """
    # TODO maybe the tranform topilimage is messing up the normalization!
    def __init__(self):
        self.tf = TF.Compose([TF.Resize(256), TF.CenterCrop(224), TF.RandomHorizontalFlip(), TF.ToTensor(), 
                            #   inaturalist_normalize(),
                              ])
    def __call__(self, img):
        return self.tf(img)
    
    
transform = eightdset_train()

transform_svhn = tforms.svhn_train()    
    
data_dir = '../data/8dset/'



In [3]:

shuffle_loader=True
batchsize=100

trainset_task1 = MY_FLOWERS(img_path=data_dir+'/flowers',
                                    txt_path=(data_dir+'/flowers/labels/train.txt'),
                                    data_transforms=transform,
                                    dataset='val')
train_loader_task1 = torch.utils.data.DataLoader(trainset_task1, batch_size=batchsize,
                                            shuffle=shuffle_loader, num_workers=2)

# ======================
trainset_task2 = MY_SCENES(img_path=data_dir+'/scenes',
                                    txt_path=(data_dir+'/scenes/labels/train.txt'),
                                    data_transforms=transform,
                                    dataset='train')
train_loader_task2 = torch.utils.data.DataLoader(trainset_task2, batch_size=batchsize,
                                            shuffle=shuffle_loader, num_workers=2)

# ======================
trainset_task3 = MY_BIRDS(img_path=data_dir+'/birds',
                                    txt_path=(data_dir+'/birds/labels/train.txt'),
                                    data_transforms=transform,
                                    dataset='train')
train_loader_task3 = torch.utils.data.DataLoader(trainset_task3, batch_size=batchsize,
                                            shuffle=shuffle_loader, num_workers=2)


# ======================
trainset_task4 = MY_CARS(img_path=data_dir+'/cars',
                                    txt_path=(data_dir+'/cars/labels/train.txt'),
                                    data_transforms=transform,
                                    dataset='train')
train_loader_task4 = torch.utils.data.DataLoader(trainset_task4, batch_size=batchsize,
                                            shuffle=shuffle_loader, num_workers=2)

# ======================
trainset_task5 = MY_AIRCRAFT(img_path=data_dir+'/aircraft',
                                    txt_path=(data_dir+'/aircraft/labels/train.txt'),
                                    data_transforms=transform,
                                    dataset='train')
train_loader_task5 = torch.utils.data.DataLoader(trainset_task5, batch_size=batchsize,
                                            shuffle=shuffle_loader, num_workers=2)

# ======================
trainset_task6 = MY_VOC(img_path=data_dir+'/voc',
                                    txt_path=(data_dir+'/voc/labels/train.txt'),
                                    data_transforms=transform,
                                    dataset='train')
train_loader_task6 = torch.utils.data.DataLoader(trainset_task6, batch_size=batchsize,
                                            shuffle=shuffle_loader, num_workers=2)

# ======================
trainset_task7 = MY_CHAR(img_path=data_dir+'/chars',
                                    txt_path=(data_dir+'/chars/labels/train.txt'),
                                    data_transforms=transform,
                                    dataset='train')
train_loader_task7 = torch.utils.data.DataLoader(trainset_task7, batch_size=batchsize,
                                            shuffle=shuffle_loader, num_workers=2)


# ======================
trainset_task8 = MY_SVHN(img_path=data_dir+'/svhn',
                                    data_transforms=transform_svhn,
                                    split='train')
train_loader_task8 = torch.utils.data.DataLoader(trainset_task8, batch_size=batchsize,
                                            shuffle=shuffle_loader, num_workers=2)



datasets = [trainset_task1, 
            trainset_task2, 
            trainset_task3, 
            trainset_task4,
            trainset_task5,
            trainset_task6,
            trainset_task7,
            trainset_task8,]

loaders = [train_loader_task1, 
            train_loader_task2, 
            train_loader_task3, 
            train_loader_task4,
            train_loader_task5,
            train_loader_task6,
            train_loader_task7,
            train_loader_task8,]


num_classes = [102,67,200,196,70,10,62,10]

In [3]:


def eightdset_Experiments_w_holdout_w_validation_trainset(dataroot, outdir, experiment_name, dset_name='8dset',\
    holdout_percent=0.2, validation_percent=0.1):
    """ 
    Only do holdout for Train
    Args:
        holdout_percent (float): percent of train data to leave out for later
        max_holdout (float): maximum holdout_percent allowed. Usually not changed 
        root (string): Root directory of the dataset where images and paths file are stored
        outdir (string): Out directory to store experiment task files (txt sequences of objects)
        train (bool, optional): partition set. If train=True then it is train set. Else, test. 
        scenario (string, optional): What tye of CL learning regime. 'nc' stands for class incremental,\
             where each task contains disjoint class subsets
    """
    
    list_tasks = ['flowers', 'scenes', 'birds', 'cars', 'aircraft', 'voc', 'chars', 'svhn']
    individual_wrappers={'flowers':MY_FLOWERS, 'aircraft':MY_AIRCRAFT, 'birds': MY_BIRDS, \
            'voc':MY_VOC, 'cars': MY_CARS, 'svhn': MY_SVHN, 'chars':MY_CHAR, 'scenes':MY_SCENES}
    
    tasklists_train = ['%s/%s/labels/train.txt'%(dataroot,d) for d in list_tasks[:-1]]+['train']

    dsets = []
    for i, dname in enumerate(list_tasks):
        if dname =='svhn':
            dataset = MY_SVHN(img_path='/lab/arios/ProjIntel/incDFM/data/svhn/',
                                    split='train')
        else:
            dataset = individual_wrappers[dname](img_path='%s/%s'%(dataroot, dname),
                                            txt_path=tasklists_train[i],
                                            dataset='train')
        dsets.append(dataset)
        
                
    # --- Set up Task sequences 8dset
    tasks_list_train=[]
    tasks_list_val=[]
    tasks_list_holdout=[]
    for t, task in enumerate(list_tasks):
        
        labels = np.array(dsets[t].img_label)
        print('Dset %s - Num data %d Num labels %d'%(task, labels.shape[0], np.unique(labels).shape[0]))
        num_samples_task = labels.shape[0]
        indices_task = np.arange(num_samples_task)

        # for each label subset
        print('num_samples_task', num_samples_task)
        
        inds_shuf = np.random.permutation(indices_task)
        
        # divide the train/val/holdout split
        split_val = int(np.floor(validation_percent*num_samples_task))
        tasks_list_val.append(inds_shuf[:split_val])
        inds_train = inds_shuf[split_val:]


        split_holdout = int(np.floor(holdout_percent*num_samples_task))
        tasks_list_train.append(inds_train[split_holdout:])
        tasks_list_holdout.append(inds_train[:split_holdout])

        print(tasks_list_train[-1].shape, tasks_list_val[-1].shape, tasks_list_holdout[-1].shape)

    # sys.exit()

    # --- save sequences to text files 
    task_filepaths_train = saveTasks_to_txt(tasks_list_train, tasklists_train, dset_name, 'train', outdir, experiment_name, scenario='nc')
    task_filepaths_train_holdout = saveTasks_to_txt(tasks_list_holdout, tasklists_train, dset_name, 'holdout', outdir, experiment_name, scenario='nc')
    task_filepaths_val = saveTasks_to_txt(tasks_list_val, tasklists_train, dset_name, 'validation', outdir, experiment_name, scenario='nc')


    return task_filepaths_train, task_filepaths_train_holdout, task_filepaths_val
        




def saveTasks_to_txt(tasklists_subset_indices, tasklists_orig, dset_name, partition, outdir, experiment_name, scenario='nc'):
    
    # --- save sequences to text files 
    dest_dir = '%s/%s/%s'%(outdir, dset_name, experiment_name)
    utils.makedirectory(dest_dir)
    dest_dir = dest_dir + '/%s'%(partition)
    utils.makedirectory(dest_dir)

    # Create directory for experiment 
    task_filepaths=[]
    for task in range(len(tasklists_orig)):
        subset_indices = tasklists_subset_indices[task]
        if task<7:
            task_filepaths.append("%s/%s_%s_task_%d.txt"%(dest_dir, scenario, partition, task))
            write_to_file_subset_8dset(tasklists_orig[task], subset_indices, task_filepaths[-1])
        else:
            task_filepaths.append("%s/%s_%s_task_%d.npy"%(dest_dir, scenario, partition, task))
            np.save(task_filepaths[-1], subset_indices)
        
    return task_filepaths



def write_to_file_subset_8dset(original_txt, subset_indices, new_txt):
    with open(original_txt, "r") as file_input:
        with open(new_txt, "w") as output: 
            for i, line in enumerate(file_input):
                if i in subset_indices:
                    output.write(line)
        


In [4]:

dataroot = '/lab/arios/ProjIntel/incDFM/data/8dset/'

experiment_dir = '/lab/arios/ProjIntel/incDFM/src/novelty_dfm_CL/Experiments_DFM_CL/'


task_filepaths_train, task_filepaths_train_holdout, task_filepaths_val = eightdset_Experiments_w_holdout_w_validation_trainset(dataroot, experiment_dir, \
    'Test', dset_name='8dset', holdout_percent=0.2, validation_percent=0.1)

Dset flowers - Num data 6149 Num labels 102
num_samples_task 6149
(4306,) (614,) (1229,)
Dset scenes - Num data 5359 Num labels 67
num_samples_task 5359
(3753,) (535,) (1071,)
Dset birds - Num data 5994 Num labels 200
num_samples_task 5994
(4197,) (599,) (1198,)
Dset cars - Num data 8144 Num labels 196
num_samples_task 8144
(5702,) (814,) (1628,)
Dset aircraft - Num data 3334 Num labels 70
num_samples_task 3334
(2335,) (333,) (666,)
Dset voc - Num data 1683 Num labels 10
num_samples_task 1683
(1179,) (168,) (336,)
Dset chars - Num data 4623 Num labels 62
num_samples_task 4623
(3237,) (462,) (924,)
Dset svhn - Num data 49191 Num labels 10
num_samples_task 49191
(34434,) (4919,) (9838,)


In [5]:
task_filepaths_train

['/lab/arios/ProjIntel/incDFM/src/novelty_dfm_CL/Experiments_DFM_CL//8dset/Test/train/nc_train_task_0.txt',
 '/lab/arios/ProjIntel/incDFM/src/novelty_dfm_CL/Experiments_DFM_CL//8dset/Test/train/nc_train_task_1.txt',
 '/lab/arios/ProjIntel/incDFM/src/novelty_dfm_CL/Experiments_DFM_CL//8dset/Test/train/nc_train_task_2.txt',
 '/lab/arios/ProjIntel/incDFM/src/novelty_dfm_CL/Experiments_DFM_CL//8dset/Test/train/nc_train_task_3.txt',
 '/lab/arios/ProjIntel/incDFM/src/novelty_dfm_CL/Experiments_DFM_CL//8dset/Test/train/nc_train_task_4.txt',
 '/lab/arios/ProjIntel/incDFM/src/novelty_dfm_CL/Experiments_DFM_CL//8dset/Test/train/nc_train_task_5.txt',
 '/lab/arios/ProjIntel/incDFM/src/novelty_dfm_CL/Experiments_DFM_CL//8dset/Test/train/nc_train_task_6.txt',
 '/lab/arios/ProjIntel/incDFM/src/novelty_dfm_CL/Experiments_DFM_CL//8dset/Test/train/nc_train_task_7.npy']

In [6]:
import copy

class eightdsetTask():
    def __init__(self, dataroot, dset_name, split='train', tasklist='task_indices.txt', transform=None, \
        returnIDX=False,  preload=False):
        '''
        dataroot - for 8dset /lab/arios/ProjIntel/incDFM/data/8dset
        dset_name - among the 8 datasets (flowers, aircrafts, birds, cars, char, scenes, voc, svhn)
        '''
        self.transform = transform
        
        self.dset_name = dset_name
        
        self.individual_wrappers={'flowers':MY_FLOWERS, 'aircraft':MY_AIRCRAFT, 'birds': MY_BIRDS, \
            'voc':MY_VOC, 'cars': MY_CARS, 'svhn': MY_SVHN, 'chars':MY_CHAR, 'scenes':MY_SCENES}
        
        self.order_tasks = {'flowers':0, 'scenes':1, 'birds':2, 'cars':3, 'aircraft':4, 'voc':5,  'chars':6, 'svhn':7}
        
        self.task_lb = self.order_tasks[self.dset_name]
        
        if dset_name !='svhn':
            self.intern_dset = self.individual_wrappers[dset_name](img_path='%s/%s'%(dataroot, dset_name),
                                        txt_path=tasklist,
                                        data_transforms=transform)
        else:
            self.intern_dset = self.individual_wrappers[dset_name]('%s/%s'%(dataroot, dset_name),
                                        tasklist=tasklist,
                                        data_transforms=transform,
                                        split=split)
            
        self.returnIDX = returnIDX
        
        self.indices_task_init = np.arange(self.intern_dset.__len__())
        
        self.indices_task = copy.deepcopy(self.indices_task_init)
            
    def __len__(self):
        return self.indices_task.shape[0]

    def select_random_subset(self, random_num):

        inds_keep = np.random.permutation(np.arange(self.indices_task_init.shape[0]))[:random_num]

        self.indices_task = self.indices_task_init[inds_keep]
        
    def select_specific_subset(self, indices_select):
        
        self.indices_task = self.indices_task_init[indices_select]
        
    def __getitem__(self, idx):
        
        idx = self.indices_task[idx]

        im, class_lbl = self.intern_dset.__getitem__(idx)
        
        if self.returnIDX:
            return im, class_lbl, self.task_lb, idx
            
        return im, class_lbl, self.task_lb


In [7]:
dataroot = '/lab/arios/ProjIntel/incDFM/data/8dset/'
class eightdset_normalize():
    def __init__(self):
        self.tf = TF.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))

    def __call__(self, img):
        return self.tf(img)


class eightdset_train():
    """Pre-processing for inaturalist training.
    """
    # TODO maybe the tranform topilimage is messing up the normalization!
    def __init__(self):
        self.tf = TF.Compose([TF.Resize(256), TF.CenterCrop(224), TF.RandomHorizontalFlip(), TF.ToTensor(), 
                            #   inaturalist_normalize(),
                              ])
    def __call__(self, img):
        return self.tf(img)
    
    
transform = eightdset_train()

transform_svhn = tforms.svhn_train()    
    

dset_name='svhn'
order_tasks = {'flowers':0, 'scenes':1, 'birds':2, 'cars':3, 'aircraft':4, 'voc':5,  'chars':6, 'svhn':7}
tasklist = task_filepaths_train[order_tasks[dset_name]]

t_apply = None

dataset_8 = eightdsetTask(dataroot, dset_name, split='train', tasklist=tasklist, transform=t_apply, \
        returnIDX=False,  preload=False)

In [8]:
dataset_8.__getitem__(10)[0]

tensor([[[214., 213., 213.,  ..., 217., 217., 216.],
         [215., 214., 211.,  ..., 217., 217., 216.],
         [216., 215., 208.,  ..., 217., 217., 216.],
         ...,
         [179., 193., 208.,  ..., 218., 218., 217.],
         [211., 218., 223.,  ..., 217., 218., 217.],
         [216., 215., 212.,  ..., 216., 218., 216.]],

        [[218., 217., 217.,  ..., 221., 221., 221.],
         [216., 215., 216.,  ..., 221., 221., 221.],
         [217., 216., 212.,  ..., 221., 221., 221.],
         ...,
         [183., 195., 209.,  ..., 219., 219., 221.],
         [214., 220., 224.,  ..., 218., 219., 221.],
         [219., 217., 213.,  ..., 217., 219., 220.]],

        [[217., 218., 218.,  ..., 222., 222., 224.],
         [219., 218., 216.,  ..., 222., 222., 224.],
         [223., 221., 215.,  ..., 222., 222., 224.],
         ...,
         [188., 200., 213.,  ..., 221., 221., 220.],
         [219., 225., 228.,  ..., 220., 221., 220.],
         [224., 222., 217.,  ..., 219., 221., 219.]]]