In [None]:
#default_exp data.core

In [None]:
#export
from fastai2.torch_basics import *
from fastai2.data.load import *

In [None]:
from nbdev.showdoc import *

# Data core

> Core functionality for gathering data

The classes here provide functionality for applying a list of transforms to a set of items (`TfmdList`, `DataSource`) or a `DataLoader` (`TfmdDl`) as well as the base class used to gather the data for model training: `DataBunch`.

## TfmdDL -

In [None]:

#export
@typedispatch
def show_batch(x, y, samples, ctxs=None, max_n=9, **kwargs):
    if ctxs is None: ctxs = Inf.nones
    for i in range_of(samples[0]):
        ctxs = [b.show(ctx=c, **kwargs) for b,c,_ in zip(samples.itemgot(i),ctxs,range(max_n))]
    return ctxs

- show_batch is a type-dispatched function that is responsible for showing decoded samples. 
- x and y are the input and the target in the batch to be shown, and are passed along to dispatch on their types. 
    - There is a different implementation of show_batch if x is a TensorImage or a TensorText for instance (see vision.core or text.data for more details). 
- ctxs can be passed but the function is responsible to create them if necessary. 
- kwargs depend on the specific implementation.

In [None]:
#export
@typedispatch
def show_results(x, y, samples, outs, ctxs=None, max_n=9, **kwargs):
    if ctxs is None: ctxs = Inf.nones
    for i in range(len(samples[0])):
        ctxs = [b.show(ctx=c, **kwargs) for b,c,_ in zip(samples.itemgot(i),ctxs,range(max_n))]
    for i in range(len(outs[0])):
        ctxs = [b.show(ctx=c, **kwargs) for b,c,_ in zip(outs.itemgot(i),ctxs,range(max_n))]
    return ctxs

In [None]:
_all_ = ["show_batch", "show_results"]
_batch_tfms = ('after_item','before_batch','after_batch') # list specific callbacks. EACH OF THEM HAS THEIR OWN PIPELINE, aka their own encode/decode
# 'after_item': (after grabbing a single item in dataset/ where before_batch pipeline is used) 
    # transform each item in tuple (where as_item=False).
    # example of after_item tfms: item_img_tfms = [ImageResizer(128), ToTensor()]
# 'after_batch': run AFTER tuples being collated together by Pytorch dataloader in a batch.
    #This transform run on a whole batch at a time 
    # GPU transformations go here
# 'before_batch': after getting all items together, but right BEFORE collating them into batch 
    #(the only one that will be done as a whole thing at a time (as_item=True). The rest will be done as Tuple Transform, 
    # aka perform transformation for each value in the tuple)


In [None]:
#export
@delegates() #Note: this decorator will replace the **kwargs with argument names of its parent class (DataLoader) when you shift tab, thus helps with autocompletion
class TfmdDL(DataLoader): # fastai DataLoader that UNDERSTAND TRANSFORM
    "Transformed `DataLoader`"
    def __init__(self, dataset, bs=64, shuffle=False, num_workers=None, verbose=False, do_setup=True, **kwargs):
        if num_workers is None: num_workers = min(16, defaults.cpus)
        for nm in _batch_tfms: 
            kwargs[nm] = Pipeline(kwargs.get(nm,None), as_item=(nm=='before_batch')) 
            # Turn after_item, before_batch ... into pipeline here, and set only 'before_batch' as item transform
        super().__init__(dataset, bs=bs, shuffle=shuffle, num_workers=num_workers, **kwargs)
        if do_setup: # do setup for pipeline here. See pipeline setup in 6_04_transform.ipynb
            for nm in _batch_tfms: 
                pv(f"Setting up {nm}: {kwargs[nm]}", verbose)
                kwargs[nm].setup(self)

    def _one_pass(self):
        b = self.do_batch([self.do_item(0)])
        if self.device is not None: b = to_device(b, self.device)
        its = self.after_batch(b)
        self._n_inp = 1 if not isinstance(its, (list,tuple)) or len(its)==1 else len(its)-1
        self._types = mapped(type,its)
        
        
    # to retain the original type of data (secret sauce for type reserve) at the end of the batch      
    def _retain_dl(self,b):
        if not getattr(self, '_types', None): self._one_pass()
        return retain_types(b, typs=self._types)

    @delegates(DataLoader.new)
    def new(self, dataset=None, cls=None, **kwargs):
        res = super().new(dataset, cls, do_setup=False, **kwargs)
        if not hasattr(self, '_n_inp') or not hasattr(self, '_types'):
            try: 
                self._one_pass()
                res._n_inp,res._types = self._n_inp,self._types
            except: print("Could not do one pass in your dataloader, there is something wrong in it")
        else: res._n_inp,res._types = self._n_inp,self._types
        return res

    def before_iter(self):
        super().before_iter()
        split_idx = getattr(self.dataset, 'split_idx', None)
        for nm in _batch_tfms:
            f = getattr(self,nm)
            if isinstance(f,Pipeline): f.split_idx=split_idx


    def decode(self, b): return self.before_batch.decode(to_cpu(self.after_batch.decode(self._retain_dl(b)))) #put back the type for type reserve
    def decode_batch(self, b, max_n=9, full=True): return self._decode_batch(self.decode(b), max_n, full)
    def _decode_batch(self, b, max_n=9, full=True):
        f = self.after_item.decode
        # good old compose. This compose is a function though (see below)
        f = compose(f, partial(getattr(self.dataset,'decode',noop), full = full))        
        return L(batch_to_samples(b, max_n=max_n)).map(f)

    def _pre_show_batch(self, b, max_n=9):
        "Decode `b` to be ready for `show_batch`"
        b = self.decode(b)
        if hasattr(b, 'show'): return b,None,None
        its = self._decode_batch(b, max_n, full=False)
        if not is_listy(b): b,its = [b],L((o,) for o in its)
        return detuplify(b[:self.n_inp]),detuplify(b[self.n_inp:]),its
        
    def show_batch(self, b=None, max_n=9, ctxs=None, **kwargs):
        "Show `b` (defaults to `one_batch`), a list of lists of pipeline outputs (i.e. output of a `DataLoader`)"
        '''
        1. pass in some batch to show
        2. decode that batch (which includes put back the type): use after batch and before batch transform: before_batch.decode and after_batch.decode
        3. context: can be from matplotlib (axes) or pandas dataframe. Can be fetched from the type of the obj in the batch
        4. Call .show()
        '''
        if b is None: b = self.one_batch()
        if not show: return self._pre_show_batch(b, max_n=max_n)
        show_batch(*self._pre_show_batch(b, max_n=max_n), ctxs=ctxs, max_n=max_n, **kwargs)
    
    def show_results(self, b, out, max_n=9, ctxs=None, show=True, **kwargs):
        x,y,its = self.show_batch(b, max_n=max_n, show=False)
        b_out = b[:self.n_inp] + (tuple(out) if is_listy(out) else (out,))
        x1,y1,outs = self.show_batch(b_out, max_n=max_n, show=False)
        res = (x,x1,None,None) if its is None else (x, y, its, outs.itemgot(slice(self.n_inp,None)))
        #its == None means that a batch knows how to show itself as a whole, so we pass x, x1
        if not show: return res
        show_results(*res, ctxs=ctxs, max_n=max_n, **kwargs)
            
#     @property
#     def device(self):
#         if not getattr(self, '_device', None): self._one_pass()
#         return self._device
    
#     @device.setter
#     def device(self, v): self._device = v

    @property
    def n_inp(self):
        if hasattr(self.dataset, 'n_inp'): return self.dataset.n_inp
        if not hasattr(self, '_n_inp'): self._one_pass()
        return self._n_inp

```python
def compose(*funcs, order=None):
    "Create a function that composes (do) all functions in `funcs`, passing along remaining `*args` and `**kwargs` to all"
    funcs = L(funcs)
    if order is not None: funcs = funcs.sorted(order)
    def _inner(x, *args, **kwargs):
        for f in L(funcs): x = f(x, *args, **kwargs)
        return x
    return _inner
```

```
TfmdDL(
    dataset,
    bs=16,
    shuffle=False,
    num_workers=None,
    drop_last=False,
    indexed=None,
    pin_memory=False,
    timeout=0,
    *,
    wif=None,
    before_iter=None,
    create_batches=None,
    sampler=None,
    create_item=None,
    after_item=None,
    before_batch=None,
    create_batch=None,
    retain=None,
    after_batch=None,
    after_iter=None,
    get_idxs=None,
)

DataLoader(
    dataset=None,
    bs=None,
    shuffle=False,
    drop_last=False,
    indexed=None,
    num_workers=0,
    pin_memory=False,
    timeout=0,
    *,
    wif=None,
    before_iter=None,
    create_batches=None,
    sampler=None,
    create_item=None,
    after_item=None,
    before_batch=None,
    create_batch=None,
    retain=None,
    after_batch=None,
    after_iter=None,
    get_idxs=None,
)
```

A `TfmdDL` is a `DataLoader` that creates `Pipeline` from a list of `Transform`s for the callbacks `after_item`, `before_batch` and `after_batch`. As a result, it can decode or show a processed `batch`.

In [None]:
add_docs(TfmdDL,
         decode="Decode `b` using `tfms`",
         decode_batch="Decode `b` entirely",
         new="Create a new version of self with a few changed attributes",
         show_batch="Show `b` (defaults to `one_batch`), a list of lists of pipeline outputs (i.e. output of a `DataLoader`)",
         show_results="Show each item of `b` and `out`",
         before_iter="override")

In [None]:
class _Category(int, ShowTitle): pass

### Type reserve (part 2, continue from 2_05 notebook)

In [None]:
[(TensorImage([1]),)] * 4
# why a list of "tuple of 1" instead of just [TensorImage([1]),TensorImage([1]), ...]?
# since this list is supposed to be from Datasets (below), and it's always a tuple because
# Datasets have multiple lists (normally 2, 1 for X 1 for Y)
# simpler output of Datasets print(dsets) => [(1,),(0,),(0,)]

[(TensorImage([1]),),
 (TensorImage([1]),),
 (TensorImage([1]),),
 (TensorImage([1]),)]

In [None]:
#Test retain type
class NegTfm(Transform):
    def encodes(self, x): return torch.neg(x)
    def decodes(self, x): return torch.neg(x)
    
tdl = TfmdDL([(TensorImage([1]),)] * 4, after_batch=NegTfm(), bs=4, num_workers=4)
b = tdl.one_batch()
test_eq(type(b[0]), TensorImage)
# even though NegTfn.encode will return a torch.Tensor (bc of torch.neg), which is the pytorch type (determined by .type())
    # the normal type (type that is determined by type(<obj>) is still reserved (fastai TensorImage)
# Note: this only works if the output is the parent class of the input (TensorImage is the child of torch.Tensor)

#See more in cells below

In [None]:
temp = TensorImage([1])
print(type(temp),temp.type()) #input
print(type(b[0]), b[0].type()) #output

# so both the pytorch type and normal type of these 2 are the same, 
# probably since there is no IntToFloat tensor transformation. If there is, then pytorch type will change (look at 2_05_data_transforms notebook)

<class 'fastai2.torch_core.TensorImage'> torch.LongTensor
<class 'fastai2.torch_core.TensorImage'> torch.LongTensor


In [None]:
print(type(tdl.decode_batch(b)[0][0]), (tdl.decode_batch(b)[0][0]).type()) #decode the output
test_eq(type(tdl.decode_batch(b)[0][0]), TensorImage)

<class 'fastai2.torch_core.TensorImage'> torch.LongTensor


In [None]:
# Note: Because ALL OF TRANSFORMATION PIPELINE (TfmDL) ALL CHECK (AFTER GOING THROUGH ENCODE/DECODE OF EACH TFMS) that TYPE MUST BE RESERVED
b = (tensor([1.,1.,1.,1.]),)
print(type(b[0]),b[0].type()) #before decode
print(type(tdl.decode_batch(b)[0][0]),(tdl.decode_batch(b)[0][0]).type()) #after decode

# even when you try to decode b which has different normal type, 
# tdl (TfmdDL with specified TensorImage as input) decoding will convert b to TensorImage type, hence 'type reserved'

<class 'torch.Tensor'> torch.FloatTensor
<class 'fastai2.torch_core.TensorImage'> torch.FloatTensor


### Force no type reserve with ->None

In [None]:
class NegTfm(Transform):
    def encodes(self, x)->None: return torch.neg(x) # the '-> None' means not enforcing type consistency
    def decodes(self, x): return torch.neg(x)
    
tdl = TfmdDL([(TensorImage([1]),)] * 4, after_batch=NegTfm(), bs=4, num_workers=4)
b = tdl.one_batch()


In [None]:
test_eq(type(b[0]), TensorImage) # failed

In [None]:
temp = TensorImage([1])
print(type(temp),temp.type()) #input
print(type(b[0]), b[0].type()) #output. Don't reserve type. Type will be strictly depends on encode function

<class 'fastai2.torch_core.TensorImage'> torch.LongTensor
<class 'torch.Tensor'> torch.LongTensor


### TODO: This examples below is inconsistent, because changing A to no type reserve, or change f(x) to type reserve, they all end up with same results

In [None]:
class A(Transform): 
    def encodes(self, x): return x 
    def decodes(self, x): return Int(x) 

@Transform
def f(x)->None: return Tuple((x,x)) # not enforcing 'reserve input type' setting

In [None]:
start = torch.arange(50)

In [None]:
a = A()
tdl = TfmdDL(start, after_item=lambda x: (a(x), f(x)), bs=4) 
# input: 1 single item from a, torch.Tensor type (Look at _batch_tfms create_item below for more info)

# return two things: a(x) which is itself and f(x) which is Tuple type
x,y = tdl.one_batch()
test_eq(type(y), Tuple) # encode forward? type Tuple

In [None]:
type(start[0]), start[0].type() #input

(torch.Tensor, 'torch.LongTensor')

In [None]:
print(type(x), x.type()) #output x
print(type(y)) #output y. 
# Note that since f(x) doesn't reserve type, y normal type (Tuple) isn't converted to torch.Tensor type

<class 'torch.Tensor'> torch.LongTensor
<class 'fastcore.utils.Tuple'>


In [None]:
x,y,type(y)

(tensor([0, 1, 2, 3]),
 (tensor([0, 1, 2, 3]), tensor([0, 1, 2, 3])),
 fastcore.utils.Tuple)

In [None]:
s = tdl.decode_batch((x,y)) # since bs = 4, return L list of 4
s

(#4) [(tensor(0), (tensor(0), tensor(0))),(tensor(1), (tensor(1), tensor(1))),(tensor(2), (tensor(2), tensor(2))),(tensor(3), (tensor(3), tensor(3)))]

In [None]:
s[0]

(tensor(0), (tensor(0), tensor(0)))

In [None]:
print(type(s[0][0]),s[0][0].type()) #decode for x, which is result of A transform
# => with type reserve, both types are exactly the same with start's 

<class 'torch.Tensor'> torch.LongTensor


In [None]:
print(type(s[0][1])) # decode for y, which is f(x) transform with no type reserve. 
# => original normal type is kept

<class 'fastcore.utils.Tuple'>


In [None]:
tdl = TfmdDL(torch.arange(0,50), after_item=A(), after_batch=NegTfm(), bs=4)
test_eq(tdl.dataset[0], start[0])
test_eq(len(tdl), (50-1)//4+1)
test_eq(tdl.bs, 4)
test_stdout(tdl.show_batch, '0\n1\n2\n3')

## DataLoaders (formerly known as DataBunch)

In [None]:
??GetAttr

why does GetAttr exist:
- 1st: because the default getattr will get EVERYTHING

Potential problem: get s.t with hidden error. For example a typo db.on_batch() instead of db.one_batch(), this typo function definitely be called using normal \__getattr__ instead of getting handled properly

- 2nd: standard \__getattr__ has no tab completion

```python
def add_props(f, n=2):
    "Create properties passing each of `range(n)` to f"
    return (property(partial(f,i)) for i in range(n))
```

In [None]:
# what's going on inside add_props
tempf1 = lambda i,x: x[i]
tempf2 = [partial(tempf1,i) for i in range(2)] # just an array of 2 functions, one with default param i=0 and one with default param i=1
print(tempf2[0](['a','b']))# first function (with i=0) got called with params ['a','b']
print(tempf2[1](['a','b']))

a
b


### Old: databunch

```python
@docs
class DataBunch(GetAttr): 
    # GetAttr: wrapper around __getattr__, using '_default' to set what object will be used in default
    "Basic wrapper around several `DataLoader`s."
    _default='train_dl' # default object for tab completion. This means databunch.somefunc() ~ databunch.train_dl.somefunc()
    _xtra = 'one_batch show_batch dataset'.split() # only tab completion these things.
    # if no _xtra, then databunch instance can access ALL attributes and functions of _default (also all tab-completion)
    # TODO: not sure if _xtra still works in current version of fastai2
    
    def __init__(self, *dls, path='.', device=None): self.dls,self.path = dls,Path(path)
        #note: you can pass as many dataloader as you like
        # dls will be stored as arrays and can be accessed by databunch instance itself, such as dbch[0],dbch[1] ...
        # this helps with add_props below
    def __getitem__(self, i): return self.dls[i]
    def new_empty(self):
        dls = [dl.new(dl.dataset.new_empty()) for dl in self.dls]
        return type(self)(*dls)
    

    # add_props: add property (see above)
    train_dl,valid_dl = add_props(lambda i,x: x[i])
    # equivalent to this
    # @property def train_dl(self): return self[0]
    # @property def valid_dl(self): return self[1]
    
    train_ds,valid_ds = add_props(lambda i,x: x[i].dataset)
    
    def cuda(self, device=None):
        for dl in self.dls: dl.device = default_device() if device is None else device
        return self
    
    @classmethod
    @delegates(TfmdDL.__init__)
    def from_dblock(cls, dblock, source, path='.', type_tfms=None, item_tfms=None, batch_tfms=None, **kwargs):
        return dblock.databunch(source, path=path, type_tfms=type_tfms, item_tfms=item_tfms, batch_tfms=batch_tfms, **kwargs)

    _docs=dict(__getitem__="Retrieve `DataLoader` at `i` (`0` is training, `1` is validation)",
               train_dl="Training `DataLoader`",
               valid_dl="Validation `DataLoader`",
               train_ds="Training `Dataset`",
               valid_ds="Validation `Dataset`",
               cuda="Use `device` (defaults to `default_device()`)",
               new_empty="Create a new empty version of `self` with the same transforms",
               from_dblock="Create a databunch from a given `dblock`")
```

### New: dataloaders

In [None]:
@docs
class DataLoaders(GetAttr):
    # GetAttr: wrapper around __getattr__, using '_default' to set what object will be used in default
    "Basic wrapper around several `DataLoader`s."
    _default='train' # default object for tab completion. This means dataloaders.somefunc() ~ dataloaders.train.somefunc()
    def __init__(self, *loaders, path='.', device=None):
        #note: you can pass as many dataloader as you like
        # loaders will be stored as arrays and can be accessed by dataloaders instance itself, such as dls[0],dls[1] ...
        # this helps with add_props below
        self.loaders,self.path = loaders,Path(path)
        self.device = device

    def __getitem__(self, i): return self.loaders[i]
    def new_empty(self):
        loaders = [dl.new(dl.dataset.new_empty()) for dl in self.loaders]
        return type(self)(*loaders, path=self.path, device=self.device)

    # add_props: add property (see above)
    # equivalent to this
    # @property def train(self): return self[0]
    # @property def valid(self): return self[1]  
    train   ,valid    = add_props(lambda i,x: x[i])
    
    
    train_ds,valid_ds = add_props(lambda i,x: x[i].dataset)

    @property
    def device(self): return self._device

    @device.setter
    def device(self, d):
        for dl in self.loaders: dl.device = d
        self._device = d

    def cuda(self, device=None):
        self.device = default_device() if device is None else device
        return self

    def cpu(self): return self.cuda(device=torch.device('cpu'))
    
    @classmethod
    def from_dsets(cls, *ds, path='.',  bs=64, device=None, dl_type=TfmdDL, **kwargs):
        default = (True,) + (False,) * (len(ds)-1)
        defaults = {'shuffle': default, 'drop_last': default}
        kwargs = merge(defaults, {k: tuplify(v, match=ds) for k,v in kwargs.items()})
        kwargs = [{k: v[i] for k,v in kwargs.items()} for i in range_of(ds)]
        return cls(*[dl_type(d, **k) for d,k in zip(ds, kwargs)], path=path, device=device)

    @classmethod
    def from_dblock(cls, dblock, source, path='.',  bs=64, val_bs=None, shuffle_train=True, device=None, **kwargs):
        return dblock.dataloaders(source, path=path, bs=bs, val_bs=val_bs, shuffle_train=shuffle_train, device=device, **kwargs)

    _docs=dict(__getitem__="Retrieve `DataLoader` at `i` (`0` is training, `1` is validation)",
               train="Training `DataLoader`",
               valid="Validation `DataLoader`",
               train_ds="Training `Dataset`",
               valid_ds="Validation `Dataset`",
               cuda="Use `device` (defaults to `default_device()`)",
               cpu="Use the cpu",
               new_empty="Create a new empty version of `self` with the same transforms",
               from_dblock="Create a dataloaders from a given `dblock`")

In [None]:
tdl = TfmdDL([(TensorImage([1]),)] * 4, after_batch=NegTfm(), bs=4, num_workers=4)

In [None]:
dls = DataLoaders(tdl,tdl)

In [None]:
x = dls.train.one_batch() # or dls[0].one_batch()
x2 = next(iter(tdl))
test_eq(x,x2)

In [None]:
x2 = dls.one_batch() # Note: quick tab completion, result of GetAttr. 
# This is basically dls.train.one_batch()
test_eq(x,x2)

In [None]:
x

(tensor([[-1],
         [-1],
         [-1],
         [-1]]),)

In [None]:
temp1 = dls.dataset # or this dataset will be 'default' as dbch.train_ds
temp2 = dls.train_ds # == dls.train.dataset
temp3 = dls.train.dataset
test_eq(temp1,temp2)
test_eq(temp1,temp3)

In [None]:
dls.train.dataset # note that NO tfms is done on Dataset

[(TensorImage([1]),),
 (TensorImage([1]),),
 (TensorImage([1]),),
 (TensorImage([1]),)]

In [None]:
dls.on_batch() # proper behavior on typo

AttributeError: on_batch

In [None]:

class A(Transform): 
    def encodes(self, x): return x 
    def decodes(self, x): return TitledInt(x) # TitledInt: int that can be shown as title

@Transform
def f(x)->None: return Tuple((x,x))

start = torch.arange(50)
test_eq_type(f(2), Tuple((2,2)))

a = A()
tdl = TfmdDL(start, after_item=lambda x: (a(x), f(x)), bs=4)
x,y = tdl.one_batch()
test_eq(type(y), Tuple)

s = tdl.decode_batch((x,y))
test_eq(type(s[0][1]), Tuple)

## TfmdLists (train and val)

**A TfmdLists combines a collection of object with a transformation Pipeline.**
- tfms applied only when index (lazy)
- Try to have Pytorch Dataset behavior?
- tfms can either be a Pipeline or a list of transforms, in which case, it will wrap them in a Pipeline. 

- use_list is passed along to L with the items, 
- as_item and split_idx are passed to each transform of the Pipeline. 
- do_setup indicates if the Pipeline.setup method should be called during initialization.

In [None]:

#export
class FilteredBase:
    "Base class for lists with subsets"
    _dl_type,_dbunch_type = TfmdDL,DataLoaders
    def __init__(self, *args, dl_type=None, **kwargs):
        if dl_type is not None: self._dl_type = dl_type
        self.dataloaders = delegates(self._dl_type.__init__)(self.dataloaders)
        super().__init__(*args, **kwargs)

    @property
    def n_subsets(self): return len(self.splits)
    def _new(self, items, **kwargs): return super()._new(items, splits=self.splits, **kwargs)
    def subset(self): raise NotImplemented

    def dataloaders(self, bs=64, val_bs=None, shuffle_train=True, n=None, path='.', dl_type=None, dl_kwargs=None, device=None,
                  **kwargs):
        if device is None: device=default_device()
        if dl_kwargs is None: dl_kwargs = [{}] * self.n_subsets
        if dl_type is None: dl_type = self._dl_type
        dl = dl_type(self.subset(0), bs=bs, shuffle=shuffle_train, drop_last=shuffle_train, n=n, device=device,
                     **merge(kwargs, dl_kwargs[0]))
        dls = [dl] + [dl.new(self.subset(i), bs=(bs if val_bs is None else val_bs), shuffle=False, drop_last=False, 
                             n=None, **dl_kwargs[i]) for i in range(1, self.n_subsets)]
        return self._dbunch_type(*dls, path=path, device=device)

FilteredBase.train,FilteredBase.valid = add_props(lambda i,x: x.subset(i))

In [None]:
#export
class TfmdLists(FilteredBase, L, GetAttr):
    "A `Pipeline` of `tfms` applied to a collection of `items`"
    _default='tfms'
    def __init__(self, items, tfms, use_list=None, do_setup=True, as_item=True, split_idx=None, train_setup=True,
                 splits=None, types=None, verbose=False):
        super().__init__(items, use_list=use_list)
        self.splits = L([slice(None),[]] if splits is None else splits).map(mask2idxs)
        if isinstance(tfms,TfmdLists): tfms = tfms.tfms
        if isinstance(tfms,Pipeline): do_setup=False
        
        self.tfms = Pipeline(tfms, as_item=as_item, split_idx=split_idx)
        # why we pass split_idx into Pipeline? Go to class Pipeline in 6_04_transform.ipynb
        
        self.types = types
        if do_setup: 
            pv(f"Setting up {self.tfms}", verbose)
            self.setup(train_setup=train_setup)

    def _new(self, items, **kwargs): return super()._new(items, tfms=self.tfms, do_setup=False, types=self.types, **kwargs)
    def subset(self, i): return self._new(self._get(self.splits[i]), split_idx=i)
    def _after_item(self, o): return self.tfms(o)
    def __repr__(self): return f"{self.__class__.__name__}: {self.items}\ntfms - {self.tfms.fs}"
    def __iter__(self): return (self[i] for i in range(len(self)))
    def show(self, o, **kwargs): return self.tfms.show(o, **kwargs)
    def decode(self, o, **kwargs): return self.tfms.decode(o, **kwargs)
    def __call__(self, o, **kwargs): return self.tfms.__call__(o, **kwargs)
    def overlapping_splits(self): return L(Counter(self.splits.concat()).values()).filter(gt(1))

    def setup(self, train_setup=True):
        self.tfms.setup(self, train_setup)
        if len(self) != 0:
            x = super().__getitem__(0) if self.splits is None else super().__getitem__(self.splits[0])[0]
            self.types = []
            for f in self.tfms.fs:
                self.types.append(getattr(f, 'input_types', type(x)))
                x = f(x)
            self.types.append(type(x))
        types = L(t if is_listy(t) else [t] for t in self.types).concat().unique()
        self.pretty_types = '\n'.join([f'  - {t}' for t in types])

    def infer_idx(self, x):
        idx = 0
        for t in self.types:
            if isinstance(x, t): break
            idx += 1
        types = L(t if is_listy(t) else [t] for t in self.types).concat().unique()
        pretty_types = '\n'.join([f'  - {t}' for t in types])
        assert idx < len(self.types), f"Expected an input of type in \n{pretty_types}\n but got {type(x)}"
        return idx

    def infer(self, x):
        return compose_tfms(x, tfms=self.tfms.fs[self.infer_idx(x):], split_idx=self.split_idx)

    def __getitem__(self, idx):
        res = super().__getitem__(idx)
        if self._after_item is None: return res
        return self._after_item(res) if is_indexer(idx) else res.map(self._after_item)

In [None]:
add_docs(TfmdLists,
         setup="Transform setup with self",
         decode="From `Pipeline",
         show="From `Pipeline",
         overlapping_splits="All splits that are in more than one split",
         subset="New `TfmdLists` with same tfms that only includes items in `i`th split",
         infer_idx="Finds the index where `self.tfms` can be applied to `x`, depending on the type of `x`",
         infer="Apply `self.tfms` to `x` starting at the right tfm depending on the type of `x`")

In [None]:

#exports
def decode_at(o, idx):
    "Decoded item at `idx`"
    return o.decode(o[idx])

In [None]:
#exports
def show_at(o, idx, **kwargs):
    "Show item at `idx`",
    return o.show(o[idx], **kwargs)

In [None]:

class _IntFloatTfm(Transform):
    def encodes(self, o):  return TitledInt(o)
    def decodes(self, o):  return TitledFloat(o)
int2f_tfm=_IntFloatTfm()

def _neg(o): return -o
neg_tfm = Transform(_neg, _neg)

In [None]:
# how to use TfmdList

items = L([1.,2.,3.])
tfms = [neg_tfm, int2f_tfm]
tl = TfmdLists(items, tfms=tfms)

In [None]:
tl

TfmdLists: [1.0, 2.0, 3.0]
tfms - (#2) [Transform: True (object,object) -> _neg (object,object) -> _neg,_IntFloatTfm: True (object,object) -> encodes (object,object) -> decodes]

In [None]:
tl[0],tl[1],tl[2] # transforms applied only when index (lazy)

# TODO: test if TfmdList can reserve type

(-1, -2, -3)

In [None]:
type(tl[0]),type(tl[1]),type(tl[2])

(fastai2.torch_core.TitledInt,
 fastai2.torch_core.TitledInt,
 fastai2.torch_core.TitledInt)

In [None]:
tl.decode(tl[0]),type(tl.decode(tl[0])) # change to TitledFloat then negated

(1.0, fastai2.torch_core.TitledFloat)

In [None]:
tl.types # TODO the fuck is this

[float, float, fastai2.torch_core.TitledInt]

In [None]:
tl.tfms # auto turn tfms into pipeline

Pipeline: (#2) [Transform: True (object,object) -> _neg (object,object) -> _neg,_IntFloatTfm: True (object,object) -> encodes (object,object) -> decodes]

In [None]:
test_eq_type(tl[0], TitledInt(-1))
test_eq_type(tl[1], TitledInt(-2))
test_eq_type(tl.decode(tl[2]), TitledFloat(3.))
test_stdout(lambda: show_at(tl, 2), '-3')
test_eq(tl.types, [float, float, TitledInt])

Add train/val splits to TfmdLists

In [None]:
items,tfms

((#3) [1.0,2.0,3.0],
 [Transform: True (object,object) -> _neg (object,object) -> _neg,
  _IntFloatTfm: True (object,object) -> encodes (object,object) -> decodes])

In [None]:

splits = [[0,2],[1]]
tl = TfmdLists(items, tfms=tfms, splits=splits)

In [None]:
tl.train,tl.train.items

(TfmdLists: [1.0, 3.0]
 tfms - (#2) [Transform: True (object,object) -> _neg (object,object) -> _neg,_IntFloatTfm: True (object,object) -> encodes (object,object) -> decodes],
 [1.0, 3.0])

In [None]:
tl.train.tfms,tl.valid.tfms # train and val have A COPY OF a same pipeline (see cell below)
# implying we can have different pipelines for train/val

(Pipeline: (#2) [Transform: True (object,object) -> _neg (object,object) -> _neg,_IntFloatTfm: True (object,object) -> encodes (object,object) -> decodes],
 Pipeline: (#2) [Transform: True (object,object) -> _neg (object,object) -> _neg,_IntFloatTfm: True (object,object) -> encodes (object,object) -> decodes])

In [None]:
hex(id(tl.train.tfms)),hex(id(tl.valid.tfms))

('0x7f0bfa17f8d0', '0x7f0bfa17fa50')

In [None]:
test_eq(tl.n_subsets, 2)
test_eq(tl.train, tl.subset(0)) #with splits, tfmdlist can auto set train and valid
test_eq(tl.valid, tl.subset(1))

test_eq(tl.train.items, items[splits[0]])
test_eq(tl.valid.items, items[splits[1]])

In [None]:
print(type(tl.train.tfms))
test_eq(tl.train.tfms.split_idx, 0) # train tfms pipeline knows that it is executed on train dataset
# meaning EACH tfm in train pipeline knows this
test_eq(tl.valid.tfms.split_idx, 1) # val tfms pipeline knows ...

# all thanks to class Pipeline and compose functions in 6_04_transform notebook

<class 'fastcore.transform.Pipeline'>


In [None]:
test_eq_type(tl.splits, L(splits))
assert not tl.overlapping_splits() # check split overlapping. Cool feature!

In [None]:
df = pd.DataFrame(dict(a=[1,2,3],b=[2,3,4]))
df

Unnamed: 0,a,b
0,1,2
1,2,3
2,3,4


In [None]:
tl = TfmdLists(df, tfms = lambda o: o.a+1, splits=[[0],[1,2]]) # take dataframe input

In [None]:
tl # simple tfm: increase by 1

TfmdLists:    a  b
0  1  2
1  2  3
2  3  4
tfms - (#1) [Transform: True (object,object) -> <lambda> ]

In [None]:
tl.items

Unnamed: 0,a,b
0,1,2
1,2,3
2,3,4


In [None]:
tl[0],tl[1],tl[2] # column a only (post tfm)

(2, 3, 4)

In [None]:
tl[:] # get everything post tfm as L list

(#3) [2,3,4]

In [None]:
test_eq(tl[1,2], [3,4])
tr = tl.subset(0)
test_eq(tr[:], [2])
val = tl.subset(1)
test_eq(val[:], [3,4])

In [None]:
items

(#3) [1.0,2.0,3.0]

In [None]:
class _B(Transform):
    def __init__(self): self.m = 0
    def encodes(self, o): return o+self.m
    def decodes(self, o): return o-self.m
    def setups(self, items): self.m = tensor(items).float().mean().item()

In [None]:
# test for setup, which updates `self.m`
tl = TfmdLists(items, _B())
test_eq(tl.m, 2.0)

In [None]:
type(tl.m)

float

### Here's how we can use `TfmdList.setup` to implement a simple category list, getting labels from a mock file list:

In [None]:
class _Cat(Transform):
    order = 1
    def encodes(self, o):    return int(self.o2i[o])
    def decodes(self, o):    return TitledStr(self.vocab[o])
    def setups(self, items): self.vocab,self.o2i = uniqueify(L(items), sort=True, bidir=True)
tcat = _Cat()

def _lbl(o): return TitledStr(o.split('_')[0]) # no order given ==> order = 0?

In [None]:
# Check that tfms are sorted by `order` & `_lbl` is called first
fns = ['dog_0.jpg','cat_0.jpg','cat_2.jpg','cat_1.jpg','dog_1.jpg']
tl = TfmdLists(fns, [tcat,_lbl])
exp_voc = ['cat','dog']

In [None]:
hex(id(tl.vocab)), hex(id(tl.tfms.vocab)), hex(id(tcat.vocab))
# tfm class var can be accessed by: tfmdList obj, tfmdList's pipeline, and of course the tfm itself

('0x7f0bf2bc9550', '0x7f0bf2bc9550', '0x7f0bf2bc9550')

In [None]:
tl.vocab, tl.o2i

((#2) ['cat','dog'], {'cat': 0, 'dog': 1})

In [None]:
tl.tfms

Pipeline: (#2) [Transform: True (object,object) -> _lbl ,_Cat: True (object,object) -> encodes (object,object) -> decodes]

In [None]:
# test setup to see whether vocab is created
test_eq(tcat.vocab, exp_voc)
test_eq(tl.tfms.vocab, exp_voc)
test_eq(tl.vocab, exp_voc)


test_eq(tl, (1,0,0,0,1))
test_eq([tl.decode(o) for o in tl], ('dog','cat','cat','cat','dog'))

### IMPORTANT NOTE: Check only the training set is taken into account for Pipeline SETUPS (best practice)

In [None]:
tl = TfmdLists(fns, [tcat,_lbl], splits=[[0,4], [1,2,3]])
test_eq(tcat.vocab, ['dog'])

In [None]:
start = torch.arange(50)

In [None]:
class A(Transform): 
    def encodes(self, x): return x+1
    def decodes(self, x): return TitledInt(x) 

### Use both tfmdLists and TfmdDL + The consequences of matching split_idx for different tfms

In [None]:
start

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])

In [None]:
tfm = NegTfm(split_idx=0) # set split_idx for NegTfm (if there is train/val set, this tfm is only used on train set aka subset 0)
tds = TfmdLists(start, A())
tdl = TfmdDL(tds, after_batch=tfm, bs=4) #after_batch: run AFTER tuples being collated together in a batch.

In [None]:
print(tds.split_idx)
print(tdl.split_idx)
print(tfm.split_idx)

None
None
0


In [None]:
x = tdl.one_batch()
x # increased by one, but not neg because those tfms' split idx dont match (see cell above)

tensor([1, 2, 3, 4])

In [None]:
tds.split_idx = 0 # set tfmdlist's tfm (A()) split_idx to match NegTfm split_idx => both tfms will be executed
# note that tfmdDL split_idx is auto set to be same as tfmdLists split_idx
x = tdl.one_batch()
x #

tensor([-1, -2, -3, -4])

In [None]:
print(tds.split_idx)
print(tdl.split_idx)
print(tfm.split_idx)

0
0
0


In [None]:
tds.split_idx = 1
print(tds.split_idx)
print(tdl.split_idx) 
print(tfm.split_idx)

1
1
0


In [None]:
x = tdl.one_batch()
x

tensor([1, 2, 3, 4])

In [None]:
tfm = NegTfm() # DON'T set split_idx for tfm
tds = TfmdLists(start, A())
tdl = TfmdDL(tds, after_batch=tfm, bs=4)
print(tds.split_idx)
print(tdl.split_idx)
print(tfm.split_idx)

None
None
None


In [None]:
x = tdl.one_batch() # due to matching split_idx in cell above, all tfms are performed
x

tensor([-1, -2, -3, -4])

## Datasets (creating X and y)

Normally for a dataset when you index into it, you should have independent var and dependent var returned to you

This is where that happens, **using multiple tfmdList with multiple pipelines (ideally 2, 1 for indepedent and 1 for dependent)**

In [None]:
#export
@docs
@delegates(TfmdLists)
class Datasets(FilteredBase):
    "A dataset that creates a tuple from each `tfms`, passed thru `item_tfms`"
    def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
        super().__init__(dl_type=dl_type)
        self.tls = L(tls if tls else [TfmdLists(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])
        # few tfmdLists are created. Technically 2 are created, each tfmlist for each "list of transforms" (aka pipeline) you passed in
        # 1 tfmlist for X tfms pipeline, 1 for y tfms pipeline
        # Example:
#         tfms = [[PILImage.create], [labeller, Categorize()]]
#         pets = Datasets(items, tfms)
        self.n_inp = (1 if len(self.tls)==1 else len(self.tls)-1) if n_inp is None else n_inp

    def __getitem__(self, it):
        res = tuple([tl[it] for tl in self.tls])
        return res if is_indexer(it) else list(zip(*res))

    def __getattr__(self,k): return gather_attrs(self, k, 'tls')
    def __dir__(self): return super().__dir__() + gather_attr_names(self, 'tls')
    def __len__(self): return len(self.tls[0])
    def __iter__(self): return (self[i] for i in range(len(self)))
    def __repr__(self): return coll_repr(self)
    def decode(self, o, full=True): return tuple(tl.decode(o_, full=full) for o_,tl in zip(o,tuplify(self.tls, match=o)))
    def subset(self, i): return type(self)(tls=L(tl.subset(i) for tl in self.tls), n_inp=self.n_inp)
    def _new(self, items, *args, **kwargs): return super()._new(items, tfms=self.tfms, do_setup=False, **kwargs)
    def overlapping_splits(self): return self.tls[0].overlapping_splits()
    @property
    def splits(self): return self.tls[0].splits
    @property
    def split_idx(self): return self.tls[0].tfms.split_idx
    @property
    def items(self): return self.tls[0].items
    @items.setter
    def items(self, v):
        for tl in self.tls: tl.items = v

    def show(self, o, ctx=None, **kwargs):
        for o_,tl in zip(o,self.tls): ctx = tl.show(o_, ctx=ctx, **kwargs)
        return ctx

    def new_empty(self):
        tls = [tl._new([], split_idx=tl.split_idx) for tl in self.tls]
        return type(self)(tls=tls, n_inp=self.n_inp)

    @contextmanager
    def set_split_idx(self, i):
        old_split_idx = self.split_idx
        for tl in self.tls: tl.tfms.split_idx = i
        yield self
        for tl in self.tls: tl.tfms.split_idx = old_split_idx

    _docs=dict(
        decode="Compose `decode` of all `tuple_tfms` then all `tfms` on `i`",
        show="Show item `o` in `ctx`",
        dataloaders="Get a `DataLoaders`",
        overlapping_splits="All splits that are in more than one split",
        subset="New `Datasets` that only includes subset `i`",
        new_empty="Create a new empty version of the `self`, keeping only the transforms",
        set_split_idx="Contextmanager to use the same `Datasets` with another `split_idx`"
    )

A `Datasets` creates **a tuple from `items` (typically input,target)** by applying to them each list of `Transform` (or `Pipeline`) in `tfms`. 

Note that **if `tfms` contains only one list of `tfms`, the items given by `Datasets` will be tuples of one element.**

`n_inp` is the number of elements in the tuples that should be considered part of the input and will default to 1 if `tfms` consists of one set of transforms, `len(tfms)-1` otherwise. In most cases, the number of elements in the tuples spit out by `Datasets` will be 2 (for input,target) but it can happen that there is 3 (Siamese networks or tabular data) in which case we need to be able to determine when the inputs end and the targets begin.

In [None]:
class _IntFloatTfm(Transform):
    def encodes(self, o):  return TitledInt(o)
    def decodes(self, o):  return TitledFloat(o)
int2f_tfm=_IntFloatTfm()



In [None]:
items = [1,2,3,4]
dsets = Datasets(items, [[neg_tfm,int2f_tfm], [add(1)]]) # 2 set of tfms aka 2 pipelines
t = dsets[0] # 2 pipelines applied only when index (lazy, because tfmdlist is used)
test_eq(t, (-1,2)) # 2 results b/c 2 pipelines on the same item (items[0])

test_eq(dsets[0,1,2], [(-1,2),(-2,3),(-3,4)])
test_eq(dsets.n_inp, 1)
dsets.decode(t)

(1.0, 2)

In [None]:
dsets.tls[0] # 2 tfmdlist b/c of 2 pipelines

TfmdLists: [1, 2, 3, 4]
tfms - (#2) [Transform: True (object,object) -> _neg (object,object) -> _neg,_IntFloatTfm: True (object,object) -> encodes (object,object) -> decodes]

In [None]:
dsets.tls[1]

TfmdLists: [1, 2, 3, 4]
tfms - (#1) [Transform: True (object,object) -> <lambda> ]

In [None]:
dsets.tls[0].tfms,dsets.tls[1].tfms

(Pipeline: (#2) [Transform: True (object,object) -> _neg (object,object) -> _neg,_IntFloatTfm: True (object,object) -> encodes (object,object) -> decodes],
 Pipeline: (#1) [Transform: True (object,object) -> <lambda> ])

In [None]:
dsets.train,dsets.valid # no split so nothing in val. Note that tfms already applied when you do .train or .valid

((#4) [(-1, 2),(-2, 3),(-3, 4),(-4, 5)], (#0) [])

In [None]:

class _IntFloatTfm(Transform):
    def encodes(self, o):  return TitledInt(o)
    def decodes(self, o):  return TitledFloat(o)
int2f_tfm=_IntFloatTfm()

def _neg(o): return -o
neg_tfm = Transform(_neg, _neg)

In [None]:
class Norm(Transform):
    def encodes(self, o): return (o-self.m)/self.s
    def decodes(self, o): return (o*self.s)+self.m
    def setups(self, items):
        its = tensor(items).float()
        self.m,self.s = its.mean(),its.std()

In [None]:
items = [1,2,3,4]
nrm = Norm()
dsets = Datasets(items, [[neg_tfm,int2f_tfm], [neg_tfm,nrm]])
x,y = zip(*dsets)

In [None]:
x,y

((-1, -2, -3, -4),
 (tensor(1.1619), tensor(0.3873), tensor(-0.3873), tensor(-1.1619)))

In [None]:
type(x[0]) # correct type b/c of int2f_tfm

fastai2.torch_core.TitledInt

In [None]:
nrm.m,nrm.s

(tensor(-2.5000), tensor(1.2910))

In [None]:
dsets[1] # can be also indexed into, normally

(-2, tensor(0.3873))

In [None]:
x,y = zip(*dsets)
test_close(tensor(y).mean(), 0)
test_close(tensor(y).std(), 1)
test_eq(x, (-1,-2,-3,-4,))
test_eq(nrm.m, -2.5)
test_stdout(lambda:show_at(dsets, 1), '-2')

test_eq(dsets.m, nrm.m)
test_eq(dsets.norm.m, nrm.m)
test_eq(dsets.train.norm.m, nrm.m)

### tfms' split_idx when using Datasets

In [None]:
#hide
#Check filtering is properly applied
class B(Transform):
    def encodes(self, x)->None:  return int(x+1)
    def decodes(self, x):        return TitledInt(x-1)
    
add1 = B(split_idx=1) # different split_idx ( = 1 for val set only)

dsets = Datasets(items, [neg_tfm, [neg_tfm,int2f_tfm,add1]], splits=[[3],[0,1,2]])

In [None]:
dsets # add1 tfm is not applied when calling the entire dsets due to different split_idx

(#4) [(-1, -1),(-2, -2),(-3, -3),(-4, -4)]

In [None]:
dsets.train # add1 tfm not executed on train

(#1) [(-4, -4)]

In [None]:
dsets.valid # add1 tmf run on valid when use dsets.valid

(#3) [(-1, 0),(-2, -1),(-3, -2)]

In [None]:
test_fns = ['dog_0.jpg','cat_0.jpg','cat_2.jpg','cat_1.jpg','kid_1.jpg']
tcat = _Cat()
dsets = Datasets(test_fns, [[tcat,_lbl]], splits=[[0,1,2], [3,4]])
test_eq(tcat.vocab, ['cat','dog'])

In [None]:
print(dsets.train) # return tuple of 1 for each because there is only 1 pipeline [tcat_,_lbl]
print(dsets.valid[:-1]) 
# dict KeyError when trying to show the last value of dsets or the last value of dsets.valid 
# because vocab is built on train set which only has cat and dog. Valid set has new value 'kid'

(#3) [(1,),(0,),(0,)]
[(0,)]


In [None]:
test_stdout(lambda: show_at(dsets.train, 0), "dog")

In [None]:
inp = [0,1,2,3,4]
dsets = Datasets(inp, tfms=[None])

In [None]:
test_eq(*dsets[2], 2)          # Retrieve one item (subset 0 is the default)
test_eq(dsets[1,2], [(1,),(2,)])    # Retrieve two items by index
mask = [True,False,False,True,False]
test_eq(dsets[mask], [(0,),(3,)])   # Retrieve two items by mask

In [None]:
# understanding attrgetter: retrieve attribute with given name from an obj
class Temp:
    b=2
    def __init__(self):
        self.a = 1
    def temp_func(self):
        print(self.a)
print(attrgetter('temp_func')(Temp()))
print(attrgetter('a')(Temp()))
print(attrgetter('b')(Temp()))

<bound method Temp.temp_func of <__main__.Temp object at 0x7f0bf212cc50>>
1
2


In [None]:
inp = pd.DataFrame(dict(a=[5,1,2,3,4]))
inp

Unnamed: 0,a
0,5
1,1
2,2
3,3
4,4


In [None]:
dsets = Datasets(inp, tfms=attrgetter('a')) # tfm here mean: get column a

In [None]:
dsets

(#5) [(5,),(1,),(2,),(3,),(4,)]

In [None]:
test_eq(dsets[1,2], [(1,),(2,)])    # Retrieve two items by index
mask = [True,False,False,True,False]
test_eq(dsets[mask], [(5,),(3,)])   # Retrieve two items by mask

### n_inp

`n_inp` is the number of elements in the tuples that should be considered part of the input and will default to 1 if `tfms` consists of one set of transforms, `len(tfms)-1` otherwise. In most cases, the number of elements in the tuples spit out by `Datasets` will be 2 (for input,target) but it can happen that there is 3 (Siamese networks or tabular data) in which case we need to be able to determine when the inputs end and the targets begin.

In [None]:
#test n_inp
inp = [0,1,2,3,4]
dsets = Datasets(inp, tfms=[None])
test_eq(dsets.n_inp, 1) # b/c only one set of tfms
dsets = Datasets(inp, tfms=[[None],[None],[None]])
test_eq(dsets.n_inp, 2) # 3 pipelines: first 2 for 2 inputs, last one for output (default)
dsets = Datasets(inp, tfms=[[None],[None],[None]], n_inp=1)
test_eq(dsets.n_inp, 1) # 3 pipelines, first 1 for input, last 2 for outputs (Siamese data)

### train/val splits for Datasets

In [None]:
# splits can be indices
dsets = Datasets(range(5), tfms=[None], splits=[tensor([0,2]), [1,3,4]])

test_eq(dsets.subset(0), [(0,),(2,)])
test_eq(dsets.train, [(0,),(2,)])       # Subset 0 is aliased to `train`
test_eq(dsets.subset(1), [(1,),(3,),(4,)])
test_eq(dsets.valid, [(1,),(3,),(4,)])     # Subset 1 is aliased to `valid`
test_eq(*dsets.valid[2], 4)
#assert '[(1,),(3,),(4,)]' in str(dsets) and '[(0,),(2,)]' in str(dsets)
dsets

(#5) [(0,),(1,),(2,),(3,),(4,)]

In [None]:
# splits can be boolean masks (they don't have to cover all items, BUT MUST BE DISJOINT)
splits = [[False,True,True,False,True], [True,False,False,False,False]]
dsets = Datasets(range(5), tfms=[None], splits=splits)

test_eq(dsets.train, [(1,),(2,),(4,)])
test_eq(dsets.valid, [(0,)])

In [None]:
# apply transforms to all items
tfm = [[lambda x: x*2,lambda x: x+1]] # double then add 1
splits = [[1,2],[0,3,4]]
dsets = Datasets(range(5), tfm, splits=splits)
test_eq(dsets.train,[(3,),(5,)])
test_eq(dsets.valid,[(1,),(7,),(9,)])
test_eq(dsets.train[False,True], [(5,)])

### Split_idx continue

In [None]:
# only transform val set aka subset 1 by setting split_idx = 1
class _Tfm(Transform):
    split_idx=1
    def encodes(self, x): return x*2
    def decodes(self, x): return TitledStr(x//2)

# or you can set split_idx this way: tfm = _Tfm(split_idx=1)

In [None]:
dsets = Datasets(range(5), [_Tfm()], splits=[[1,2],[0,3,4]])
test_eq(dsets.train,[(1,),(2,)]) # nothing happens
test_eq(dsets.valid,[(0,),(6,),(8,)]) # tfm is applied
test_eq(dsets.train[False,True], [(2,)])
dsets

(#5) [(0,),(1,),(2,),(3,),(4,)]

A context manager to change the split_idx and apply the validation transform on the training set **temporarily**

In [None]:
print(dsets.split_idx)
print(dsets.train.split_idx)
print(dsets.valid.split_idx)

None
0
1


In [None]:
ds = dsets.train
ds # nothing happen

(#2) [(1,),(2,)]

In [None]:
with ds.set_split_idx(1): # switch split_idx of train from 0 to 1 temporarily. 
    # TODO this doesn't work: dsets.train.set_split_idx(1)
    print(dsets.split_idx)
    print(ds.split_idx) # this was switched from 0 to 1
    print(dsets.valid.split_idx)
    test_eq(ds,[(2,),(4,)])
    
test_eq(dsets.train,[(1,),(2,)]) #back to normal

None
1
1


In [None]:
#hide
#Test Datasets pickles
dsrc1 = pickle.loads(pickle.dumps(dsets))
test_eq(dsets.train, dsrc1.train)
test_eq(dsets.valid, dsrc1.valid)

In [None]:
# only transform subset 1 (val) by setting split_idx = 1
class _Tfm(Transform):
    split_idx=1
    def encodes(self, x): return x*2
    def decodes(self, x): return TitledStr(x//2)

In [None]:
dsets = Datasets(range(5), [_Tfm(),noop], splits=[[1,2],[0,3,4]]) # THis is actually 2 pipelines, equivalent to [[_Tfm()],[noop]]
print(dsets.train)
print(dsets.valid) # note that for val, only first pipeline _Tfm() is called to produce X val, \
# so 0 3 4 is double. Noop does nothing as all in y val production, so 0 3 4 is the same

(#2) [(1, 1),(2, 2)]
(#3) [(0, 0),(6, 3),(8, 4)]


In [None]:
start = torch.arange(0,50)
tds = Datasets(start, [A()]) # A() transformation is just adding 1
tdl = TfmdDL(tds, after_item=NegTfm(), bs=4) # transform each item in tuple (where as_item=False)
b = tdl.one_batch()
b # note that since tds Datasets only have 1 tfm => only X is produced, so it returns tuple of 1

(tensor([-1, -2, -3, -4]),)

In [None]:
tdl.decode_batch(b)

(#4) [(1,),(2,),(3,),(4,)]

### tfms split_idx on DataLoaders (datasets.dataloaders)

In [None]:
# only transform subset 1
class _Tfm(Transform):
    split_idx=1
    def encodes(self, x): return x*2

dsets = Datasets(range(8), [None], splits=[[1,2,5,7],[0,3,4,6]])
dsets

(#8) [(0,),(1,),(2,),(3,),(4,),(5,),(6,),(7,)]

In [None]:
dls = dsets.dataloaders(bs=4, after_batch=_Tfm(), # tfm done after a batch is formed, normally done on GPU
                        shuffle_train=False, device=torch.device('cpu'))

test_eq(dls.train, [(tensor([1,2,5, 7]),)]) # 1 batch of train_dl: don't change
test_eq(dls.valid, [(tensor([0,6,8,12]),)]) # 1 batch of val_dl: _Tfm() is used
test_eq(dls.n_inp, 1)
# TODO: check if dls.train is a TfmdDL since isn't it dls.train.one_batch()???

## Add test set for inference