In [None]:
#default_exp data.core

In [None]:
#export
from fastai2.torch_basics import *
from fastai2.data.load import *

In [None]:
from nbdev.showdoc import *

# Data core

> Core functionality for gathering data

The classes here provide functionality for applying a list of transforms to a set of items (`TfmdList`, `DataSource`) or a `DataLoader` (`TfmdDl`) as well as the base class used to gatehr the data for model training: `DataBunch`.

## TfmdDL -

In [None]:
#export
@typedispatch
def show_batch(x, y, samples, ctxs=None, max_n=9, **kwargs):
    if ctxs is None: ctxs = Inf.nones
    for i in range_of(samples[0]):
        ctxs = [b.show(ctx=c, **kwargs) for b,c,_ in zip(samples.itemgot(i),ctxs,range(max_n))]
    return ctxs

- show_batch is a type-dispatched function that is responsible for showing decoded samples. 
- x and y are the input and the target in the batch to be shown, and are passed along to dispatch on their types. 
    - There is a different implementation of show_batch if x is a TensorImage or a TensorText for instance (see vision.core or text.data for more details). 
- ctxs can be passed but the function is responsible to create them if necessary. 
- kwargs depend on the specific implementation.

In [None]:
#export
@typedispatch
def show_results(x, y, samples, outs, ctxs=None, max_n=9, **kwargs):
    if ctxs is None: ctxs = Inf.nones
    for i in range(len(samples[0])):
        ctxs = [b.show(ctx=c, **kwargs) for b,c,_ in zip(samples.itemgot(i),ctxs,range(max_n))]
    for i in range(len(outs[0])):
        ctxs = [b.show(ctx=c, **kwargs) for b,c,_ in zip(outs.itemgot(i),ctxs,range(max_n))]
    return ctxs

In [None]:
_all_ = ["show_batch", "show_results"]
_batch_tfms = ('after_item','before_batch','after_batch') # list specific callbacks for transform pipeline
# 'after_item': after grabbing a single item in dataset, transform each individual tuple coming out of transform pipelines
# 'after_batch': run after tuples being collated together by Pytorch dataloader in a batch. This transform run on a whole batch at a time 

# 'before_batch': after getting all items together, but right before collating them into batch 
    #(the only one that will be done as individual item (as_item=True). The rest will be done as Tuple Transform)


In [None]:
#export
@delegates() #Note: this decorator will replace the **kwargs with argument names of its parent class (DataLoader) when you shift tab, and help autocompletion
class TfmdDL(DataLoader):
    "Transformed `DataLoader`"
    def __init__(self, dataset, bs=16, shuffle=False, num_workers=None, **kwargs):
        if num_workers is None: num_workers = min(16, defaults.cpus)
        for nm in _batch_tfms:
            kwargs[nm] = Pipeline(kwargs.get(nm,None), as_item=(nm=='before_batch')) # Turn them into pipeline here, set 'before_batch' as item transform
        super().__init__(dataset, bs=bs, shuffle=shuffle, num_workers=num_workers, **kwargs)
        
        for nm in _batch_tfms: kwargs[nm].setup(self)

    def _one_pass(self):
        its = self.after_batch(self.do_batch([self.do_item(0)]))
        self._device = find_device(its)
        self._n_inp = 1 if not isinstance(its, (list,tuple)) or len(its)==1 else len(its)-1
        self._retain_dl = partial(retain_types, typs=mapped(type,its)) # to retain the original type of data at the end of the batch
        
    def _retain_dl(self,b):
        self._one_pass()
        # we just replaced ourselves, so this is *not* recursive! :)
        return self._retain_dl(b)

    def before_iter(self):
        super().before_iter()
        split_idx = getattr(self.dataset, 'split_idx', None)
        for nm in _batch_tfms:
            f = getattr(self,nm)
            if isinstance(f,Pipeline): f.split_idx=split_idx

    def decode(self, b): return self.before_batch.decode(self.after_batch.decode(self._retain_dl(b)))
    def decode_batch(self, b, max_n=9, full=True): return self._decode_batch(self.decode(b), max_n, full)

    def _decode_batch(self, b, max_n=9, full=True):
        f = self.after_item.decode
        # good old compose. This compose is a function though (see below)
        f = compose(f, partial(getattr(self.dataset,'decode',noop), full = full)) 
        
        return L(batch_to_samples(b, max_n=max_n)).map(f)

    def _pre_show_batch(self, b, max_n=9):
        "Decode `b` to be ready for `show_batch`"
        b = self.decode(b)
        if hasattr(b, 'show'): return b,None,None
        its = self._decode_batch(b, max_n, full=False)
        if not is_listy(b): b,its = [b],L((o,) for o in its)
        return detuplify(b[:self.n_inp]),detuplify(b[self.n_inp:]),its
        
    def show_batch(self, b=None, max_n=9, ctxs=None, **kwargs):
        "Show `b` (defaults to `one_batch`), a list of lists of pipeline outputs (i.e. output of a `DataLoader`)"
        '''
        1. pass in some batch to show
        2. decode that batch (using after batch and before batch transform): before_batch.decode and after_batch.decode
        3. context: can be from matplotlib (axes) or pandas dataframe. Can be fetched from the type of the obj in the batch
        4. Call .show()
        '''
        if b is None: b = self.one_batch()
        if not show: return self._pre_show_batch(b, max_n=max_n)
        show_batch(*self._pre_show_batch(b, max_n=max_n), ctxs=ctxs, max_n=max_n, **kwargs)
    
    def show_results(self, b, out, max_n=9, ctxs=None, show=True, **kwargs):
        x,y,its = self.show_batch(b, max_n=max_n, show=False)
        b_out = b[:self.n_inp] + (tuple(out) if is_listy(out) else (out,))
        x1,y1,outs = self.show_batch(b_out, max_n=max_n, show=False)
        res = (x,x1,None,None) if its is None else (x, y, its, outs.itemgot(slice(self.n_inp,None)))
        #its == None means that a batch knows how to show itself as a whole, so we pass x, x1
        if not show: return res
        show_results(*res, ctxs=ctxs, max_n=max_n, **kwargs)
            
    @property
    def device(self):
        if not getattr(self, '_device', None): self._one_pass()
        return self._device
    
    @device.setter
    def device(self, v): self._device = v

    @property
    def n_inp(self):
        if hasattr(self.dataset, 'n_inp'): return self.dataset.n_inp
        if not hasattr(self, '_n_inp'): self._one_pass()
        return self._n_inp

```python
def compose(*funcs, order=None):
    "Create a function that composes (do) all functions in `funcs`, passing along remaining `*args` and `**kwargs` to all"
    funcs = L(funcs)
    if order is not None: funcs = funcs.sorted(order)
    def _inner(x, *args, **kwargs):
        for f in L(funcs): x = f(x, *args, **kwargs)
        return x
    return _inner
```

```
TfmdDL(
    dataset,
    bs=16,
    shuffle=False,
    num_workers=None,
    drop_last=False,
    indexed=None,
    pin_memory=False,
    timeout=0,
    *,
    wif=None,
    before_iter=None,
    create_batches=None,
    sampler=None,
    create_item=None,
    after_item=None,
    before_batch=None,
    create_batch=None,
    retain=None,
    after_batch=None,
    after_iter=None,
    get_idxs=None,
)

DataLoader(
    dataset=None,
    bs=None,
    shuffle=False,
    drop_last=False,
    indexed=None,
    num_workers=0,
    pin_memory=False,
    timeout=0,
    *,
    wif=None,
    before_iter=None,
    create_batches=None,
    sampler=None,
    create_item=None,
    after_item=None,
    before_batch=None,
    create_batch=None,
    retain=None,
    after_batch=None,
    after_iter=None,
    get_idxs=None,
)
```

A `TfmdDL` is a `DataLoader` that creates `Pipeline` from a list of `Transform`s for the callbacks `after_item`, `before_batch` and `after_batch`. As a result, it can decode or show a processed `batch`.

In [None]:

add_docs(TfmdDL,
         decode="Decode `b` using `tfms`",
         decode_batch="Decode `b` entirely",
         show_batch="Show `b` (defaults to `one_batch`), a list of lists of pipeline outputs (i.e. output of a `DataLoader`)",
         show_results="Show each item of `b` and `out`",
         before_iter="override")

In [None]:
class _Category(int, ShowTitle): pass

### Type reserve (part 2, continue from 2_05 notebook)

In [None]:
[(TensorImage([1]),)] * 4

[(tensor([1]),), (tensor([1]),), (tensor([1]),), (tensor([1]),)]

In [None]:
#Test retain type
class NegTfm(Transform):
    def encodes(self, x): return torch.neg(x)
    def decodes(self, x): return torch.neg(x)
    
tdl = TfmdDL([(TensorImage([1]),)] * 4, after_batch=NegTfm(), bs=4, num_workers=4)
b = tdl.one_batch()
test_eq(type(b[0]), TensorImage)
# even though NegTfn.encode will return a torch.Tensor (bc of torch.neg), which is the pytorch type (determined by .type())
    # the normal type (type that is determined by type(<obj>) is still reserved (fastai TensorImage)
# Note: this only works if the output is the parent class of the input (TensorImage is the child of torch.Tensor)

#See more in cells below

In [None]:
temp = TensorImage([1])
print(type(temp),temp.type()) #input
print(type(b[0]), b[0].type()) #output

# so both the pytorch type and normal type of these 2 are the same, 
# probably since there is no IntToFloat tensor transformation. If there is, then pytorch type will change (look at 2_05_data_transforms notebook)

<class 'fastai2.torch_core.TensorImage'> torch.LongTensor
<class 'fastai2.torch_core.TensorImage'> torch.LongTensor


In [None]:
print(type(tdl.decode_batch(b)[0][0]), (tdl.decode_batch(b)[0][0]).type()) #decode the output
test_eq(type(tdl.decode_batch(b)[0][0]), TensorImage) # same as output

<class 'fastai2.torch_core.TensorImage'> torch.LongTensor


In [None]:
# Note: Because ALL OF TRANSFORMATION PIPELINE (TfmDL) ALL CHECK (AFTER GOING THROUGH ENCODE/DECODE OF EACH TFMS) that TYPE MUST BE RESERVED
b = (tensor([1.,1.,1.,1.]),)
print(type(b[0]),b[0].type()) #before decode
print(type(tdl.decode_batch(b)[0][0]),(tdl.decode_batch(b)[0][0]).type()) #after decode

# even when you try to decode b which has different normal type, 
# tdl (TfmdDL with specified TensorImage as input) decoding will convert b to TensorImage type, hence 'type reserved'

<class 'torch.Tensor'> torch.FloatTensor
<class 'fastai2.torch_core.TensorImage'> torch.FloatTensor


### Force no type reserve with ->None

In [None]:
class NegTfm(Transform):
    def encodes(self, x)->None: return torch.neg(x) # the '-> None' means not enforcing type consistency
    def decodes(self, x): return torch.neg(x)
    
tdl = TfmdDL([(TensorImage([1]),)] * 4, after_batch=NegTfm(), bs=4, num_workers=4)
b = tdl.one_batch()
# test_eq(type(b[0]), TensorImage) # failed
test_eq(type(b[0]), torch.Tensor)

In [None]:
temp = TensorImage([1])
print(type(temp),temp.type()) #input
print(type(b[0]), b[0].type()) #output. Don't reserve type. Type will be strictly depends on encode function

<class 'fastai2.torch_core.TensorImage'> torch.LongTensor
<class 'torch.Tensor'> torch.LongTensor


In [None]:
class A(Transform): 
    def encodes(self, x): return x 
    def decodes(self, x): return Int(x) 

@Transform
def f(x)->None: return Tuple((x,x)) # not enforcing 'reserve input type' setting

In [None]:
start = torch.arange(50)

In [None]:
a = A()
tdl = TfmdDL(start, after_item=lambda x: (a(x), f(x)), bs=4) 
# input: 1 single item from a, torch.Tensor type (Look at _batch_tfms create_item below for more info)

# return two things: a(x) which is itself and f(x) which is Tuple type
x,y = tdl.one_batch()
test_eq(type(y), Tuple) # encode forward? type Tuple

In [None]:
type(start[0]), start[0].type() #input

(torch.Tensor, 'torch.LongTensor')

In [None]:
print(type(x), x.type()) #output x
print(type(y)) #output y. 
# Note that since f(x) doesn't reserve type, y normal type (Tuple) isn't converted to torch.Tensor type

<class 'torch.Tensor'> torch.LongTensor
<class 'fastcore.utils.Tuple'>


In [None]:
x,y,type(y)

(tensor([0, 1, 2, 3]),
 (tensor([0, 1, 2, 3]), tensor([0, 1, 2, 3])),
 fastcore.utils.Tuple)

In [None]:
s = tdl.decode_batch((x,y)) # since bs = 4, return L list of 4
s

(#4) [(tensor(0), (tensor(0), tensor(0))),(tensor(1), (tensor(1), tensor(1))),(tensor(2), (tensor(2), tensor(2))),(tensor(3), (tensor(3), tensor(3)))]

In [None]:
s[0]

(tensor(0), (tensor(0), tensor(0)))

In [None]:
print(type(s[0][0]),s[0][0].type()) #decode for x, which is result of A transform
# => with type reserve, both types are exactly the same with start

<class 'torch.Tensor'> torch.LongTensor


In [None]:
print(type(s[0][1])) # decode for y, which is f(x) transform with no type reserve. 
# => original normal type is kept

<class 'fastcore.utils.Tuple'>


In [None]:
tdl = TfmdDL(torch.arange(0,50), after_item=A(), after_batch=NegTfm(), bs=4)
test_eq(tdl.dataset[0], start[0])
test_eq(len(tdl), (50-1)//4+1)
test_eq(tdl.bs, 4)
test_stdout(tdl.show_batch, '0\n1\n2\n3')

## DataBunch -

In [None]:
??GetAttr

why does GetAttr this exist:
- 1st: because the default getattr will get EVERYTHING

=> get s.t with hidden error. For example a typo db.on_batch(), this will definitely be called by normal \__getattr__ (see below to see how it is fixed)

- 2nd: no tab completion

In [None]:
# what's going on inside add_props
tempf1 = lambda i,x: x[i]
tempf2 = [partial(tempf1,i) for i in range(2)]
print(tempf2[0](['a','b']))
print(tempf2[1](['a','b']))

a
b


In [None]:
# export
@docs
class DataBunch(GetAttr): 
    # GetAttr: wrapper around __getattr__, using '_default' to set what will be returned from default
    "Basic wrapper around several `DataLoader`s."
    _default='train_dl' # _default things for tab completion
    _xtra = 'one_batch show_batch dataset'.split() # only tab completion these things.
    # if no _xtra, then _xtra will be set as ALL attributes inside _default
    
    
    def __init__(self, *dls): self.dls = dls #note: you can pass as many dataloader as you like
    def __getitem__(self, i): return self.dls[i]

    # add_props: add property (see above)
    # equivalent to this
    # @property def train_dl(self): return self[0]
    # @property def valid_dl(self): return self[1]
    train_dl,valid_dl = add_props(lambda i,x: x[i])
    train_ds,valid_ds = add_props(lambda i,x: x[i].dataset)
    
    _docs=dict(__getitem__="Retrieve `DataLoader` at `i` (`0` is training, `1` is validation)",
              train_dl="Training `DataLoader`",
              valid_dl="Validation `DataLoader`",
              train_ds="Training `Dataset`",
              valid_ds="Validation `Dataset`")

In [None]:
dbch = DataBunch(tdl,tdl)
x = dbch.train_dl.one_batch() # or dbch[0].one_batch()
x2 = next(iter(tdl))
test_eq(x,x2)h
x2 = dbch.one_batch() # Note: quick tab completion, result of GetAttr. This will be call one_batch from 'default' dl: train_dl
test_eq(x,x2)

In [None]:
temp1 = dbch.dataset # or this dataset will be 'default' as dbch.train_ds
temp2 = dbch.train_ds
test_eq(temp1,temp2)

In [None]:
dbch.on_batch() # proper behavior on typo

AttributeError: on_batch

### Methods

In [None]:
show_doc(DataBunch.__getitem__)

<h4 id="DataBunch.__getitem__" class="doc_header"><code>DataBunch.__getitem__</code><a href="https://github.com/fastai/fastai_dev/tree/master/dev/__main__.py#L8" class="source_link" style="float:right">[source]</a></h4>

> <code>DataBunch.__getitem__</code>(**`i`**)

Retrieve [`DataLoader`](/dataloader.html#DataLoader) at `i` (`0` is training, `1` is validation)

In [None]:
x2 = dbch[0].one_batch()
test_eq(x,x2)

In [None]:
show_doc(DataBunch.train_dl, name="train_dl")

<h4 id="train_dl" class="doc_header"><code>train_dl</code><a href="https://github.com/fastai/fastai_dev/tree/master/dev/__main__.py#L10" class="source_link" style="float:right">[source]</a></h4>

Training [`DataLoader`](/dataloader.html#DataLoader)

In [None]:
show_doc(DataBunch.valid_dl, name="valid_dl")

<h4 id="valid_dl" class="doc_header"><code>valid_dl</code><a href="https://github.com/fastai/fastai_dev/tree/master/dev/__main__.py#L10" class="source_link" style="float:right">[source]</a></h4>

Validation [`DataLoader`](/dataloader.html#DataLoader)

In [None]:
show_doc(DataBunch.train_ds, name="train_ds")

<h4 id="train_ds" class="doc_header"><code>train_ds</code><a href="https://github.com/fastai/fastai_dev/tree/master/dev/__main__.py#L11" class="source_link" style="float:right">[source]</a></h4>

Training `Dataset`

In [None]:
show_doc(DataBunch.valid_ds, name="valid_ds")

<h4 id="valid_ds" class="doc_header"><code>valid_ds</code><a href="https://github.com/fastai/fastai_dev/tree/master/dev/__main__.py#L11" class="source_link" style="float:right">[source]</a></h4>

Validation `Dataset`

## TfmdList -

In [None]:
#export
class FilteredBase:
    "Base class for lists with subsets"
    _dl_type = TfmdDL
    def __init__(self, *args, **kwargs):
        self.databunch = delegates(self._dl_type.__init__)(self.databunch)
        super().__init__(*args, **kwargs)

    def _new(self, items, **kwargs): return super()._new(items, filts=self.filts, **kwargs)
    def subset(self): raise NotImplemented
    @property
    def n_subsets(self): return len(self.filts)

    def databunch(self, bs=16, val_bs=None, shuffle_train=True, **kwargs):
        n = self.n_subsets-1
        bss = [bs] + [2*bs]*n if val_bs is None else [bs] + [val_bs]*n
        shuffles = [shuffle_train] + [False]*n
        return DataBunch(*[self._dl_type(self.subset(i), bs=b, shuffle=s, drop_last=s, **kwargs)
                               for i,(b,s) in enumerate(zip(bss, shuffles))])

FilteredBase.train,FilteredBase.valid = add_props(lambda i,x: x.subset(i), 2)

In [None]:
#export
class TfmdList(FilteredBase, L):
    "A `Pipeline` of `tfms` applied to a collection of `items`"
    def __init__(self, items, tfms, use_list=None, do_setup=True, as_item=True, filt=None, train_setup=True, filts=None):
        super().__init__(items, use_list=use_list)
        self.filts = L([slice(None)] if filts is None else filts).map(mask2idxs)
        if isinstance(tfms,TfmdList): tfms = tfms.tfms
        if isinstance(tfms,Pipeline): do_setup=False
        self.tfms = Pipeline(tfms, as_item=as_item, filt=filt)
        if do_setup: self.setup(train_setup=train_setup)

    def _new(self, items, **kwargs): return super()._new(items, tfms=self.tfms, do_setup=False, **kwargs)
    def subset(self, i): return self._new(self._get(self.filts[i]), filt=i)
    def _after_item(self, o): return self.tfms(o)
    def __repr__(self): return f"{self.__class__.__name__}: {self.items}\ntfms - {self.tfms.fs}"
    def __iter__(self): return (self[i] for i in range(len(self)))
    def show(self, o, **kwargs): return self.tfms.show(o, **kwargs)
    def decode(self, x, **kwargs): return self.tfms.decode(x, **kwargs)
    def __call__(self, x, **kwargs): return self.tfms.__call__(x, **kwargs)
    def setup(self, train_setup=True): self.tfms.setup(getattr(self,'train',self) if train_setup else self)
    @property
    def default(self): return self.tfms

    def __getitem__(self, idx):
        res = super().__getitem__(idx)
        if self._after_item is None: return res
        return self._after_item(res) if is_indexer(idx) else res.map(self._after_item)

In [None]:
add_docs(TfmdList,
         setup="Transform setup with self",
         decode="From `Pipeline",
         show="From `Pipeline",
         subset="New `TfmdList` that only includes subset `i`")

In [None]:
#exports
def decode_at(o, idx):
    "Decoded item at `idx`"
    return o.decode(o[idx])

In [None]:
#exports
def show_at(o, idx, **kwargs):
    "Show item at `idx`",
    return o.show(o[idx], **kwargs)

A `TfmdList` combines a collection of object with a `Pipeline`. `tfms` can either be a `Pipeline` or a list of transforms, in which case, it will wrap them in a `Pipeline`. `use_list` is passed along to `L` with the `items`, `as_item` and `filt` are passed to each transform of the `Pipeline`. `do_setup` indicates if the `Pipeline.setup` method should be called during initialization.

In [None]:
class IntFloatTfm(Transform):
    def encodes(self, x):  return Int(x)
    def decodes(self, x):  return Float(x)
    foo=1

int_tfm=IntFloatTfm()

def neg(x): return -x
neg_tfm = Transform(neg, neg)

class B(Transform):
    def encodes(self, x): return x+1
    def decodes(self, x): return x-1
add1 = B()
add1.filt = 1

In [None]:
tl = TfmdList([1.,2.,3.], [neg_tfm, int_tfm], filts=[[0,2],[1]])
t = tl[1]
test_eq_type(t, Int(-2))
test_eq(decode_at(tl, 1), 2)
test_eq_type(tl.decode(t), Float(2.0))
test_stdout(lambda: show_at(tl, 2), '-3')
tl

TfmdList: [1.0, 2.0, 3.0]
tfms - (#2) [Transform: True {'object': 'neg'} {'object': 'neg'},IntFloatTfm: True {'object': 'encodes'} {'object': 'decodes'}]

In [None]:
tl = TfmdList([1.,2.,3.], [neg_tfm, int_tfm, add1], filts=[[0,2],[1]])
test_eq(tl[0], -1)
test_eq(tl[1], -2)
test_eq(tl.valid[0], -1) #add1 is only applied on the validation set

In [None]:
p2 = tl.subset(0)
test_eq(p2, [-1,-3])
test_eq(map(type, p2), (Int,Int))
test_eq(tl[tensor(1)], tl[1])

In [None]:
df = pd.DataFrame(dict(a=[1,2,3],b=[2,3,4]))
tl = TfmdList(df, lambda o: o.a, filts=[[0],[1,2]])
test_eq(tl[1,2], [2,3])
p2 = tl.subset(1)
test_eq(p2, [2,3])

In [None]:
class B(Transform):
    def __init__(self):   self.a = 2
    def encodes(self, x): return x+self.a
    def decodes(self, x): return x-self.a
    def setups(self, items): self.a = tensor(items).float().mean().item()

tl1 = TfmdList([1,2,3,4], B())
test_eq(tl1.tfms[0].a, 2.5)

In [None]:
tfilts = [tensor([0,2]), [1,3,4]]

In [None]:
tl = TfmdList(range(5), tfms=[None], filts=tfilts)
test_eq(len(tl.filts), 2)
test_eq(tl.subset(0), [0,2])
test_eq(tl.train, [0,2])       # Subset 0 is aliased to `train`
test_eq(tl.subset(1), [1,3,4])
test_eq(tl.valid, [1,3,4])     # Subset 1 is aliased to `valid`
test_eq(tl.valid[2], 4)

Here's how we can use `TfmdList.setup` to implement a simple category list, getting labels from a mock file list:

In [None]:
class _Cat(Transform):
    order = 1
    def encodes(self, o):    return int(self.o2i[o])
    def decodes(self, o):    return Str(self.vocab[o])
    def setups(self, items): self.vocab,self.o2i = uniqueify(L(items), sort=True, bidir=True)

def _lbl(o):  return Str(o.split('_')[0])
test_fns = ['dog_0.jpg','cat_0.jpg','cat_2.jpg','cat_1.jpg','dog_1.jpg']
tcat = _Cat()
# Check that tfms are sorted by `order`
tl = TfmdList(test_fns, [tcat,_lbl])

exp_voc = ['cat','dog']
test_eq(tcat.vocab, exp_voc)
test_eq(tl.tfms.vocab, exp_voc)
test_eq(tl.vocab, exp_voc)

In [None]:
test_eq(tl, (1,0,0,0,1))
t = L(tl)
test_eq(t, [1,0,0,0,1])
test_eq(tl[-1], 1)
test_eq(tl[0,1], (1,0))
test_eq([tl.decode(o) for o in t], ('dog','cat','cat','cat','dog'))
test_stdout(lambda:show_at(tl, 0), "dog")
tl

TfmdList: ['dog_0.jpg', 'cat_0.jpg', 'cat_2.jpg', 'cat_1.jpg', 'dog_1.jpg']
tfms - (#2) [Transform: True {'object': '_lbl'} {},_Cat: True {'object': 'encodes'} {'object': 'decodes'}]

In [None]:
test_fns = ['dog_0.jpg','cat_0.jpg','cat_2.jpg','cat_1.jpg','dog_1.jpg','kid_05.jpg']
tcat = _Cat()
tl = TfmdList(test_fns, [tcat,_lbl], filts=[[0,1,2,3,4], [5]])
#Check only the training set is taken into account for setup
test_eq(tcat.vocab, ['cat','dog'])

In [None]:
tfm = NegTfm(filt=1)
tds = TfmdList(start, A())
tdl = TfmdDL(tds, after_batch=tfm, bs=4)
x = tdl.one_batch()
test_eq(x, torch.arange(4))
tds.filt = 1
x = tdl.one_batch()
test_eq(x, -torch.arange(4))
tds.filt = 0
x = tdl.one_batch()
test_eq(x, torch.arange(4))

In [None]:
tds = TfmdList(start, A())
tdl = TfmdDL(tds, after_batch=NegTfm(), bs=4)
test_eq(tdl.dataset[0], start[0])
test_eq(len(tdl), (len(tds)-1)//4+1)
test_eq(tdl.bs, 4)
test_stdout(tdl.show_batch, '0\n1\n2\n3')

In [None]:
show_doc(TfmdList.subset)

<h4 id="TfmdList.subset" class="doc_header"><code>TfmdList.subset</code><a href="https://github.com/fastai/fastai_dev/tree/master/dev/__main__.py#L13" class="source_link" style="float:right">[source]</a></h4>

> <code>TfmdList.subset</code>(**`i`**)

New [`TfmdList`](/data.core.html#TfmdList) that only includes subset `i`

## DataSource -

In [None]:
#export
@docs
@delegates(TfmdList)
class DataSource(FilteredBase):
    "A dataset that creates a tuple from each `tfms`, passed thru `ds_tfms`"
    def __init__(self, items=None, tfms=None, tls=None, **kwargs):
        self.tls = L(tls if tls else [TfmdList(items, t, **kwargs) for t in L(ifnone(tfms,[None]))])

    def __getitem__(self, it):
        res = tuple([tl[it] for tl in self.tls])
        return res if is_indexer(it) else list(zip(*res))
    
    def __getattr__(self,k): return gather_attrs(self, k, 'tls')
    def __len__(self): return len(self.tls[0])
    def __iter__(self): return (self[i] for i in range(len(self)))
    def __repr__(self): return coll_repr(self)
    def decode(self, o): return tuple(tl.decode(o_) for o_,tl in zip(o,self.tls))
    def subset(self, i): return type(self)(tls=L(tl.subset(i) for tl in self.tls))
    def _new(self, items, *args, **kwargs): return super()._new(items, tfms=self.tfms, do_setup=False, **kwargs)
    @property
    def filts(self): return self.tls[0].filts
    @property
    def filt(self): return self.tls[0].tfms.filt
    
    def show(self, o, ctx=None, **kwargs):
        for o_,tl in zip(o,self.tls): ctx = tl.show(o_, ctx=ctx, **kwargs)
        return ctx

    _docs=dict(
        decode="Compose `decode` of all `tuple_tfms` then all `tfms` on `i`",
        show="Show item `o` in `ctx`",
        databunch="Get a `DataBunch`",
        subset="New `DataSource` that only includes subset `i`")

A `DataSource` creates a tuple from `items` (typically input,target) by applying each list of `Transform` (or `Pipeline`) in `tfms` to them. Note that if `tfms` contains only one list of `tfms`, the items given by `DataSource` will be tuples of one element.

In [None]:
items = [1,2,3,4]
dsrc = DataSource(items, [[neg_tfm,int_tfm]])
test_eq(dsrc[0], (-1,))
test_eq(dsrc[0,1,2], [(-1,),(-2,),(-3,)])

In [None]:
class Norm(Transform):
    def encodes(self, o): return (o-self.m)/self.s
    def decodes(self, o): return (o*self.s)+self.m
    def setups(self, items):
        its = tensor(items).float()
        self.m,self.s = its.mean(),its.std()

In [None]:
items = [1,2,3,4]
nrm = Norm()
dsrc = DataSource(items, [[neg_tfm,int_tfm], [neg_tfm,nrm]])

x,y = zip(*dsrc)
test_close(tensor(y).mean(), 0)
test_close(tensor(y).std(), 1)
test_eq(x, (-1,-2,-3,-4,))
test_eq(nrm.m, -2.5)
test_stdout(lambda:show_at(dsrc, 1), '-2')

test_eq(dsrc.m, nrm.m)
test_eq(dsrc.norm.m, nrm.m)
test_eq(dsrc.train.norm.m, nrm.m)

In [None]:
#hide
#Check filtering is properly applied
class B(Transform):
    def encodes(self, x)->None:  return int(x+1)
    def decodes(self, x):        return Int(x-1)
add1 = B(filt=1)

dsrc = DataSource(items, [neg_tfm, [neg_tfm,int_tfm,add1]], filts=[[3],[0,1,2]])
test_eq(dsrc[1], [-2,-2])
test_eq(dsrc.valid[1], [-2,-1])
test_eq(dsrc.valid[[1,1]], [[-2,-1], [-2,-1]])
test_eq(dsrc.train[0], [-4,-4])

In [None]:
#hide
#Test setup works with train attribute
def _lbl(o): return o.split('_')[0]

test_fns = ['dog_0.jpg','cat_0.jpg','cat_2.jpg','cat_1.jpg','kid_1.jpg']
tcat = _Cat()
dsrc = DataSource(test_fns, [[tcat,_lbl]], filts=[[0,1,2], [3,4]])
test_eq(tcat.vocab, ['cat','dog'])
test_eq(dsrc.train, [(1,),(0,),(0,)])
test_eq(dsrc.valid[0], (0,))
test_stdout(lambda: show_at(dsrc.train, 0), "dog")

In [None]:
inp = [0,1,2,3,4]
dsrc = DataSource(inp, tfms=[None])

test_eq(*dsrc[2], 2)          # Retrieve one item (subset 0 is the default)
test_eq(dsrc[1,2], [(1,),(2,)])    # Retrieve two items by index
mask = [True,False,False,True,False]
test_eq(dsrc[mask], [(0,),(3,)])   # Retrieve two items by mask

In [None]:
inp = pd.DataFrame(dict(a=[5,1,2,3,4]))
dsrc = DataSource(inp, tfms=attrgetter('a')).subset(0)
test_eq(*dsrc[2], 2)          # Retrieve one item (subset 0 is the default)
test_eq(dsrc[1,2], [(1,),(2,)])    # Retrieve two items by index
mask = [True,False,False,True,False]
test_eq(dsrc[mask], [(5,),(3,)])   # Retrieve two items by mask

In [None]:
# filts can be indices
dsrc = DataSource(range(5), tfms=[None], filts=[tensor([0,2]), [1,3,4]])

test_eq(dsrc.subset(0), [(0,),(2,)])
test_eq(dsrc.train, [(0,),(2,)])       # Subset 0 is aliased to `train`
test_eq(dsrc.subset(1), [(1,),(3,),(4,)])
test_eq(dsrc.valid, [(1,),(3,),(4,)])     # Subset 1 is aliased to `valid`
test_eq(*dsrc.valid[2], 4)
#assert '[(1,),(3,),(4,)]' in str(dsrc) and '[(0,),(2,)]' in str(dsrc)
dsrc

(#5) [(0,),(1,),(2,),(3,),(4,)]

In [None]:
# filts can be boolean masks (they don't have to cover all items, but must be disjoint)
filts = [[False,True,True,False,True], [True,False,False,False,False]]
dsrc = DataSource(range(5), tfms=[None], filts=filts)

test_eq(dsrc.train, [(1,),(2,),(4,)])
test_eq(dsrc.valid, [(0,)])

In [None]:
# apply transforms to all items
tfm = [[lambda x: x*2,lambda x: x+1]]
filts = [[1,2],[0,3,4]]
dsrc = DataSource(range(5), tfm, filts=filts)
test_eq(dsrc.train,[(3,),(5,)])
test_eq(dsrc.valid,[(1,),(7,),(9,)])
test_eq(dsrc.train[False,True], [(5,)])

In [None]:
# only transform subset 1
class _Tfm(Transform):
    filt=1
    def encodes(self, x): return x*2
    def decodes(self, x): return Str(x//2)

In [None]:
dsrc = DataSource(range(5), [_Tfm()], filts=[[1,2],[0,3,4]])
test_eq(dsrc.train,[(1,),(2,)])
test_eq(dsrc.valid,[(0,),(6,),(8,)])
test_eq(dsrc.train[False,True], [(2,)])
dsrc

(#5) [(0,),(1,),(2,),(3,),(4,)]

In [None]:
#hide
#Test DataSource pickles
dsrc1 = pickle.loads(pickle.dumps(dsrc))
test_eq(dsrc.train, dsrc1.train)
test_eq(dsrc.valid, dsrc1.valid)

In [None]:
dsrc = DataSource(range(5), [_Tfm(),noop], filts=[[1,2],[0,3,4]])
test_eq(dsrc.train,[(1,1),(2,2)])
test_eq(dsrc.valid,[(0,0),(6,3),(8,4)])

In [None]:
start = torch.arange(0,50)
tds = DataSource(start, [A()])
tdl = TfmdDL(tds, after_item=NegTfm(), bs=4)
b = tdl.one_batch()
test_eq(tdl.decode_batch(b), ((0,),(1,),(2,),(3,)))
test_stdout(tdl.show_batch, "0\n1\n2\n3")

In [None]:
# only transform subset 1
class _Tfm(Transform):
    filt=1
    def encodes(self, x): return x*2

dsrc = DataSource(range(8), [None], filts=[[1,2,5,7],[0,3,4,6]])
dbch = dsrc.databunch(bs=4, after_batch=_Tfm(), shuffle_train=False)
test_eq(dbch.train_dl, [(tensor([1,2,5, 7]),)])
test_eq(dbch.valid_dl, [(tensor([0,6,8,12]),)])

### Methods

In [None]:
items = [1,2,3,4]
dsrc = DataSource(items, [[neg_tfm,int_tfm]])

In [None]:
show_doc(DataSource.decode)

<h4 id="DataSource.decode" class="doc_header"><code>DataSource.decode</code><a href="https://github.com/fastai/fastai_dev/tree/master/dev/__main__.py#L17" class="source_link" style="float:right">[source]</a></h4>

> <code>DataSource.decode</code>(**`o`**)

Compose `decode` of all `tuple_tfms` then all `tfms` on `i`

In [None]:
test_eq(*dsrc[0], -1)
test_eq(*dsrc.decode((-1,)), 1)

In [None]:
show_doc(DataSource.show)

<h4 id="DataSource.show" class="doc_header"><code>DataSource.show</code><a href="https://github.com/fastai/fastai_dev/tree/master/dev/__main__.py#L25" class="source_link" style="float:right">[source]</a></h4>

> <code>DataSource.show</code>(**`o`**, **`ctx`**=*`None`*, **\*\*`kwargs`**)

Show item `o` in `ctx`

In [None]:
test_stdout(lambda:dsrc.show(dsrc[1]), '-2')

## Add test set for inference

In [None]:
# only transform subset 1
class _Tfm1(Transform):
    filt=0
    def encodes(self, x): return x*3

dsrc = DataSource(range(8), [[_Tfm(),_Tfm1()]], filts=[[1,2,5,7],[0,3,4,6]])
test_eq(dsrc.train, [(3,),(6,),(15,),(21,)])
test_eq(dsrc.valid, [(0,),(6,),(8,),(12,)])

In [None]:
#export
def test_set(dsrc, test_items):
    "Create a test set from `test_items` using validation transforms of `dsrc`"
    test_tl = dsrc.tls[0]._new(test_items, filt=1)
    return DataSource(tls=[test_tl])

In [None]:
class _Tfm1(Transform):
    filt=0
    def encodes(self, x): return x*3

dsrc = DataSource(range(8), [[_Tfm(),_Tfm1()]], filts=[[1,2,5,7],[0,3,4,6]])
test_eq(dsrc.train, [(3,),(6,),(15,),(21,)])
test_eq(dsrc.valid, [(0,),(6,),(8,),(12,)])

#Tranform of the validation set are applied
tst = test_set(dsrc, [1,2,3])
test_eq(tst, [(2,),(4,),(6,)])

In [None]:
#export
def test_dl(dbunch, test_items):
    "Create a test dataloader `test_items` using validation transforms of `dbunch`"
    test_ds = test_set(dbunch.valid_ds, test_items)
    return dbunch.valid_dl.new(test_ds)

In [None]:
dbunch = dsrc.databunch(bs=4)
tst_dl = test_dl(dbunch, [2,3,4,5])
test_eq(list(tst_dl), [(tensor([ 4,  6,  8, 10]),)])

## Export -