In [None]:
#default_exp tabular.core

In [None]:
# !pip install git+http://github.com/fastai/fastai2
# !pip install git+http://github.com/fastai/fastcore

In [None]:
#export
from fastai2.torch_basics import *
from fastai2.data.all import *

In [None]:
from nbdev.showdoc import *

In [None]:
#export
pd.set_option('mode.chained_assignment','raise')

# Tabular core

> Basic function to preprocess tabular data before assembling it in a `DataLoaders`.

## Initial preprocessing

In [None]:
#export
def make_date(df, date_field):
    "Make sure `df[date_field]` is of the right date type."
    field_dtype = df[date_field].dtype
    if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        field_dtype = np.datetime64
    if not np.issubdtype(field_dtype, np.datetime64):
        df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)

In [None]:
df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24']})
make_date(df, 'date')
test_eq(df['date'].dtype, np.dtype('datetime64[ns]'))

In [None]:
#export
def add_datepart(df, field_name, prefix=None, drop=True, time=False):
    "Helper function that adds columns relevant to a date in the column `field_name` of `df`."
    make_date(df, field_name)
    field = df[field_name]
    prefix = ifnone(prefix, re.sub('[Dd]ate$', '', field_name))
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[prefix + n] = getattr(field.dt, n.lower())
    df[prefix + 'Elapsed'] = field.astype(np.int64) // 10 ** 9
    if drop: df.drop(field_name, axis=1, inplace=True)
    return df

In [None]:
df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24']})
df = add_datepart(df, 'date')
test_eq(df.columns, ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', 
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start', 'Elapsed'])
df.head()

Unnamed: 0,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,2019,12,49,4,2,338,False,False,False,False,False,False,1575417600
1,2019,11,48,29,4,333,False,False,False,False,False,False,1574985600
2,2019,11,46,15,4,319,False,False,False,False,False,False,1573776000
3,2019,10,43,24,3,297,False,False,False,False,False,False,1571875200


In [None]:
#export
def _get_elapsed(df,field_names, date_field, base_field, prefix):
    for f in field_names:
        day1 = np.timedelta64(1, 'D')
        last_date,last_base,res = np.datetime64(),None,[]
        for b,v,d in zip(df[base_field].values, df[f].values, df[date_field].values):
            if last_base is None or b != last_base:
                last_date,last_base = np.datetime64(),b
            if v: last_date = d
            res.append(((d-last_date).astype('timedelta64[D]') / day1))
        df[prefix + f] = res
    return df

In [None]:
#export
def add_elapsed_times(df, field_names, date_field, base_field):
    "Add in `df` for each event in `field_names` the elapsed time according to `date_field` grouped by `base_field`"
    field_names = list(L(field_names))
    #Make sure date_field is a date and base_field a bool
    df[field_names] = df[field_names].astype('bool')
    make_date(df, date_field)

    work_df = df[field_names + [date_field, base_field]]
    work_df = work_df.sort_values([base_field, date_field])
    work_df = _get_elapsed(work_df, field_names, date_field, base_field, 'After')
    work_df = work_df.sort_values([base_field, date_field], ascending=[True, False])
    work_df = _get_elapsed(work_df, field_names, date_field, base_field, 'Before')

    for a in ['After' + f for f in field_names] + ['Before' + f for f in field_names]:
        work_df[a] = work_df[a].fillna(0).astype(int)

    for a,s in zip([True, False], ['_bw', '_fw']):
        work_df = work_df.set_index(date_field)
        tmp = (work_df[[base_field] + field_names].sort_index(ascending=a)
                      .groupby(base_field).rolling(7, min_periods=1).sum())
        tmp.drop(base_field,1,inplace=True)
        tmp.reset_index(inplace=True)
        work_df.reset_index(inplace=True)
        work_df = work_df.merge(tmp, 'left', [date_field, base_field], suffixes=['', s])
    work_df.drop(field_names,1,inplace=True)
    return df.merge(work_df, 'left', [date_field, base_field])

In [None]:
df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24'],
                   'event': [False, True, False, True], 'base': [1,1,2,2]})
df = add_elapsed_times(df, ['event'], 'date', 'base')
df

Unnamed: 0,date,event,base,Afterevent,Beforeevent,event_bw,event_fw
0,2019-12-04,False,1,5,0,1.0,0.0
1,2019-11-29,True,1,0,0,1.0,1.0
2,2019-11-15,False,2,22,0,1.0,0.0
3,2019-10-24,True,2,0,0,1.0,1.0


In [None]:
#export
def cont_cat_split(df, max_card=20, dep_var=None):
    "Helper function that returns column names of cont and cat variables from given `df`."
    cont_names, cat_names = [], []
    for label in df:
        if label == dep_var: continue
        if df[label].dtype == int and df[label].unique().shape[0] > max_card or df[label].dtype == float:
            cont_names.append(label)
        else: cat_names.append(label)
    return cont_names, cat_names

## Tabular -

In [None]:
#export
class _TabIloc:
    "Get/set rows by iloc and cols by name"
    def __init__(self,to): self.to = to
    def __getitem__(self, idxs):
        df = self.to.items
        if isinstance(idxs,tuple):
            rows,cols = idxs
            cols = df.columns.isin(cols) if is_listy(cols) else df.columns.get_loc(cols)
        else: rows,cols = idxs,slice(None)
        return self.to.new(df.iloc[rows, cols])

```python
# This class is to define some basic things you should have in a collection, such as list, array or your own collection class
# Note that you can just inherit from List or Array, but we will use composition
# (Composition means that an object knows another object, and explicitly delegates some tasks to it. Pros: Can selectively delegate funcs, and relax coupling between objs)
class CollBase:
    "Base class for composing a list of `items`"
    def __init__(self, items): self.items = items
    def __len__(self): return len(self.items) # delegate task EXPLICITLY to len func
    def __getitem__(self, k): return self.items[k]
    def __setitem__(self, k, v): self.items[list(k) if isinstance(k,CollBase) else k] = v
    def __delitem__(self, i): del(self.items[i])
    def __repr__(self): return self.items.__repr__() # some more delegation
    def __iter__(self): return self.items.__iter__()
```

In [None]:
#export
class Tabular(CollBase, GetAttr, FilteredBase):
    "A `DataFrame` wrapper that knows which cols are cont/cat/y, and returns rows in `__getitem__`"
    _default,with_cont='procs',True
    def __init__(self, df, procs=None, cat_names=None, cont_names=None, y_names=None, y_block=None, splits=None,
                 do_setup=True, device=None, inplace=False, reduce_memory=True):
        if inplace and splits is not None and pd.options.mode.chained_assignment is not None:
            warn("Using inplace with splits will trigger a pandas error. Set `pd.options.mode.chained_assignment=None` to avoid it.")
        if not inplace: df = df.copy()
        if splits is not None: df = df.iloc[sum(splits, [])]
        self.dataloaders = delegates(self._dl_type.__init__)(self.dataloaders)
        super().__init__(df) # Collbase init, to store pandas df as self.items

        self.y_names,self.device = L(y_names),device
        if y_block is None and self.y_names:
            # Make ys categorical if they're not numeric
            ys = df[self.y_names]
            
            # set different type_tfm (through 'block') for y depending the type of y (cat or cont)
            # see more in 5_06_data_block
            if len(ys.select_dtypes(include='number').columns)!=len(ys.columns): 
                y_block = CategoryBlock() 
            else: y_block = RegressionBlock()
        if y_block is not None and do_setup:
            if callable(y_block): y_block = y_block()
            procs = L(procs) + y_block.type_tfms
        
        # store cat column names, cont column names and processes (procs)
        # also turn processes into Pipeline. 
        
        # NOTE: processing or tfms in TabularData are done AHEAD OF TIME
        # instead of done lazily like Image or NLP processings (typical in TfmdDL or fastai DATASETS)
        # Example: see more in TabularPandas
        self.cat_names,self.cont_names,self.procs = L(cat_names),L(cont_names),Pipeline(procs)
        self.split = len(df) if splits is None else len(splits[0])
        if reduce_memory: 
        	if len(self.cat_names) > 0: self.reduce_cats() 
        	if len(self.cont_names) > 0: self.reduce_conts()
        if do_setup: self.setup()

    def new(self, df):
        return type(self)(df, do_setup=False, reduce_memory=False, y_block=TransformBlock(),
                          **attrdict(self, 'procs','cat_names','cont_names','y_names', 'device'))
    
    def subset(self, i): return self.new(self.items[slice(0,self.split) if i==0 else slice(self.split,len(self))])
    def copy(self): self.items = self.items.copy(); return self
    def decode(self): return self.procs.decode(self)
    def decode_row(self, row): return self.new(pd.DataFrame(row).T).decode().items.iloc[0]
    def reduce_cats(self): self.train[self.cat_names] = self.train[self.cat_names].astype('category')
    def reduce_conts(self): self[self.cont_names] = self[self.cont_names].astype(np.float32)
    def show(self, max_n=10, **kwargs): display_df(self.new(self.all_cols[:max_n]).decode().items)
    def setup(self): 
#         print('Tabular class setup, no return')
        self.procs.setup(self) # call procs setup which does not return anything
    def process(self): self.procs(self)
    def loc(self): return self.items.loc
    def iloc(self): return _TabIloc(self)
    def targ(self): return self.items[self.y_names]
    def x_names (self): return self.cat_names + self.cont_names
    def n_subsets(self): return 2
    def y(self): return self[self.y_names[0]]
    def new_empty(self): return self.new(pd.DataFrame({}, columns=self.items.columns))
    def to_device(self, d=None):
        self.device = d
        return self
    
    def all_col_names (self): 
        ys = [n for n in self.y_names if n in self.items.columns]
        return self.x_names + self.y_names if len(ys) == len(self.y_names) else self.x_names

properties(Tabular,'loc','iloc','targ','all_col_names','n_subsets','x_names','y')

In [None]:
#export
class TabularPandas(Tabular):
    # get a list of targeted (mentioned) cols or all cols, then perform in-place 
    def transform(self, cols, f, all_col=True): 
        if not all_col: cols = [c for c in cols if c in self.items.columns]
        if len(cols) > 0: self[cols] = self[cols].transform(f)

In [None]:
#export
def _add_prop(cls, nm):
    @property
    def f(o): return o[list(getattr(o,nm+'_names'))]
    @f.setter
    def fset(o, v): o[getattr(o,nm+'_names')] = v
    setattr(cls, nm+'s', f)
    setattr(cls, nm+'s', fset)

_add_prop(Tabular, 'cat')
_add_prop(Tabular, 'cont')
_add_prop(Tabular, 'y')
_add_prop(Tabular, 'x')
_add_prop(Tabular, 'all_col')

In [None]:
df = pd.DataFrame({'a':[0,1,2,0,2], 'b':[0,0,0,0,1]})
to = TabularPandas(df, cat_names='a') # we tell Tabular obj to process only 1 column as cats
# b will be ignored by Tabular obj

In [None]:
to.items # the df itself

Unnamed: 0,a,b
0,0,0
1,1,0
2,2,0
3,0,0
4,2,1


In [None]:
# object need to be pickled, for inference purposes
t = pickle.loads(pickle.dumps(to))
test_eq(t.items,to.items)

In [None]:
to.cat_names,to.cont_names,to.x_names,to.y_names 
# no b col or y col b/c we don't explicitly mention them in TabularPanda init

((#1) ['a'], (#0) [], (#1) ['a'], (#0) [])

In [None]:
to.all_cols

Unnamed: 0,a
0,0
1,1
2,2
3,0
4,2


In [None]:
to[['a']]

Unnamed: 0,a
0,0
1,1
2,2
3,0
4,2


In [None]:
to[['a','b']] # remember that Tabular class inherits Collbase, and __getitem__ will be called
# which will called pandas df.__getitem__
# thus equivalent to to.items[['a','b']]

Unnamed: 0,a,b
0,0,0
1,1,0
2,2,0
3,0,0
4,2,1


In [None]:
test_eq(to.all_cols,to[['a']])

In [None]:
??InplaceTransform

In [None]:
#export
def _apply_cats (voc, add, c):
    if not is_categorical_dtype(c):
        return pd.Categorical(c, categories=voc[c.name][add:]).codes+add
    return c.cat.codes+add #if is_categorical_dtype(c) else c.map(voc[c.name].o2i)
def _decode_cats(voc, c): return c.map(dict(enumerate(voc[c.name].items)))

In [None]:
#export
class TabularProc(InplaceTransform): 
    # inplace transform: modifies in-place and just returns whatever it's passed
    "Base class to write a non-lazy tabular processor for dataframes"
    def setup(self, items=None, train_setup=False): #TODO: properly deal with train_setup
#         print('TabularProc setup which return')

        super().setup(getattr(items,'train',items), train_setup=False) 
        # InplaceTransform setup is called, which will call Categorify setup below
        # which is weird since Categorify is CHILDREN of TabularProc
        

        # all transforms are done ahead of time (NO LAZY)
        # Procs (Transform) are called AS SOON AS data is available
        # why? "during training, we need both setup (?) and 
        # then tfm/process ENCODING (self(...) == self.encode(...)) is called as soon as setup is done"
        return self(items.items if isinstance(items,Datasets) else items)
    
    # NOTE: some setups (or setup) func don't return shit, some setup return something
    # see below

Reminder of CategoryMap

```python
t = CategoryMap([4,2,3,4])
test_eq(t, [2,3,4])
test_eq(t.o2i, {2:0,3:1,4:2})
test_fail(lambda: t.o2i['unseen label'])
```

In [None]:
to.cat_names

(#1) ['a']

Reminder

```python
class TabularPandas(Tabular):
    # get a list of targeted (mentioned) cols or all cols, then perform in-place 
    def transform(self, cols, f, all_col=True): 
        if not all_col: cols = [c for c in cols if c in self.items.columns]
        if len(cols) > 0: self[cols] = self[cols].transform(f)
```

In [None]:
#export
class Categorify(TabularProc): # an example of Tabular proc (tfm)
    "Transform the categorical variables to that type."
    order = 1
    def setups(self, to):
#         print('Categorify tfm inheriting from TabularProc setup, no return')

        self.classes = {n:CategoryMap(to.iloc[:,n].items, add_na=(n in to.cat_names)) for n in to.cat_names}
        # a dictionary that maps cat column names to CategoryMap of that cat feature
        # note: always add_na so this code is kinda dumb
        
    def encodes(self, to): 
        # perform TabularPandas transform (reminder above) on all cat_names cols (turn them to int)
        # USING THE self.classes FROM SETUPS
        to.transform(to.cat_names, partial(_apply_cats, self.classes, 1))
    def decodes(self, to): 
        to.transform(to.cat_names, partial(_decode_cats, self.classes))
    def __getitem__(self,k): return self.classes[k]

In [None]:
#export

# not sure why these here...
@Categorize
def setups(self, to:Tabular):
    if len(to.y_names) > 0:
        self.vocab = CategoryMap(getattr(to, 'train', to).iloc[:,to.y_names[0]].items)
        self.c = len(self.vocab)
    return self(to)

@Categorize
def encodes(self, to:Tabular):
    to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0), all_col=False)
    return to

@Categorize
def decodes(self, to:Tabular):
    to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}), all_col=False)
    return to

```python
df = pd.DataFrame({'a':[0,1,2,0,2]})
to = TabularPandas(df, procs = Categorify, cat_names='a')
```

Order of setups func called:

- Tabular class SETUP, no return
    - TabularProc (inherit from INPLACETRANSFORM) SETUP, which return
        - Categorify tfm (inheriting from TabularProc) SETUPS, no return
    - then TabularProc encode is called, which make Categorify encode is called

In [None]:
df = pd.DataFrame({'a':[0,1,2,0,2]})
to = TabularPandas(df, procs = Categorify, cat_names='a')

In [None]:
cat = to.procs.categorify
cat

Categorify: (object,object) -> encodes (object,object) -> decodes

In [None]:
cat.classes

{'a': (#4) ['#na#',0,1,2]}

In [None]:
test_eq(cat['a'], ['#na#',0,1,2]) # cat['a'] == cat.classes['a']
test_eq(to['a'], [1,2,3,1,3]) # to.items which store df, but transform has already been done INPLACE
# so no longer [0,1,2,0,2]

In [None]:
to.items

Unnamed: 0,a
0,1
1,2
2,3
3,1
4,3


In [None]:
to.show() # show will decode back

Unnamed: 0,a
0,0
1,1
2,2
3,0
4,2


In [None]:
df1 = pd.DataFrame({'a':[1,0,3,-1,2]})
to1 = to.new(df1)
to1.process()
#Values that weren't in the training df are sent to 0 (na)
test_eq(to1['a'], [2,1,0,0,3])
to2 = cat.decode(to1)
test_eq(to2['a'], [1,0,'#na#','#na#',2])

In [None]:
#test with splits
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2]})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]])
test_eq(cat['a'], ['#na#',0,1,2])
test_eq(to['a'], [1,2,3,0,3])

In [None]:
df = pd.DataFrame({'a':pd.Categorical(['M','H','L','M'], categories=['H','M','L'], ordered=True)})
to = TabularPandas(df, Categorify, 'a')
cat = to.procs.categorify
test_eq(cat['a'], ['#na#','H','M','L'])
test_eq(to.items.a, [2,1,3,2])
to2 = cat.decode(to)
test_eq(to2['a'], ['M','H','L','M'])

In [None]:
#test with targets
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'b', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_eq(to.vocab, ['a', 'b'])
test_eq(to['b'], [0,1,0,1,1])
to2 = to.procs.decode(to)
test_eq(to2['b'], ['a', 'b', 'a', 'b', 'b'])

In [None]:
#test with targets and train
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'c', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_eq(to.vocab, ['a', 'b'])

In [None]:
#export
class NormalizeTab(TabularProc):
    "Normalize the continuous variables."
    order = 2
    def setups(self, dsets): self.means,self.stds = dsets.conts.mean(),dsets.conts.std(ddof=0)+1e-7
    def encodes(self, to): to.conts = (to.conts-self.means) / self.stds
    def decodes(self, to): to.conts = (to.conts*self.stds ) + self.means

In [None]:
#export
@Normalize
def setups(self, to:Tabular):
    self.means,self.stds = getattr(to, 'train', to).conts.mean(),getattr(to, 'train', to).conts.std(ddof=0)+1e-7
    return self(to)

@Normalize
def encodes(self, to:Tabular):
    to.conts = (to.conts-self.means) / self.stds
    return to

@Normalize
def decodes(self, to:Tabular):
    to.conts = (to.conts*self.stds ) + self.means
    return to

In [None]:
norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a')
x = np.array([0,1,2,3,4])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to['a'].values, (x-m)/s)

In [None]:
df1 = pd.DataFrame({'a':[5,6,7]})
to1 = to.new(df1)
to1.process()
test_close(to1['a'].values, (np.array([5,6,7])-m)/s)
to2 = norm.decode(to1)
test_close(to2['a'].values, [5,6,7])

In [None]:
norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a', splits=[[0,1,2],[3,4]])
x = np.array([0,1,2])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to['a'].values, (np.array([0,1,2,3,4])-m)/s)

In [None]:
#export
class FillStrategy:
    "Namespace containing the various filling strategies."
    def median  (c,fill): return c.median()
    def constant(c,fill): return fill
    def mode    (c,fill): return c.dropna().value_counts().idxmax()

In [None]:
#export
class FillMissing(TabularProc):
    "Fill the missing values in continuous columns."
    def __init__(self, fill_strategy=FillStrategy.median, add_col=True, fill_vals=None):
        if fill_vals is None: fill_vals = defaultdict(int)
        store_attr(self, 'fill_strategy,add_col,fill_vals')

    def setups(self, dsets):
        missing = pd.isnull(dsets.conts).any()
        self.na_dict = {n:self.fill_strategy(dsets[n], self.fill_vals[n])
                        for n in missing[missing].keys()}

    def encodes(self, to):
        missing = pd.isnull(to.conts)
        for n in missing.any()[missing.any()].keys():
            assert n in self.na_dict, f"nan values in `{n}` but not in setup training set"
        for n in self.na_dict.keys():
            to[n].fillna(self.na_dict[n], inplace=True)
            if self.add_col:
                to.loc[:,n+'_na'] = missing[n]
                if n+'_na' not in to.cat_names: to.cat_names.append(n+'_na')

In [None]:
show_doc(FillMissing, title_level=3)

<h3 id="FillMissing" class="doc_header"><code>class</code> <code>FillMissing</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>FillMissing</code>(**`fill_strategy`**=*`'median'`*, **`add_col`**=*`True`*, **`fill_vals`**=*`None`*) :: [`TabularProc`](/tabular.core#TabularProc)

Fill the missing values in continuous columns.

In [None]:
fill1,fill2,fill3 = (FillMissing(fill_strategy=s) 
                     for s in [FillStrategy.median, FillStrategy.constant, FillStrategy.mode])
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4]})
df1 = df.copy(); df2 = df.copy()
tos = (TabularPandas(df, fill1, cont_names='a'),
       TabularPandas(df1, fill2, cont_names='a'),
       TabularPandas(df2, fill3, cont_names='a'))
test_eq(fill1.na_dict, {'a': 1.5})
test_eq(fill2.na_dict, {'a': 0})
test_eq(fill3.na_dict, {'a': 1.0})

for t in tos: test_eq(t.cat_names, ['a_na'])

for to_,v in zip(tos, [1.5, 0., 1.]):
    test_eq(to_['a'].values, np.array([0, 1, v, 1, 2, 3, 4]))
    test_eq(to_['a_na'].values, np.array([0, 0, 1, 0, 0, 0, 0]))

In [None]:
fill = FillMissing() 
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4], 'b': [0,1,2,3,4,5,6]})
to = TabularPandas(df, fill, cont_names=['a', 'b'])
test_eq(fill.na_dict, {'a': 1.5})
test_eq(to.cat_names, ['a_na'])
test_eq(to['a'].values, np.array([0, 1, 1.5, 1, 2, 3, 4]))
test_eq(to['a_na'].values, np.array([0, 0, 1, 0, 0, 0, 0]))
test_eq(to['b'].values, np.array([0,1,2,3,4,5,6]))

## TabularPandas Pipelines -

In [None]:
procs = [Normalize, Categorify, FillMissing, noop]
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4]})
to = TabularPandas(df, procs, cat_names='a', cont_names='b')

#Test setup and apply on df_main
test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,3,2,2,3,1])
test_eq(to['b_na'], [1,1,2,1,1,1,1])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to['b'].values, (x-m)/s)
test_eq(to.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})

In [None]:
#Test apply on y_names
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, 'a', 'b', y_names='c')

test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,3,2,2,3,1])
test_eq(to['b_na'], [1,1,2,1,1,1,1])
test_eq(to['c'], [1,0,1,0,0,1,0])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to['b'].values, (x-m)/s)
test_eq(to.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})
test_eq(to.vocab, ['a','b'])

In [None]:
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, 'a', 'b', y_names='c')

test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,3,2,2,3,1])
test_eq(df.a.dtype,int)
test_eq(to['b_na'], [1,1,2,1,1,1,1])
test_eq(to['c'], [1,0,1,0,0,1,0])

In [None]:
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,np.nan,1,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, cat_names='a', cont_names='b', y_names='c', splits=[[0,1,4,6], [2,3,5]])

test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,2,1,0,2,0])
test_eq(df.a.dtype,int)
test_eq(to['b_na'], [1,2,1,1,1,1,1])
test_eq(to['c'], [1,0,0,0,1,0,1])

In [None]:
#export
def _maybe_expand(o): return o[:,None] if o.ndim==1 else o

In [None]:
#export
class ReadTabBatch(ItemTransform):
    def __init__(self, to): self.to = to

    def encodes(self, to):
        if not to.with_cont: res = (tensor(to.cats).long(),)
        else: res = (tensor(to.cats).long(),tensor(to.conts).float())
        ys = [n for n in to.y_names if n in to.items.columns]
        if len(ys) == len(to.y_names): res = res + (tensor(to.targ),)
        if to.device is not None: res = to_device(res, to.device)
        return res

    def decodes(self, o):
        o = [_maybe_expand(o_) for o_ in to_np(o) if o_.size != 0]
        vals = np.concatenate(o, axis=1)
        try: df = pd.DataFrame(vals, columns=self.to.all_col_names)
        except: df = pd.DataFrame(vals, columns=self.to.x_names)
        to = self.to.new(df)
        return to

In [None]:
#export
@typedispatch
def show_batch(x: Tabular, y, its, max_n=10, ctxs=None):
    x.show()

In [None]:
from torch.utils.data.dataloader import _MultiProcessingDataLoaderIter,_SingleProcessDataLoaderIter,_DatasetKind
_loaders = (_MultiProcessingDataLoaderIter,_SingleProcessDataLoaderIter)

In [None]:
#export
@delegates()
class TabDataLoader(TfmdDL):
    do_item = noops
    def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
        if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatch(dataset)
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

    def create_batch(self, b): return self.dataset.iloc[b]

TabularPandas._dl_type = TabDataLoader

## Integration example

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

In [None]:
df.shape

(32561, 15)

In [None]:
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()

In [None]:
df_main.shape,df_test.shape

((10000, 15), (22561, 15))

In [None]:
df_test.drop('salary', axis=1, inplace=True)
df_main.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

In [None]:
to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="salary", splits=splits)

In [None]:
print(type(to)) # TODO: what is the differences between this and dls.train.dataset below???

<class '__main__.TabularPandas'>


In [None]:
dls = to.dataloaders()

In [None]:
print(type(dls.train.dataset)) # this is different from fastai Dataset

<class '__main__.TabularPandas'>


In [None]:
type(dls.train.items) # pandas df

pandas.core.frame.DataFrame

In [None]:
type(dls.train)

__main__.TabDataLoader

In [None]:
temp_dl = dls.train
temp_cat, temp_cont, temp_y = temp_dl.one_batch()

In [None]:
print(type(temp_cat),temp_cat.type())
print(type(temp_cont),temp_cont.type())
print(type(temp_y),temp_y.type()) # CharTensor hmmm

<class 'torch.Tensor'> torch.cuda.LongTensor
<class 'torch.Tensor'> torch.cuda.FloatTensor
<class 'torch.Tensor'> torch.cuda.CharTensor


In [None]:
temp_dl.after_iter, temp_dl.after_item # empty pipeline for after_item hmmm

(<bound method after_iter of <__main__.TabDataLoader object at 0x7ffa80de7e90>>,
 Pipeline: )

In [None]:
temp_dl.after_batch

Pipeline: ReadTabBatch

In [None]:
dls.valid.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary
0,Local-gov,Some-college,Never-married,Exec-managerial,Not-in-family,White,False,43.0,323627.001759,10.0,<50k
1,Private,HS-grad,Divorced,Adm-clerical,Not-in-family,White,False,51.0,241346.001596,9.0,<50k
2,Private,Some-college,Divorced,Sales,Unmarried,Black,False,38.0,224584.001244,10.0,<50k
3,Private,HS-grad,Never-married,#na#,Not-in-family,White,True,26.0,288591.999136,10.0,<50k
4,Private,5th-6th,Married-civ-spouse,Craft-repair,Husband,Other,False,53.0,195813.000085,3.0,>=50k
5,?,11th,Never-married,?,Own-child,White,True,17.0,303317.005308,10.0,<50k
6,Private,Bachelors,Divorced,Sales,Not-in-family,White,False,46.0,364548.000637,13.0,>=50k
7,Self-emp-not-inc,Some-college,Divorced,Other-service,Unmarried,White,False,41.0,154374.00032,10.0,<50k
8,Private,11th,Never-married,Sales,Own-child,White,False,19.000001,240685.998549,7.0,<50k
9,Private,12th,Never-married,Handlers-cleaners,Own-child,White,False,18.000001,128086.000352,8.0,<50k


In [None]:
to.show() # show head(10) of the dataframe (both train and val) (raw aka before tfms)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary
8924,Private,Masters,Married-civ-spouse,Prof-specialty,Husband,White,False,53.0,254285.0,14.0,>=50k
6094,Private,Some-college,Never-married,Adm-clerical,Not-in-family,White,False,37.0,187311.0,10.0,<50k
9951,Private,HS-grad,Divorced,Prof-specialty,Not-in-family,White,False,56.0,168625.0,9.0,<50k
1377,Private,12th,Divorced,Transport-moving,Unmarried,Black,False,44.0,139338.0,8.0,<50k
179,Private,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,False,37.0,119098.0,13.0,>=50k
1400,Private,Assoc-voc,Married-civ-spouse,Exec-managerial,Wife,White,False,26.0,282142.0,11.0,<50k
103,Private,Bachelors,Separated,#na#,Own-child,Other,True,50.0,171852.0,10.0,<50k
2244,?,11th,Never-married,?,Own-child,White,False,17.0,80077.0,7.0,<50k
9141,Private,HS-grad,Married-civ-spouse,Transport-moving,Husband,White,False,37.0,224886.0,9.0,<50k
4766,Private,HS-grad,Never-married,Other-service,Not-in-family,Black,False,32.0,206365.0,9.0,<50k


In [None]:
to.items # df of train and val (AFTER tfms)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary,education-num_na
8924,1.051302,4,0.575797,13,1.565461,3,11,1,5,Male,0,0,40,United-States,1,1
6094,-0.125433,4,-0.045186,16,-0.034448,5,2,2,5,Female,0,0,60,United-States,0,1
9951,1.271940,4,-0.218442,12,-0.434426,1,11,2,5,Female,4101,0,40,United-States,0,1
1377,0.389389,4,-0.489991,3,-0.834403,1,15,5,3,Male,0,0,40,United-States,0,1
179,-0.125433,4,-0.677657,10,1.165484,3,11,1,5,Male,0,0,50,United-States,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5510,-0.198979,4,-0.392682,12,-0.434426,3,5,6,5,Female,0,0,40,United-States,0,1
3565,0.462935,2,-0.960768,16,-0.034448,3,5,1,5,Male,7688,0,40,United-States,1,1
7895,-1.081529,7,0.771770,16,-0.034448,5,8,4,5,Female,0,0,40,United-States,0,1
632,0.021659,4,-1.505498,12,-0.434426,3,4,1,5,Male,0,0,40,United-States,0,1


In [None]:
row = to.items.iloc[0]
row

age                         1.0513
workclass                        4
fnlwgt                    0.575797
education                       13
education-num              1.56546
marital-status                   3
occupation                      11
relationship                     1
race                             5
sex                           Male
capital-gain                     0
capital-loss                     0
hours-per-week                  40
native-country       United-States
salary                           1
education-num_na                 1
Name: 8924, dtype: object

In [None]:
to.decode_row(row)

age                                  53
workclass                       Private
fnlwgt                           254285
education                       Masters
education-num                        14
marital-status       Married-civ-spouse
occupation               Prof-specialty
relationship                    Husband
race                              White
sex                                Male
capital-gain                          0
capital-loss                          0
hours-per-week                       40
native-country            United-States
salary                            >=50k
education-num_na                  False
Name: 8924, dtype: object

In [None]:
# get unprocessed test set and process it based on previous tabular obj's setup
to_tst = to.new(df_test)
to_tst.process()
to_tst.items.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,education-num_na
10000,0.462935,4,1.352977,10,1.165484,3,2,1,2,Male,0,0,40,Philippines,1
10001,-0.934438,4,1.26543,12,-0.434426,3,15,1,4,Male,0,0,40,United-States,1
10002,1.051302,4,0.156119,2,-1.23438,1,9,2,5,Female,0,0,37,United-States,1
10003,0.536481,4,-0.279591,12,-0.434426,7,2,5,5,Female,0,0,43,United-States,1
10004,0.757118,5,1.456128,9,0.365529,3,5,1,5,Male,0,0,60,United-States,1


In [None]:
tst_dl = dls.valid.new(to_tst) # TODO
tst_dl.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num
0,Private,Bachelors,Married-civ-spouse,Adm-clerical,Husband,Asian-Pac-Islander,False,45.0,338105.00631,13.0
1,Private,HS-grad,Married-civ-spouse,Transport-moving,Husband,Other,False,26.0,328662.999829,9.0
2,Private,11th,Divorced,Other-service,Not-in-family,White,False,53.000001,209021.999284,7.0
3,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,False,46.0,162029.999909,9.0
4,Self-emp-inc,Assoc-voc,Married-civ-spouse,Exec-managerial,Husband,White,False,49.0,349229.995866,11.0
5,Local-gov,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,False,34.0,124827.001629,10.0
6,Self-emp-inc,Some-college,Married-civ-spouse,Sales,Husband,White,False,53.000001,290639.998802,10.0
7,Private,Some-college,Never-married,Sales,Own-child,White,False,19.0,106272.998291,10.0
8,Private,Some-college,Married-civ-spouse,Protective-serv,Husband,Black,False,71.999999,53683.994218,10.0
9,Private,Some-college,Never-married,Sales,Own-child,White,False,20.0,505979.998058,10.0


## Other target types

### Multi-label categories

#### one-hot encoded label

In [None]:
def _mock_multi_label(df):
    sal,sex,white = [],[],[]
    for row in df.itertuples():
        sal.append(row.salary == '>=50k')
        sex.append(row.sex == ' Male')
        white.append(row.race == ' White')
    df['salary'] = np.array(sal)
    df['male']   = np.array(sex)
    df['white']  = np.array(white)
    return df

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)

In [None]:
df_main.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary,male,white
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,True,False,True
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,True,True,True
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,False,False,False
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,True,True,False
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,False,False,False


In [None]:
#export
@EncodedMultiCategorize
def encodes(self, to:Tabular): return to

@EncodedMultiCategorize
def decodes(self, to:Tabular):
    to.transform(to.y_names, lambda c: c==1)
    return to

In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
y_names=["salary", "male", "white"]

In [None]:
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names=y_names, y_block=MultiCategoryBlock(encoded=True, vocab=y_names), splits=splits)

CPU times: user 76.5 ms, sys: 1.74 ms, total: 78.2 ms
Wall time: 79.6 ms


In [None]:
dls = to.dataloaders()
dls.valid.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary,male,white
0,State-gov,Some-college,Divorced,Other-service,Unmarried,Black,False,36.0,223020.001041,10.0,False,False,False
1,Private,HS-grad,Never-married,Machine-op-inspct,Unmarried,White,False,21.000001,227985.998943,9.0,False,False,True
2,Private,Bachelors,Married-civ-spouse,Adm-clerical,Husband,White,False,38.0,297449.001415,13.0,True,True,True
3,Private,Assoc-acdm,Married-civ-spouse,Craft-repair,Husband,White,False,32.0,185027.000104,12.0,True,True,True
4,Self-emp-not-inc,HS-grad,Never-married,Craft-repair,Not-in-family,White,False,29.0,241430.999848,9.0,False,True,True
5,Local-gov,Masters,Never-married,Prof-specialty,Not-in-family,White,False,45.0,33798.003413,14.0,False,False,True
6,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,False,26.0,73689.005033,9.0,False,True,True
7,?,HS-grad,Married-civ-spouse,?,Wife,White,False,19.0,204868.00017,9.0,False,False,True
8,Federal-gov,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,False,43.0,369467.994072,13.0,True,True,True
9,Private,Assoc-voc,Never-married,Sales,Not-in-family,White,False,26.0,187576.999826,11.0,False,True,True


#### Not one-hot encoded

In [None]:
def _mock_multi_label(df):
    targ = []
    for row in df.itertuples():
        labels = []
        if row.salary == '>=50k': labels.append('>50k')
        if row.sex == ' Male':   labels.append('male')
        if row.race == ' White': labels.append('white')
        targ.append(' '.join(labels))
    df['target'] = np.array(targ)
    return df

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)

In [None]:
df_main.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary,target
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k,>50k white
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k,>50k male white
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k,
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k,>50k male
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k,


In [None]:
@MultiCategorize
def encodes(self, to:Tabular): 
    #to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0))
    return to
  
@MultiCategorize
def decodes(self, to:Tabular): 
    #to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}))
    return to

In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

In [None]:
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="target", y_block=MultiCategoryBlock(), splits=splits)

CPU times: user 71.4 ms, sys: 908 µs, total: 72.3 ms
Wall time: 76.9 ms


In [None]:
to.procs[2].vocab

(#24) ['-','_','a','c','d','e','f','g','h','i'...]

### Regression

In [None]:
#export
@RegressionSetup
def setups(self, to:Tabular): 
    if self.c is not None: return
    self.c = len(to.y_names)
    return to

@RegressionSetup
def encodes(self, to:Tabular): return to

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)

In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

In [None]:
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names='age', splits=splits)

CPU times: user 70.2 ms, sys: 1.47 ms, total: 71.7 ms
Wall time: 73.7 ms


In [None]:
to.procs[-1].means

fnlwgt           192960.187250
education-num        10.077125
dtype: float64

In [None]:
dls = to.dataloaders()
dls.valid.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,fnlwgt,education-num,age
0,Private,Some-college,Divorced,#na#,Unmarried,White,True,70092.004456,10.0,41.0
1,Private,Some-college,Married-civ-spouse,Craft-repair,Husband,White,False,128143.001298,10.0,51.0
2,Private,Some-college,Divorced,Adm-clerical,Unmarried,White,False,201454.000364,10.0,38.0
3,Private,HS-grad,Never-married,Machine-op-inspct,Own-child,White,False,160300.000984,9.0,26.0
4,Private,Bachelors,Never-married,Sales,Not-in-family,White,False,176683.000798,13.0,29.0
5,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,White,False,247444.001407,10.0,30.0
6,Private,10th,Separated,Adm-clerical,Unmarried,White,False,150600.998845,6.0,38.0
7,Private,HS-grad,Divorced,Machine-op-inspct,Unmarried,White,False,265880.996902,9.0,36.0
8,Private,Assoc-voc,Divorced,#na#,Unmarried,White,True,142411.000408,10.0,53.0
9,Private,11th,Never-married,Other-service,Own-child,White,False,115550.999867,7.0,17.0


## Not being used now - for multi-modal

In [None]:
class TensorTabular(Tuple):
    def get_ctxs(self, max_n=10, **kwargs):
        n_samples = min(self[0].shape[0], max_n)
        df = pd.DataFrame(index = range(n_samples))
        return [df.iloc[i] for i in range(n_samples)]

    def display(self, ctxs): display_df(pd.DataFrame(ctxs))

class TabularLine(pd.Series):
    "A line of a dataframe that knows how to show itself"
    def show(self, ctx=None, **kwargs): return self if ctx is None else ctx.append(self)

class ReadTabLine(ItemTransform):
    def __init__(self, proc): self.proc = proc

    def encodes(self, row):
        cats,conts = (o.map(row.__getitem__) for o in (self.proc.cat_names,self.proc.cont_names))
        return TensorTabular(tensor(cats).long(),tensor(conts).float())

    def decodes(self, o):
        to = TabularPandas(o, self.proc.cat_names, self.proc.cont_names, self.proc.y_names)
        to = self.proc.decode(to)
        return TabularLine(pd.Series({c: v for v,c in zip(to.items[0]+to.items[1], self.proc.cat_names+self.proc.cont_names)}))

class ReadTabTarget(ItemTransform):
    def __init__(self, proc): self.proc = proc
    def encodes(self, row): return row[self.proc.y_names].astype(np.int64)
    def decodes(self, o): return Category(self.proc.classes[self.proc.y_names][o])

In [None]:
# tds = TfmdDS(to.items, tfms=[[ReadTabLine(proc)], ReadTabTarget(proc)])
# enc = tds[1]
# test_eq(enc[0][0], tensor([2,1]))
# test_close(enc[0][1], tensor([-0.628828]))
# test_eq(enc[1], 1)

# dec = tds.decode(enc)
# assert isinstance(dec[0], TabularLine)
# test_close(dec[0], pd.Series({'a': 1, 'b_na': False, 'b': 1}))
# test_eq(dec[1], 'a')

# test_stdout(lambda: print(show_at(tds, 1)), """a               1
# b_na        False
# b               1
# category        a
# dtype: object""")

## Export -

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_torch_core.ipynb.
Converted 01_layers.ipynb.
Converted 02_data.load.ipynb.
Converted 03_data.core.ipynb.
Converted 04_data.external.ipynb.
Converted 05_data.transforms.ipynb.
Converted 06_data.block.ipynb.
Converted 07_vision.core.ipynb.
Converted 08_vision.data.ipynb.
Converted 09_vision.augment.ipynb.
Converted 09b_vision.utils.ipynb.
Converted 09c_vision.widgets.ipynb.
Converted 10_tutorial.pets.ipynb.
Converted 11_vision.models.xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_callback.core.ipynb.
Converted 13a_learner.ipynb.
Converted 13b_metrics.ipynb.
Converted 14_callback.schedule.ipynb.
Converted 14a_callback.data.ipynb.
Converted 15_callback.hook.ipynb.
Converted 15a_vision.models.unet.ipynb.
Converted 16_callback.progress.ipynb.
Converted 17_callback.tracker.ipynb.
Converted 18_callback.fp16.ipynb.
Converted 19_callback.mixup.ipynb.
Converted 20_interpret.ipynb.
Converted 20a_distributed.ipynb.
Converted 21_vision.learner.ipynb.
Converted 22_tutorial.ima