# READ ONLY

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.tabular import *
from utils import *

from sklearn.metrics import cohen_kappa_score,confusion_matrix,mean_squared_error,accuracy_score,make_scorer

from fastai.callbacks.tracker import *

path = Path.cwd()/'data/'

In [3]:
from fastai.text import *
from fastai.text.data import _join_texts

In [4]:
def reset_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
#     tf.set_random_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
reset_seed()

In [5]:
train,test = preprocess_data_tabular(for_nn=True,drop_description=False)

X_train shape: (14990, 151)
X_test shape: (3948, 149)


In [6]:
train.columns[train.isna().any()].tolist()

['sentiment_document_magnitude',
 'sentiment_magnitude',
 'sentiment_document_score',
 'sentiment_score',
 'metadata_metadata_annots_score_MEAN',
 'metadata_metadata_annots_score_SUM',
 'metadata_metadata_color_score_MEAN',
 'metadata_metadata_color_score_SUM',
 'metadata_metadata_color_pixelfrac_MEAN',
 'metadata_metadata_color_pixelfrac_SUM',
 'metadata_metadata_crop_conf_MEAN',
 'metadata_metadata_crop_conf_SUM',
 'metadata_metadata_crop_importance_MEAN',
 'metadata_metadata_crop_importance_SUM',
 'image_size_sum',
 'image_size_mean',
 'image_size_std',
 'width_sum',
 'width_mean',
 'width_std',
 'height_sum',
 'height_mean',
 'height_std']

In [7]:
def generate_val_idxs():
    seed=42
    np.random.seed(seed)
    rIDs = np.random.permutation(train.RescuerID.unique()).tolist()
    
    val_len = int(.2 * len(train.RescuerID.unique()) + 1)
    train_idxs=[]
    val_idxs=[]
    for i in range(5):
        val_rIDs = rIDs[val_len*i:val_len*(i+1)]
        train_rIDs=rIDs[0:val_len*i] + rIDs[val_len*(i+1):]
        train_idxs.append(train[train.RescuerID.isin(train_rIDs)].index.values)
        val_idxs.append(train[train.RescuerID.isin(val_rIDs)].index.values)
    return train_idxs,val_idxs

In [8]:
train_idxs,val_idxs = generate_val_idxs()
val_idxs

[array([    8,    13,    16,    19, ..., 14973, 14978, 14985, 14989]),
 array([   15,    21,    26,    27, ..., 14963, 14971, 14983, 14984]),
 array([    0,     6,     7,    24, ..., 14967, 14975, 14982, 14986]),
 array([    1,     2,    11,    12, ..., 14976, 14980, 14981, 14988]),
 array([    3,     4,     5,     9, ..., 14968, 14977, 14979, 14987])]

In [9]:
train,test=mean_encoding(train,test,train_idxs,val_idxs,for_nn=True)
print(train.shape)
print(test.shape)

(14990, 167)
(3948, 165)


In [10]:
train=train.drop(['RescuerID'],axis=1)
print(train.shape)
print(test.shape)

(14990, 166)
(3948, 165)


In [11]:
cat_names=['Type','Breed1', 'Breed2','Gender','Color1', 'Color2', 'Color3',
          'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed','Sterilized', 'Health','State','m1_label_description',
           'Is_Sterilized','Is_Free','IsMandarin','IsMalay']
# cont_names= list(set(train.columns) - set(cat_names) - {'AdoptionSpeed'})
cont_names= list(set(train.columns) - set(cat_names) - {'AdoptionSpeed','Description'})
print(f'# of continuous feas: {len(cont_names)}')
print(f'# of categorical feas: {len(cat_names)}')
dep_var = 'AdoptionSpeed'
procs = [FillMissing,Categorify, Normalize]

# of continuous feas: 145
# of categorical feas: 19


In [12]:
txt_cols=['Description']

In [13]:
len(cat_names) + len(cont_names) + 2 == train.shape[1]

True

In [14]:
def get_learner(data,layers,save_name='best_nn',emb_drop=0.0,ps=None):
    return tabular_learner(data, layers=layers, metrics=[root_mean_squared_error],emb_drop=emb_drop,ps=ps,
#                         callback_fns=[partial(EarlyStoppingCallback, monitor='quadratic_kappa_score',mode='max', min_delta=0.01, patience=3)]
                           callback_fns=[partial(SaveModelCallback, monitor='root_mean_squared_error',mode='min',every='improvement',name=save_name)],
                           
                          )
def get_learner_no_cb(data,layers,emb_drop=0.0,ps=None):
    return tabular_learner(data, layers=layers, emb_drop=emb_drop,ps=ps,metrics=[root_mean_squared_error])


# Get tabular databunch

In [16]:
def get_databunch(k=0,bs=64,val_idxs=val_idxs):
    reset_seed()
    data = (TabularList.from_df(train, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_by_idx(val_idxs[k])
                           .label_from_df(cols=dep_var)
                           .add_test(TabularList.from_df(test, path=path, cat_names=cat_names, cont_names=cont_names))
                           .databunch(bs=bs))
    return data

l2str= lambda layers: '_'.join([str(l) for l in layers])

# Get text databunch

In [17]:
def get_text_databunch(bs=64):
    X_train = train[['Description','AdoptionSpeed']].copy()
    X_test = test['Description'].copy()
    
    data_lm = load_data(path, 'tmp_lm.pkl', bs=bs)
    

    X_train.AdoptionSpeed = X_train.AdoptionSpeed.astype('int32')
    
    reset_seed()
    data_clas = (TextList.from_df(X_train,path, vocab=data_lm.vocab)
                 #grab all the text files in path
                 .split_by_idx(val_idxs[0])
                 #split by train and valid folder (that only keeps 'train' and 'test' so no need to filter)
                 .label_from_df(cols='AdoptionSpeed',label_cls=FloatList)
                 #label them all with their folders
                 .add_test(TabularList.from_df(X_test, path=path))
                 .databunch(bs=bs))

    # data_clas.save('data_clas.pkl')
    return data_clas
#     return load_data(path, 'data_clas.pkl', bs=32)

# Start experiment

In [18]:
import pdb

https://github.com/ohmeow/medium-finding-data-block-nirvana/blob/master/yelp-00-custom-itemlist.ipynb

Medium:

https://blog.usejournal.com/finding-data-block-nirvana-a-journey-through-the-fastai-data-block-api-c38210537fe4

## Complete code block here

In [19]:
class TabularText(TabularLine):
    "Item's that include both tabular data(`conts` and `cats`) and textual data (numericalized `ids`)"
    
    def __init__(self, cats, conts, cat_classes, col_names, txt_ids, txt_cols, txt_string):
        # tabular
        super().__init__(cats, conts, cat_classes, col_names)

        # add the text bits
        self.text_ids = txt_ids
        self.text_cols = txt_cols
        self.text = txt_string
        
        # append numericalted text data to your input (represents your X values that are fed into your model)
        # self.data = [tensor(cats), tensor(conts), tensor(txt_ids)]
        self.data += [ np.array(txt_ids, dtype=np.int64) ]
        self.obj = self.data
        
    def __str__(self):
        res = super().__str__() + f'Text: {self.text}'
        return res

class TabularTextProcessor(TabularProcessor):
    # The processors are called at the end of the labelling to apply some kind of function on your items. 
    # The default processor of the inputs can be overriden by passing a processor in the kwargs when creating the ItemList, 
    # the default processor of the targets can be overriden by passing a processor in the kwargs of the labelling function.
    def __init__(self, ds:ItemList=None, procs=None, 
                 #tokenize processor args
                 tokenizer:Tokenizer=None, chunksize:int=10000, # no mark_fields
                 include_bos:bool=True, include_eos:bool=False, #include_bos, include_eos for def proces
                 
                 # numericalize processor args
                 vocab:Vocab=None, max_vocab:int=60000, min_freq:int=2):
        super().__init__(ds, procs)
        
        #tokenize
        self.tokenizer, self.chunksize = ifnone(tokenizer, Tokenizer()), chunksize
        self.include_bos, self.include_eos = include_bos, include_eos
        
        #numericalize
        vocab = ifnone(vocab, ds.vocab if ds is not None else None)
        self.vocab, self.max_vocab, self.min_freq = vocab, max_vocab, min_freq
        
    # process a single item in a dataset
    # NOTE: THIS IS METHOD HAS NOT BEEN TESTED AT THIS POINT (WILL COVER IN A FUTURE ARTICLE)
    def process_one(self, item):
#         # process tabular data (copied form tabular.data)
#         df = pd.DataFrame([item, item])
#         for proc in self.procs: proc(df, test=True)
            
#         if len(self.cat_names) != 0:
#             codes = np.stack([c.cat.codes.values for n,c in df[self.cat_names].items()], 1).astype(np.int64) + 1
#         else: 
#             codes = [[]]
            
#         if len(self.cont_names) != 0:
#             conts = np.stack([c.astype('float32').values for n,c in df[self.cont_names].items()], 1)
#         else: 
#             conts = [[]]
            
#         classes = None
#         col_names = list(df[self.cat_names].columns.values) + list(df[self.cont_names].columns.values)
        
#         # process textual data
#         if len(self.text_cols) != 0:
#             txt = _join_texts(df[self.text_cols].values, (len(self.text_cols) > 1))
#             txt_toks = self.tokenizer._process_all_1(txt)[0]
#             text_ids = np.array(self.vocab.numericalize(txt_toks), dtype=np.int64)
#         else:
#             txt_toks, text_ids = None, [[]]
            
#         # return ItemBase
#         return MixedTabularLine(codes[0], conts[0], classes, col_names, text_ids, self.txt_cols, txt_toks)
        pass
    # processes the entire dataset
    def process(self, ds):
        '''
        ds is itembase
        '''
        # process tabular data and then set "preprocessed=False" since we still have text data possibly
        super().process(ds)
        ds.preprocessed = False
        
        # process text data from column(s) containing text
        if len(ds.text_cols) != 0:
            texts = _join_texts(ds.inner_df[ds.text_cols].values, (len(ds.text_cols) > 1), self.include_bos, self.include_eos)

            # tokenize (set = .text)
            tokens = []
            for i in progress_bar(range(0, len(ds), self.chunksize), leave=False):
                tokens += self.tokenizer.process_all(texts[i:i+self.chunksize])
            ds.text = tokens
            
            # numericalize 
            # set/build vocab
            if self.vocab is None: self.vocab = Vocab.create(ds.text, self.max_vocab, self.min_freq)
            ds.vocab = self.vocab
            ds.text_ids = [ np.array(self.vocab.numericalize(toks), dtype=np.int64) for toks in ds.text ]
        else:
            ds.text, ds.vocab, ds.text_ids = None, None, []
            
        ds.preprocessed = True
        
        
def mixed_tabular_pad_collate(samples:BatchSamples, 
                              pad_idx:int=1, pad_first:bool=True) -> Tuple[LongTensor, LongTensor]:
    "Function that collect samples and adds padding."
    # we need to add padding to the column with the text ids in order to ensure 
    # a square matrix per batch before integrating the text bits with the tabular.
    

    samples = to_data(samples)
    max_len = max([len(s[0][-1]) for s in samples])
    res = torch.zeros(len(samples), max_len).long() + pad_idx
   
    for i,s in enumerate(samples):
        if pad_first: 
            res[i,-len(s[0][-1]):] = LongTensor(s[0][-1])
        else:         
            res[i,:len(s[0][-1]):] = LongTensor(s[0][-1])
            
        # replace the text_ids array (the last thing in the inputs) with the padded tensor matrix
        s[0][-1] = res[i]
    

    # for the inputs, return a list containing 3 elements: a list of cats, a list of conts, and a list of text_ids
    # also include tensor list of classes
    return [torch.stack(x) for x in zip(*[s[0] for s in samples])],tensor(np.array([s[1] for s in samples]))


# each "ds" is of type LabelList(Dataset)
class TabularTextDataBunch(DataBunch):
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs=64, 
               pad_idx=1, pad_first=True, no_check:bool=False, **kwargs) -> DataBunch:
        
        # only thing we're doing here is setting the collate_fn = to our new "pad_collate" method above
        collate_fn = partial(mixed_tabular_pad_collate, pad_idx=pad_idx, pad_first=pad_first)
        
        return super().create(train_ds, valid_ds, test_ds, path=path, bs=bs,**kwargs)

    
    
class TabularTextList(TabularList):
    "A custom `ItemList` that merges tabular data along with textual data"
    
    _item_cls = TabularText
    _processor = TabularTextProcessor
    _bunch = TabularTextDataBunch
    
    def __init__(self, items:Iterator, cat_names:OptStrList=None, cont_names:OptStrList=None, 
                 text_cols=None, vocab:Vocab=None, pad_idx:int=1, 
                 procs=None, **kwargs) -> 'MixedTabularList':
        super().__init__(items, cat_names, cont_names, procs, **kwargs)
        
        self.cols = [] if cat_names == None else cat_names.copy()
        if cont_names: self.cols += cont_names.copy()
        if txt_cols: self.cols += text_cols.copy()
        
        # from TextList
        self.text_cols, self.vocab, self.pad_idx = text_cols, vocab, pad_idx
        
        # add any ItemList state into "copy_new" that needs to be copied each time "new()" is called; 
        # your ItemList acts as a prototype for training, validation, and/or test ItemList instances that
        # are created via ItemList.new()
        self.copy_new += ['text_cols', 'vocab', 'pad_idx']
        
        self.preprocessed = False
        
    # defines how to construct an ItemBase from the data in the ItemList.items array
    def get(self, i):
        if not self.preprocessed: 
            return self.inner_df.iloc[i][self.cols] if hasattr(self, 'inner_df') else self.items[i]
        
        codes = [] if self.codes is None else self.codes[i]
        conts = [] if self.conts is None else self.conts[i]
        
        #from TextList
        text_ids = [] if self.text_ids is None else self.text_ids[i]
        text_string = None if self.text_ids is None else self.vocab.textify(self.text_ids[i])
        
        return self._item_cls(codes, conts, self.classes, self.col_names, text_ids, self.text_cols, text_string)
    
    # this is the method that is called in data.show_batch(), learn.predict() or learn.show_results() 
    # to transform a pytorch tensor back in an ItemBase. 
    # in a way, it does the opposite of calling ItemBase.data. It should take a tensor t and return 
    # the same kind of thing as the get method.
    def reconstruct(self, t:Tensor):
        # TODO?
        idx_min = (t[2] != self.pad_idx).nonzero().min()
        idx_max = (t[2] != self.pad_idx).nonzero().max()
        return self._item_cls(t[0], t[1], self.classes, self.col_names, 
                              t[2][idx_min:idx_max], self.text_cols, self.vocab.textify(t[2][idx_min:idx_max]))
        
#         return self._item_cls(t[0], t[1], self.classes, self.col_names, 
#                               t[2], self.text_cols, self.vocab.textify(t[2])) 
    

        
        
    # tells fastai how to display a custom ItemBase when data.show_batch() is called
    def show_xys(self, xs, ys) -> None:
        "Show the `xs` (inputs) and `ys` (targets)."
        from IPython.display import display, HTML
        
        # show tabular
        display(HTML('TABULAR:<br>'))
        super().show_xys(xs, ys)
        
        # show text        
        display(HTML('TEXT:<br>'))        
        names = ['text', 'target']
        items = []
        for i, (x,y) in enumerate(zip(xs,ys)):
            res = []
            res.append(' '.join([ f'{tok}({self.vocab.stoi[tok]})' 
                              for tok in x.text.split() if (not self.vocab.stoi[tok] == self.pad_idx) ]))
                
            res.append(str(y))
            items.append(res)

        items = np.array(items)
        df = pd.DataFrame({n:items[:,i] for i,n in enumerate(names)})
        with pd.option_context('display.max_colwidth', -1):
            display(HTML(df.to_html(index=False)))
        
    # tells fastai how to display a custom ItemBase when learn.show_results() is called
    def show_xyzs(self, xs, ys, zs):
        "Show `xs` (inputs), `ys` (targets) and `zs` (predictions)."
        from IPython.display import display, HTML
        
        # show tabular
        display(HTML('TABULAR:<br>'))
        super().show_xyzs(xs, ys, zs)
        
        # show text        
        display(HTML('TEXT:<br>'))        
        names = ['text', 'target','pred']
        items = []
        for i, (x,y,z) in enumerate(zip(xs,ys,zs)):
            res = []
            res.append(' '.join([ f'{tok}({self.vocab.stoi[tok]})' 
                              for tok in x.text.split() if (not self.vocab.stoi[tok] == self.pad_idx) ]))
                
            res += [str(y),str(z)]
            items.append(res)
            
        items = np.array(items)
        df = pd.DataFrame({n:items[:,i] for i,n in enumerate(names)})
        with pd.option_context('display.max_colwidth', -1):
            display(HTML(df.to_html(index=False)))
    
        
    @classmethod
    def from_df(cls, df:DataFrame, cat_names:OptStrList=None, cont_names:OptStrList=None, 
                text_cols=None, vocab=None, procs=None, **kwargs) -> 'ItemList':
        
        return cls(items=range(len(df)), cat_names=cat_names, cont_names=cont_names, 
                   text_cols=text_cols, vocab=vocab, procs=procs, inner_df=df.copy(), **kwargs)

In [20]:
def get_tabulartext_databunch(k=0,bs=64,val_idxs=val_idxs):
    data_lm = load_data(path, 'tmp_lm.pkl', bs=bs)
    collate_fn = partial(mixed_tabular_pad_collate, pad_idx=1, pad_first=True)
    reset_seed()
    return (TabularTextList.from_df(train, cat_names, cont_names, txt_cols, vocab=data_lm.vocab, procs=procs, path=path)
                            .split_by_idx(val_idxs[k])
                            .label_from_df(cols=dep_var)
                            .add_test(TabularTextList.from_df(test, cat_names, cont_names, txt_cols,path=path))
                            .databunch(bs=bs,collate_fn=collate_fn, no_check=False))

# Tab and text learner

In [22]:
# def tabular_learner(data:DataBunch, layers:Collection[int], emb_szs:Dict[str,int]=None, metrics=None,
#         ps:Collection[float]=None, emb_drop:float=0., y_range:OptRange=None, use_bn:bool=True, **learn_kwargs):
#     "Get a `Learner` using `data`, with `metrics`, including a `TabularModel` created using the remaining params."
#     emb_szs = data.get_emb_szs(ifnone(emb_szs, {}))
#     model = TabularModel(emb_szs, len(data.cont_names), out_sz=data.c, layers=layers, ps=ps, emb_drop=emb_drop,
#                          y_range=y_range, use_bn=use_bn)
#     return Learner(data, model, metrics=metrics, **learn_kwargs)


In [67]:
class PoolingLinearTabularTextClassifier(nn.Module):
    "Create a linear classifier with pooling."

    def __init__(self, rnn_lin_layers:Collection[int], ps_lin_ftrs:Collection[float],
                 # tabular params inputs
                 emb_szs,n_cont,n_class,layers,ps,emb_drop,y_range,use_bn,bn_final):
        # rnn_lin_layers: [1200, 50, 1]
        # ps_lin_ftrs: [0.4 (from output_p, layer1200), 0.1 for layer50]
        
        super().__init__()
        # text layers
        mod_layers = []
        activs = [nn.ReLU(inplace=True)] * (len(rnn_lin_layers) - 2) + [None]
        for n_in,n_out,p,actn in zip(rnn_lin_layers[:-1],rnn_lin_layers[1:], ps_lin_ftrs, activs):
            mod_layers += bn_drop_lin(n_in, n_out, p=p, actn=actn)
        mod_layers = mod_layers[:-1] # exclude the last linear output
#         self.layers = nn.Sequential(*mod_layers[:-1]) 
    
    
        #tabular layers
        ps = ifnone(ps, [0]*len(layers))
        ps = listify(ps, layers)
                
        # embedding stuff
        self.embeds = nn.ModuleList([embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(emb_drop) # drop for embedding
        self.bn_cont = nn.BatchNorm1d(n_cont) # bn for continuous features
        n_emb = sum(e.embedding_dim for e in self.embeds) # total length of cat embeddings
        
               
        self.n_emb,self.n_cont,self.y_range = n_emb,n_cont,y_range
        sizes = self.get_sizes(layers, rnn_lin_layers[-2],n_class) # [343, 222, 111, 1] ->convert to [343+ 50, 222, 111, 1]        
        actns = [nn.ReLU(inplace=True)] * (len(sizes)-2) + [None] # [ReLU(inplace), ReLU(inplace), None]
        
        # self.layers can stay the same
        layers = []
        for i,(n_in,n_out,dp,act) in enumerate(zip(sizes[:-1],sizes[1:],[0.]+ps,actns)):
            layers += bn_drop_lin(n_in, n_out, bn=use_bn and i!=0, p=dp, actn=act)
        if bn_final: layers.append(nn.BatchNorm1d(sizes[-1]))
        
#         print(mod_layers + layers)
        self.layers = nn.Sequential(*(mod_layers + layers))

        
        
    def get_sizes(self, layers, rnn_lin_layer, out_sz):
        # concatenate cat,conts of tabular and rnn lin layer 
        return [self.n_emb + self.n_cont + rnn_lin_layer] + layers + [out_sz]
    
#     def forward(self, input:Tuple[Tensor,Tensor, Tensor])->Tuple[Tensor,Tensor,Tensor]:
#         pdb.set_trace()
#         raw_outputs,outputs,mask = input
#         output = outputs[-1]
#         avg_pool = output.masked_fill(mask[:,:,None], 0).mean(dim=1)
#         avg_pool *= output.size(1) / (output.size(1)-mask.float().sum(dim=1))[:,None]
#         max_pool = output.masked_fill(mask[:,:,None], -float('inf')).max(dim=1)[0]
#         x = torch.cat([output[:,-1], max_pool, avg_pool], 1)
#         x = self.layers(x)
#         return x, raw_outputs, outputs # TODO: why do we need raw_outputs and outputs?
#         return None
    
    def forward(self, input:Tuple[Tensor,Tensor,Tensor,Tensor,Tensor]):
        x_cat,x_cont,raw_outputs,outputs,mask = input
        # text
        output = outputs[-1]
        avg_pool = output.masked_fill(mask[:,:,None], 0).mean(dim=1)
        avg_pool *= output.size(1) / (output.size(1)-mask.float().sum(dim=1))[:,None]
        max_pool = output.masked_fill(mask[:,:,None], -float('inf')).max(dim=1)[0]
        x_text = torch.cat([output[:,-1], max_pool, avg_pool], 1) #(bs,1200) for AWD LSTM
        
        # tabular
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont != 0:
            x_cont = self.bn_cont(x_cont)
            x = torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont
            
        x = torch.cat([x_text,x],1)
        x = self.layers(x)
        if self.y_range is not None:
            x = (self.y_range[1]-self.y_range[0]) * torch.sigmoid(x) + self.y_range[0]
        return x,raw_outputs,outputs # TODO: why do we need raw_outputs and outputs?

class MultiBatchMixEncoder(MultiBatchEncoder):
    "Create an encoder over `module` that can process a full sentence."
    def __init__(self, bptt:int, max_len:int, module:nn.Module, pad_idx:int=1):
        super().__init__(bptt,max_len,module,pad_idx)

    def forward(self, x_cat:Tensor,x_cont:Tensor,x_text:LongTensor):
        bs,sl = x_text.size()
        self.reset()
        raw_outputs,outputs,masks = [],[],[]
        for i in range(0, sl, self.bptt):
            r, o = self.module(input[:,i: min(i+self.bptt, sl)])
            if i>(sl-self.max_len):
                masks.append(input[:,i: min(i+self.bptt, sl)] == self.pad_idx)
                raw_outputs.append(r)
                outputs.append(o)
        return x_cat,x_cont,self.concat(raw_outputs),self.concat(outputs),torch.cat(masks,dim=1)

                                
def get_tabular_text_classifier(emb_szs:ListSizes, n_cont:int , n_class:int, layers:Collection[int], 
                                # text classifier params inputs
                                arch:Callable, vocab_sz:int, bptt:int=70, max_len:int=20*70, config:dict=None, 
                                drop_mult:float=1., lin_ftrs:Collection[int]=None, ps_lin_ftrs:Collection[float]=None,pad_idx:int=1,
                                # tabular params inputs
                                ps:Collection[float]=None,emb_drop:float=0., y_range:OptRange=None, use_bn:bool=True, bn_final:bool=False
                               ) -> nn.Module:
    "Create a text classifier from `arch` and its `config`, maybe `pretrained`."
    
    # TODO: remove text.learner.*?
    meta = text.learner._model_meta[arch]
    config = ifnone(config, meta['config_clas'].copy())
    for k in config.keys(): 
        if k.endswith('_p'): config[k] *= drop_mult # drop_mult: multiply to different dropouts in AWD LSTM
    if lin_ftrs is None: lin_ftrs = [50]
    if ps_lin_ftrs is None:  ps_lin_ftrs = [0.1]
    
    rnn_lin_layers = [config[meta['hid_name']] * 3] + lin_ftrs + [n_class] # [1200, 50, 1]
    ps_lin_ftrs = [config.pop('output_p')] + ps_lin_ftrs #[0.4 (from output_p), 0.1 for layer50]
    init = config.pop('init') if 'init' in config else None
    encoder = MultiBatchMixEncoder(bptt, max_len, arch(vocab_sz, **config), pad_idx=pad_idx)
    
    tabtext_lin_model = PoolingLinearTabularTextClassifier(rnn_lin_layers, ps_lin_ftrs,
                                                       # tabular params inputs
                                                      emb_szs,n_cont,n_class,layers,ps,emb_drop,y_range,use_bn,bn_final)
    final_model = SequentialRNN(encoder, tabtext_lin_model)
    return final_model if init is None else final_model.apply(init)

In [68]:
def tabtext_learner(data,layers:Collection[int], 
                    arch:Callable,
                    # text classifier params inputs
                    metrics=None,
                    bptt:int=70, max_len:int=20*70, config:dict=None,                     
                    drop_mult:float=1., lin_ftrs:Collection[int]=None, ps_lin_ftrs:Collection[float]=None,pad_idx:int=1,
                    # tabular params inputs
                    emb_szs:Dict[str,int]=None, ps:Collection[float]=None,emb_drop:float=0., 
                    y_range:OptRange=None, use_bn:bool=True, bn_final:bool=False,pretrained:bool=True, **learn_kwargs):
    emb_szs = data.get_emb_szs(ifnone(emb_szs, {}))
    model = get_tabular_text_classifier(emb_szs,len(data.cont_names),data.c,layers,
                                       arch,len(data.vocab.itos),bptt=bptt,max_len=max_len,config=config,
                                        drop_mult=drop_mult, lin_ftrs=lin_ftrs,ps_lin_ftrs=ps_lin_ftrs,
                                        pad_idx=pad_idx,ps=ps,emb_drop=emb_drop,y_range=y_range,
                                       use_bn=use_bn,bn_final=bn_final)
    #text
    meta = text.learner._model_meta[arch]
    learn = RNNLearner(data, model, metrics = metrics, split_func=meta['split_clas'], **learn_kwargs)
    if pretrained:
        if 'url' not in meta: 
            warn("There are no pretrained weights for that architecture yet!")
            return learn
        model_path = untar_data(meta['url'], data=False)
        fnames = [list(model_path.glob(f'*.{ext}'))[0] for ext in ['pth', 'pkl']]
        learn.load_pretrained(*fnames, strict=False)
        learn.freeze()
    return learn

#     learn = Learner(data,model,metrics=metrics,**learn_kwargs) 

In [57]:
tabtext_db = get_tabulartext_databunch()

In [61]:
learn = tabtext_learner(tabtext_db,[222,111],AWD_LSTM,metrics=[root_mean_squared_error],
                       callback_fns=[partial(SaveModelCallback, monitor='root_mean_squared_error',mode='min',every='improvement',name='best_nn')])

In [70]:
learn.load_encoder('bs256-8e-awdlstm-enc')

In [69]:
learn.callback_fns

[functools.partial(<class 'fastai.basic_train.Recorder'>, add_time=True),
 functools.partial(<class 'fastai.callbacks.tracker.SaveModelCallback'>, monitor='root_mean_squared_error', mode='min', every='improvement', name='best_nn')]

In [38]:
# emb_szs = tabtext_db.get_emb_szs({})
# model = get_tabular_text_classifier(emb_szs,len(tabtext_db.cont_names),tabtext_db.c,[222,111],
#                                    AWD_LSTM,len(tabtext_db.vocab.itos),lin_ftrs=[55,44],ps_lin_ftrs=[0.55,0.44],ps=[0.22,0.11],
#                                    emb_drop=0.111)

In [39]:
# model

SequentialRNN(
  (0): MultiBatchEncoder(
    (module): AWD_LSTM(
      (encoder): Embedding(10131, 400, padding_idx=1)
      (encoder_dp): EmbeddingDropout(
        (emb): Embedding(10131, 400, padding_idx=1)
      )
      (rnns): ModuleList(
        (0): WeightDropout(
          (module): LSTM(400, 1150, batch_first=True)
        )
        (1): WeightDropout(
          (module): LSTM(1150, 1150, batch_first=True)
        )
        (2): WeightDropout(
          (module): LSTM(1150, 400, batch_first=True)
        )
      )
      (input_dp): RNNDropout()
      (hidden_dps): ModuleList(
        (0): RNNDropout()
        (1): RNNDropout()
        (2): RNNDropout()
      )
    )
  )
  (1): PoolingLinearTabTextClassifier(
    (embeds): ModuleList(
      (0): Embedding(3, 3)
      (1): Embedding(165, 28)
      (2): Embedding(123, 24)
      (3): Embedding(4, 3)
      (4): Embedding(8, 5)
      (5): Embedding(8, 5)
      (6): Embedding(7, 5)
      (7): Embedding(5, 4)
      (8): Embedding(4, 3)

In [97]:
# model = get_text_classifier(AWD_LSTM, len(tabtext_db.vocab.itos), tabtext_db.c, bptt=70, max_len=70*20,
#                                 drop_mult=1.)

In [98]:
# model

SequentialRNN(
  (0): MultiBatchEncoder(
    (module): AWD_LSTM(
      (encoder): Embedding(10131, 400, padding_idx=1)
      (encoder_dp): EmbeddingDropout(
        (emb): Embedding(10131, 400, padding_idx=1)
      )
      (rnns): ModuleList(
        (0): WeightDropout(
          (module): LSTM(400, 1150, batch_first=True)
        )
        (1): WeightDropout(
          (module): LSTM(1150, 1150, batch_first=True)
        )
        (2): WeightDropout(
          (module): LSTM(1150, 400, batch_first=True)
        )
      )
      (input_dp): RNNDropout()
      (hidden_dps): ModuleList(
        (0): RNNDropout()
        (1): RNNDropout()
        (2): RNNDropout()
      )
    )
  )
  (1): PoolingLinearClassifier(
    (layers): Sequential(
      (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Dropout(p=0.4)
      (2): Linear(in_features=1200, out_features=50, bias=True)
      (3): ReLU(inplace)
      (4): BatchNorm1d(50, eps=1e-05, momentum=

In [60]:
# emb_szs = tabtext_db.get_emb_szs({})
# model = TabularModel(emba_szs, len(tabtext_db.cont_names), out_sz=tabtext_db.c, layers=[222,111],ps=[0.2,0.1])

In [55]:
# model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(3, 3)
    (1): Embedding(165, 28)
    (2): Embedding(123, 24)
    (3): Embedding(4, 3)
    (4): Embedding(8, 5)
    (5): Embedding(8, 5)
    (6): Embedding(7, 5)
    (7): Embedding(5, 4)
    (8): Embedding(4, 3)
    (9): Embedding(4, 3)
    (10): Embedding(4, 3)
    (11): Embedding(4, 3)
    (12): Embedding(4, 3)
    (13): Embedding(14, 7)
    (14): Embedding(76, 18)
    (15): Embedding(3, 3)
    (16): Embedding(3, 3)
    (17): Embedding(3, 3)
    (18): Embedding(3, 3)
    (19): Embedding(3, 3)
    (20): Embedding(3, 3)
    (21): Embedding(3, 3)
    (22): Embedding(3, 3)
    (23): Embedding(3, 3)
    (24): Embedding(3, 3)
    (25): Embedding(3, 3)
    (26): Embedding(3, 3)
    (27): Embedding(3, 3)
    (28): Embedding(3, 3)
    (29): Embedding(3, 3)
    (30): Embedding(3, 3)
    (31): Embedding(3, 3)
    (32): Embedding(3, 3)
    (33): Embedding(3, 3)
    (34): Embedding(3, 3)
    (35): Embedding(3, 3)
    (36): Embedding(3, 3)
 

## Complete tabular text databunch

In [28]:
tabtext_db = get_tabulartext_databunch()

In [29]:
tabtext_db.show_batch(ds_type=DatasetType.Valid)

Type,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,State,m1_label_description,Is_Sterilized,Is_Free,IsMandarin,IsMalay,width_mean_na,metadata_metadata_color_pixelfrac_MEAN_na,image_size_std_na,width_std_na,sentiment_magnitude_na,height_mean_na,metadata_metadata_annots_score_SUM_na,metadata_metadata_crop_importance_MEAN_na,image_size_mean_na,metadata_metadata_crop_conf_MEAN_na,sentiment_document_magnitude_na,metadata_metadata_crop_importance_SUM_na,image_size_sum_na,height_sum_na,metadata_metadata_crop_conf_SUM_na,sentiment_score_na,metadata_metadata_color_score_MEAN_na,metadata_metadata_color_pixelfrac_SUM_na,height_std_na,metadata_metadata_color_score_SUM_na,metadata_metadata_annots_score_MEAN_na,width_sum_na,sentiment_document_score_na,TFIDF_Description_5,RescuerID_age_mean,general health,TFIDF_metadata_annots_top_desc_15,potential for mouthiness,width_mean,Fee,Breed1_m1_vertex_y_mean,TFIDF_metadata_annots_top_desc_11,RescuerID_Fee_Mean,Quantity,RescuerID_m1LabelScore_mean,TFIDF_Description_7,Breed1_Age_mean,TFIDF_metadata_annots_top_desc_14,TFIDF_sentiment_entities_10,tolerates hot weather,State_Pop,State_RescuerID_COUNT_sum,State_m1_label_score_mean,size,adapts well to apartment living,good for novice owners,drooling potential,TFIDF_sentiment_entities_14,metadata_metadata_color_pixelfrac_MEAN,intelligence,TFIDF_metadata_annots_top_desc_2,Breed1_RescuerID_nunique,easy to train,tendency to bark or howl,TFIDF_sentiment_entities_1,image_size_std,State_Density,TFIDF_metadata_annots_top_desc_13,width_std,m1_dominant_red,TFIDF_Description_9,sentiment_magnitude,height_mean,metadata_metadata_annots_score_SUM,metadata_metadata_crop_importance_MEAN,TFIDF_Description_2,TFIDF_Description_14,wanderlust potential,TFIDF_metadata_annots_top_desc_0,RescuerID_breed1_nunique,TFIDF_Description_3,image_size_mean,State_m1_vertex_y_mean,TFIDF_Description_6,State_Area,prey drive,metadata_metadata_crop_conf_MEAN,TFIDF_sentiment_entities_15,sentiment_document_magnitude,TFIDF_sentiment_entities_11,potential for playfulness,Age,TFIDF_metadata_annots_top_desc_10,Breed1_count,Breed1_m1_label_score_sum,m1_dominant_blue,metadata_metadata_crop_importance_SUM,TFIDF_sentiment_entities_6,State_count,PhotoAmt,TFIDF_sentiment_entities_8,image_size_sum,affectionate with family,TFIDF_metadata_annots_top_desc_9,easy to groom,pet friendly,friendly toward strangers,TFIDF_sentiment_entities_9,tendency to vocalize,incredibly kid friendly dogs,height_sum,m1_dominant_pixel_frac,TFIDF_sentiment_entities_5,RescuerID_m1VertexY_mean,RescuerID_IsFree_Mean,intensity,State_Age_mean,TFIDF_metadata_annots_top_desc_12,State_GDP,m1_vertex_y,metadata_metadata_crop_conf_SUM,TFIDF_Description_0,State_m1_label_score_sum,RescuerID_m1LabelScore_sum,RescuerID_Fee_Sum,TFIDF_metadata_annots_top_desc_5,TFIDF_sentiment_entities_4,TFIDF_sentiment_entities_12,TFIDF_metadata_annots_top_desc_7,potential for weight gain,sentiment_score,TFIDF_metadata_annots_top_desc_3,metadata_metadata_color_score_MEAN,metadata_metadata_color_pixelfrac_SUM,TFIDF_Description_13,TFIDF_sentiment_entities_13,height_std,State_RescuerID_nunique,State_RescuerID_COUNT_mean,TFIDF_Description_8,sensitivity level,m1_vertex_x,name_length,Breed1_RescuerID_COUNT_sum,TFIDF_sentiment_entities_3,metadata_metadata_color_score_SUM,TFIDF_Description_10,TFIDF_Description_1,TFIDF_sentiment_entities_0,TFIDF_Description_12,energy level,m1_bounding_confidence,m1_dominant_score,exercise needs,dog friendly,RescuerID_COUNT,m1_bounding_importance,TFIDF_metadata_annots_top_desc_4,amount of shedding,TFIDF_Description_11,metadata_metadata_annots_score_MEAN,tolerates being alone,TFIDF_sentiment_entities_7,Breed1_RescuerID_COUNT_mean,TFIDF_metadata_annots_top_desc_1,m1_dominant_green,TFIDF_metadata_annots_top_desc_8,width_sum,TFIDF_Description_4,kid friendly,tolerates cold weather,TFIDF_Description_15,TFIDF_metadata_annots_top_desc_6,VideoAmt,m1_label_score,Breed1_m1_label_score_mean,sentiment_document_score,TFIDF_sentiment_entities_2,target
2,265,0,2,6,0,0,2,2,2,2,2,1,41326,cat,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.2549,0.3578,-0.4543,0.3437,-0.3621,0.0207,-0.2778,-0.6611,-0.1649,-0.3371,-0.3974,0.2,0.1319,-0.4907,2.7438,-0.4767,-0.3576,0.8318,0.835,0.1177,-0.3427,-0.3405,-0.353,-0.3074,-0.3161,0.3228,-0.4474,-0.0607,-0.6231,-0.3619,-0.3487,-0.3582,-0.3003,-0.583,-0.5318,-0.8797,-1.0693,-0.4375,-0.8303,-1.0736,0.6044,-0.0248,-0.2367,0.4258,-0.3514,-0.7828,-0.1105,-0.255,-0.6207,0.4828,0.0041,0.2114,-0.3538,-0.1055,-0.0067,-0.8706,0.3268,-0.4672,-0.4719,0.7317,-0.8591,-0.8494,-0.7754,0.5891,3.1878,0.8051,0.6022,1.0712,0.0306,-0.4779,-2.4276,-0.4189,-0.2496,-0.4171,0.2029,-0.2233,-0.371,0.1765,0.0303,-0.6531,-0.6292,0.4945,-0.3593,-0.1802,-0.0046,-0.5201,-0.794,0.5871,-1.0823,0.8055,-0.3692,-0.4003,0.0344,-0.3981,0.6081,-0.4922,-0.3616,-0.6921,-0.1507,0.5295,0.7101,-0.9616,0.6769,-0.6207,0.7693,0.7718,1.3544,-0.3712,0.106,-1.1636,-0.8704,0.6612,0.7023,0.4375,0.1388,-0.192,-0.6448,-0.3657,0.1403,-0.3429,-0.3618,-0.3557,-0.3704,0.14,-0.1087,-0.4417,-0.6733,0.173,-0.3423,-2.1347,-0.8529,1.4372,-0.8293,-1.107,0.5431,-0.4783,-0.2532,-0.3537,-0.4454,0.0299,-0.1649,0.2183,0.608,-0.7934,-0.2633,1.0
2,265,0,3,1,6,7,1,2,2,2,3,1,41326,cat,False,True,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,1.3525,-0.5742,-0.4543,-0.1767,-0.3621,2.2813,-0.2778,-0.6611,0.8607,-0.3371,3.6141,0.2419,-1.0765,-0.4907,-1.5543,-1.4085,-0.3576,0.8318,0.835,0.1177,-0.3427,-0.3405,-0.353,-0.3074,-1.1138,0.7125,-0.4474,-0.0326,-0.6231,-0.3619,-0.3487,-0.0885,-0.2419,-0.583,0.4751,-0.1974,-1.1127,-0.4062,0.238,0.9175,-0.8588,-0.0248,-0.3692,-1.374,-0.3514,-1.321,-0.5044,-0.681,0.4637,0.4828,-0.4818,0.2114,-0.3538,-0.1055,-0.6482,0.2639,0.4531,-0.4672,-0.4719,-0.198,-0.8591,-0.8494,-0.699,-0.8743,-0.0304,0.8051,-0.8433,-2.059,-0.6528,-0.4779,0.6537,-0.4189,-0.2496,-0.4171,2.134,-0.2233,-0.371,-0.7792,0.0245,0.026,0.9475,0.4945,-0.3593,-0.1802,0.0995,-0.5201,0.7797,-0.8757,0.5217,0.8055,-0.4189,-0.4003,-0.3374,-0.4286,0.7245,0.293,-0.3616,0.4081,-0.1506,0.7655,-0.7812,-0.9848,0.1229,-0.6207,0.7693,0.7718,0.7019,-0.3712,1.8634,1.8753,-0.8704,1.0284,-0.836,0.5516,-0.4924,-0.1917,-0.3706,-0.3657,0.1403,0.0609,-0.3618,-0.3557,-0.4188,0.14,-0.0028,-0.4417,-0.0007,2.4888,-0.3423,0.9921,-0.8529,0.204,-0.8891,0.1822,-0.6451,0.1877,-0.2532,-0.3537,0.042,-0.6008,-0.1649,0.1945,0.608,0.8099,0.5191,1.0
2,266,0,2,2,0,0,1,1,1,1,1,1,41326,cat,True,True,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,-0.567,-0.5076,-0.4543,0.2022,-0.3621,-0.9212,-0.2778,-0.2847,-0.1629,0.2856,-0.3974,0.2531,0.5189,-0.4237,1.8355,-0.128,-0.3576,0.8318,0.835,0.1177,-0.3427,-0.3405,-0.353,-0.3074,-0.028,0.491,-0.4474,0.1421,0.4482,-0.3619,-0.3487,-0.6953,-0.2419,-0.583,-1.0581,-0.1974,0.5229,-0.4636,-0.6268,0.0642,-0.8748,-0.0248,-0.1304,-0.6529,-0.3514,-1.0142,-0.5044,0.0565,-0.3995,0.4828,0.1654,0.2114,-0.3538,-0.1055,-0.0561,-0.587,0.145,-0.4672,-0.253,-0.0717,0.0533,0.0752,0.4168,-0.8743,-0.1872,0.8051,-0.8433,0.0327,-0.7621,-0.4779,2.2284,-0.4189,-0.2496,-0.4171,-0.0133,-0.2233,-0.371,-0.8333,0.1629,-0.0948,-0.1453,-2.0504,-0.3593,-0.1802,-0.0025,-0.5201,0.1053,-0.8757,-1.3586,0.8055,-0.3986,-0.2472,1.0894,-0.1168,0.1268,-0.1511,-0.3616,2.2419,-0.1507,0.5448,-0.7978,0.7347,0.0153,-0.6207,0.7693,0.7718,0.6579,-0.3712,-0.6262,-0.2387,-0.6373,-0.162,-0.8444,0.5216,0.2345,-0.1922,-0.383,-0.3657,0.1403,0.5054,-0.3618,-0.3557,-0.3994,0.14,-0.143,-0.4417,1.8687,-0.4226,-0.3423,-0.2103,-0.7482,1.1361,0.6666,-2.5701,-0.8747,-0.2455,-0.2532,-0.3537,-0.1314,1.511,-0.1649,0.2209,0.3221,-0.2833,-0.9716,3.0
2,266,0,3,1,2,7,1,1,2,2,2,1,41326,cat,False,True,False,True,False,False,True,True,True,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False,False,False,True,-0.6903,-0.441,-0.4543,-0.0641,-0.3621,-1.8631,-0.2778,-0.2847,-1.9524,-0.3371,2.2769,0.2372,0.087,-0.4237,-1.0196,-0.1286,-0.3576,0.8318,0.835,0.1177,-0.3427,-0.3405,-0.353,-0.3074,-0.0274,0.6969,-0.4474,0.0029,0.4482,-0.3619,-0.3487,-0.6999,-0.2419,-0.583,-0.0049,-0.1974,-1.5035,-0.7988,-0.2199,0.0642,-0.8707,-0.0248,-0.3268,0.1334,-0.3514,-0.9989,-0.4059,1.2022,-0.7777,0.4828,0.017,0.2114,-0.3538,-0.1055,-0.0561,-0.2466,0.1456,-0.4672,-0.5266,-4.0559,0.0533,0.0752,-1.2798,-0.8743,-0.188,0.8051,-0.8433,0.0338,-0.81,-0.4779,-0.3712,-0.4189,-0.2496,-0.4171,-0.0131,-0.2233,-0.371,-0.8333,0.7223,-0.0952,-0.4185,0.4945,-0.3593,-0.1802,0.0291,-0.5201,0.1053,-0.8757,-0.2182,0.8055,-0.4189,-0.4003,1.1023,-0.1166,0.1284,-1.6291,-0.3616,0.0414,-0.1507,-1.0954,-0.7824,0.3563,0.0164,-0.6207,0.7693,0.7718,0.5773,-0.3712,-1.3585,-1.1636,-0.6373,-0.165,-0.9066,1.0498,-0.2912,5.2294,-0.8588,-0.3657,0.1403,-0.121,-0.3618,-0.3557,-0.4188,0.14,-0.1544,-0.4417,-0.9175,0.3232,-0.3423,-0.2123,-0.7482,0.9779,-1.3678,3.5829,-0.9422,-0.5393,-0.2532,-0.3537,0.8651,2.6744,-0.1649,0.2136,0.3221,-0.2104,-0.9773,4.0
2,285,251,1,3,0,0,3,2,1,1,1,1,41326,cat,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.5306,0.6874,1.2087,-0.2083,-0.3621,-0.9212,-0.2778,-2.1551,0.0745,1.3044,-0.3974,0.1861,0.8744,0.5684,-2.0212,-0.1631,-0.3576,0.8318,0.835,0.1177,-0.3427,-0.3405,-0.353,-0.3074,0.3234,0.1754,-0.4474,0.1412,-1.3688,-0.3619,-0.3487,-0.2472,-0.3672,-0.583,1.1492,-0.8797,0.7545,1.47,0.4923,0.0642,-0.2777,-0.0248,0.1225,0.4527,-0.3514,-1.0039,0.5788,-0.4528,-0.9143,0.4828,0.286,0.2114,-0.3538,-0.1055,-0.2652,0.4341,0.8179,0.6944,1.3882,1.1315,-1.2672,-1.2769,-0.1334,-0.289,-0.9264,0.8051,-0.2651,0.2395,-0.5733,1.1497,-4.4118,-0.4189,1.9573,0.9503,-0.2688,3.2499,-0.371,-0.2563,0.2207,0.9332,0.0344,-1.5636,-0.3593,-0.1802,0.0489,-0.5201,0.1053,-0.2906,0.5457,0.8055,0.1676,5.7554,1.0159,-0.7475,-0.5328,-0.8963,-0.3616,-0.3254,-0.1507,0.9369,-0.2354,1.2082,-0.3503,-0.6207,0.7693,0.7718,0.4262,-0.3712,-0.6262,-0.1066,-0.9417,1.4577,-0.1797,-0.0362,-0.6604,-0.1917,0.6201,-0.3657,0.1403,0.7169,-0.3618,-0.3557,0.1519,0.14,-0.2055,3.1045,-0.6147,0.3805,-0.3423,-0.4137,-0.8897,1.1796,0.6965,-2.1509,-0.4696,1.3644,1.8372,-0.3537,-1.0257,1.4045,-0.1649,0.227,-0.0821,0.5913,-0.0057,2.0


text,target
"xxbos(2) healthy(94) and(9) active(102) ,(10) feisty(2058) kitten(84) found(71) in(20) neighbours(1026) '(232) garden(786) .(8) xxmaj(4) not(46) sure(301) of(19) sex(1939)",1.0
"xxbos(2) xxmaj(4) hi(319) xxmaj(4) pet(149) xxmaj(4) lovers(572) !(43) xxmaj(4) this(49) is(14) my(32) first(231) posting(1619) and(9) i(16) need(123) help(161) !(43) 3(86) months(109) ago(261) we(36) befriended(5968) a(12) mother(142) stray(187) cat(50) with(23) 3(86) kittens(101) in(20) our(134) area(182) and(9) we(36) '(232) adopted(141) '(232) them(37) but(52) they(39) come(248) and(9) go(171) as(45) they(39) please(31) .(8) 2(62) months(109) ago(261) the(13) mother(142) gave(287) birth(244) to(11) a(12) litter(160) of(19) 7(286) cute(115) kittens(101) and(9) we(36) have(33) been(88) taking(367) care(79) of(19) the(13) mother(142) and(9) the(13) kittens(101) ever(625) since(195) in(20) our(134) home(25) ...(117) in(20) the(13) ground(2353) floor(1299) bathroom(4507) !(43) xxmaj(4) some(172) of(19) the(13) kittens(101) look(252) half(860) -(29) persian(804) with(23) long(157) hair(332) ,(10) xxunk(0) also(145) from(66) the(13) earlier(1557) stray(187) batch(2823) we(36) adopted(141) .(8) xxmaj(4) looking(82) for(15) good(60) caring(435) homes(378)",1.0
xxbos(2) to(11) be(27) spayed(200) on(53) /(56) 12(690) adorable(152) &(67),3.0
"xxbos(2) xxmaj(4) birth(244) xxmaj(4) date(1109) :(41) xxmaj(4) oct(1752) 30th(5133) xxmaj(4) kitty(309) 1(108) ,(10) xxmaj(4) xxunk(0) -female(6549) xxmaj(4) sangat(672) comel(1017) and(9) xxunk(0) .(8) xxmaj(4) kitty(309) 2(62) ,(10) xxmaj(4) tootsie(7369) -(29) xxmaj(4) male(143) xxmaj(4) badan(3157) putih(1662) ,(10) tapi(1206) ekor(784) hitam(2135) and(9) telinga(4065) xxunk(0) -(29) xxunk(0) ,(10) macam(2396) kucing(300) 3(86) color(516) .(8) xxmaj(4) kitty(309) 3(86) ,(10) xxmaj(4) bo(6550) -male(4787) xxmaj(4) corak(5134) badan(3157) 3(86) tompok(3443) ,(10) he(21) is(14) one(89) of(19) the(13) most(276) quite(333) and(9) xxunk(0) kitty(309) of(19) the(13) litter(160) tapi(1206) jantan(1640) .(8) xxmaj(4) kitty(309) 4(107) ,(10) xxmaj(4) xxunk(0) -(29) xxmaj(4) female(127) xxmaj(4) macam(2396) abang(4509) xxunk(0) ,(10) she(17) is(14) very(26) xxunk(0) and(9) sweet(179) ,(10) have(33) 1(108) big(289) spot(928) behind(723) her(18) back(168) and(9) 1(108) super(365) cute(115) spot(928) belakang(2916) telinga(4065) .(8) xxmaj(4) kitty(309) 5(146) ,(10) xxmaj(4) missy(2621) -(29) xxmaj(4) female(127) .(8) xxmaj(4) she(17) is(14) very(26) curious(674) and(9) adventurous(1139) .(8) xxmaj(4) very(26) lovable(579) ,(10) with(23) sweet(179) voice(1433)",4.0
"xxbos(2) xxmaj(4) garfield(2060) is(14) a(12) very(26) large(956) cat(50) .(8) xxmaj(4) needs(189) daily(580) grooming(939) .(8) xxmaj(4) he(21) has(58) been(88) groomed(1850) short(341) and(9) deflea(1753) .(8) xxmaj(4) very(26) good(60) wt(2759) other(112) cats(98) .(8) xxmaj(4) calm(610) cat(50) .(8) xxmaj(4) he(21) is(14) neutered(259) ,(10) vaccinated(191) and(9) dewormed(207) .(8) xxmaj(4) must(114) be(27) kept(353) indoors(379) .(8) xxmaj(4) needs(189) grooming(939) and(9) brushing(4276) as(45) his(68) fur(239) is(14) very(26) soft(478) and(9) thick(1332) .(8) xxmaj(4) if(35) you(24) are(22) interested(77) to(11) adopt(64) xxmaj(4) garfield(2060) ,(10) please(31) give(69) me(34) a(12) call(74) or(51) sent(546) me(34) an(106) email(250) .(8) thank(173) you(24)",2.0


In [41]:
x1,y1 = tabtext_db.one_batch(ds_type=DatasetType.Train)
y1,len(y1)

(tensor([2., 2., 4., 4., 4., 4., 4., 4., 1., 2., 4., 2., 3., 1., 2., 4., 2., 1.,
         1., 4., 4., 1., 4., 0., 2., 3., 3., 4., 3., 2., 1., 1., 2., 4., 3., 1.,
         4., 3., 4., 4., 1., 4., 1., 2., 4., 1., 3., 4., 4., 1., 4., 3., 3., 4.,
         2., 2., 3., 4., 4., 3., 4., 4., 4., 1.]), 64)

In [42]:
len(x1),len(x1[0]),len(x1[1]),x1[0].shape,x1[1].shape,x1[2].shape

(3, 64, 64, torch.Size([64, 42]), torch.Size([64, 145]), torch.Size([64, 765]))

In [32]:
x1[0][0] #cats

x1[1][0] #conts (normalized)

tensor([  2, 127,   1,   2,   6,   7,   1,   2,   2,   1,   1,   1,   1,   3,
         11,   2,   2,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1])

tensor([-3.8405e-01,  1.6226e+00, -4.5429e-01, -7.8942e-02, -3.6212e-01,
        -2.9247e-01, -2.7784e-01, -6.6113e-01, -4.0079e-01, -3.3711e-01,
        -3.9743e-01,  2.6175e-01, -1.0043e-01, -4.9071e-01, -1.4529e+00,
         1.8988e-01, -3.5761e-01,  8.3176e-01,  8.3500e-01,  1.1769e-01,
        -3.4273e-01, -3.4052e-01, -3.5297e-01, -3.0743e-01, -1.2816e-01,
         1.3650e+00, -4.4736e-01, -1.3967e-01, -6.2311e-01, -3.6190e-01,
        -3.4872e-01, -5.5261e-01, -6.3511e-01, -5.8299e-01,  1.1894e+00,
         2.3167e-01,  5.2292e-01, -3.0570e-01,  7.9755e-01, -1.0707e+00,
         1.4220e-02, -2.4799e-02, -2.1915e-01,  1.6981e-02, -3.5138e-01,
        -9.9167e-01, -4.0593e-01, -4.3433e-01, -1.2103e+00,  4.8278e-01,
         1.8928e-01,  2.1138e-01, -3.5382e-01, -1.0554e-01, -1.7935e-01,
         8.3119e-01, -2.3576e-01, -4.6717e-01,  2.0447e+00,  8.2465e-01,
        -8.5909e-01, -8.4936e-01,  8.7540e-01,  3.7276e-03,  3.2496e-02,
         8.0510e-01,  2.3973e-02, -1.7139e-01, -5.9

# Step by step to create TabularText databunch

## 1. create itemlist

itemlist: list of itembase, which contains independent variables

In [30]:
data_lm = load_data(path, 'tmp_lm.pkl', bs=32)

In [31]:
il = TabularTextList.from_df(train, cat_names, cont_names, txt_cols, vocab=data_lm.vocab, procs=procs, path=path)

In [82]:
print(f'CATS:\n{il.cat_names}')
# print(f'CONTS:\n{il.cont_names}')
print(f'TEXT COLS:\n{il.text_cols}')
print(f'PROCS:\n{il.procs}')

CATS:
['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'State', 'm1_label_description', 'Is_Sterilized', 'Is_Free', 'IsMandarin', 'IsMalay', 'image_size_std_na', 'sentiment_document_score_na', 'metadata_metadata_color_pixelfrac_MEAN_na', 'sentiment_document_magnitude_na', 'image_size_sum_na', 'metadata_metadata_crop_conf_MEAN_na', 'metadata_metadata_crop_conf_SUM_na', 'metadata_metadata_color_pixelfrac_SUM_na', 'sentiment_magnitude_na', 'height_std_na', 'metadata_metadata_color_score_SUM_na', 'metadata_metadata_color_score_MEAN_na', 'metadata_metadata_crop_importance_MEAN_na', 'metadata_metadata_annots_score_MEAN_na', 'width_sum_na', 'image_size_mean_na', 'width_std_na', 'height_mean_na', 'metadata_metadata_crop_importance_SUM_na', 'width_mean_na', 'metadata_metadata_annots_score_SUM_na', 'sentiment_score_na', 'height_sum_na']
TEXT COLS:
['Description']
PROCS:
[<class 'fastai.tabular.tran

In [86]:
il.get(0)

Type                                                                                         2
Breed1                                                                                     299
Breed2                                                                                       0
Gender                                                                                       1
Color1                                                                                       1
Color2                                                                                       7
Color3                                                                                       0
MaturitySize                                                                                 1
FurLength                                                                                    1
Vaccinated                                                                                   2
Dewormed                                          

## 2. Split train and validation

ItemLists: train ItemList and validation itemList

In [32]:
ils = il.split_by_idx(val_idxs[0])

In [48]:
len(ils.train), len(ils.valid), ils.path

(11939, 3051, PosixPath('/home/quantran/kwon/kaggle/new-comp/data'))

In [None]:
#     data_clas = (TextList.from_df(X_train,path, vocab=data_lm.vocab)
#                  #grab all the text files in path
#                  .split_by_idx(val_idxs[0])
#                  #split by train and valid folder (that only keeps 'train' and 'test' so no need to filter)
#                  .label_from_df(cols='AdoptionSpeed',label_cls=FloatList)
#                  #label them all with their folders
#                  .databunch(bs=32))


## 3: add y labels

add y label -> becomes label lists

this is essentially Pytorch Dataset, will be fed to model's forward pass

In [33]:
lls = ils.label_from_df(cols='AdoptionSpeed',label_cls=FloatList)

In [50]:
type(lls.valid)

fastai.data_block.LabelList

In [60]:
lls.valid.y[0]

FloatItem 1.0

In [59]:
lls.valid.x[0]

TabularText Type 2; Breed1 265; Breed2 0; Gender 2; Color1 6; Color2 0; Color3 0; MaturitySize 2; FurLength 2; Vaccinated 2; Dewormed 2; Sterilized 2; Health 1; State 41326; m1_label_description cat; Is_Sterilized False; Is_Free True; IsMandarin False; IsMalay False; image_size_std_na False; sentiment_document_score_na False; metadata_metadata_color_pixelfrac_MEAN_na False; sentiment_document_magnitude_na False; image_size_sum_na False; metadata_metadata_crop_conf_MEAN_na False; metadata_metadata_crop_conf_SUM_na False; metadata_metadata_color_pixelfrac_SUM_na False; sentiment_magnitude_na False; height_std_na False; metadata_metadata_color_score_SUM_na False; metadata_metadata_color_score_MEAN_na False; metadata_metadata_crop_importance_MEAN_na False; metadata_metadata_annots_score_MEAN_na False; width_sum_na False; image_size_mean_na False; width_std_na False; height_mean_na False; metadata_metadata_crop_importance_SUM_na False; width_mean_na False; metadata_metadata_annots_score_SUM

In [63]:
lls.valid.x[0].data

[tensor([  2, 127,   1,   2,   6,   1,   1,   2,   2,   2,   2,   2,   1,   3,
          11,   1,   2,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1]),
 tensor([ 0.2183,  0.4945,  0.4828,  0.1388, -0.3692, -0.3074,  0.3268, -0.3576,
         -0.1507,  0.0299, -0.3161, -0.3003, -0.4543,  0.0303, -0.3371, -0.7934,
         -0.3981, -0.4783,  2.7438, -0.0067,  1.4372, -0.8704,  0.7718, -0.2633,
         -0.4719,  0.8051,  0.3228, -1.0693, -0.3423,  0.6022, -0.5318, -0.3710,
         -0.3616, -0.4454, -2.4276, -0.0046, -0.3712, -0.8529,  0.4258, -0.4003,
         -0.6531,  0.3437,  1.3544, -0.5201, -0.8706, -0.7828,  0.8350,  0.0306,
         -0.3557,  0.7693, -0.4922,  0.3578, -0.1055, -0.9616,  0.2114, -0.4171,
          0.6080, -1.1070,  0.1319, -0.1105,  0.5871,  0.7101, -0.8303, -0.6207,
          0.2029,  0.0344,  0.7023, -0.3619, -0.6448, -0.3429,  0.5295,  0.6081,
         -0.6231, -0.5830, -0.47

In [65]:
lls.valid.x[0].text

"xxbos healthy and active , feisty kitten found in neighbours ' garden . xxmaj not sure of sex ."

In [62]:
lls.valid.x.text_ids[0]

array([   2,   94,    9,  102,   10, 2058,   84,   71,   20, 1026,  232,  786,    8,    4,   46,  301,   19, 1939,
          8])

In [67]:
len(lls.train.x.vocab.itos), len(lls.valid.x.vocab.itos)

(10131, 10131)

## 4: add test

In [34]:
lls = lls.add_test(TabularTextList.from_df(test, cat_names, cont_names, txt_cols,procs=procs, path=path))

In [70]:
lls.test.x[0].data

[tensor([  1, 164,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,   1,   3,
          20,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1]),
 tensor([ 1.2947e-01, -1.5042e+00,  4.5436e-01,  2.7206e-01,  9.8155e-01,
         -3.0743e-01, -2.2018e+00, -3.5761e-01, -1.5066e-01,  8.2639e-01,
          7.5734e-01, -5.0052e-01, -4.5429e-01, -1.6898e-02,  1.2211e+00,
          8.1120e-02,  2.1810e+00,  7.7454e-02,  3.0595e-01, -7.0714e-01,
         -8.8790e-01,  9.0379e-01,  3.5707e-01,  9.7715e-01, -4.7187e-01,
          6.5516e-01,  3.8508e-01, -1.4456e+00, -3.4227e-01, -2.6513e-01,
          3.4878e-01, -3.7101e-01, -3.6165e-01,  1.9814e-02, -3.4950e-02,
         -3.0038e+00, -3.7124e-01,  3.4839e-01, -3.9615e-01,  1.3585e+01,
          3.0051e+00, -8.5804e-01,  1.3302e-01, -5.2010e-01, -3.0337e-01,
          1.2564e+00,  4.7000e-01, -3.6701e-01, -3.5566e-01,  7.3454e-01,
         -2.0052e-01,

In [71]:
lls.test.x[0].text
lls.test.x.text_ids[0]

'xxbos xxmaj puppy is calm for a young dog , but he becomes very cheerful among people . xxmaj he likes being hugged and carried .'

array([   2,    4,  100,   14,  610,   15,   12,  279,   48,   10,   52,   21, 1485,   26,  868,  861,  156,    8,
          4,   21,  266,  169, 3208,    9, 1189,    8])

In [72]:
len(lls.test.x.vocab.itos)

10131

## 5: build databunch

collection of pytorch dataloaders

In [35]:
collate_fn = partial(mixed_tabular_pad_collate, pad_idx=1, pad_first=True)
tabtext_db = lls.databunch(bs=64,collate_fn=collate_fn, no_check=False)

In [107]:
x1,y1 = tabtext_db.one_batch(ds_type=DatasetType.Valid)

In [108]:
#label
y1,len(y1)

(tensor([1., 1., 3., 4., 2., 3., 4., 3., 4., 3., 3., 4., 4., 4., 2., 4., 3., 4.,
         1., 3., 3., 4., 3., 1., 2., 3., 2., 3., 2., 4., 4., 1., 2., 4., 3., 3.,
         2., 1., 4., 2., 2., 2., 4., 4., 4., 4., 1., 3., 2., 4., 1., 1., 2., 2.,
         2., 2., 2., 1., 4., 1., 2., 2., 2., 2.]), 64)

In [113]:
y1.shape

torch.Size([64])

In [88]:
x1,y1 = tabtext_db.one_batch(ds_type=DatasetType.Train)
# noted that each piece text (paragraph) has been patched to have equal size (= size of longest texts)
# this size can be different for each batch
x1[2][0].shape,x1[2][1].shape,x1[2][-1].shape

(torch.Size([461]), torch.Size([461]), torch.Size([461]))

In [84]:
x1,y1 = tabtext_db.one_batch(ds_type=DatasetType.Valid)

x1[2][0].shape,x1[2][1].shape,x1[2][-1].shape

(torch.Size([484]), torch.Size([484]), torch.Size([484]))

In [86]:
x1,y1 = tabtext_db.one_batch(ds_type=DatasetType.Test)

x1[2][0].shape,x1[2][1].shape,x1[2][-1].shape

(torch.Size([141]), torch.Size([141]), torch.Size([141]))

In [106]:
tabtext_db = get_tabulartext_databunch()

In [34]:
tabtext_db.show_batch()

Type,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,State,m1_label_description,Is_Sterilized,Is_Free,IsMandarin,IsMalay,metadata_metadata_crop_conf_SUM_na,sentiment_document_magnitude_na,height_sum_na,width_mean_na,metadata_metadata_annots_score_MEAN_na,image_size_std_na,height_std_na,image_size_sum_na,metadata_metadata_color_pixelfrac_MEAN_na,metadata_metadata_crop_importance_SUM_na,width_std_na,sentiment_document_score_na,width_sum_na,metadata_metadata_color_pixelfrac_SUM_na,sentiment_score_na,metadata_metadata_annots_score_SUM_na,height_mean_na,metadata_metadata_crop_importance_MEAN_na,sentiment_magnitude_na,metadata_metadata_color_score_MEAN_na,metadata_metadata_crop_conf_MEAN_na,metadata_metadata_color_score_SUM_na,image_size_mean_na,TFIDF_metadata_annots_top_desc_8,State_RescuerID_nunique,metadata_metadata_crop_conf_SUM,State_RescuerID_COUNT_mean,potential for playfulness,Breed1_RescuerID_COUNT_mean,TFIDF_Description_4,sentiment_document_magnitude,height_sum,affectionate with family,sensitivity level,Quantity,TFIDF_metadata_annots_top_desc_10,RescuerID_IsFree_Mean,potential for weight gain,kid friendly,State_m1_label_score_mean,TFIDF_metadata_annots_top_desc_7,easy to groom,Breed1_AdoptionSpeed_mean,width_mean,Age,TFIDF_sentiment_entities_15,metadata_metadata_annots_score_MEAN,TFIDF_Description_7,image_size_std,drooling potential,TFIDF_metadata_annots_top_desc_15,RescuerID_m1LabelScore_mean,RescuerID_COUNT,TFIDF_Description_11,TFIDF_Description_9,TFIDF_Description_2,TFIDF_sentiment_entities_1,State_GDP,height_std,tendency to bark or howl,m1_dominant_green,tolerates hot weather,TFIDF_metadata_annots_top_desc_13,State_m1_vertex_y_mean,State_m1_label_score_sum,image_size_sum,TFIDF_metadata_annots_top_desc_11,TFIDF_metadata_annots_top_desc_6,TFIDF_metadata_annots_top_desc_5,intelligence,Breed1_RescuerID_nunique,TFIDF_Description_14,TFIDF_sentiment_entities_12,State_RescuerID_COUNT_sum,PhotoAmt,TFIDF_Description_6,metadata_metadata_color_pixelfrac_MEAN,TFIDF_Description_15,metadata_metadata_crop_importance_SUM,TFIDF_Description_10,TFIDF_Description_12,TFIDF_sentiment_entities_6,width_std,TFIDF_sentiment_entities_7,good for novice owners,State_Age_mean,Breed1_count,TFIDF_Description_3,TFIDF_sentiment_entities_5,TFIDF_sentiment_entities_8,potential for mouthiness,State_Density,m1_dominant_score,TFIDF_metadata_annots_top_desc_1,RescuerID_m1LabelScore_sum,Breed1_Age_mean,sentiment_document_score,m1_label_score,width_sum,RescuerID_Fee_Sum,incredibly kid friendly dogs,TFIDF_metadata_annots_top_desc_2,TFIDF_sentiment_entities_9,TFIDF_Description_0,RescuerID_m1VertexY_mean,metadata_metadata_color_pixelfrac_SUM,TFIDF_metadata_annots_top_desc_4,RescuerID_breed1_nunique,TFIDF_sentiment_entities_4,general health,RescuerID_Fee_Mean,State_count,m1_dominant_pixel_frac,TFIDF_metadata_annots_top_desc_9,sentiment_score,TFIDF_Description_1,TFIDF_sentiment_entities_10,adapts well to apartment living,metadata_metadata_annots_score_SUM,wanderlust potential,height_mean,metadata_metadata_crop_importance_MEAN,State_Area,sentiment_magnitude,m1_bounding_confidence,m1_bounding_importance,RescuerID_age_mean,metadata_metadata_color_score_MEAN,amount of shedding,TFIDF_sentiment_entities_0,State_Pop,Fee,energy level,metadata_metadata_crop_conf_MEAN,TFIDF_Description_8,size,name_length,Breed1_m1_vertex_y_mean,Breed1_RescuerID_COUNT_sum,intensity,TFIDF_sentiment_entities_13,easy to train,m1_vertex_y,TFIDF_sentiment_entities_3,TFIDF_metadata_annots_top_desc_0,Breed1_m1_label_score_mean,TFIDF_sentiment_entities_14,tolerates cold weather,TFIDF_metadata_annots_top_desc_14,TFIDF_metadata_annots_top_desc_3,friendly toward strangers,TFIDF_metadata_annots_top_desc_12,tolerates being alone,exercise needs,m1_dominant_red,VideoAmt,dog friendly,m1_dominant_blue,TFIDF_Description_5,metadata_metadata_color_score_SUM,prey drive,Breed1_m1_label_score_sum,TFIDF_Description_13,TFIDF_sentiment_entities_2,tendency to vocalize,m1_vertex_x,TFIDF_sentiment_entities_11,image_size_mean,pet friendly,target
2,265,0,2,6,7,0,2,2,1,1,1,1,41326,cat,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-1.4697,0.7693,0.002,0.7718,-0.4672,-0.8529,-0.1481,0.8312,-0.2555,-0.4779,-0.3712,-0.3974,0.8246,0.4945,-0.3616,-0.2532,0.1177,0.1035,-0.4189,-0.6975,-0.2925,2.0447,-0.1794,0.1812,-0.1004,-0.6351,-0.3074,-0.0789,0.2618,-0.3898,-0.2463,-0.3057,-0.2191,-0.5526,-0.5201,1.3053,-0.3487,0.7115,-0.3576,1.1894,0.4828,0.8055,-0.5962,-0.4008,0.0087,-0.1386,-0.4474,-0.6231,0.017,-0.3516,0.835,0.024,0.1893,1.365,-0.4773,0.0037,0.5123,-0.4519,0.0325,0.2317,0.1341,-0.353,-0.1802,-0.8591,-0.4343,-0.0219,-0.1714,-0.3621,-0.583,0.3541,0.8029,-0.3883,-0.4907,0.0811,0.2268,-0.0868,-0.4003,-0.371,-0.1397,-0.1577,-0.2053,-0.4534,0.4145,-0.0245,-0.4059,0.0296,-0.4543,-0.3371,0.8051,1.025,0.0957,-0.6921,-0.0983,0.1899,-0.3405,0.0142,-0.3514,-1.0707,-0.0248,0.2114,0.7976,0.1403,0.14,1.6226,1.1286,-0.4417,-0.1921,0.8318,-0.2778,-0.3657,-0.1055,-1.194,-0.3427,-0.6351,-0.6611,-0.8704,-0.3593,-0.1731,-0.3619,-1.0907,0.0772,-0.9917,0.608,-0.1282,-0.3537,-1.4529,-0.1507,-0.4171,0.0236,-0.3423,-0.3618,0.5229,-0.1649,-0.3557,0.8754,-0.384,0.1743,-0.3538,-0.8494,0.1811,-0.625,-0.2233,0.106,-0.2358,-1.2103,-0.2496,1.0
1,307,0,1,2,0,0,1,1,2,1,1,1,41326,dog,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0124,0.7693,-0.2906,0.7718,-0.4672,0.5321,-0.4279,-0.3601,-0.4726,-0.4779,-0.3712,-0.3974,0.0951,0.4945,-0.3616,-0.2532,0.1177,-0.2854,-0.4189,0.8336,-1.6276,-0.4172,0.1484,0.3084,0.0182,-0.4253,-0.3074,0.0616,0.1748,-0.4285,0.1638,1.3271,-0.235,-0.6327,-0.5201,-0.6207,-0.3487,-0.3805,-0.3576,-1.212,0.4828,0.8055,-0.5598,-0.1267,-0.4984,0.37,-0.4474,0.8983,0.2813,-0.0173,0.835,-0.2651,-0.1799,-0.5302,0.0848,-0.289,-0.4695,-0.037,-0.078,-0.8797,-0.0823,-0.353,-0.1802,1.0779,-0.1025,-0.0756,0.0276,-0.3621,-0.583,-0.0415,-0.9103,-0.4291,-0.348,0.0082,0.1509,-0.6215,-0.4003,-0.371,2.8821,-0.1281,-0.8658,-0.9649,-0.3941,0.2491,-0.5044,-0.0363,-0.4543,-0.3371,0.8051,-0.0048,-0.0097,1.5084,-0.0327,-0.0876,-0.3405,-0.2789,-0.3514,-1.0736,-0.0248,0.2114,-0.3725,0.1403,0.14,-0.5076,-0.3212,-0.4417,-0.1921,0.8318,-0.2778,-0.3657,-0.1055,0.0634,-0.3427,-0.2387,0.5527,1.2307,-0.3593,-0.0137,-0.3619,-0.794,-0.093,0.6951,-0.0506,-0.1748,-0.3537,-0.6126,-0.1507,-0.4171,1.1076,-0.3423,-0.3618,-0.2298,-0.1649,-0.3557,-0.6226,-0.4215,-0.3229,-0.3538,1.0698,-0.3165,-0.7902,-0.2233,-1.1754,-0.0638,-0.8787,-0.2496,3.0
2,265,0,3,1,3,7,2,2,2,2,2,1,41326,cat,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.2918,0.7693,-0.2906,0.7718,-0.4672,-0.8529,0.647,-0.587,-0.0832,-0.4779,-0.3712,0.9397,-1.1896,0.4945,-0.3616,-0.2532,0.1177,-0.7636,-0.4189,-0.6975,0.5231,-0.4719,-1.4452,-1.9456,-1.2234,2.0246,-0.3074,-0.0362,0.1296,-0.4188,0.3069,-0.2851,-0.3208,-0.1021,-0.5201,-0.6207,-0.3487,1.1303,-0.3576,0.6653,0.4828,0.8055,-0.184,-0.9426,1.7727,0.7867,-0.4474,-0.6231,-0.5358,0.5036,0.835,-0.2651,-0.4629,0.3817,0.0875,-0.289,0.6903,-0.9221,0.33,1.8219,1.0502,-0.353,-0.1802,-0.8591,-0.6292,-0.5942,-2.5858,-0.3621,-0.583,-0.2573,0.1948,-0.4194,-0.4907,-1.3036,0.21,-0.1591,-0.4003,-0.371,-0.0438,2.0938,0.2646,0.0186,-0.189,-0.0809,-0.4059,-0.3374,-0.4543,-0.3371,0.8051,0.2564,-0.2168,-1.4257,-0.5312,-1.4128,-0.3405,-0.3162,-0.3514,0.9744,-0.0248,0.2114,-0.5251,0.1403,0.14,-0.441,0.1259,-0.4417,-0.1917,0.8318,-0.2778,-0.3657,-0.1055,0.4891,-0.3427,-1.1636,-0.6611,-0.8704,-0.3593,0.0688,-0.3619,0.8247,0.5761,-1.2823,0.608,-1.6274,-0.3537,-0.6048,-0.1507,-0.4171,0.0496,-0.3423,-0.3618,0.9861,-0.1649,-0.3557,1.334,1.5709,-0.272,-0.3538,-0.8494,-0.7398,0.6558,-0.2233,1.8634,0.1526,0.1109,-0.2496,1.0
2,266,0,1,3,0,0,2,1,1,1,1,1,41326,cat,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1.916,0.7693,1.1722,0.7718,-0.4672,-0.7482,0.0869,0.5476,0.703,-0.4779,-0.3712,-0.3974,2.0939,0.4945,-0.3616,-0.2532,0.1177,-1.2904,-0.4189,-0.0747,-0.3101,-0.4719,-2.2685,0.5171,0.2592,-0.5185,-0.3074,-0.0528,0.2255,-0.3027,-0.4016,2.1344,-0.0555,-0.2365,-0.5201,1.3879,-0.3487,0.6367,-0.3576,0.9293,0.4828,0.8055,0.4224,0.2119,2.1406,0.8306,-0.4474,0.4482,0.6795,-1.0021,0.835,1.1804,-0.1991,-0.4045,-0.3825,1.1745,0.1369,-0.6331,2.3631,0.0946,-0.7575,-0.353,-0.1802,0.0533,-0.3794,-0.3384,-0.2296,-0.3621,-0.583,-0.5359,1.0718,-0.2984,-0.4237,0.154,0.2241,0.8935,-0.4003,-0.371,-0.026,0.2325,0.3855,-0.367,0.8819,-0.1576,-0.4059,-0.3619,-0.4543,-0.3371,0.8051,-0.1573,-0.9828,-0.3254,-0.6212,0.2401,-0.3405,1.21,-0.3514,-0.8887,-0.0248,0.2114,0.4923,0.1403,0.14,-0.5742,-0.6381,-0.4417,-0.1918,0.8318,-0.2778,-0.3657,-0.1055,0.5823,-0.3427,0.0256,-0.2847,-0.6373,-0.3593,-2.0471,-0.3619,-1.0997,1.0864,-0.9472,0.3221,1.2903,-0.3537,-0.4185,-0.1507,-0.4171,-0.0121,-0.3423,-0.3618,0.6966,-0.1649,-0.3557,0.4474,-0.5345,0.9667,-0.3538,0.0752,0.1796,0.1398,-0.2233,0.106,-0.4287,-0.5579,-0.2496,3.0
1,152,0,2,1,2,0,1,1,1,1,1,2,41401,dog,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.1619,-0.5691,1.4647,-0.7447,2.4369,1.0021,0.5909,3.2138,1.1869,2.2347,3.1264,-0.3974,-0.0044,-1.9733,3.3099,-0.2532,-0.0565,-0.2286,3.0073,-0.723,-0.3969,2.0447,0.527,0.7336,-0.0858,0.5223,0.984,0.3938,0.1355,-0.3317,-0.205,-0.1971,-0.4391,0.7891,1.6741,1.3053,0.575,-0.4703,2.5027,-0.3081,-0.7675,-0.7548,0.5065,-0.0138,0.331,-0.6482,1.2962,-1.5099,0.3361,-0.6651,-0.9728,1.4695,0.1455,-0.7559,-0.7703,1.4672,1.0491,-1.0206,0.8606,0.2317,0.2455,2.2585,0.8094,-1.3268,0.3676,0.0138,-0.3566,1.1961,1.6893,0.4136,-0.7321,-0.3311,1.9911,-0.3562,0.1958,1.0839,0.3541,1.6841,-0.774,-0.5886,1.6328,0.8432,0.8943,0.1625,-0.012,1.0615,2.0402,0.7785,-0.7534,0.4209,-0.0512,-1.0589,-0.5513,1.0129,3.7023,1.5178,4.2284,-0.4402,-0.0248,-0.93,3.1376,0.1403,0.14,0.612,-1.2829,0.9768,-0.1917,-1.2222,2.1856,2.4835,-0.1055,-1.9006,0.6543,-0.3708,-0.3942,-0.922,2.049,0.0229,1.7956,0.1053,0.5221,0.1797,0.0979,0.548,0.6377,-0.0664,-0.1507,1.634,-0.0087,2.5849,2.5268,-0.5337,-0.1649,1.1839,-0.271,-0.392,1.0117,2.1532,-1.3362,0.2886,0.1418,-0.2233,0.106,-0.3022,-0.6351,-0.2496,3.0


text,target
"xxbos(2) xxmaj(4) maya(2040) is(14) the(13) queen(2592) in(20) the(13) house(96) .(8) xxmaj(4) she(17) 's(55) beautiful(222) ,(10) sweet(179) and(9) looks(421) blur(3844) at(30) times(456) .(8) xxmaj(4) when(78) you(24) hug(1466) her(18) ,(10) it(59) 's(55) like(95) hugging(5395) a(12) soft(478) feathered(9375) xxunk(0) .(8) xxmaj(4) always(263) ready(262) for(15) hugs(1683) and(9) kisses(1867) ,(10) this(49) lady(593) xxunk(0) in(20) tickling(9298) on(53) her(18) stomach(2471) and(9) neck(1385) .(8) xxmaj(4) she(17) will(38) lean(2917) over(324) to(11) ask(630) for(15) a(12) kiss(2635) on(53) her(18) nose(1081) and(9) seldom(1681) make(230) any(148) noise(1135) .(8) xxmaj(4) feeds(2046) only(93) on(53) dry(551) kibbles(401) .(8) xxmaj(4) easy(474) maintenance(2114) .(8) p(782) /(56) s(470) :(41) xxmaj(4) no(90) cage(322) confinement(2099) please(31) ,(10) except(654) in(20) incidences(4072) of(19) illness(1492) or(51) injuries(1510) .(8)",1.0
xxbos(2) xxmaj(4) he(21) was(40) nearly(1649) became(1740) a(12) road(346) kill(1941) before(197) me(34) and(9) my(32) wife(1272) and(9) also(145) some(172) xxmaj(4) good(60) xxmaj(4) samaritans(4307) help(161) to(11) rescue(357) him(42) .(8) xxmaj(4) very(26) affectionate(338) little(124) guy(846) !(43),3.0
"xxbos(2) xxmaj(4) these(140) kittens(101) were(151) born(264) at(30) my(32) grandmother(4719) 's(55) yard(1595) ,(10) last(358) week(325) i(16) found(71) the(13) mother(142) dead(1514) ,(10) maybe(955) hit(706) by(61) a(12) car(272) around(125) 5(146) minutes(2732) before(197) i(16) saw(596) her(18) .(8) xxmaj(4) the(13) kittens(101) are(22) almost(476) 2(62) months(109) old(65) .(8) xxmaj(4) they(39) know(219) how(273) to(11) drink(1094) milk(457) by(61) themselves(2035) .(8) xxmaj(4) just(126) need(123) to(11) clean(405) them(37) up(87) every(486) time(113) they(39) poop(1614) .(8) xxmaj(4) really(163) need(123) someone(165) to(11) adopt(64) as(45) i(16) already(159) have(33) 7(286) cats(98) at(30) home(25) and(9) the(13) milking(5380) mother(142) is(14) not(46) keen(1063) on(53) giving(400) them(37) milk(457) .(8)",1.0
"xxbos(2) xxmaj(4) found(71) him(42) a(12) drain(443) 2(62) weeks(180) ago(261) .(8) xxmaj(4) super(365) playful(76) and(9) gets(430) along(283) great(302) with(23) other(112) cats(98) .(8) xxmaj(4) very(26) smart(314) and(9) never(372) messes(4199) up(87) the(13) house(96) .(8) xxmaj(4) he(21) also(145) knows(431) how(273) to(11) use(464) the(13) litter(160) box(271) and(9) is(14) not(46) used(429) to(11) be(27) alone(411) ((44) especially(749) at(30) xxunk(0) u(209) adopt(64) him(42) and(9) find(135) out(155) that(57) he(21) is(14) not(46) really(163) the(13) kitten(84) u(209) want(196) ,(10) u(209) may(249) always(263) return(517) him(42) back(168) to(11) me(34) .(8) xxmaj(4) pls(158) email(250) me(34) if(35) interested(77) :)(183)",3.0
"xxbos(2) 20(812) xxmaj(4) april(791) update(480) :(41) minpin(6178) is(14) on(53) trial(3401) adoption(54) minpin(6178) was(40) found(71) on(53) a(12) busy(578) road(346) and(9) is(14) being(169) put(310) up(87) for(15) adoption(54) since(195) no(90) one(89) claimed(2650) her(18) .(8) xxmaj(4) she(17) 's(55) now(85) believed(1486) to(11) be(27) abandoned(208) due(217) to(11) her(18) cataract(4259) .(8) xxmaj(4) due(217) to(11) her(18) early(1410) cataract(4259) which(213) is(14) probably(692) xxunk(0) inherited(5286) ,(10) she(17) can(47) only(93) see(293) xxunk(0) from(66) her(18) right(330) eye(504) although(769) she(17) has(58) good(60) vision(7216) from(66) her(18) left(254) eye(504) .(8) xxmaj(4) this(49) would(116) caused(1805) her(18) to(11) panic(4944) and(9) start(945) protecting(5532) herself(824) by(61) xxunk(0) if(35) there(130) 's(55) any(148) sudden(4807) movement(3658) .(8) xxmaj(4) however(364) ,(10) when(78) she(17) knows(431) you(24) ,(10) get(150) used(429) to(11) your(111) voice(1433) ,(10) she(17) will(38) be(27) as(45) adorable(152) as(45) a(12) kitten(84) .(8) xxmaj(4) in(20) addition(1503) ,(10) due(217) to(11) her(18) cataract(4259) problem(498) and(9) her(18) not(46) being(169) well(128) socialized(3891) ,(10) she(17) will(38) start(945) barking(1489) whenever(952) there(130) 's(55) dogs(121) or(51) cats(98) near(256) her(18) so(63) she(17) may(249) not(46) be(27) suitable(395) to(11) multi(3954) -(29) pet(149) homes(378) or(51) has(58) to(11) be(27) monitored(4853) when(78) around(125) other(112) pets(356) .(8) xxmaj(4) as(45) she(17) was(40) likely(1515) abandoned(208) by(61) her(18) owner(92) and(9) probably(692) had(166) to(11) fight(1370) for(15) food(105) while(317) on(53) the(13) streets(449) ,(10) she(17) is(14) still(178) protective(1241) over(324) her(18) food(105) while(317) she(17) 's(55) eating(556) ,(10) so(63) it(59) 's(55) best(389) not(46) to(11) disturb(2506) her(18) when(78) there(130) 's(55) food(105) in(20) front(455) of(19) her(18) .(8) xxmaj(4) behavior(2074) :(41) -(29) xxmaj(4) she(17) 's(55) toilet(241) trained(137) ,(10) will(38) only(93) pee(640) and(9) poo(711) outside(336) the(13) house(96) .(8) -(29) xxmaj(4) she(17) can(47) walk(432) off(532) leash(718) ,(10) she(17) will(38) follow(407) the(13) xxunk(0) sound(1750) when(78) you(24) want(196) her(18) to(11) follow(407) .(8) -(29) healthy(94) and(9) fit(1260) ,(10) loves(119) to(11) be(27) with(23) humans(463) .(8) -(29) xxmaj(4) her(18) personality(764) and(9) attitude(2103) is(14) more(138) suitable(395) for(15) a(12) family(122) that(57) do(80) not(46) have(33) other(112) pets(356) but(52) the(13) adopting(328) family(122) has(58) to(11) be(27) patient(960) when(78) dealing(4880) with(23) minpin(6178) and(9) she(17) will(38) eventually(2282) listen(1812) and(9) be(27) an(106) extreme(4796) cute(115) girl(206) when(78) she(17) gets(430) to(11) know(219) you(24) better(376) .(8) xxmaj(4) the(13) adopter(110) should(342) be(27) active(102) on(53) xxmaj(4) facebook(1183) so(63) that(57) we(36) can(47) get(150) pictorial(6292) updates(886) on(53) the(13) dog(48) and(9) forgo(9587) the(13) need(123) for(15) physical(1978) visits(1239) to(11) check(433) on(53) her(18) .(8) xxmaj(4) by(61) the(13) way(462) ,(10) minpin(6178) was(40) xxunk(0) in(20) xxmaj(4) star(2737) newspaper(1720) on(53) 16(1642) xxmaj(4) february(1927) under(321) the(13) xxmaj(4) dog(48) xxmaj(4) talk(1462) xxunk(0) .(8) xxmaj(4) thank(173) you(24) ,(10) xxmaj(4) ellen(6117) xxmaj(4) xxunk(0) !(43) xxmaj(4) if(35) you(24) 're(277) interested(77) to(11) adopt(64) her(18) ,(10) please(31) sent(546) a(12) message(557) ((44) via(555) petfinder.my(3042) )(28) and(9) tell(836) me(34) about(133) yourself(563) especially(749) your(111) dog(48) ownership(2531) experience(439) and(9) do(80) expect(1675) an(106) interview(1282) .(8)",3.0


# Compare to tabular data

In [53]:
# itemlist: list of itembase, which contains independent variables
data = TabularList.from_df(train, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)

In [54]:
type(data)

fastai.tabular.data.TabularList

In [40]:
item_lists = data.split_by_idx(val_idxs[0]) # train ItemList and validation itemList

In [42]:
type(item_lists.valid) # there is no label yet (e.g. item_lists.valid.y)

fastai.tabular.data.TabularList

In [48]:
label_lists = item_lists.label_from_df(cols=dep_var,label_cls = FloatList) # add y label -> label lists

In [49]:
type(label_lists.valid) # this is essentially Pytorch Dataset, will be fed to model's forward pass

fastai.data_block.LabelList

In [50]:
print(type(label_lists.valid.y))
print(type(label_lists.valid.x))

<class 'fastai.data_block.FloatList'>
<class 'fastai.tabular.data.TabularList'>


In [55]:
data_tab = get_databunch()

In [32]:
data_tab.show_batch(ds_type=DatasetType.Valid)

Type,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,State,m1_label_description,Is_Sterilized,Is_Free,IsMandarin,IsMalay,image_size_mean_na,metadata_metadata_color_score_MEAN_na,metadata_metadata_crop_conf_SUM_na,image_size_std_na,metadata_metadata_color_pixelfrac_MEAN_na,metadata_metadata_annots_score_MEAN_na,metadata_metadata_crop_importance_MEAN_na,image_size_sum_na,height_sum_na,sentiment_score_na,sentiment_document_score_na,metadata_metadata_crop_conf_MEAN_na,metadata_metadata_color_pixelfrac_SUM_na,metadata_metadata_color_score_SUM_na,metadata_metadata_crop_importance_SUM_na,height_std_na,width_sum_na,metadata_metadata_annots_score_SUM_na,width_mean_na,sentiment_document_magnitude_na,height_mean_na,sentiment_magnitude_na,width_std_na,image_size_mean,State_Pop,Quantity,TFIDF_metadata_annots_top_desc_15,State_count,RescuerID_age_mean,m1_vertex_x,TFIDF_Description_3,TFIDF_metadata_annots_top_desc_11,RescuerID_m1LabelScore_sum,metadata_metadata_color_score_MEAN,State_m1_vertex_y_mean,Breed1_m1_vertex_y_mean,TFIDF_sentiment_entities_3,potential for weight gain,metadata_metadata_crop_conf_SUM,friendly toward strangers,affectionate with family,TFIDF_metadata_annots_top_desc_1,TFIDF_sentiment_entities_1,TFIDF_Description_8,Breed1_AdoptionSpeed_mean,intensity,TFIDF_sentiment_entities_9,image_size_std,State_Density,TFIDF_Description_2,State_RescuerID_COUNT_sum,TFIDF_sentiment_entities_12,energy level,incredibly kid friendly dogs,easy to train,TFIDF_Description_0,metadata_metadata_color_pixelfrac_MEAN,TFIDF_metadata_annots_top_desc_12,metadata_metadata_annots_score_MEAN,TFIDF_Description_7,metadata_metadata_crop_importance_MEAN,TFIDF_metadata_annots_top_desc_5,Breed1_RescuerID_COUNT_mean,image_size_sum,TFIDF_Description_5,PhotoAmt,TFIDF_metadata_annots_top_desc_10,tolerates cold weather,TFIDF_metadata_annots_top_desc_3,Age,State_GDP,height_sum,State_Area,Breed1_Age_mean,Breed1_RescuerID_nunique,Breed1_count,TFIDF_sentiment_entities_13,m1_bounding_confidence,tolerates being alone,TFIDF_sentiment_entities_8,TFIDF_sentiment_entities_15,TFIDF_Description_1,good for novice owners,sentiment_score,sentiment_document_score,TFIDF_Description_4,TFIDF_Description_15,TFIDF_Description_14,TFIDF_metadata_annots_top_desc_13,adapts well to apartment living,metadata_metadata_crop_conf_MEAN,TFIDF_metadata_annots_top_desc_8,exercise needs,TFIDF_metadata_annots_top_desc_7,tendency to vocalize,tendency to bark or howl,RescuerID_m1VertexY_mean,name_length,TFIDF_metadata_annots_top_desc_14,TFIDF_metadata_annots_top_desc_6,Breed1_RescuerID_COUNT_sum,intelligence,TFIDF_Description_9,TFIDF_Description_12,TFIDF_sentiment_entities_2,TFIDF_sentiment_entities_7,metadata_metadata_color_pixelfrac_SUM,wanderlust potential,metadata_metadata_color_score_SUM,RescuerID_Fee_Mean,State_m1_label_score_mean,State_m1_label_score_sum,prey drive,RescuerID_breed1_nunique,TFIDF_sentiment_entities_11,tolerates hot weather,size,m1_dominant_blue,TFIDF_sentiment_entities_6,TFIDF_sentiment_entities_5,State_RescuerID_nunique,m1_dominant_red,metadata_metadata_crop_importance_SUM,height_std,Fee,TFIDF_sentiment_entities_10,width_sum,TFIDF_metadata_annots_top_desc_2,State_Age_mean,TFIDF_Description_10,m1_dominant_green,potential for playfulness,RescuerID_m1LabelScore_mean,general health,sensitivity level,metadata_metadata_annots_score_SUM,width_mean,sentiment_document_magnitude,Breed1_m1_label_score_mean,m1_vertex_y,m1_bounding_importance,height_mean,State_RescuerID_COUNT_mean,TFIDF_Description_6,TFIDF_metadata_annots_top_desc_0,pet friendly,amount of shedding,potential for mouthiness,kid friendly,RescuerID_Fee_Sum,dog friendly,sentiment_magnitude,easy to groom,Breed1_m1_label_score_sum,VideoAmt,TFIDF_sentiment_entities_0,RescuerID_COUNT,m1_dominant_score,TFIDF_sentiment_entities_14,TFIDF_metadata_annots_top_desc_9,width_std,TFIDF_Description_13,RescuerID_IsFree_Mean,m1_dominant_pixel_frac,m1_label_score,TFIDF_sentiment_entities_4,TFIDF_Description_11,TFIDF_metadata_annots_top_desc_4,drooling potential,target
2,265,0,2,6,0,0,2,2,2,2,2,1,41326,cat,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.6207,0.8318,-0.3974,0.3437,0.8051,0.3578,0.106,-0.255,-0.1649,-0.3692,0.5295,0.4828,-0.6611,0.6612,-0.3616,0.5871,-0.4171,-0.4779,1.4372,-0.3582,1.3544,-0.6975,-0.3593,0.2029,-0.3003,-0.583,-0.2367,0.835,0.6081,-0.3657,-0.371,-0.3619,-1.0823,0.3228,-0.0046,0.173,0.1319,-0.0248,0.0344,-0.8529,0.0306,-0.2549,0.6022,0.7317,-0.3537,-0.1507,-0.4719,-0.5201,0.1765,0.2114,-0.4907,-0.6231,-0.8591,0.6769,0.1403,-0.3423,1.0712,-0.0067,0.1388,-0.353,-0.6921,-0.7934,-0.4783,-0.4454,0.4258,-0.5318,-0.3405,-0.1055,-1.107,-0.3618,-0.4922,-0.2233,-0.3487,-0.6292,-1.1636,2.7438,0.0299,-0.8704,-0.4474,-0.4375,-0.6448,-0.2633,-2.1347,0.7101,-0.3514,0.7023,-0.3371,0.1177,0.8055,-0.3538,-0.1105,0.3268,-0.3576,-0.3427,-0.7754,3.1878,-0.6531,0.7693,-1.0693,0.5891,-0.6207,-0.2778,-0.4767,0.5431,-0.0607,-0.1802,0.4375,-0.8293,-0.4672,0.2,-0.4543,-0.3712,0.6044,0.0207,-0.8706,0.608,-0.794,0.14,-1.0736,0.7718,0.0041,-0.7828,-0.2496,-0.4417,-0.3621,-0.2532,-0.4003,-0.3557,-0.8303,-0.4189,-0.8494,-0.1649,-0.192,-0.3704,-0.3429,-0.3161,-2.4276,-0.8797,-0.9616,0.4945,0.0303,0.2183,-0.3981,-0.6733,-0.1087,-0.3074,1.0
2,265,0,3,1,6,7,1,2,2,2,3,1,41326,cat,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,0.4637,0.8318,3.6141,-0.1767,0.8051,-0.5742,1.8634,-0.681,0.8607,-0.4189,0.7655,0.4828,-0.6611,1.0284,-0.3616,-0.8757,-0.4171,-0.4779,0.204,-0.0885,0.7019,-0.6975,-0.3593,2.134,-0.2419,-0.583,-0.3692,0.835,0.7245,-0.3657,-0.371,-0.3619,0.5217,0.7125,0.0995,2.4888,-1.0765,-0.0248,-0.3374,-0.8529,-0.6528,1.3525,-0.8433,-0.198,-0.3537,-0.1506,-0.4719,-0.5201,-0.7792,0.2114,-0.4907,-0.6231,-0.8591,0.1229,0.1403,-0.3423,-2.059,-0.6482,-0.4924,-0.353,0.4081,0.8099,0.1877,0.042,-1.374,0.4751,-0.3405,-0.1055,0.1822,-0.3618,0.293,-0.2233,-0.3487,0.9475,1.8753,-1.5543,-0.6008,-0.8704,-0.4474,-0.4062,-0.3706,0.5191,0.9921,-0.7812,-0.3514,-0.836,-0.3371,0.1177,0.8055,-0.3538,-0.5044,0.4531,-0.3576,-0.3427,-0.699,-0.0304,0.026,0.7693,-1.1127,-0.8743,-0.6207,-0.2778,-1.4085,-0.6451,-0.0326,-0.1802,0.5516,-0.8891,-0.4672,0.2419,-0.4543,-0.3712,-0.8588,2.2813,0.2639,0.608,0.7797,0.14,0.9175,0.7718,-0.4818,-1.321,-0.2496,-0.4417,-0.3621,-0.2532,-0.4003,-0.3557,0.238,-0.4189,-0.8494,-0.1649,-0.1917,-0.4188,0.0609,-1.1138,0.6537,-0.1974,-0.9848,0.4945,0.0245,0.1945,-0.4286,-0.0007,-0.0028,-0.3074,1.0
2,266,0,2,2,0,0,1,1,1,1,1,1,41326,cat,True,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,-0.3995,0.8318,-0.3974,0.2022,0.8051,-0.5076,-0.6262,0.0565,-0.1629,-0.3986,0.5448,0.4828,-0.2847,-0.162,-0.3616,-0.8757,-0.4171,-0.4779,1.1361,-0.6953,0.6579,-0.0747,-0.3593,-0.0133,-0.2419,-0.583,-0.1304,0.835,0.1268,-0.3657,-0.371,-0.3619,-1.3586,0.491,-0.0025,-0.4226,0.5189,-0.0248,1.0894,-0.7482,-0.7621,-0.567,-0.8433,-0.0717,-0.3537,-0.1507,-0.253,-0.5201,-0.8333,0.2114,-0.4237,0.4482,0.0533,0.0153,0.1403,-0.3423,0.0327,-0.0561,0.2345,-0.353,2.2419,-0.2833,-0.2455,-0.1314,-0.6529,-1.0581,-0.3405,-0.1055,-2.5701,-0.3618,-0.1511,-0.2233,-0.3487,-0.1453,-0.2387,1.8355,1.511,-0.6373,-0.4474,-0.4636,-0.383,-0.9716,-0.2103,-0.7978,-0.3514,-0.8444,0.2856,0.1177,0.8055,-0.3538,-0.5044,0.145,-0.3576,-0.3427,0.4168,-0.1872,-0.0948,0.7693,0.5229,-0.8743,-0.6207,-0.2778,-0.128,-0.8747,0.1421,-0.1802,0.5216,0.6666,-0.4672,0.2531,-0.4543,-0.3712,-0.8748,-0.9212,-0.587,0.3221,0.1053,0.14,0.0642,0.7718,0.1654,-1.0142,-0.2496,-0.4417,-0.3621,-0.2532,-0.2472,-0.3557,-0.6268,-0.4189,0.0752,-0.1649,-0.1922,-0.3994,0.5054,-0.028,2.2284,-0.1974,0.7347,-2.0504,0.1629,0.2209,-0.1168,1.8687,-0.143,-0.3074,3.0
2,266,0,3,1,2,7,1,1,2,2,2,1,41326,cat,False,True,False,True,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,True,False,False,False,True,False,True,True,-0.7777,0.8318,2.2769,-0.0641,0.8051,-0.441,-1.3585,1.2022,-1.9524,-0.4189,-1.0954,0.4828,-0.2847,-0.165,-0.3616,-0.8757,-0.4171,-0.4779,0.9779,-0.6999,0.5773,-0.0747,-0.3593,-0.0131,-0.2419,-0.583,-0.3268,0.835,0.1284,-0.3657,-0.371,-0.3619,-0.2182,0.6969,0.0291,0.3232,0.087,-0.0248,1.1023,-0.7482,-0.81,-0.6903,-0.8433,-4.0559,-0.3537,-0.1507,-0.5266,-0.5201,-0.8333,0.2114,-0.4237,0.4482,0.0533,0.0164,0.1403,-0.3423,0.0338,-0.0561,-0.2912,-0.353,0.0414,-0.2104,-0.5393,0.8651,0.1334,-0.0049,-0.3405,-0.1055,3.5829,-0.3618,-1.6291,-0.2233,-0.3487,-0.4185,-1.1636,-1.0196,2.6744,-0.6373,-0.4474,-0.7988,-0.8588,-0.9773,-0.2123,-0.7824,-0.3514,-0.9066,-0.3371,0.1177,0.8055,-0.3538,-0.4059,0.1456,-0.3576,-0.3427,-1.2798,-0.188,-0.0952,0.7693,-1.5035,-0.8743,-0.6207,-0.2778,-0.1286,-0.9422,0.0029,-0.1802,1.0498,-1.3678,-0.4672,0.2372,-0.4543,-0.3712,-0.8707,-1.8631,-0.2466,0.3221,0.1053,0.14,0.0642,0.7718,0.017,-0.9989,-0.2496,-0.4417,-0.3621,-0.2532,-0.4003,-0.3557,-0.2199,-0.4189,0.0752,-0.1649,5.2294,-0.4188,-0.121,-0.0274,-0.3712,-0.1974,0.3563,0.4945,0.7223,0.2136,-0.1166,-0.9175,-0.1544,-0.3074,4.0
2,285,251,1,3,0,0,3,2,1,1,1,1,41326,cat,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.9143,0.8318,-0.3974,-0.2083,0.8051,0.6874,-0.6262,-0.4528,0.0745,0.1676,0.9369,0.4828,-2.1551,1.4577,-0.3616,-0.2906,0.9503,1.1497,1.1796,-0.2472,0.4262,-2.0008,-0.3593,-0.2688,-0.3672,-0.583,0.1225,0.835,-0.5328,-0.3657,-0.371,-0.3619,0.5457,0.1754,0.0489,0.3805,0.8744,-0.0248,1.0159,-0.8897,-0.5733,-0.5306,-0.2651,1.1315,-0.3537,-0.1507,1.3882,-0.5201,-0.2563,0.2114,0.5684,-1.3688,-1.2672,-0.3503,0.1403,-0.3423,0.2395,-0.2652,-0.6604,-0.353,-0.3254,0.5913,1.3644,-1.0257,0.4527,1.1492,-0.3405,-0.1055,-2.1509,-0.3618,-0.8963,3.2499,-0.3487,0.0344,-0.1066,-2.0212,1.4045,-0.9417,-0.4474,1.47,0.6201,-0.0057,-0.4137,-0.2354,-0.3514,-0.1797,1.3044,0.1177,0.8055,-0.3538,0.5788,0.8179,-0.3576,-0.3427,-0.1334,-0.9264,0.9332,0.7693,0.7545,-0.289,-0.6207,-0.2778,-0.1631,-0.4696,0.1412,-0.1802,-0.0362,0.6965,0.6944,0.1861,1.2087,-0.3712,-0.2777,-0.9212,0.4341,-0.0821,0.1053,0.14,0.0642,0.7718,0.286,-1.0039,1.9573,3.1045,-0.3621,1.8372,5.7554,-0.3557,0.4923,-0.4189,-1.2769,-0.1649,-0.1917,0.1519,0.7169,0.3234,-4.4118,-0.8797,1.2082,-1.5636,0.2207,0.227,-0.7475,-0.6147,-0.2055,-0.3074,2.0


In [33]:
data_tab.show_batch(ds_type=DatasetType.Test)

Type,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,State,m1_label_description,Is_Sterilized,Is_Free,IsMandarin,IsMalay,image_size_mean_na,metadata_metadata_color_score_MEAN_na,metadata_metadata_crop_conf_SUM_na,image_size_std_na,metadata_metadata_color_pixelfrac_MEAN_na,metadata_metadata_annots_score_MEAN_na,metadata_metadata_crop_importance_MEAN_na,image_size_sum_na,height_sum_na,sentiment_score_na,sentiment_document_score_na,metadata_metadata_crop_conf_MEAN_na,metadata_metadata_color_pixelfrac_SUM_na,metadata_metadata_color_score_SUM_na,metadata_metadata_crop_importance_SUM_na,height_std_na,width_sum_na,metadata_metadata_annots_score_SUM_na,width_mean_na,sentiment_document_magnitude_na,height_mean_na,sentiment_magnitude_na,width_std_na,image_size_mean,State_Pop,Quantity,TFIDF_metadata_annots_top_desc_15,State_count,RescuerID_age_mean,m1_vertex_x,TFIDF_Description_3,TFIDF_metadata_annots_top_desc_11,RescuerID_m1LabelScore_sum,metadata_metadata_color_score_MEAN,State_m1_vertex_y_mean,Breed1_m1_vertex_y_mean,TFIDF_sentiment_entities_3,potential for weight gain,metadata_metadata_crop_conf_SUM,friendly toward strangers,affectionate with family,TFIDF_metadata_annots_top_desc_1,TFIDF_sentiment_entities_1,TFIDF_Description_8,Breed1_AdoptionSpeed_mean,intensity,TFIDF_sentiment_entities_9,image_size_std,State_Density,TFIDF_Description_2,State_RescuerID_COUNT_sum,TFIDF_sentiment_entities_12,energy level,incredibly kid friendly dogs,easy to train,TFIDF_Description_0,metadata_metadata_color_pixelfrac_MEAN,TFIDF_metadata_annots_top_desc_12,metadata_metadata_annots_score_MEAN,TFIDF_Description_7,metadata_metadata_crop_importance_MEAN,TFIDF_metadata_annots_top_desc_5,Breed1_RescuerID_COUNT_mean,image_size_sum,TFIDF_Description_5,PhotoAmt,TFIDF_metadata_annots_top_desc_10,tolerates cold weather,TFIDF_metadata_annots_top_desc_3,Age,State_GDP,height_sum,State_Area,Breed1_Age_mean,Breed1_RescuerID_nunique,Breed1_count,TFIDF_sentiment_entities_13,m1_bounding_confidence,tolerates being alone,TFIDF_sentiment_entities_8,TFIDF_sentiment_entities_15,TFIDF_Description_1,good for novice owners,sentiment_score,sentiment_document_score,TFIDF_Description_4,TFIDF_Description_15,TFIDF_Description_14,TFIDF_metadata_annots_top_desc_13,adapts well to apartment living,metadata_metadata_crop_conf_MEAN,TFIDF_metadata_annots_top_desc_8,exercise needs,TFIDF_metadata_annots_top_desc_7,tendency to vocalize,tendency to bark or howl,RescuerID_m1VertexY_mean,name_length,TFIDF_metadata_annots_top_desc_14,TFIDF_metadata_annots_top_desc_6,Breed1_RescuerID_COUNT_sum,intelligence,TFIDF_Description_9,TFIDF_Description_12,TFIDF_sentiment_entities_2,TFIDF_sentiment_entities_7,metadata_metadata_color_pixelfrac_SUM,wanderlust potential,metadata_metadata_color_score_SUM,RescuerID_Fee_Mean,State_m1_label_score_mean,State_m1_label_score_sum,prey drive,RescuerID_breed1_nunique,TFIDF_sentiment_entities_11,tolerates hot weather,size,m1_dominant_blue,TFIDF_sentiment_entities_6,TFIDF_sentiment_entities_5,State_RescuerID_nunique,m1_dominant_red,metadata_metadata_crop_importance_SUM,height_std,Fee,TFIDF_sentiment_entities_10,width_sum,TFIDF_metadata_annots_top_desc_2,State_Age_mean,TFIDF_Description_10,m1_dominant_green,potential for playfulness,RescuerID_m1LabelScore_mean,general health,sensitivity level,metadata_metadata_annots_score_SUM,width_mean,sentiment_document_magnitude,Breed1_m1_label_score_mean,m1_vertex_y,m1_bounding_importance,height_mean,State_RescuerID_COUNT_mean,TFIDF_Description_6,TFIDF_metadata_annots_top_desc_0,pet friendly,amount of shedding,potential for mouthiness,kid friendly,RescuerID_Fee_Sum,dog friendly,sentiment_magnitude,easy to groom,Breed1_m1_label_score_sum,VideoAmt,TFIDF_sentiment_entities_0,RescuerID_COUNT,m1_dominant_score,TFIDF_sentiment_entities_14,TFIDF_metadata_annots_top_desc_9,width_std,TFIDF_Description_13,RescuerID_IsFree_Mean,m1_dominant_pixel_frac,m1_label_score,TFIDF_sentiment_entities_4,TFIDF_Description_11,TFIDF_metadata_annots_top_desc_4,drooling potential,target
1,307,0,1,1,0,0,2,2,2,2,2,1,41326,dog like mammal,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.371,0.8318,-0.3974,-0.858,0.6552,0.6004,0.106,1.3962,0.2637,0.9816,0.3344,0.4544,0.6234,-1.424,-0.3616,-0.2906,-0.4171,-0.4779,-0.8879,0.2615,0.133,0.8526,-0.3593,-1.2768,-0.5005,-0.583,-0.0025,0.47,-0.1634,-0.3657,-0.371,-0.3619,-0.8116,0.3851,-3.0038,0.5257,0.4789,-0.0248,-1.6294,0.3484,-0.367,-1.129,-0.2651,-0.2332,-0.3537,-0.1507,-0.4719,-0.5201,-0.4005,0.2114,-0.3374,0.8953,0.9258,2.1902,0.1403,-0.3423,-0.1563,-0.7071,0.2721,-0.353,1.5084,0.0811,0.0775,0.0198,-0.3961,0.3488,-0.3405,-0.1055,-0.249,-0.3618,-0.2005,-0.2233,-0.3487,0.1297,-0.503,0.3059,0.8264,0.9038,-0.4474,1.1644,-0.1164,0.9771,-0.1664,-0.1882,-0.3514,-0.2483,1.2211,0.1079,0.6553,-0.3538,1.4651,-2.2018,-0.3576,-0.3427,-1.1423,0.7117,3.0051,0.7345,-1.4456,-0.289,1.0515,1.7751,-1.7194,-0.3346,-0.9765,-0.3302,-0.5824,-1.278,-0.4672,0.1032,-0.4543,-0.3712,-0.2753,-0.2933,-0.3034,-0.0502,-0.794,0.14,-0.6943,0.3571,-0.1979,1.2564,-0.2496,-0.4417,-0.3621,-0.2532,13.5854,-0.3557,-0.3216,-0.4189,0.9178,-0.1649,-0.192,0.974,0.8105,0.7573,-0.0349,0.0852,-0.1787,-1.5042,-0.0169,0.1295,2.181,-0.0694,-0.3075,-0.3074,0
2,266,0,1,2,7,0,2,1,1,1,1,1,41326,cat,True,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,-0.6725,0.8318,-0.3974,0.2022,0.6552,0.6004,0.106,0.5281,-0.1629,0.9816,1.1741,0.4544,-0.2544,-0.4078,-0.3616,-0.8757,-0.4171,-0.4779,1.1361,3.2258,0.6227,-0.1303,-0.3593,-0.3678,-0.2419,-0.583,0.0884,0.47,-0.1784,-0.3657,-0.371,-0.3619,-0.6646,1.2418,-0.0025,-0.196,0.4113,-0.0248,1.0894,-0.788,-0.7966,0.3811,-0.8433,-0.0717,-0.3537,-0.1507,0.7317,-0.5201,-0.9479,0.2114,-0.4328,0.4347,0.0427,0.3011,0.1403,-0.3423,-0.1009,0.3402,1.672,-0.353,-1.0589,-0.9392,1.4378,-1.6418,-0.2801,-1.0581,-0.3405,-0.1055,-2.5701,-0.3618,-0.1511,-0.2233,-0.3487,0.1297,-0.3708,1.8355,1.511,-0.6687,-0.4474,-0.7522,3.086,-2.1455,0.0279,-0.7415,-0.3514,-0.8205,1.2211,0.1079,0.6553,-0.3538,1.4651,0.197,-0.3576,-0.3427,-1.1423,-0.212,-0.3952,0.7345,-1.3588,-0.8743,-0.6207,-0.2778,0.3877,-0.8072,0.1421,-0.3302,-2.8384,-1.293,-0.4672,0.1032,-0.4543,-0.3712,-0.8736,0.0207,-1.0976,0.3092,-1.3245,0.14,-1.7448,0.3571,-0.1236,-1.0142,-0.2496,-0.4417,-0.3621,-0.2532,13.5854,-0.3557,-1.0847,-0.4189,0.0635,-0.1649,-0.192,0.974,0.7538,0.0218,2.2284,-0.1974,2.7361,-1.5042,0.9165,0.2216,-0.5776,-0.6904,-0.143,-0.3074,0
2,266,0,2,7,0,0,2,1,1,1,1,1,41326,cat,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,0.0097,0.8318,-0.3974,-0.0727,0.6552,0.6004,0.106,-0.4563,-0.3099,0.9816,0.9371,0.4544,-0.2544,1.15,-0.3616,-0.8757,-0.4171,-0.4779,1.616,1.9341,-1.3058,-0.1303,-0.3593,-0.1416,-0.2419,-0.583,-0.3218,0.47,0.2099,-0.3657,-0.371,-0.3619,0.4304,-0.0893,-0.0607,-0.5348,0.2112,-0.0248,-1.3249,-0.788,-0.7103,-1.268,-0.8433,0.2945,-0.3537,-0.1507,0.5129,-0.5201,-0.9054,0.2114,-0.4328,0.4347,0.0427,0.4747,0.1403,-0.3423,0.6185,0.1256,0.72,-0.353,-0.6921,-0.6477,-0.57,-0.245,0.3549,0.0898,-0.3405,-0.1055,0.2593,-0.3618,0.387,-0.2233,-0.3487,0.1297,-0.1066,-1.2202,-1.9742,-0.6687,-0.4474,-0.7799,-0.4564,-1.0667,-0.7343,-0.8414,-0.3514,-0.8295,1.2211,0.1079,0.6553,-0.3538,1.4651,0.153,-0.3576,-0.3427,1.1505,-1.3543,0.7846,0.7345,1.0874,-0.8743,-0.6207,1.7751,-0.1785,-0.8072,-0.2598,-0.3302,1.3921,1.2051,-0.4672,0.1032,-0.4543,-0.3712,-0.8754,0.0207,-0.3034,0.3092,-0.794,0.14,-1.0736,0.3571,0.0046,-0.6136,-0.2496,-0.4417,-0.3621,-0.2532,13.5854,-0.3557,-0.3216,-0.4189,0.0635,-0.1649,-0.1918,0.974,0.6868,-0.0665,0.52,-0.1974,-0.1873,-1.5042,-0.1472,0.2091,-1.1432,-0.4474,0.0151,-0.3074,0
2,266,252,2,1,6,7,2,1,1,1,1,1,41326,cat,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,1.3099,0.8318,-0.3974,-0.0641,0.6552,0.6004,1.8634,0.8843,-1.9524,0.9816,-0.9119,0.4544,-0.2544,-0.0465,-0.3616,-0.8757,-0.4171,-0.4779,0.9779,0.127,0.2113,-0.1303,-0.3593,-0.3014,-0.2419,-0.583,-0.211,0.47,-0.6719,-0.3657,-0.371,-0.3619,-0.386,0.1361,0.0291,0.041,0.195,-0.0248,1.1023,-0.788,-0.5457,-0.3977,-0.8433,-4.0559,-0.3537,-0.1507,-0.3077,-0.5201,-0.8152,0.2114,-0.4328,0.4347,0.0427,-0.153,0.1403,-0.3423,-0.0635,0.1834,0.9424,-0.353,0.0414,0.2269,0.3613,-0.3136,-0.3758,-0.0049,-0.3405,-0.1055,3.5829,-0.3618,-1.6291,-0.2233,-0.3487,0.1297,-0.3708,-1.0196,2.6744,-0.6687,-0.4474,-0.0413,-0.7296,-0.8543,0.1569,-0.8245,-0.3514,-0.8997,1.2211,0.1079,0.6553,-0.3538,1.4651,0.9474,-0.3576,-0.3427,-0.5156,0.0475,-0.0755,0.7345,-0.5337,-0.8743,-0.6207,1.0908,0.0472,-0.6451,0.0029,-0.3302,0.3582,-0.3356,-0.4672,0.1032,-0.4543,-0.3712,-0.8723,2.2813,0.037,0.3092,0.3301,0.14,0.3487,0.3571,-0.06,-0.9989,-0.2496,-0.4417,-0.3621,-0.2532,13.5854,-0.3557,0.0345,-0.4189,0.0635,-0.1649,-0.1921,0.974,-0.2634,0.2176,-0.3712,-0.1974,0.2007,-1.5042,0.0634,0.1788,-0.0827,-0.294,-0.1544,-0.3074,0
1,307,0,2,1,2,7,2,1,1,1,1,1,41326,dog breed,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,-0.4882,0.8318,-0.3974,-4.1919,0.6552,0.6004,-0.4798,0.9757,-0.0452,0.9816,-2.3545,0.4544,0.6234,-0.1984,-0.3616,-0.8757,-0.4171,-0.4779,-0.8894,-0.1251,-0.5454,0.8526,-0.3593,-0.9602,-0.2419,-0.583,-0.2831,0.47,1.8819,-0.3657,-0.371,-0.3619,-0.3422,-2.1271,2.2889,-0.5256,-0.3505,-0.0248,-0.052,0.3484,-0.7733,0.3575,-0.8433,0.033,-0.3537,-0.1507,-0.253,-0.5201,-0.7756,0.2114,-0.3374,0.8953,0.9258,-0.0357,0.1403,-0.3423,-0.2526,1.5968,-0.2414,-0.353,-0.3254,-0.1375,-0.444,0.7522,0.4512,-1.766,-0.3405,-0.1055,-0.1738,-0.3618,-0.07,-0.2233,-0.3487,0.1297,0.0256,-0.3715,0.0275,0.9038,-0.4474,0.3497,-0.1114,0.6047,-0.2552,-0.9942,-0.3514,-0.9544,1.2211,0.1079,0.6553,-0.3538,1.4651,1.7128,-0.3576,-0.3427,0.0806,-0.4978,-0.4795,0.7345,0.1755,-0.8743,-0.6207,1.7751,1.2234,-0.8612,1.6379,-0.3302,0.0669,0.7713,-0.4672,0.1032,-0.4543,-0.3712,-0.8754,-0.7328,-0.4736,-0.0502,0.8247,0.14,0.9744,0.3571,-0.3434,0.572,-0.2496,-0.4417,-0.3621,-0.2532,13.5854,-0.3557,-0.5251,-0.4189,0.9178,-0.1649,-0.192,0.974,-0.676,0.4775,-0.0693,-0.1974,-0.433,-1.5042,-0.4211,0.046,1.172,0.1673,-1.7202,-0.3074,0


In [56]:
x1,y1 = data_tab.one_batch(ds_type=DatasetType.Train)

In [57]:
#label
y1,len(y1)

# each batch contains 2 columns: cats, conts
len(x1),len(x1[0]),len(x1[1]),x1[0].shape,x1[1].shape

x1[0][0] #cats

x1[1][0] #conts (normalized)

(tensor([3., 2., 2., 3., 3., 1., 2., 3., 1., 2., 2., 4., 4., 4., 4., 0., 4., 1.,
         2., 4., 4., 1., 2., 2., 2., 3., 2., 4., 2., 4., 4., 1., 2., 2., 2., 2.,
         2., 4., 4., 4., 1., 0., 4., 1., 4., 4., 3., 1., 4., 3., 4., 3., 2., 2.,
         1., 2., 2., 4., 2., 3., 2., 4., 4., 0.]), 64)

(2, 64, 64, torch.Size([64, 42]), torch.Size([64, 145]))

tensor([  2, 156,   1,   2,   2,   7,   1,   2,   1,   2,   1,   2,   1,   5,
         11,   1,   2,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1])

tensor([-0.8772,  0.3806, -0.4752, -0.2254, -0.4003, -0.6432, -0.1776, -1.1442,
        -0.2383, -0.2890, -0.1921,  0.0399, -0.1515, -0.1222, -1.7294, -0.6351,
         0.0047, -0.3074, -0.3371, -0.1649, -0.0869, -0.3537,  1.7047, -0.3509,
        -0.3481,  0.0039, -0.2233, -0.2274, -1.2558,  0.0348, -0.2906, -0.2756,
        -0.0248,  2.0888, -0.4779,  0.2517, -0.7411,  0.1933, -1.0589, -0.2842,
         0.7111,  0.7377, -0.2677,  0.2045, -0.1867, -0.2487,  0.1403,  0.6815,
        -0.0841, -0.8285, -0.4189,  0.2257, -0.0235,  0.5070, -0.3538,  0.2559,
        -0.8663, -0.1055, -0.1869, -0.6207,  2.9088, -1.1089, -0.4543, -0.4417,
        -0.1507, -0.6268, -0.4719, -0.0248, -0.3423, -0.9304, -0.4091, -1.5054,
        -0.3657, -0.3514, -0.1035, -0.4544, -0.0917, -1.6352, -0.3712,  2.0422,
         0.1400, -0.5742, -0.5056, -0.8795,  0.4945, -0.3621, -1.2210,  0.4495,
         0.1493,  0.1636, -0.3557, -0.3619, -1.0004, -1.7584,  0.8247, -0.3427,
        -0.3074,  1.5305, -0.4672, -0.80

# Compare to text databunch

In [41]:
txt_data = get_text_databunch(bs=64)

In [37]:
type(txt_data)

fastai.text.data.TextClasDataBunch

In [38]:
type(txt_data.valid_ds.x)

fastai.text.data.TextList

In [44]:
txt_data.show_batch(ds_type = DatasetType.Valid)

text,target
xxbos xxmaj for more pics go here : 9th xxmaj april edit : i have xxunk the adoption fee to xxup rm to include the fee i paid her breeder . xxmaj the reason i am now making reimbursement of the breeder 's fee compulsory is to help out xxmaj malaysian xxmaj dogs xxmaj deserve xxmaj better ( xxup mddb ) - they are currently in xxunk financial straits (,3.0
xxbos xxmaj profile xxmaj xxunk : xxmaj name : xxmaj mama xxmaj kin xxmaj age : xxmaj adult cat . xxmaj probably 1 year and half . xxmaj condition : xxmaj healthy and xxmaj pregnant ( xxmaj probably will deliver in early xxmaj may ) . xxmaj believed that this is not the first time she is pregnant . xxmaj characteristics : xxmaj adorable . xxmaj soft . xxmaj very,2.0
"xxbos xxmaj my people kept many of us . xxmaj we were all semi long haired . i loved them because they were kind and fed us well . xxmaj the small ones would sometimes make us toys from rolled up news paper and bits of string . xxmaj they also let us out of the cage we lived in and i enjoyed running and chasing , climbing and rolling",4.0
"xxbos [ xxmaj adopted 17 xxmaj november by xxmaj xxunk t ] "" i 'm a cute young standard xxmaj english bull terrier with lots of love to give . i love snuggling up for cuddles and belly rubs . xxmaj canine buddies make me super happy , i really love to play ! xxmaj like most of my xxunk , i 'm a xxunk xxunk ready to xxunk you",2.0
xxbos xxmaj please send an sms before calling if possible because i might not pick up your calls when i 'm busy and might not call back unknown missed calls . xxmaj or you can send e - mail . xxmaj these two kittens are xxmaj beel and xxunk . xxmaj they are kids of on of the stray cats we feed . xxmaj we picked them up from the,2.0


In [40]:
type(txt_data.valid_ds.x[0])

fastai.text.data.Text

In [41]:
txt_data.valid_ds.x[0]

Text xxbos healthy and active , feisty kitten found in neighbours ' garden . xxmaj not sure of sex .

In [48]:
txt_data.valid_ds.x[0].data

array([   2,   94,    9,  102,   10, 2058,   84,   71,   20, 1026,  232,  786,    8,    4,   46,  301,   19, 1939,
          8])

In [42]:
x1,y1 = txt_data.one_batch(ds_type=DatasetType.Train)
#label
y1,len(y1)
# each batch contains 1 column: text_ids

x1.shape

(tensor([4., 3., 2., 1., 2., 3., 3., 4., 4., 2., 4., 3., 4., 2., 1., 1., 4., 4.,
         4., 3., 2., 2., 4., 1., 3., 1., 1., 2., 3., 1., 1., 3., 3., 2., 4., 2.,
         4., 2., 1., 2., 4., 4., 4., 2., 4., 4., 0., 2., 1., 4., 4., 4., 4., 0.,
         4., 4., 4., 0., 2., 3., 1., 2., 2., 4.]), 64)

torch.Size([64, 1560])

In [43]:
x1[0][-50:]

tensor([2693,   13,  149,   44,    9,  132,   28,   19,  148, 1888, 1923,    8,
           4,   36,   75,   72,   19,  134, 1192,    9,  616,   37,   95,  134,
         290,  465,   10,   52,   36,  145, 3286,   37,  382, 7640,    8,    4,
          36,  815,   95,   24,   11,  352,   19,   37,   45,  111,  290,  465,
         139,    8])

In [95]:
x1,y1 = txt_data.one_batch(ds_type=DatasetType.Valid)
#label
y1,len(y1)
# each batch contains 1 column: text_ids

x1.shape

(tensor([3., 2., 4., 2., 2., 4., 2., 3., 4., 3., 1., 4., 3., 1., 2., 2., 3., 1.,
         3., 3., 4., 4., 2., 1., 4., 1., 4., 4., 3., 3., 1., 4.]), 32)

32

torch.Size([32, 1347])

In [98]:
x1,y1 = txt_data.one_batch(ds_type=DatasetType.Test)
#label
y1,len(y1)
# each batch contains 1 column: text_ids

x1.shape

(tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]), 32)

32

torch.Size([32, 4])