In [None]:
#|default_exp data.tabular

# Time Series Tabular Data

>Main Tabular functions used throughout the library. This is helpful when you have additional time series data like metadata, time series features, etc.

In [None]:
#|export
from fastai.tabular.all import *
from tsai.imports import *
from tsai.utils import *

In [None]:
#|export
@delegates(TabularPandas.__init__)
def get_tabular_ds(df, procs=[Categorify, FillMissing, Normalize], cat_names=None, cont_names=None, y_names=None, groupby=None,
                   y_block=None, splits=None, do_setup=True, inplace=False, reduce_memory=True, device=None, **kwargs):
    device = ifnone(device, default_device())
    groupby = str2list(groupby)
    cat_names = str2list(cat_names)
    cont_names = str2list(cont_names)
    y_names = str2list(y_names)
    cols = []
    for _cols in [groupby, cat_names, cont_names, y_names]:
        if _cols is not None: cols.extend(_cols)
    cols = list(set(cols))
    if y_names is None: y_block = None
    elif y_block is None:
        num_cols = df._get_numeric_data().columns
        y_block = CategoryBlock() if any([True for n in y_names if n not in num_cols]) else RegressionBlock()
    else: y_block = None
    pd.options.mode.chained_assignment=None
    to = TabularPandas(df[cols], procs=procs, cat_names=cat_names, cont_names=cont_names, y_names=y_names, y_block=y_block,
                       splits=splits, do_setup=do_setup, inplace=inplace, reduce_memory=reduce_memory, device=device)
    setattr(to, "groupby", groupby)
    return to

In [None]:
#|export
@delegates(DataLoaders.__init__)
def get_tabular_dls(df, procs=[Categorify, FillMissing, Normalize], cat_names=None, cont_names=None, y_names=None, bs=64, 
                    y_block=None, splits=None, do_setup=True, inplace=False, reduce_memory=True, device=None, **kwargs):
    to = get_tabular_ds(df, procs=procs, cat_names=cat_names, cont_names=cont_names, y_names=y_names, 
                        y_block=y_block, splits=splits, do_setup=do_setup, inplace=inplace, reduce_memory=reduce_memory, device=device, **kwargs)
    if splits is not None: bs = min(len(splits[0]), bs)
    else: bs = min(len(df), bs)
    return to.dataloaders(device=device, bs=bs, **kwargs)

In [None]:
#|export
def preprocess_df(df, procs=[Categorify, FillMissing, Normalize], cat_names=None, cont_names=None, y_names=None, sample_col=None, reduce_memory=True):
    cat_names = str2list(cat_names)
    cont_names = str2list(cont_names)
    y_names = str2list(y_names)
    cols = []
    for _cols in [cat_names, cont_names, y_names]:
        if _cols is not None: cols.extend(_cols)
    cols = list(set(cols))
    pd.options.mode.chained_assignment=None
    to = TabularPandas(df[cols], procs=procs, cat_names=cat_names, cont_names=cont_names, y_names=y_names, reduce_memory=reduce_memory)
    procs = to.procs
    if sample_col is not None:
        sample_col = str2list(sample_col)
        to = pd.concat([df[sample_col], to.cats, to.conts, to.ys], axis=1)
    else: 
        to = pd.concat([to.cats, to.conts, to.ys], axis=1)
    return to, procs

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
# df['salary'] = np.random.rand(len(df)) # uncomment to simulate a cont dependent variable

cat_names = ['workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
             'capital-gain', 'capital-loss', 'native-country']
cont_names = ['age', 'fnlwgt', 'hours-per-week']
target = ['salary']
splits = RandomSplitter()(range_of(df))

dls = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names='salary', splits=splits, bs=512, device=device)
dls.show_batch()

Unnamed: 0,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,native-country,age,fnlwgt,hours-per-week,salary
0,Private,Some-college,10.0,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,United-States,48.0,190072.000005,50.0,>=50k
1,Self-emp-not-inc,Some-college,10.0,Married-civ-spouse,Sales,Husband,White,Male,0,0,United-States,72.000001,284120.002964,40.0,<50k
2,Private,Some-college,10.0,Married-civ-spouse,Protective-serv,Husband,Black,Male,0,0,United-States,72.000001,53684.002497,40.0,<50k
3,Self-emp-inc,Some-college,10.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,United-States,47.0,337049.998875,40.0,<50k
4,Private,HS-grad,9.0,Divorced,Craft-repair,Not-in-family,White,Male,0,0,United-States,46.0,207677.000707,30.0,<50k
5,Private,5th-6th,3.0,Divorced,Priv-house-serv,Unmarried,White,Female,0,0,Mexico,45.0,265082.999142,35.0,<50k
6,Private,Assoc-acdm,12.0,Never-married,Other-service,Not-in-family,White,Female,0,0,United-States,28.0,150296.001328,79.999999,<50k
7,Private,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,United-States,50.0,94080.999353,40.0,>=50k
8,Private,Assoc-voc,11.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,Germany,58.0,235624.000302,40.0,>=50k
9,Private,HS-grad,9.0,Never-married,Other-service,Unmarried,Black,Female,0,0,Japan,29.0,419721.008996,40.0,<50k


In [None]:
metrics = mae if dls.c == 1 else accuracy
learn = tabular_learner(dls, layers=[200, 100], y_range=None, metrics=metrics)
learn.fit(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.349525,0.288922,0.866093,00:05


In [None]:
learn.dls.one_batch()

(tensor([[  5,  12,   9,  ...,   1,   1,  21],
         [  1,  10,  13,  ...,   1,   1,   3],
         [  5,   4,   2,  ...,   1,   1,   6],
         ...,
         [  5,   6,   4,  ...,   1,   1,  40],
         [  3,  10,  13,  ...,   1,   1,  40],
         [  5,  12,   9,  ..., 116,   1,  40]]),
 tensor([[-0.2593,  0.1234,  1.1829],
         [-0.9913, -1.4041, -0.0347],
         [-0.1129,  0.4583, -0.0347],
         ...,
         [-1.5769, -0.1989,  0.3712],
         [ 0.4727, -1.4400,  0.3712],
         [ 1.5708, -0.2222, -0.0347]]),
 tensor([[1],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [1],
         [0],
         [0],
         [0],
         [0],
         [0],
         [1],
         [0],
         [0],
         [1],
         [0],
         [0],
         [0],
         [1],
         [1],
         [1],
         [0],
         [0],
         [1],
         [1],
         [0],
         [

In [None]:
learn.model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(10, 6)
    (1): Embedding(17, 8)
    (2): Embedding(17, 8)
    (3): Embedding(8, 5)
    (4): Embedding(16, 8)
    (5): Embedding(7, 5)
    (6): Embedding(6, 4)
    (7): Embedding(3, 3)
    (8): Embedding(117, 23)
    (9): Embedding(90, 20)
    (10): Embedding(43, 13)
  )
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): LinBnDrop(
      (0): Linear(in_features=106, out_features=200, bias=False)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): LinBnDrop(
      (0): Linear(in_features=200, out_features=100, bias=False)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): LinBnDrop(
      (0): Linear(in_features=100, out_features=2, bias=True)
   

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
cat_names = ['workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
             'capital-gain', 'capital-loss', 'native-country']
cont_names = ['age', 'fnlwgt', 'hours-per-week']
target = ['salary']
df, procs = preprocess_df(df, procs=[Categorify, FillMissing, Normalize], cat_names=cat_names, cont_names=cont_names, y_names=target, 
                          sample_col=None, reduce_memory=True)
df.head()

Unnamed: 0,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,native-country,age,fnlwgt,hours-per-week,salary
0,5,8,12,3,0,6,5,1,1,48,40,0.763796,-0.838084,-0.035429,1
1,5,13,14,1,5,2,5,2,101,1,40,0.397233,0.444987,0.369519,1
2,5,12,0,1,0,5,3,1,1,1,40,-0.042642,-0.886734,-0.683348,0
3,6,15,15,3,11,1,2,2,1,1,40,-0.042642,-0.728873,-0.035429,1
4,7,6,0,3,9,6,3,1,1,1,40,0.250608,-1.018314,0.774468,0


In [None]:
procs.classes, procs.means, procs.stds

({'workclass': ['#na#', ' ?', ' Federal-gov', ' Local-gov', ' Never-worked', ' Private', ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay'],
  'education': ['#na#', ' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate', ' HS-grad', ' Masters', ' Preschool', ' Prof-school', ' Some-college'],
  'education-num': ['#na#', 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0],
  'marital-status': ['#na#', ' Divorced', ' Married-AF-spouse', ' Married-civ-spouse', ' Married-spouse-absent', ' Never-married', ' Separated', ' Widowed'],
  'occupation': ['#na#', ' ?', ' Adm-clerical', ' Armed-Forces', ' Craft-repair', ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners', ' Machine-op-inspct', ' Other-service', ' Priv-house-serv', ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support', ' Transport-moving'],
  'relationship': ['#na#', ' Husband', ' Not-in-fami

In [None]:
#|eval: false
#|hide
from tsai.export import get_nb_name; nb_name = get_nb_name(locals())
from tsai.imports import create_scripts; create_scripts(nb_name)

<IPython.core.display.Javascript object>

/Users/nacho/notebooks/tsai/nbs/014_data.tabular.ipynb saved at 2023-02-19 22:11:36
Correct notebook to script conversion! 😃
Sunday 19/02/23 22:11:38 CET
