In [None]:
# default_exp data

In [None]:
# hide
%load_ext autoreload
%autoreload 2

In [None]:
# hide
from nbdev import *

In [None]:
# export
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import gc
import typing
from math import isclose
from typing import Sequence, Union, Tuple

In [None]:
# hide
from nn4tab.test_utils import fake_data

# Data

## Processors

In [None]:
# export
class TabularProc():
    _order = 1
    isset = False
    def setup(self): pass
    def checkup(self):
        pass
    def encode(self, x):
        raise NotImplementedError
    def decode(self, x): pass

In [None]:
df, cont_names, cat_names = fake_data(preproc=False)
test_df = df.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   cont_0  1000 non-null   float32
 1   cont_1  1000 non-null   float32
 2   cont_2  1000 non-null   float32
 3   cont_3  1000 non-null   float32
 4   cont_4  1000 non-null   float32
 5   cat_0   1000 non-null   object 
 6   cat_1   1000 non-null   object 
 7   targ    1000 non-null   float32
dtypes: float32(6), object(2)
memory usage: 39.2+ KB


### ProcPipeline

In [None]:
# hide
class ProcPipeline:
    def __init__(self, procs:Sequence[TabularProc]):
        self.procs = procs
        self.isset = False
    def setup(self, data):
        for proc in self.procs:
            proc.setup(data)

### Normalize proc

In [None]:
# export
class Normalize(TabularProc):
    """
    Normalizes continuous features to zero mean and unit variance.
    """
    def setup(self, df:pd.DataFrame, cont_names:Sequence):
        """Store mean and std for columns in cont_names"""
        self.checkup()
        self.mean = {col: df[col].mean() for col in cont_names}
        self.std = {col: df[col].std() for col in cont_names}
        self.is_set = True
    
    def encode_one(self, df:pd.DataFrame, col:str):
        return (df[col] - self.mean[col])/self.std[col]

    def encode(self, df:pd.DataFrame, cont_names:Sequence):
        for col in cont_names:
            df[col] = self.encode_one(df, col)
    
    def decode_one(self, df:pd.DataFrame, col:str):
        return df[col]*self.std[col] + self.mean[col]
    
    def decode(self, df:pd.DataFrame, cont_names:Sequence):
        for col in cont_names:
            df[col] = self.decode_one(df, col)

In [None]:
def print_stat(df, cont_names=cont_names):
    for col in cont_names:
        print(f'{col}: mean= {df[col].mean():.4f}, std = {df[col].std():.4f}')

In [None]:
norm = Normalize()
norm.setup(test_df, cont_names)

In [None]:
print_stat(test_df)

cont_0: mean=-1.6134, std =2.5976
cont_1: mean=4.8120, std =2.7086
cont_2: mean=-1.8440, std =2.4517
cont_3: mean=2.8474, std =1.4774
cont_4: mean=3.6936, std =2.9721


In [None]:
norm.encode(test_df, cont_names)

In [None]:
print_stat(test_df)

cont_0: mean=0.0000, std =1.0000
cont_1: mean=0.0000, std =1.0000
cont_2: mean=-0.0000, std =1.0000
cont_3: mean=0.0000, std =1.0000
cont_4: mean=-0.0000, std =1.0000


In [None]:
for x in test_df[cont_names].mean():
    assert isclose(x, 0, abs_tol=1e-5)

In [None]:
for x in test_df[cont_names].std():
    assert isclose(x, 1, abs_tol=1e-5)

In [None]:
norm.decode(test_df, cont_names)

In [None]:
print_stat(test_df)

cont_0: mean= -1.6134, std = 2.5976
cont_1: mean= 4.8120, std = 2.7086
cont_2: mean= -1.8440, std = 2.4517
cont_3: mean= 2.8474, std = 1.4774
cont_4: mean= 3.6936, std = 2.9721


In [None]:
for x in (test_df[cont_names] - df[cont_names]).abs().sum():
    assert x < 1e-4

### FillMissing proc

In [None]:
# export
class FillMissing(TabularProc):
    """Fills missing values in continuous columns"""
    def __init__(self, add_bool=True, method='mean'):
        self.add_bool = add_bool
        self.method = method
        
    def setup(self, df:pd.DataFrame, cont_names:Sequence, cat_names:Sequence):
        self.checkup()
        if self.method == 'mean':
            self.values = {col:df[col].mean() for col in cont_names}
        self.cont_names = cont_names
        self.cat_names = cat_names
        self.isset = True
        
    def encode(self, df:pd.DataFrame, cont_names:Sequence=None):
        if not cont_names:
            cont_names = self.cont_names
        for col in cont_names:
            if not df[col].isna().any():
                continue
            if self.add_bool:
                df[f'{col}_na'] = df[col].isna().astype(np.int8)
            df[col].fillna(value=self.values[col], inplace=True)
            
    def decode(self, *args, **kwargs):
        pass

In [None]:
df, cont_names, cat_names = fake_data(nons=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   cont_0  883 non-null    float32
 1   cont_1  899 non-null    float32
 2   cont_2  912 non-null    float32
 3   cont_3  902 non-null    float32
 4   cont_4  894 non-null    float32
 5   cat_0   891 non-null    float64
 6   cat_1   902 non-null    float64
 7   targ    1000 non-null   float32
dtypes: float32(6), float64(2)
memory usage: 39.2 KB


In [None]:
fillproc = FillMissing()
fillproc.setup(test_df, cont_names, cat_names)

In [None]:
fillproc.encode(df)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   cont_0     1000 non-null   float32
 1   cont_1     1000 non-null   float32
 2   cont_2     1000 non-null   float32
 3   cont_3     1000 non-null   float32
 4   cont_4     1000 non-null   float32
 5   cat_0      891 non-null    float64
 6   cat_1      902 non-null    float64
 7   targ       1000 non-null   float32
 8   cont_0_na  1000 non-null   int8   
 9   cont_1_na  1000 non-null   int8   
 10  cont_2_na  1000 non-null   int8   
 11  cont_3_na  1000 non-null   int8   
 12  cont_4_na  1000 non-null   int8   
dtypes: float32(6), float64(2), int8(5)
memory usage: 44.1 KB


In [None]:
for col in cont_names:
    assert not df[cont_names].isna().any().any()

### Categorify proc

In [None]:
# export
def _catlist(s:pd.Series):
    c = set(s)
    c.discard('#na')
    return ['#na'] + list(c)

In [None]:
# export
class Categorify(TabularProc):
    """Numericalizes categorical columns."""
    def setup(self, df:pd.DataFrame, cat_names:Sequence):
        self.checkup()
        self.cat = {col: _catlist(df[col].dropna()) for col in cat_names}
        self.i2c = {c: i for i, c in enumerate(self.cat)}

    def encode_one(self, df:pd.DataFrame, col:str):
        return pd.Series(pd.Categorical(test_df[col].fillna('#na'), categories=self.cat[col])).cat.codes
    
    def encode(self, df:pd.DataFrame, cat_names:Sequence):
        for col in cat_names:
            df[col] = self.encode_one(df, col)
    
    def decode_one(self, df:pd.DataFrame, col:str):
        return pd.Series(pd.Categorical.from_codes(df[col], categories=self.cat[col]))
    
    def decode(self, df:pd.DataFrame, cat_names:Sequence):
        for col in cat_names:
            df[col] = self.decode_one(df, col)

In [None]:
df, cont_names, cat_names = fake_data(preproc=False, nons=True)
test_df = df.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   cont_0  883 non-null    float32
 1   cont_1  899 non-null    float32
 2   cont_2  912 non-null    float32
 3   cont_3  902 non-null    float32
 4   cont_4  894 non-null    float32
 5   cat_0   891 non-null    object 
 6   cat_1   901 non-null    object 
 7   targ    1000 non-null   float32
dtypes: float32(6), object(2)
memory usage: 39.2+ KB


In [None]:
cproc = Categorify()
cproc.setup(test_df, cat_names)

In [None]:
ctfy.cat

{'cat_0': ['#na', 'C', 'A', 'B'], 'cat_1': ['#na', 'C', 'A', 'B']}

In [None]:
cproc.encode(test_df, cat_names)

In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   cont_0  883 non-null    float32
 1   cont_1  899 non-null    float32
 2   cont_2  912 non-null    float32
 3   cont_3  902 non-null    float32
 4   cont_4  894 non-null    float32
 5   cat_0   1000 non-null   int8   
 6   cat_1   1000 non-null   int8   
 7   targ    1000 non-null   float32
dtypes: float32(6), int8(2)
memory usage: 25.5 KB


In [None]:
for col in cat_names:
    assert sum(test_df.loc[df[col].isna(), col]) == 0
    assert np.issubdtype(test_df[col].dtype, np.integer)

In [None]:
cproc.decode(test_df, cat_names)

In [None]:
for col in cat_names:
    assert (df.loc[df[col].notna(), col] == test_df.loc[df[col].notna(), col]).all()

## Dataset and dataloader

In [None]:
# export
def cont_cat_split(df, dep_var=None, max_card=np.inf):
    cont, cat = [], []
    for col in df.columns:
        if col == dep_var: continue #?? mb change to support multiple dep var
        if np.issubdtype(df[col].dtype, np.floating) or (len(df[col].unique()) > max_card and np.issubdtype(df[col].dtype, np.integer)):
            cont.append(col)
        else: #?? any condition np.issubdtype(df[col].dtype, np.integer) 
            cat.append(col)
    return cont, cat

In [None]:
# export
class TabularDataset(Dataset):

    def __init__(self, df:pd.DataFrame, cat_names:Sequence, cont_names:Sequence, dep_var:Sequence, procs=None):
        self.data = df
        self.cat = cat_names
        self.cont = cont_names
        self.dep_var = dep_var

    def __getitem__(self, idx):
        return (self.data[self.cat].iloc[idx].to_numpy(dtype=np.long), 
                self.data[self.cont].iloc[idx].to_numpy(dtype=np.float32), 
                self.data[self.dep_var].iloc[idx].to_numpy(dtype=np.float32))

    def __len__(self):
        return len(self.data)

In [None]:
# export
def get_dsets(df:pd.DataFrame, cat_names:Sequence, cont_names:Sequence, dep_var:Sequence, splits=None, stratify=True):
    if splits:
        train_df, valid_df = df[splits[0]], df[splits[1]]
    else:
        s = df[dep_var[0]] if stratify else None
        train_df, valid_df = train_test_split(df, test_size=0.2, stratify=s)
    return (TabularDataset(train_df, cat_names, cont_names, dep_var), 
            TabularDataset(valid_df, cat_names, cont_names, dep_var))

In [None]:
# export
def get_dl(ds, bs=512, train=True, drop_last=True):
    return DataLoader(ds, batch_size=bs, shuffle=train, drop_last=drop_last)

In [None]:
# hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 00a_test_utils.ipynb.
Converted 01_data.ipynb.
Converted 02_model.ipynb.
Converted 03_learner.ipynb.
Converted index.ipynb.
