In [None]:
# default_exp test_utils

In [None]:
# hide
%load_ext autoreload
%autoreload 2

from nbdev import *

In [None]:
# export
import numpy as np
import pandas as pd
from collections import namedtuple
from pathlib import Path
import random
from math import isclose
import string
import pdb

In [None]:
# export
def cont_cat_split(df, dep_var=None, max_card=np.inf, ignore=[]):
    """
    Sugests a split of columns of the dataframe to continuous and categorical ommiting dep_var and 
    ignore. Split is done based on column datatype: float columns and int with cardinality > max_card 
    are treated as continuous, all other - categorical.
    """
    cont, cat = [], []
    for col in df.columns:
        if (col == dep_var) or (col in dep_var) or (col in ignore): continue
        if np.issubdtype(df[col].dtype, np.floating) or (len(df[col].unique()) > max_card and np.issubdtype(df[col].dtype, np.integer)):
            cont.append(col)
        else: #?? any condition np.issubdtype(df[col].dtype, np.integer) 
            cat.append(col)
    return cont, cat

## Expr

In [None]:
rng = np.random.default_rng(8)

In [None]:
x = rng.integers(3, size=(10,2))
x

array([[2, 0],
       [0, 2],
       [0, 0],
       [1, 2],
       [1, 2],
       [0, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [1, 1]], dtype=int64)

In [None]:
a = np.array(list(string.ascii_uppercase))

In [None]:
a[x]

array([['C', 'A'],
       ['A', 'C'],
       ['A', 'A'],
       ['B', 'C'],
       ['B', 'C'],
       ['A', 'B'],
       ['B', 'B'],
       ['B', 'B'],
       ['A', 'A'],
       ['B', 'B']], dtype='<U1')

In [None]:
cont = rng.normal(size=(10, 3))
cont

array([[ 0.76747011, -0.05302978,  0.85979399],
       [ 1.50548116, -0.65359453,  0.61035115],
       [-0.04267383,  1.44001673, -0.83689502],
       [-0.30154661,  0.36233859,  0.25811027],
       [-1.63944796,  0.36015523, -0.1184977 ],
       [-0.23974785, -0.15530166,  0.21897171],
       [-1.81639566,  1.55246657, -0.86144167],
       [-2.24136786, -0.08197449,  1.45748042],
       [-0.51860097,  1.55127562,  1.556942  ],
       [-0.86273192, -2.46512082, -1.23518276]])

In [None]:
np.where((rng.uniform(size=cont.shape) > 0.9), np.nan, cont)

array([[        nan, -0.05302978,         nan],
       [ 1.50548116, -0.65359453,  0.61035115],
       [-0.04267383,  1.44001673, -0.83689502],
       [-0.30154661,  0.36233859,  0.25811027],
       [-1.63944796,  0.36015523, -0.1184977 ],
       [-0.23974785, -0.15530166,  0.21897171],
       [-1.81639566,  1.55246657, -0.86144167],
       [-2.24136786, -0.08197449,  1.45748042],
       [-0.51860097,  1.55127562,  1.556942  ],
       [-0.86273192, -2.46512082, -1.23518276]])

##  Fake data

In [None]:
# export
def fake_data(n=1000, n_cont=5, n_cat=2, task='class', preproc=True, nans=False):
    """
    Generates randomized tabular dataframe conatining:
    n samples, n_cont continious features, n_cat categorical features.
    If preproc is True, continious values are normalized and categorical features
    are numericalized.
    If nans is True, some NoN values are added randomly
    """
    rng = np.random.default_rng(8)
    
    loc = 0. if preproc else rng.uniform(-5, 5, size=(n_cont,))
    scale = 1. if preproc else rng.uniform(1, 5, size=(n_cont,))
    cont = rng.normal(loc, scale, size=(n, n_cont))
    
    #mb change to support varying cardinality through categories
    cat = rng.integers(3, size=(n, n_cat))
    a = np.array(list(string.ascii_uppercase))
    cat_data = cat if preproc else a[cat]
        
    cont_names = [f'cont_{i}' for i in range(n_cont)]
    cat_names = [f'cat_{i}' for i in range(n_cat)]
    
    
    cont_data = np.where((rng.uniform(size=cont.shape) > 0.9), np.nan, cont) if nans else cont
#         cat_data = np.where((rng.uniform(size=cat_data.shape) > 0.9), np.nan, cat_data)
    
    left = pd.DataFrame(cont_data, columns=cont_names, dtype=np.float32)
    right = pd.DataFrame(cat_data, columns=cat_names)
    df = left.join(right)
    
    if nans:
        for col in cat_names:
            df.loc[(rng.uniform(size=len(df[col])) > 0.9), col] = np.nan
    
    df['targ'] = cont.sum(axis=1) + cat.sum(axis=1) - 1
    
    if task=='class':
        df['targ'] = (df['targ']>df['targ'].mean()).astype(np.float32)
            
    return df, cont_names, cat_names

In [None]:
df, cont_names, cat_names = fake_data(nans=True)
df.head(5)

Unnamed: 0,cont_0,cont_1,cont_2,cont_3,cont_4,cat_0,cat_1,targ
0,-1.738266,-1.336643,-1.361107,-0.351617,-2.312582,2.0,1.0,0.0
1,-0.188897,-0.957229,,0.956847,1.392258,0.0,,1.0
2,0.76747,-0.05303,0.859794,1.505481,,,1.0,1.0
3,0.610351,-0.042674,1.440017,-0.836895,-0.301547,2.0,1.0,1.0
4,0.362339,0.25811,-1.639448,0.360155,-0.118498,2.0,1.0,1.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   cont_0  883 non-null    float32
 1   cont_1  899 non-null    float32
 2   cont_2  912 non-null    float32
 3   cont_3  902 non-null    float32
 4   cont_4  894 non-null    float32
 5   cat_0   891 non-null    float64
 6   cat_1   902 non-null    float64
 7   targ    1000 non-null   float32
dtypes: float32(6), float64(2)
memory usage: 39.2 KB


In [None]:
df['cat_1']

0      1.0
1      NaN
2      1.0
3      1.0
4      1.0
      ... 
995    1.0
996    2.0
997    1.0
998    1.0
999    0.0
Name: cat_1, Length: 1000, dtype: float64

In [None]:
df, cont_names, cat_names = fake_data(preproc=False, nans=True)
df.head(5)

Unnamed: 0,cont_0,cont_1,cont_2,cont_3,cont_4,cat_0,cat_1,targ
0,0.237776,4.726856,0.328851,5.035037,,,C,1.0
1,-0.16513,4.75535,1.774184,1.690559,2.819697,A,A,1.0
2,-0.801118,5.582966,-5.896749,3.399724,3.353442,C,,0.0
3,-2.345072,4.445452,-1.267434,0.292013,8.225743,A,A,0.0
4,-3.939306,-1.294417,-2.01709,4.966501,2.186796,C,B,0.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   cont_0  883 non-null    float32
 1   cont_1  899 non-null    float32
 2   cont_2  912 non-null    float32
 3   cont_3  902 non-null    float32
 4   cont_4  894 non-null    float32
 5   cat_0   891 non-null    object 
 6   cat_1   901 non-null    object 
 7   targ    1000 non-null   float32
dtypes: float32(6), object(2)
memory usage: 39.2+ KB


In [None]:
assert not df['targ'].isna().any()

df['targ'].mean()

0.512

In [None]:
df, cont_names, cat_names = fake_data(preproc=False, nans=True, task='reg')
df.head(5)

Unnamed: 0,cont_0,cont_1,cont_2,cont_3,cont_4,cat_0,cat_1,targ
0,0.237776,4.726856,0.328851,5.035037,,,C,15.121694
1,-0.16513,4.75535,1.774184,1.690559,2.819697,A,A,9.87466
2,-0.801118,5.582966,-5.896749,3.399724,3.353442,C,,7.638265
3,-2.345072,4.445452,-1.267434,0.292013,8.225743,A,A,8.350702
4,-3.939306,-1.294417,-2.01709,4.966501,2.186796,C,B,1.902485


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   cont_0  883 non-null    float32
 1   cont_1  899 non-null    float32
 2   cont_2  912 non-null    float32
 3   cont_3  902 non-null    float32
 4   cont_4  894 non-null    float32
 5   cat_0   891 non-null    object 
 6   cat_1   901 non-null    object 
 7   targ    1000 non-null   float64
dtypes: float32(5), float64(1), object(2)
memory usage: 43.1+ KB


In [None]:
assert not df['targ'].isna().any()

df['targ'].mean()

8.900451063493145

In [None]:
def print_stat(df):
    for col in cont_names:
        print(f'{col}: mean={df[col].mean():.4f}, std ={df[col].std():.4f}')

## Tests for dataframes

In [None]:
# export
def test_normalized(df, cont_names):
    """Test if all columns in cont_names of the dataframe are close to standard normal"""
    for m in df[cont_names].mean():
        assert isclose(m, 0, abs_tol=0.2), f'mean is {m}'
    for s in df[cont_names].std():
        assert isclose(s, 1, abs_tol=0.2), f'std is {s}'

In [None]:
df, cont_names, cat_names = fake_data()
dep_var = ['targ']
test_normalized(df, cont_names)

In [None]:
df, cont_names, cat_names = fake_data(preproc=False)
dep_var = ['targ']
try:
    test_normalized(df, cont_names)
except AssertionError:
    print('Test not passed as intended')

Test not passed as intended


In [None]:
# export
def test_nans(df, cont_names, cat_names):
    assert df[cont_names].notna().all().all(), 'There are NaNs in continiuous columns'
    assert df[cat_names].notna().all().all(), 'There are NaNs in categorical columns'

In [None]:
df, cont_names, cat_names = fake_data()
dep_var = ['targ']
test_nans(df, cont_names, cat_names)

In [None]:
df, cont_names, cat_names = fake_data(nans=True)
dep_var = ['targ']
try:
    test_nans(df, cont_names, cat_names)
except AssertionError as e:
    print(e)

There are NaNs in continiuous columns


In [None]:
# export
def test_categorical(df, cat_names):
    for col in cat_names:
        assert np.issubdtype(df[col].dtype, np.integer), f'{col} dtype is not int'

In [None]:
df, cont_names, cat_names = fake_data()
dep_var = ['targ']
test_categorical(df, cat_names)

In [None]:
df, cont_names, cat_names = fake_data(preproc=False)
dep_var = ['targ']
try:
    test_categorical(df, cat_names)
except AssertionError as e:
    print(e)

cat_0 dtype is not int


In [None]:
# export
def test_df_processed(df, cont_names=[], cat_names=[], dep_var=[]):
    if not dep_var: dep_var = ['targ']
    if not (cont_names or cat_names):
        cont_names, cat_names = cont_cat_split(df, dep_var=dep_var)
    test_normalized(df, cont_names)
    test_nans(df, cont_names, cat_names)
    test_categorical(df, cat_names)

In [None]:
df, cont_names, cat_names = fake_data()
dep_var = ['targ']
test_df_processed(df, cont_names, cat_names, dep_var)

In [None]:
df = pd.read_csv(Path('./datasets/adult_preproc.csv'), index_col=0)
test_df_processed(df, dep_var=['salary'])

In [None]:
# hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 00a_test_utils.ipynb.
Converted 01_data.ipynb.
Converted 02_model.ipynb.
Converted 03_learner.ipynb.
Converted index.ipynb.
