# Tutorial: Data preprocessing

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

## Load UCI Data
UCI Adult data are going to be used in this tutorial. The data contain both numerical and categorical features, and the target to predict is whether a person makes over 50K a year. It is a very simple binary classification task.

In [3]:
train = pd.read_csv('kuma_utils/datasets/adult_train.csv')
test = pd.read_csv('kuma_utils/datasets/adult_test.csv')

In [4]:
train.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education_Num,Martial_Status,Occupation,Relationship,Race,Sex,Capital_Gain,Capital_Loss,Hours_per_week,Country,Target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
test.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education_Num,Martial_Status,Occupation,Relationship,Race,Sex,Capital_Gain,Capital_Loss,Hours_per_week,Country,Target
0,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18,,103497.0,Some-college,10.0,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K


In [6]:
target_dict = {
    ' <=50K': 0,
    ' >50K': 1
}

In [7]:
from xfeat import SelectCategorical, SelectNumerical
from kuma_utils.preprocessing.xfeat import Pipeline, TargetEncoder
from category_encoders import OrdinalEncoder

In [8]:
num_enc = Pipeline([
    SelectNumerical(),
])

cat_enc = Pipeline([
    SelectCategorical(exclude_cols=['Target']),
    OrdinalEncoder(handle_missing='return_nan')
], target_col='Target')

In [9]:
train_cat, train_num = cat_enc.fit_transform(train).astype(int, errors='ignore'), num_enc.fit_transform(train).astype(float)
categorical_index = list(range(train_cat.shape[1]))
train_encoded = pd.concat([train_cat, train_num], axis=1) 
train_encoded.head()

Unnamed: 0,Workclass,Education,Martial_Status,Occupation,Relationship,Race,Sex,Country,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week
0,1.0,1,1,1.0,1,1,1,1.0,39.0,77516.0,13.0,2174.0,0.0,40.0
1,2.0,1,2,2.0,2,1,1,1.0,50.0,83311.0,13.0,0.0,0.0,13.0
2,3.0,2,3,3.0,1,1,1,1.0,38.0,215646.0,9.0,0.0,0.0,40.0
3,3.0,3,2,3.0,2,2,1,1.0,53.0,234721.0,7.0,0.0,0.0,40.0
4,3.0,1,2,4.0,3,2,2,2.0,28.0,338409.0,13.0,0.0,0.0,40.0


In [10]:
test_encoded = pd.concat([
    cat_enc.transform(test).astype(int, errors='ignore'), 
    num_enc.transform(test).astype(float)], axis=1)
test_encoded.head()

Unnamed: 0,Workclass,Education,Martial_Status,Occupation,Relationship,Race,Sex,Country,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week
0,3.0,3,1,10.0,4,2,1,1.0,25.0,226802.0,7.0,0.0,0.0,40.0
1,3.0,2,2,9.0,2,1,1,1.0,38.0,89814.0,9.0,0.0,0.0,50.0
2,5.0,7,2,13.0,2,1,1,1.0,28.0,336951.0,12.0,0.0,0.0,40.0
3,3.0,6,2,10.0,2,2,1,1.0,44.0,160323.0,10.0,7688.0,0.0,40.0
4,,6,1,,4,1,2,1.0,18.0,103497.0,10.0,0.0,0.0,30.0


## Drop data points 

In [11]:
def random_drop(df, ratio=0.1):
    for col in df.columns:
        if col in 'Target':
            continue
        drop_num = int(df.shape[0] * ratio)
        drop_idx = np.random.choice(np.arange(df.shape[0]), drop_num)
        df[col].iloc[drop_idx] = np.nan

In [12]:
random_drop(train_encoded, 0.2)
random_drop(test_encoded, 0.2)

## Regression imputer and Simple imputer

In [13]:
from kuma_utils.preprocessing.imputer import LGBMImputer
from sklearn.impute import SimpleImputer

In [14]:
imputer = LGBMImputer(cat_features=categorical_index)
train_encoded2 = imputer.fit_transform(train_encoded)
test_encoded2 = imputer.transform(test_encoded)

100%|██████████| 14/14 [00:16<00:00,  1.15s/it]


In [15]:
simple_imputer = SimpleImputer(strategy='most_frequent')
simple_imputer.fit(train_encoded)
train_encoded3 = pd.DataFrame(simple_imputer.transform(train_encoded), columns=train_encoded.columns)
test_encoded3 = pd.DataFrame(simple_imputer.transform(test_encoded), columns=test_encoded.columns)

In [16]:
train_encoded2.info(), test_encoded2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Workclass       32561 non-null  int64  
 1   Education       32561 non-null  int64  
 2   Martial_Status  32561 non-null  int64  
 3   Occupation      32561 non-null  int64  
 4   Relationship    32561 non-null  int64  
 5   Race            32561 non-null  int64  
 6   Sex             32561 non-null  int64  
 7   Country         32561 non-null  int64  
 8   Age             32561 non-null  float64
 9   fnlwgt          32561 non-null  float64
 10  Education_Num   32561 non-null  float64
 11  Capital_Gain    32561 non-null  float64
 12  Capital_Loss    32561 non-null  float64
 13  Hours_per_week  32561 non-null  float64
dtypes: float64(6), int64(8)
memory usage: 3.5 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 14 columns):
 #   Column 

(None, None)

## Compare performance

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from kuma_utils.training import CrossValidator
from kuma_utils.metrics import AUC

In [18]:
labels = train.Target.replace(target_dict)
test_labels = test.Target.replace(target_dict)

In [21]:
cv0 = CrossValidator(RandomForestClassifier)
cv0.train(
    data=(train_encoded3, labels),
    folds=StratifiedKFold(n_splits=5, random_state=0)
)
AUC()(test_labels, np.stack(cv0.smart_predict(test_encoded3)).mean(0))

Logger created at 20/11/25:09:36:37
09:36:37 [cv0] Starting fold 0
eval_metric automatically selected.
09:36:39 [None]	best score is 0.884687
09:36:39 [cv0] Fold 0: eval=0.884687 (iter=None)
09:36:39 [cv0] Starting fold 1
eval_metric automatically selected.
09:36:41 [None]	best score is 0.884826
09:36:41 [cv0] Fold 1: eval=0.884826 (iter=None)
09:36:41 [cv0] Starting fold 2
eval_metric automatically selected.
09:36:43 [None]	best score is 0.885389
09:36:43 [cv0] Fold 2: eval=0.885389 (iter=None)
09:36:43 [cv0] Starting fold 3
eval_metric automatically selected.
09:36:45 [None]	best score is 0.886907
09:36:45 [cv0] Fold 3: eval=0.886907 (iter=None)
09:36:46 [cv0] Starting fold 4
eval_metric automatically selected.
09:36:48 [None]	best score is 0.891252
09:36:48 [cv0] Fold 4: eval=0.891252 (iter=None)
09:36:48 [cv0] Overall metric: 0.886612 + 0.002450


0.8901309168571004

In [22]:
cv1 = CrossValidator(RandomForestClassifier)
cv1.train(
    data=(train_encoded2, labels),
    folds=StratifiedKFold(n_splits=5, random_state=0)
)
AUC()(test_labels, np.stack(cv1.smart_predict(test_encoded2)).mean(0))

Logger created at 20/11/25:09:36:49
09:36:49 [cv0] Starting fold 0
eval_metric automatically selected.
09:36:51 [None]	best score is 0.892954
09:36:51 [cv0] Fold 0: eval=0.892954 (iter=None)
09:36:51 [cv0] Starting fold 1
eval_metric automatically selected.
09:36:54 [None]	best score is 0.887376
09:36:54 [cv0] Fold 1: eval=0.887376 (iter=None)
09:36:54 [cv0] Starting fold 2
eval_metric automatically selected.
09:36:56 [None]	best score is 0.891081
09:36:56 [cv0] Fold 2: eval=0.891081 (iter=None)
09:36:56 [cv0] Starting fold 3
eval_metric automatically selected.
09:36:59 [None]	best score is 0.896422
09:36:59 [cv0] Fold 3: eval=0.896422 (iter=None)
09:36:59 [cv0] Starting fold 4
eval_metric automatically selected.
09:37:01 [None]	best score is 0.897924
09:37:01 [cv0] Fold 4: eval=0.897924 (iter=None)
09:37:01 [cv0] Overall metric: 0.893152 + 0.003772


0.896468385474462

**LGBMImputer is better than SimpleImputer**