# Tutorial: Data preprocessing

In [2]:
import numpy as np
import pandas as pd

## Load UCI Data
UCI Adult data are going to be used in this tutorial. The data contain both numerical and categorical features, and the target to predict is whether a person makes over 50K a year. It is a very simple binary classification task.

In [3]:
from catboost.datasets import adult
train, test = adult()

In [5]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [6]:
test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18.0,,103497.0,Some-college,10.0,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K


In [7]:
target_dict = {
    '<=50K': 0,
    '>50K': 1
}

In [8]:
train['income'] = train['income'].map(target_dict)
test['income'] = test['income'].map(target_dict)

## Encode categorical and numerical features
For categorical features, label encoding will be applied. For numerical features, standardization will be applied.

In [9]:
from kuma_utils.preprocessing import (
    PrepPipeline, SelectCategorical, SelectNumerical, DistTransformer)
from sklearn.preprocessing import TargetEncoder, OrdinalEncoder

In [10]:
num_enc = PrepPipeline([
    SelectNumerical(exclude_cols=['income']),
    DistTransformer('standard')
])

cat_enc = PrepPipeline([
    SelectCategorical(exclude_cols=['income']),
    OrdinalEncoder(encoded_missing_value=np.nan, handle_unknown='use_encoded_value', unknown_value=np.nan)
    # TargetEncoder(),
], target_col='income')

In [11]:
train_cat, train_num = cat_enc.fit_transform(train).astype(int, errors='ignore'), num_enc.fit_transform(train).astype(float)
categorical_index = list(range(train_cat.shape[1]))
train_encoded = pd.concat([train_cat, train_num], axis=1) 
train_encoded.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,6.0,9.0,4.0,0.0,1.0,4.0,1.0,38.0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,5.0,9.0,2.0,3.0,0.0,4.0,1.0,38.0,0.837109,-1.008707,1.134739,-0.14592,-0.21666,-2.222153
2,3.0,11.0,0.0,5.0,1.0,4.0,1.0,38.0,-0.042642,0.245079,-0.42006,-0.14592,-0.21666,-0.035429
3,3.0,1.0,2.0,5.0,0.0,2.0,1.0,38.0,1.057047,0.425801,-1.197459,-0.14592,-0.21666,-0.035429
4,3.0,9.0,2.0,9.0,5.0,2.0,0.0,4.0,-0.775768,1.408176,1.134739,-0.14592,-0.21666,-0.035429


In [12]:
test_encoded = pd.concat([
    cat_enc.transform(test).astype(int, errors='ignore'), 
    num_enc.transform(test).astype(float)], axis=1)
test_encoded.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,3.0,1.0,4.0,6.0,3.0,2.0,1.0,38.0,-0.995706,0.350774,-1.197459,-0.14592,-0.21666,-0.035429
1,3.0,11.0,2.0,4.0,0.0,4.0,1.0,38.0,-0.042642,-0.947095,-0.42006,-0.14592,-0.21666,0.774468
2,1.0,7.0,2.0,10.0,0.0,4.0,1.0,38.0,-0.775768,1.394362,0.746039,-0.14592,-0.21666,-0.035429
3,3.0,15.0,2.0,6.0,0.0,2.0,1.0,38.0,0.397233,-0.27907,-0.03136,0.895083,-0.21666,-0.035429
4,,15.0,4.0,,3.0,4.0,0.0,38.0,-1.508894,-0.817458,-0.03136,-0.14592,-0.21666,-0.845327


## Drop data points 

In [13]:
def random_drop(df, ratio=0.1):
    df_drop = df.copy()
    for col in df.columns:
        if col in 'income':
            continue
        drop_num = int(df.shape[0] * ratio)
        drop_idx = np.random.choice(np.arange(df.shape[0]), drop_num)
        df_drop.loc[drop_idx, col] = np.nan
    return df_drop

In [14]:
train_encoded = random_drop(train_encoded, 0.2)
test_encoded = random_drop(test_encoded, 0.2)

## Regression imputer and Simple imputer

In [15]:
from kuma_utils.preprocessing.imputer import LGBMImputer
from sklearn.impute import SimpleImputer

In [16]:
train_encoded.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,6.0,9.0,4.0,0.0,,,1.0,38.0,,-1.063611,1.134739,0.148453,,-0.035429
1,5.0,9.0,,3.0,,,1.0,38.0,,-1.008707,1.134739,,-0.21666,-2.222153
2,,11.0,0.0,5.0,1.0,4.0,1.0,38.0,-0.042642,,-0.42006,-0.14592,-0.21666,-0.035429
3,3.0,1.0,2.0,5.0,0.0,,,38.0,1.057047,0.425801,-1.197459,-0.14592,-0.21666,-0.035429
4,,,2.0,9.0,5.0,2.0,0.0,4.0,-0.775768,1.408176,1.134739,-0.14592,-0.21666,-0.035429


In [17]:
imputer = LGBMImputer(cat_features=categorical_index, n_iter=100, verbose=True)
train_encoded2 = imputer.fit_transform(train_encoded)
test_encoded2 = imputer.transform(test_encoded)

  0%|          | 0/14 [00:00<?, ?it/s]

In [18]:
simple_imputer = SimpleImputer(strategy='most_frequent')
simple_imputer.fit(train_encoded)
train_encoded3 = pd.DataFrame(simple_imputer.transform(train_encoded), columns=train_encoded.columns)
test_encoded3 = pd.DataFrame(simple_imputer.transform(test_encoded), columns=test_encoded.columns)

In [19]:
train_encoded2.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,6,9,4,0,1,4,1,38,0.031979,-1.063611,1.134739,0.148453,0.031204,-0.035429
1,5,9,2,3,0,4,1,38,0.515819,-1.008707,1.134739,0.126848,-0.21666,-2.222153
2,3,11,0,5,1,4,1,38,-0.042642,0.047536,-0.42006,-0.14592,-0.21666,-0.035429
3,3,1,2,5,0,4,1,38,1.057047,0.425801,-1.197459,-0.14592,-0.21666,-0.035429
4,3,9,2,9,5,2,0,4,-0.775768,1.408176,1.134739,-0.14592,-0.21666,-0.035429


## Compare performance

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from kuma_utils.training import CrossValidator
from kuma_utils.metrics import AUC

In [21]:
labels = train['income']
test_labels = test['income']

In [22]:
cv0 = CrossValidator(LogisticRegression)
cv0.train(
    data=(train_encoded3, labels),
    folds=StratifiedKFold(n_splits=5, shuffle=True, random_state=0),
    params={'max_iter': 1000}
)
AUC()(test_labels, np.stack(cv0.smart_predict(test_encoded3)).mean(0))

Logger created at 24/02/09:06:11:56
06:11:56 [cv0] Starting fold 0
eval_metric automatically selected.
06:11:56 [None]	best score is 0.820522
06:11:56 [cv0] Fold 0: eval=0.820522 (iter=None)
06:11:56 [cv0] Starting fold 1
eval_metric automatically selected.
06:11:56 [None]	best score is 0.827455
06:11:56 [cv0] Fold 1: eval=0.827455 (iter=None)
06:11:56 [cv0] Starting fold 2
eval_metric automatically selected.
06:11:57 [None]	best score is 0.832709
06:11:57 [cv0] Fold 2: eval=0.832709 (iter=None)
06:11:57 [cv0] Starting fold 3
eval_metric automatically selected.
06:11:57 [None]	best score is 0.828149
06:11:57 [cv0] Fold 3: eval=0.828149 (iter=None)
06:11:57 [cv0] Starting fold 4
eval_metric automatically selected.
06:11:57 [None]	best score is 0.832683
06:11:57 [cv0] Fold 4: eval=0.832683 (iter=None)
06:11:57 [cv0] Overall metric: 0.828304 + 0.004469


0.8267886091398622

In [23]:
cv1 = CrossValidator(LogisticRegression)
cv1.train(
    data=(train_encoded2, labels),
    folds=StratifiedKFold(n_splits=5, shuffle=True, random_state=0),
    params={'max_iter': 1000}
)
AUC()(test_labels, np.stack(cv1.smart_predict(test_encoded2)).mean(0))

Logger created at 24/02/09:06:11:57
06:11:57 [cv0] Starting fold 0
eval_metric automatically selected.
06:11:58 [None]	best score is 0.833885
06:11:58 [cv0] Fold 0: eval=0.833885 (iter=None)
06:11:58 [cv0] Starting fold 1
eval_metric automatically selected.
06:11:58 [None]	best score is 0.842492
06:11:58 [cv0] Fold 1: eval=0.842492 (iter=None)
06:11:58 [cv0] Starting fold 2
eval_metric automatically selected.
06:11:58 [None]	best score is 0.846669
06:11:58 [cv0] Fold 2: eval=0.846669 (iter=None)
06:11:58 [cv0] Starting fold 3
eval_metric automatically selected.
06:11:58 [None]	best score is 0.845375
06:11:58 [cv0] Fold 3: eval=0.845375 (iter=None)
06:11:58 [cv0] Starting fold 4
eval_metric automatically selected.
06:11:59 [None]	best score is 0.850993
06:11:59 [cv0] Fold 4: eval=0.850993 (iter=None)
06:11:59 [cv0] Overall metric: 0.843883 + 0.005700


0.8424649675974977

**LGBMImputer is better than SimpleImputer!**