In [2]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
from kuma_utils.stats import PropensityScoreMatching, make_demographic_table
from kuma_utils.preprocessing import PrepPipeline, SelectCategorical, SelectNumerical, DummyVarible, DistTransformer, Cast

In [3]:
from catboost.datasets import adult
train, test = adult()
full_dataset = pd.concat([train.assign(group='train'), test.assign(group='test')], axis=0)

# Make demopgraphic table (a.k.a. Table 1)
for numeric variables:
- run KS test
    - if the varible follows normal distribution, run T test
    - if no, run Mann Whitney U test
    
for categorical variables:
- run chi-squared test

In [4]:
full_dataset = full_dataset[['occupation', 'education', 'age', 'sex', 'fnlwgt', 'capital-loss', 'group']].copy()  # To make it simple

In [5]:
make_demographic_table(full_dataset, group_col='group', display_cols=full_dataset.columns)

Unnamed: 0,_item,_type,_ks_stat,_stat_test,_nan_info,train,test,p-value
0,N,numerical,,,False,32561,16281,
1,"occupation=Armed-Forces, n(%)",categorical,,Chi2,False,9 (0.0%),6 (0.0%),0.784203
2,"occupation=Craft-repair, n(%)",categorical,,Chi2,False,4099 (12.6%),2013 (12.4%),0.488556
3,"occupation=Exec-managerial, n(%)",categorical,,Chi2,False,4066 (12.5%),2020 (12.4%),0.811452
4,"occupation=Farming-fishing, n(%)",categorical,,Chi2,False,994 (3.1%),496 (3.0%),0.992125
5,"occupation=Handlers-cleaners, n(%)",categorical,,Chi2,False,1370 (4.2%),702 (4.3%),0.606379
6,"occupation=Machine-op-inspct, n(%)",categorical,,Chi2,False,2002 (6.1%),1020 (6.3%),0.628453
7,"occupation=NaN, n(%)",categorical,,Chi2,True,1843 (5.7%),966 (5.9%),0.229485
8,"occupation=Other-service, n(%)",categorical,,Chi2,False,3295 (10.1%),1628 (10.0%),0.689444
9,"occupation=Priv-house-serv, n(%)",categorical,,Chi2,False,149 (0.5%),93 (0.6%),0.105789


# Propensity score matching
Matching methods
- Greedy: very slow / more matched pairs
- Hungarian algorithm: very fast / slightly less pairs

In [6]:
psm_greedy = PropensityScoreMatching(
    match_cols=['occupation', 'education', 'age', 'sex', 'fnlwgt', 'capital-loss'], 
    group_col='group', 
    categorical_encoder=PrepPipeline([SelectCategorical(), DummyVarible(dummy_na=True)]),
    numerical_encoder=PrepPipeline([SelectNumerical(), DistTransformer('standard')]),
    matching_method='greedy')
make_demographic_table(
    psm_greedy.run(full_dataset.sample(10000)), group_col='group', display_cols=full_dataset.columns) #  ~60s

Logger created at 24/02/12:09:39:20
eval_metric automatically selected.
09:39:20 [None]	best score is 0.536398


Unnamed: 0,_item,_type,_ks_stat,_stat_test,_nan_info,test,train,p-value
0,N,numerical,,,False,3346,3346,
1,"occupation=Armed-Forces, n(%)",categorical,,Chi2,False,1 (0.0%),1 (0.0%),1.0
2,"occupation=Craft-repair, n(%)",categorical,,Chi2,False,471 (14.1%),451 (13.5%),0.500392
3,"occupation=Exec-managerial, n(%)",categorical,,Chi2,False,410 (12.3%),407 (12.2%),0.940471
4,"occupation=Farming-fishing, n(%)",categorical,,Chi2,False,107 (3.2%),115 (3.4%),0.632792
5,"occupation=Handlers-cleaners, n(%)",categorical,,Chi2,False,118 (3.5%),120 (3.6%),0.947374
6,"occupation=Machine-op-inspct, n(%)",categorical,,Chi2,False,209 (6.2%),207 (6.2%),0.959622
7,"occupation=NaN, n(%)",categorical,,Chi2,True,183 (5.5%),178 (5.3%),0.828641
8,"occupation=Other-service, n(%)",categorical,,Chi2,False,361 (10.8%),363 (10.8%),0.968608
9,"occupation=Priv-house-serv, n(%)",categorical,,Chi2,False,16 (0.5%),19 (0.6%),0.734648


In [7]:
psm_fast = PropensityScoreMatching(
    match_cols=['occupation', 'education', 'age', 'sex', 'fnlwgt', 'capital-loss'],
    group_col='group', 
    categorical_encoder=PrepPipeline([SelectCategorical(), DummyVarible(dummy_na=True)]),
    numerical_encoder=PrepPipeline([SelectNumerical(), DistTransformer('standard')]),
    matching_method='hungarian')
make_demographic_table(
    psm_fast.run(full_dataset.sample(10000)), group_col='group', display_cols=full_dataset.columns)  # ~0.5s

Logger created at 24/02/12:09:40:19
eval_metric automatically selected.
09:40:19 [None]	best score is 0.524922


Unnamed: 0,_item,_type,_ks_stat,_stat_test,_nan_info,test,train,p-value
0,N,numerical,,,False,3341,3341,
1,"occupation=Craft-repair, n(%)",categorical,,Chi2,False,424 (12.7%),416 (12.5%),0.796172
2,"occupation=Exec-managerial, n(%)",categorical,,Chi2,False,412 (12.3%),427 (12.8%),0.605246
3,"occupation=Farming-fishing, n(%)",categorical,,Chi2,False,101 (3.0%),99 (3.0%),0.942766
4,"occupation=Handlers-cleaners, n(%)",categorical,,Chi2,False,148 (4.4%),151 (4.5%),0.905797
5,"occupation=Machine-op-inspct, n(%)",categorical,,Chi2,False,210 (6.3%),226 (6.8%),0.45747
6,"occupation=NaN, n(%)",categorical,,Chi2,True,176 (5.3%),195 (5.8%),0.336256
7,"occupation=Other-service, n(%)",categorical,,Chi2,False,324 (9.7%),292 (8.7%),0.189888
8,"occupation=Priv-house-serv, n(%)",categorical,,Chi2,False,18 (0.5%),15 (0.4%),0.727076
9,"occupation=Prof-specialty, n(%)",categorical,,Chi2,False,426 (12.8%),424 (12.7%),0.970713


In [8]:
psm_lgb = PropensityScoreMatching(
    match_cols=['occupation', 'education', 'age', 'sex', 'fnlwgt', 'capital-loss'],
    group_col='group', 
    categorical_encoder=PrepPipeline([SelectCategorical(), OrdinalEncoder()]),
    numerical_encoder=PrepPipeline([SelectNumerical()]),
    model=LGBMClassifier,
    trainer_params={
        'params': {
            'objective': 'binary',
            'metric': 'auc',
            'verbosity': 0,
        },
        'fit_params': {
            'num_boost_round': 500,
            'callbacks': [lgb.early_stopping(stopping_rounds=50, verbose=True), lgb.log_evaluation(25)]
        }
    },
    fit_method='cv',
    matching_method='hungarian')
make_demographic_table(
    psm_lgb.run(full_dataset.sample(10000)), group_col='group', display_cols=full_dataset.columns)  # ~10s

Logger created at 24/02/12:09:40:20
Training until validation scores don't improve for 50 rounds
[25]	cv_agg's valid auc: 0.507111 + 0.00409765
[50]	cv_agg's valid auc: 0.504013 + 0.00338955
Early stopping, best iteration is:
[8]	cv_agg's valid auc: 0.510841 + 0.0111668
09:40:21 [8]	best score is 0.510841


Unnamed: 0,_item,_type,_ks_stat,_stat_test,_nan_info,test,train,p-value
0,N,numerical,,,False,3003,3003,
1,"occupation=Craft-repair, n(%)",categorical,,Chi2,False,397 (13.2%),429 (14.3%),0.24546
2,"occupation=Exec-managerial, n(%)",categorical,,Chi2,False,380 (12.7%),378 (12.6%),0.969005
3,"occupation=Farming-fishing, n(%)",categorical,,Chi2,False,81 (2.7%),77 (2.6%),0.808881
4,"occupation=Handlers-cleaners, n(%)",categorical,,Chi2,False,129 (4.3%),117 (3.9%),0.473896
5,"occupation=Machine-op-inspct, n(%)",categorical,,Chi2,False,202 (6.7%),198 (6.6%),0.876617
6,"occupation=NaN, n(%)",categorical,,Chi2,True,162 (5.4%),191 (6.4%),0.12451
7,"occupation=Other-service, n(%)",categorical,,Chi2,False,310 (10.3%),313 (10.4%),0.932549
8,"occupation=Priv-house-serv, n(%)",categorical,,Chi2,False,16 (0.5%),20 (0.7%),0.616016
9,"occupation=Prof-specialty, n(%)",categorical,,Chi2,False,356 (11.9%),323 (10.8%),0.192245
