In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

from helpers import utils, pipelines, models

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

import dice_ml

import multiprocessing

from collections.abc import Iterable
from functools import partial

from copy import deepcopy

### Define research parameters

In [11]:
scoring = 'f1'#roc_auc#f1#accuracy

test_size_proportion=0.33

sample_frac = 0.05

search_pipelines = pipelines.get_classification_pipelines()
search_parameters = models.parameters

### Read and preprocess data

In [12]:
raw_data = pd.read_csv("./datasets/adult.csv")
df = utils.preprocess_adult(raw_data)
target = 'income'

#### split data

In [13]:
# all data
X = df.drop(target, axis=1)
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_proportion, random_state=42)

# sampled data
df_sample = df.sample(frac=sample_frac, random_state=42)
X_sample = df_sample.drop(target, axis=1)
y_sample = df_sample[target]
X_sample_train, X_sample_test, y_sample_train, y_sample_test = train_test_split(X_sample, y_sample, test_size=test_size_proportion, random_state=42)

In [14]:
display(y_train.value_counts())
display(y_sample_train.value_counts())
X_sample_test.shape

0    22750
1     7517
Name: income, dtype: int64

0    1119
1     394
Name: income, dtype: int64

(746, 11)

### Whole adult dataset scores

In [7]:
print(f'whole adult dataset {scoring} scores:')
_, whole_scores = utils.fit_and_evaluate(X_train, y_train, X_test, y_test,
                    search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
whole_dataset_result_df = pd.DataFrame.from_dict(whole_scores, orient='index', columns=[f'full data {scoring} score'])
whole_dataset_result_df.T

whole adult dataset f1 scores:


Unnamed: 0,lg,rf,xgb
full data f1 score,0.618785,0.573201,0.632986


### Sampled adult dataset scores

In [8]:
print(f'{sample_frac*100}% sampled adult dataset {scoring} scores:')
_, sampled_scores = utils.fit_and_evaluate(X_sample_train, y_sample_train, X_sample_test, y_sample_test,
                        search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
sampled_dataset_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'sampled data {scoring} score'])
sampled_dataset_result_df.T

5.0% sampled adult dataset f1 scores:


Unnamed: 0,lg,rf,xgb
sampled data f1 score,0.581315,0.35514,0.571429


### Random over sampling (all classes but the majority class until balanced)

In [9]:
# ros = RandomOverSampler(random_state=42) # resample all classes but the majority class
# # add to pipeline
# ros_pipelines = deepcopy(search_pipelines)
# for n, p in ros_pipelines.items():
#     p.steps.insert(1, ('ros', ros))


# print(f'{sample_frac*100}% sampled adult dataset with Random Over Sampling {scoring} scores:')
# _, sampled_scores = utils.fit_and_evaluate(X_sample_train, y_sample_train, X_sample_test, y_sample_test,
#                         search_estimators=ros_pipelines, search_params=search_parameters, scoring=scoring)
# ros_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'ROS {scoring} score'])
# ros_result_df.T

In [10]:
ros = RandomOverSampler(random_state=42) # resample all classes but the majority class
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_sample_train, y_sample_train)
display(y_resampled_ros.value_counts())

print(f'{sample_frac*100}% sampled adult dataset with Random Over Sampling {scoring} scores:')

_, sampled_scores = utils.fit_and_evaluate(X_resampled_ros, y_resampled_ros, X_sample_test, y_sample_test,
                        search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
ros_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'ROS {scoring} score'])
ros_result_df.T

0    1119
1    1119
Name: income, dtype: int64

5.0% sampled adult dataset with Random Over Sampling f1 scores:


Unnamed: 0,lg,rf,xgb
ROS f1 score,0.632391,0.575,0.618926


### SMOTE & ADASYN (all classes but the majority class until balanced)
working just on numerical so need to transform before, so we insert it to the pipeline after the preproccess step

In [11]:
from sklearn.compose import ColumnTransformer, make_column_selector

ct = ColumnTransformer([
    ('categorical', pipelines.categorical_pipe, make_column_selector(dtype_include=['object'])),
], remainder='passthrough')

preprocess_X_sample_train = ct.fit_transform(X_sample_train)
preprocess_X_sample_test = ct.transform(X_sample_test)

In [12]:
# # add to pipeline
# smote_pipelines = deepcopy(search_pipelines)
# for n, p in smote_pipelines.items():
#     p.steps.insert(1, ('smote', SMOTE(random_state=42)))


# print(f'{sample_frac*100}% sampled adult dataset with SMOTE {scoring} scores:')
# _, sampled_scores = utils.fit_and_evaluate(X_sample_train, y_sample_train, X_sample_test, y_sample_test,
#                         search_estimators=smote_pipelines, search_params=search_parameters, scoring=scoring)
# smote_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'SMOTE {scoring} score'])
# smote_result_df.T

In [13]:
smote = SMOTE(random_state=42) # resample all classes but the majority class
X_resampled_smote, y_resampled_smote = smote.fit_resample(pd.DataFrame(preprocess_X_sample_train), y_sample_train)
display(y_resampled_smote.value_counts())

print(f'{sample_frac*100}% sampled adult dataset with SMOTE {scoring} scores:')
_, sampled_scores = utils.fit_and_evaluate(X_resampled_smote, y_resampled_smote, preprocess_X_sample_test, y_sample_test,
                        search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
smote_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'SMOTE {scoring} score'])
smote_result_df.T

0    1119
1    1119
Name: income, dtype: int64

5.0% sampled adult dataset with SMOTE f1 scores:


Unnamed: 0,lg,rf,xgb
SMOTE f1 score,0.618182,0.569343,0.595092


In [14]:
adasyn = ADASYN(random_state=42) # resample all classes but the majority class
X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(pd.DataFrame(preprocess_X_sample_train), y_sample_train) #X_sample

print(f'{sample_frac*100}% sampled adult dataset with ADASYN {scoring} scores:')
_, sampled_scores = utils.fit_and_evaluate(X_resampled_adasyn, y_resampled_adasyn, preprocess_X_sample_test, y_sample_test,
                        search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
adasyn_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'ADASYN {scoring} score'])
adasyn_result_df.T

5.0% sampled adult dataset with ADASYN f1 scores:


Unnamed: 0,lg,rf,xgb
ADASYN f1 score,0.631043,0.564815,0.590062


### Counter Factuals

need to test many variables (differents generation methods / proximity vs diversity / balancing data / etc.)

In [15]:
from sklearn.pipeline import Pipeline
# from sklearn.svm import SVC

# svc_pipeline = {'svc': Pipeline([('column_transformer', pipelines.preprocessor),('model', SVC(random_state=42))])}
# svc_params = {'svc': {'model__C': [0.5, 1, 5], 'model__kernel': ['linear', 'rbf'], 'model__gamma': ['scale', 'auto']}}
# best_svc_estimator, sampled_scores = utils.fit_and_evaluate(X_sample, y_sample,
#                         search_estimators=svc_pipeline, search_params=svc_params, scoring=scoring)
# pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'sampled data {scoring} score']).T

from lightgbm import LGBMClassifier

gbm_pipeline = {'gbm': Pipeline([('column_transformer', pipelines.preprocessor),('model', LGBMClassifier(random_state=42, verbose=-1))])}
gbm_params = {'gbm': {'model__max_depth': [5, 6, 7], 'model__min_child_weight': [30, 50], 
                        'model__num_leaves': [25, 55, 80]}}
best_gbm_estimator, sampled_scores = utils.fit_and_evaluate(X_sample_train, y_sample_train, X_sample_test, y_sample_test,
                        search_estimators=gbm_pipeline, search_params=gbm_params, scoring=scoring)
pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'sampled data {scoring} score']).T

Unnamed: 0,gbm
sampled data f1 score,0.583051


In [16]:
d = dice_ml.Data(dataframe=df_sample, continuous_features=['age', 'fnlwgt', 'hours-per-week'], outcome_name=target)
m = dice_ml.Model(model=best_gbm_estimator['gbm'], backend="sklearn")

In [17]:
exp = dice_ml.Dice(d, m, method="random") # need to check diferent method (random, genetic, kdtree)

minority_class = y_sample_train.value_counts().idxmin()
majority_class = y_sample_train.value_counts().idxmax()
classes_gap = y_sample_train.value_counts()[majority_class]-y_sample_train.value_counts()[minority_class]

# X_sample_minority_class = df_sample[df_sample[target] == minority_class].drop(target, axis=1)
X_sample_majority_class = X_sample_train[y_sample_train == majority_class]

augmented_data = pd.DataFrame()
cf_per = 1
cf_counter = 0
for i, (index, row) in enumerate(X_sample_majority_class.iterrows()):
    if cf_counter >= classes_gap: break
    if i%500 == 0: print(f'{i}/{classes_gap}')
    try:
        e1 = exp.generate_counterfactuals(pd.DataFrame(row).T, total_CFs=cf_per, desired_class="opposite",
                                          proximity_weight=1, diversity_weight=0)#, verbose=False)
        cf_df = e1.cf_examples_list[0].final_cfs_df
        if cf_df[target].iloc[0]!=minority_class: continue
        augmented_data = pd.concat([augmented_data, cf_df])
        cf_counter += len(cf_df)
    except:
        pass

0/725


100%|██████████| 1/1 [00:00<00:00,  6.77it/s]
100%|██████████| 1/1 [00:00<00:00,  6.11it/s]
100%|██████████| 1/1 [00:00<00:00,  6.45it/s]
100%|██████████| 1/1 [00:00<00:00,  7.17it/s]
100%|██████████| 1/1 [00:00<00:00,  6.66it/s]
100%|██████████| 1/1 [00:00<00:00,  3.00it/s]
100%|██████████| 1/1 [00:00<00:00,  4.34it/s]
100%|██████████| 1/1 [00:00<00:00,  6.19it/s]
100%|██████████| 1/1 [00:00<00:00,  4.19it/s]
100%|██████████| 1/1 [00:00<00:00,  3.69it/s]
100%|██████████| 1/1 [00:00<00:00,  5.52it/s]
100%|██████████| 1/1 [00:00<00:00,  3.67it/s]
100%|██████████| 1/1 [00:00<00:00,  6.75it/s]
100%|██████████| 1/1 [00:00<00:00,  4.48it/s]
100%|██████████| 1/1 [00:00<00:00,  6.65it/s]
100%|██████████| 1/1 [00:00<00:00,  7.04it/s]
100%|██████████| 1/1 [00:00<00:00,  6.26it/s]
100%|██████████| 1/1 [00:00<00:00,  6.13it/s]
100%|██████████| 1/1 [00:00<00:00,  3.76it/s]
100%|██████████| 1/1 [00:00<00:00,  6.25it/s]
100%|██████████| 1/1 [00:00<00:00,  6.17it/s]
100%|██████████| 1/1 [00:00<00:00,

500/725


100%|██████████| 1/1 [00:00<00:00,  5.28it/s]
100%|██████████| 1/1 [00:00<00:00,  4.64it/s]
100%|██████████| 1/1 [00:00<00:00,  6.25it/s]
100%|██████████| 1/1 [00:00<00:00,  5.22it/s]
100%|██████████| 1/1 [00:00<00:00,  7.04it/s]
100%|██████████| 1/1 [00:00<00:00,  3.34it/s]
100%|██████████| 1/1 [00:00<00:00,  6.86it/s]
100%|██████████| 1/1 [00:00<00:00,  6.62it/s]
100%|██████████| 1/1 [00:00<00:00,  6.45it/s]
100%|██████████| 1/1 [00:00<00:00,  5.56it/s]
100%|██████████| 1/1 [00:00<00:00,  4.94it/s]
100%|██████████| 1/1 [00:00<00:00,  6.42it/s]
100%|██████████| 1/1 [00:00<00:00,  5.79it/s]
100%|██████████| 1/1 [00:00<00:00,  4.42it/s]
100%|██████████| 1/1 [00:00<00:00,  4.30it/s]
100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
100%|██████████| 1/1 [00:00<00:00,  6.42it/s]
100%|██████████| 1/1 [00:00<00:00,  6.53it/s]
100%|██████████| 1/1 [00:00<00:00,  6.82it/s]
100%|██████████| 1/1 [00:00<00:00,  6.96it/s]
100%|██████████| 1/1 [00:00<00:00,  5.90it/s]
100%|██████████| 1/1 [00:00<00:00,

No Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec


100%|██████████| 1/1 [00:00<00:00,  6.01it/s]
100%|██████████| 1/1 [00:00<00:00,  6.41it/s]
100%|██████████| 1/1 [00:00<00:00,  6.84it/s]
100%|██████████| 1/1 [00:00<00:00,  5.91it/s]
100%|██████████| 1/1 [00:00<00:00,  6.09it/s]
100%|██████████| 1/1 [00:00<00:00,  5.32it/s]
100%|██████████| 1/1 [00:00<00:00,  6.25it/s]
100%|██████████| 1/1 [00:00<00:00,  6.45it/s]
100%|██████████| 1/1 [00:00<00:00,  5.16it/s]
100%|██████████| 1/1 [00:00<00:00,  7.09it/s]
100%|██████████| 1/1 [00:00<00:00,  6.34it/s]
100%|██████████| 1/1 [00:00<00:00,  6.12it/s]
100%|██████████| 1/1 [00:00<00:00,  7.43it/s]
100%|██████████| 1/1 [00:00<00:00,  6.83it/s]
100%|██████████| 1/1 [00:00<00:00,  7.13it/s]
100%|██████████| 1/1 [00:00<00:00,  6.06it/s]
100%|██████████| 1/1 [00:00<00:00,  4.38it/s]
100%|██████████| 1/1 [00:00<00:00,  6.80it/s]
100%|██████████| 1/1 [00:00<00:00,  7.07it/s]
100%|██████████| 1/1 [00:00<00:00,  4.91it/s]
100%|██████████| 1/1 [00:00<00:00,  6.72it/s]
100%|██████████| 1/1 [00:00<00:00,

In [18]:
X_augmented_cf = pd.concat([X_sample_train, augmented_data.drop(target, axis=1)]).astype(X_sample_train.dtypes)
y_augmented_cf = pd.concat([y_sample_train, pd.Series([minority_class]*len(augmented_data))])

display(y_augmented_cf.value_counts())

print(f'{sample_frac*100}% sampled adult dataset with CF {scoring} scores:')
best_est_cf_random, sampled_scores = utils.fit_and_evaluate(X_augmented_cf, y_augmented_cf, X_sample_test, y_sample_test,
                        search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
cf_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'CF {scoring} score'])
cf_result_df.T

0    1119
1    1119
dtype: int64

5.0% sampled adult dataset with CF f1 scores:


Unnamed: 0,lg,rf,xgb
CF f1 score,0.619318,0.589041,0.602273


# summary

In [19]:
result_df = pd.concat([whole_dataset_result_df, sampled_dataset_result_df, ros_result_df, smote_result_df, adasyn_result_df, cf_result_df], axis=1)
result_df

Unnamed: 0,full data f1 score,sampled data f1 score,ROS f1 score,SMOTE f1 score,ADASYN f1 score,CF f1 score
lg,0.618785,0.581315,0.632391,0.618182,0.631043,0.619318
rf,0.573201,0.35514,0.575,0.569343,0.564815,0.589041
xgb,0.632986,0.571429,0.618926,0.595092,0.590062,0.602273


In [20]:
result_df.to_csv(rf'log/adult_{scoring}_{str(int(sample_frac*100))}%.csv') 