In [1]:
import pandas as pd
import numpy as np

from helpers import utils, pipelines, models

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

import dice_ml

import multiprocessing

from collections.abc import Iterable
from functools import partial

# AUGMENTATION ONLY ON TRAIN SET! NEED TO FIX!

### Read and preprocess data

In [2]:
raw_data = pd.read_csv("./datasets/adult.csv")
df = utils.preprocess_adult(raw_data)
target = 'income'
X = df.drop(target, axis=1)
y = df[target]

### Define research parameters

In [3]:
scoring = 'accuracy'#roc_auc#f1

search_pipelines = pipelines.get_adult_pipelines()
search_parameters = models.parameters

sample_frac = 0.2
df_sample = df.sample(frac=sample_frac, random_state=42)
X_sample = df_sample.drop(target, axis=1)
y_sample = df_sample[target]

### Whole adult dataset scores

In [8]:
print(f'whole adult dataset {scoring} scores:')
_, whole_scores = utils.fit_and_evaluate(X, y,
                    search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
whole_dataset_result_df = pd.DataFrame.from_dict(whole_scores, orient='index', columns=[f'full data {scoring} score'])
whole_dataset_result_df.T

whole adult dataset accuracy scores:


Unnamed: 0,lg,rf,xgb
full data accuracy score,0.828951,0.824591,0.832372


### Sampled adult dataset scores

In [9]:
print(f'{sample_frac*100}% sampled adult dataset {scoring} scores:')
_, sampled_scores = utils.fit_and_evaluate(X_sample, y_sample,
                        search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
sampled_dataset_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'sampled data {scoring} score'])
sampled_dataset_result_df.T

20.0% sampled adult dataset accuracy scores:


Unnamed: 0,lg,rf,xgb
sampled data accuracy score,0.815225,0.812542,0.813548


### Random over sampling (all classes but the majority class until balanced)

In [None]:
ros = RandomOverSampler(random_state=42) # resample all classes but the majority class
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_sample, y_sample)
display(y_resampled_ros.value_counts())

print(f'{sample_frac*100}% sampled adult dataset with Random Over Sampling {scoring} scores:')
_, sampled_scores = utils.fit_and_evaluate(X_resampled_ros, y_resampled_ros,
                        search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
ros_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'ROS {scoring} score'])
ros_result_df.T

### SMOTE & ADASYN (all classes but the majority class until balanced)
working just on numerical so need to transform before

In [None]:
from sklearn.compose import ColumnTransformer, make_column_selector

ct = ColumnTransformer([
    ('categorical', pipelines.categorical_pipe, make_column_selector(dtype_include=['object'])),
], remainder='passthrough')

preprocess_X_sample = ct.fit_transform(X_sample)

In [None]:
smote = SMOTE(random_state=42) # resample all classes but the majority class
X_resampled_smote, y_resampled_smote = smote.fit_resample(pd.DataFrame(preprocess_X_sample), y_sample) #X_sample

print(f'{sample_frac*100}% sampled adult dataset with SMOTE {scoring} scores:')
_, sampled_scores = utils.fit_and_evaluate(X_resampled_smote, y_resampled_smote,
                        search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
smote_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'SMOTE {scoring} score'])
smote_result_df.T

In [None]:
adasyn = ADASYN(random_state=42) # resample all classes but the majority class
X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(pd.DataFrame(preprocess_X_sample), y_sample) #X_sample

print(f'{sample_frac*100}% sampled adult dataset with ADASYN {scoring} scores:')
_, sampled_scores = utils.fit_and_evaluate(X_resampled_adasyn, y_resampled_adasyn,
                        search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
adasyn_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'ADASYN {scoring} score'])
adasyn_result_df.T

### Counter Factuals

need to test many variables (differents generation methods / proximity vs diversity / balancing data / etc.)

In [None]:
from sklearn.pipeline import Pipeline
# from sklearn.svm import SVC

# svc_pipeline = {'svc': Pipeline([('column_transformer', pipelines.preprocessor),('model', SVC(random_state=42))])}
# svc_params = {'svc': {'model__C': [0.5, 1, 5], 'model__kernel': ['linear', 'rbf'], 'model__gamma': ['scale', 'auto']}}
# best_svc_estimator, sampled_scores = utils.fit_and_evaluate(X_sample, y_sample,
#                         search_estimators=svc_pipeline, search_params=svc_params, scoring=scoring)
# pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'sampled data {scoring} score']).T

from lightgbm import LGBMClassifier

gbm_pipeline = {'gbm': Pipeline([('column_transformer', pipelines.preprocessor),('model', LGBMClassifier(random_state=42, verbose=-1))])}
gbm_params = {'gbm': {'model__max_depth': [5, 6, 7], 'model__min_child_weight': [30, 50], 
                        'model__num_leaves': [25, 55, 80]}}
best_gbm_estimator, sampled_scores = utils.fit_and_evaluate(X_sample, y_sample,
                        search_estimators=gbm_pipeline, search_params=gbm_params, scoring=scoring)
pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'sampled data {scoring} score']).T

In [None]:
d = dice_ml.Data(dataframe=df_sample, continuous_features=['age', 'fnlwgt', 'hours-per-week'], outcome_name=target)
m = dice_ml.Model(model=best_gbm_estimator['gbm'], backend="sklearn")

In [12]:
exp = dice_ml.Dice(d, m, method="random") # need to check diferent method

minority_class = y_sample.value_counts().idxmin()
majority_class = y_sample.value_counts().idxmax()
classes_gap = y_sample.value_counts()[majority_class]-y_sample.value_counts()[minority_class]

# X_sample_minority_class = df_sample[df_sample[target] == minority_class].drop(target, axis=1)
X_sample_majority_class = df_sample[df_sample[target] == majority_class].drop(target, axis=1)

augmented_data = pd.DataFrame()
cf_per = 1
cf_counter = 0
for i, (index, row) in enumerate(X_sample_majority_class.iterrows()):
    if cf_counter >= classes_gap: break
    if i%500 == 0: print(f'{i}/{classes_gap}')
    try:
        e1 = exp.generate_counterfactuals(pd.DataFrame(row).T, total_CFs=cf_per, desired_class="opposite")#, verbose=False)
        cf_df = e1.cf_examples_list[0].final_cfs_df
        if cf_df[target].iloc[0]!=minority_class: continue
        augmented_data = pd.concat([augmented_data, cf_df])
        cf_counter += len(cf_df)
    except:
        pass

100%|██████████| 1/1 [00:00<00:00,  4.10it/s]
100%|██████████| 1/1 [00:00<00:00,  1.56it/s]
100%|██████████| 1/1 [00:00<00:00,  5.07it/s]
100%|██████████| 1/1 [00:00<00:00,  2.94it/s]
100%|██████████| 1/1 [00:00<00:00,  3.44it/s]
100%|██████████| 1/1 [00:00<00:00,  4.59it/s]
100%|██████████| 1/1 [00:00<00:00,  4.98it/s]
100%|██████████| 1/1 [00:00<00:00,  5.47it/s]
100%|██████████| 1/1 [00:00<00:00,  5.92it/s]
100%|██████████| 1/1 [00:00<00:00,  2.94it/s]
100%|██████████| 1/1 [00:00<00:00,  5.16it/s]
100%|██████████| 1/1 [00:00<00:00,  4.55it/s]
100%|██████████| 1/1 [00:00<00:00,  3.29it/s]
100%|██████████| 1/1 [00:00<00:00,  4.42it/s]
100%|██████████| 1/1 [00:00<00:00,  4.87it/s]
100%|██████████| 1/1 [00:00<00:00,  2.63it/s]
100%|██████████| 1/1 [00:00<00:00,  5.67it/s]
100%|██████████| 1/1 [00:00<00:00,  5.71it/s]
100%|██████████| 1/1 [00:00<00:00,  5.00it/s]
100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
100%|██████████| 1/1 [00:00<00:00,  3.45it/s]
100%|██████████| 1/1 [00:00<00:00,

4500/4649


100%|██████████| 1/1 [00:00<00:00,  5.45it/s]
100%|██████████| 1/1 [00:00<00:00,  5.39it/s]
100%|██████████| 1/1 [00:00<00:00,  5.10it/s]
100%|██████████| 1/1 [00:00<00:00,  2.83it/s]
100%|██████████| 1/1 [00:00<00:00,  2.61it/s]
100%|██████████| 1/1 [00:00<00:00,  3.91it/s]
100%|██████████| 1/1 [00:00<00:00,  5.47it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# cf_augmented_df = pd.concat([df_sample, augmented_data])
# X_augmented_cf = cf_augmented_df.drop(target, axis=1)
# y_augmented_cf = cf_augmented_df[target]

X_augmented_cf = pd.concat([X_sample, augmented_data.drop(target, axis=1)])
y_augmented_cf = pd.concat([y_sample, pd.Series([minority_class]*len(augmented_data))])

# classes_gap = y_sample.value_counts()[majority_class]-y_sample.value_counts()[minority_class]
# fix_augmented_data = augmented_data[augmented_data[target]==minority_class].sample(classes_gap)
# X_augmented_cf = pd.concat([X_sample, fix_augmented_data.drop(target, axis=1)])
# y_augmented_cf = pd.concat([y_sample, pd.Series([minority_class]*len(fix_augmented_data))])

display(y_augmented_cf.value_counts())

print(f'{sample_frac*100}% sampled adult dataset with CF {scoring} scores:')
best_est_cf_random, sampled_scores = utils.fit_and_evaluate(X_augmented_cf, y_augmented_cf,
                        search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
cf_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'CF {scoring} score'])
cf_result_df.T

# summary

In [None]:
result_df = pd.concat([whole_dataset_result_df, sampled_dataset_result_df, ros_result_df, smote_result_df, adasyn_result_df, cf_result_df], axis=1)
result_df

In [None]:
result_df.to_csv(rf'log/adult_{scoring}_{str(int(sample_frac*100))}%.csv') 