In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np

from helpers import utils, pipelines, models

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC, ADASYN

import dice_ml

import multiprocessing

from collections.abc import Iterable
from functools import partial

from copy import deepcopy

### Define research parameters

In [4]:
scoring = 'f1'#roc_auc#f1#accuracy

test_size_proportion=0.33

sample_frac = 0.05

search_pipelines = pipelines.get_classification_pipelines()
search_parameters = models.parameters

### Read and preprocess data

In [5]:
raw_data = pd.read_csv(r"datasets\german_credit.csv", index_col=0)
df = utils.preprocess_german(raw_data)
target = 'Risk'

#### split data

In [6]:
# all data
X = df.drop(target, axis=1)
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_proportion, random_state=42)

In [7]:
display(y_train.value_counts())

1    471
0    199
Name: Risk, dtype: int64

### Whole dataset scores

In [7]:
print(f'whole adult dataset {scoring} scores:')
whole_best_ests, whole_scores = utils.fit_and_evaluate(X_train, y_train, X_test, y_test,
                    search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
whole_dataset_result_df = pd.DataFrame.from_dict(whole_scores, orient='index', columns=[f'full data {scoring} score'])
whole_dataset_result_df.T

whole adult dataset f1 scores:


Unnamed: 0,lg,rf,xgb
full data f1 score,0.823529,0.822669,0.815109


### Random over sampling (all classes but the majority class until balanced)

In [8]:
ros = RandomOverSampler(random_state=42) # resample all classes but the majority class
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_train, y_train)
display(y_resampled_ros.value_counts())

print(f'{sample_frac*100}% sampled adult dataset with Random Over Sampling {scoring} scores:')

_, sampled_scores = utils.fit_and_evaluate(X_resampled_ros, y_resampled_ros, X_test, y_test,
                        search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
ros_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'ROS {scoring} score'])
ros_result_df.T

1    471
0    471
Name: Risk, dtype: int64

5.0% sampled adult dataset with Random Over Sampling f1 scores:


Unnamed: 0,lg,rf,xgb
ROS f1 score,0.759259,0.775414,0.788546


### SMOTE & ADASYN (all classes but the majority class until balanced)
working just on numerical so need to transform before, so we insert it to the pipeline after the preproccess step

In [9]:
from sklearn.compose import ColumnTransformer, make_column_selector

ct = ColumnTransformer([
    ('categorical', pipelines.categorical_pipe, make_column_selector(dtype_include=['object'])),
], remainder='passthrough')

preprocess_X_train = ct.fit_transform(X_train)
preprocess_X_test = ct.transform(X_test)

In [10]:
# # add to pipeline
# smote_pipelines = deepcopy(search_pipelines)
# for n, p in smote_pipelines.items():
#     p.steps.insert(1, ('smote', SMOTE(random_state=42)))


# print(f'{sample_frac*100}% sampled adult dataset with SMOTE {scoring} scores:')
# _, sampled_scores = utils.fit_and_evaluate(X_sample_train, y_sample_train, X_sample_test, y_sample_test,
#                         search_estimators=smote_pipelines, search_params=search_parameters, scoring=scoring)
# smote_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'SMOTE {scoring} score'])
# smote_result_df.T

In [11]:
smote = SMOTE(random_state=42) # resample all classes but the majority class
X_resampled_smote, y_resampled_smote = smote.fit_resample(pd.DataFrame(preprocess_X_train), y_train)
display(y_resampled_smote.value_counts())

print(f'{sample_frac*100}% sampled adult dataset with SMOTE {scoring} scores:')
smote_best_ests, sampled_scores = utils.fit_and_evaluate(X_resampled_smote, y_resampled_smote, preprocess_X_test, y_test,
                        search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
smote_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'SMOTE {scoring} score'])
smote_result_df.T

1    471
0    471
Name: Risk, dtype: int64

5.0% sampled adult dataset with SMOTE f1 scores:


Unnamed: 0,lg,rf,xgb
SMOTE f1 score,0.763889,0.786957,0.816327


In [12]:
smote = SMOTENC(random_state=42, categorical_features=['Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']) # resample all classes but the majority class
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train, y_train)
display(y_resampled_smote.value_counts())

print(f'{sample_frac*100}% sampled adult dataset with SMOTENC {scoring} scores:')
smote_best_ests, sampled_scores = utils.fit_and_evaluate(X_resampled_smote, y_resampled_smote, X_test, y_test,
                        search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
smote_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'SMOTENC {scoring} score'])
smote_result_df.T

1    471
0    471
Name: Risk, dtype: int64

5.0% sampled adult dataset with SMOTENC f1 scores:


Unnamed: 0,lg,rf,xgb
SMOTENC f1 score,0.785219,0.791946,0.812362


In [31]:
additional_samples = ((y_train.value_counts(normalize=True))*(len(y_train)*0.5)).astype(int)
sample_strategy = (additional_samples + y_train.value_counts()).to_dict()
sampler = SMOTENC(random_state=42, categorical_features=['Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account', 'Purpose'],
                    sampling_strategy=sample_strategy)

X_resampled_smote, y_resampled_smote = sampler.fit_resample(X_train, y_train)
display(y_resampled_smote.value_counts())

print(f'{sample_frac*100}% sampled adult dataset with SMOTENC {scoring} scores:')
smote_best_ests, sampled_scores = utils.fit_and_evaluate(X_resampled_smote, y_resampled_smote, X_test, y_test,
                        search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
smote_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'SMOTENC {scoring} score'])
smote_result_df.T

{1: 706, 0: 298}


1    706
0    298
Name: Risk, dtype: int64

5.0% sampled adult dataset with SMOTENC f1 scores:


Unnamed: 0,lg,rf,xgb
SMOTENC f1 score,0.819277,0.819853,0.817427


In [12]:
adasyn = ADASYN(random_state=42) # resample all classes but the majority class
X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(pd.DataFrame(preprocess_X_train), y_train) #X_sample

print(f'{sample_frac*100}% sampled adult dataset with ADASYN {scoring} scores:')
_, sampled_scores = utils.fit_and_evaluate(X_resampled_adasyn, y_resampled_adasyn, preprocess_X_test, y_test,
                        search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
adasyn_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'ADASYN {scoring} score'])
adasyn_result_df.T

5.0% sampled adult dataset with ADASYN f1 scores:


Unnamed: 0,lg,rf,xgb
ADASYN f1 score,0.76815,0.785714,0.813142


### Counter Factuals

need to test many variables (differents generation methods / proximity vs diversity / balancing data / etc.)

In [13]:
from sklearn.pipeline import Pipeline
# from sklearn.svm import SVC

# svc_pipeline = {'svc': Pipeline([('column_transformer', pipelines.preprocessor),('model', SVC(random_state=42))])}
# svc_params = {'svc': {'model__C': [0.5, 1, 5], 'model__kernel': ['linear', 'rbf'], 'model__gamma': ['scale', 'auto']}}
# best_svc_estimator, sampled_scores = utils.fit_and_evaluate(X_sample, y_sample,
#                         search_estimators=svc_pipeline, search_params=svc_params, scoring=scoring)
# pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'sampled data {scoring} score']).T

# from lightgbm import LGBMClassifier

# gbm_pipeline = {'gbm': Pipeline([('column_transformer', pipelines.preprocessor),('model', LGBMClassifier(random_state=42, verbose=-1))])}
# gbm_params = {'gbm': {
#     'model__max_depth': [5, 6, 7], 
#                       'model__min_child_samples': [30, 50], 
#                         'model__num_leaves': [25, 55], 
#                         'model__learning_rate': [0.1, 0.3, 0.5],
#                         'model__reg_lambda': [0, 0.5, 1.5, 3],
#                         }}
# best_gbm_estimator, sampled_scores = utils.fit_and_evaluate(X_train, y_train, X_test, y_test,
#                         search_estimators=gbm_pipeline, search_params=gbm_params, scoring=scoring)
# pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'sampled data {scoring} score']).T

from sklearn.neighbors import KNeighborsClassifier


knn_pipeline = {'knn': Pipeline([('column_transformer', pipelines.preprocessor),('model', KNeighborsClassifier())])}
knn_params = {'knn': {'model__n_neighbors': list(range(1,21)), 'model__weights': ['uniform', 'distance'],
                      'model__p': [1,2], 'model__algorithm': ['ball_tree', 'kd_tree', 'brute'],}}
best_cf_estimator, sampled_scores = utils.fit_and_evaluate(X_train, y_train, X_test, y_test,
                        search_estimators=knn_pipeline, search_params=knn_params, scoring=scoring)
pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'sampled data {scoring} score']).T

Unnamed: 0,knn
sampled data f1 score,0.829175


In [14]:
best_cf_estimator['knn']['model'].get_params()

{'algorithm': 'ball_tree',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 19,
 'p': 1,
 'weights': 'uniform'}

In [15]:
d = dice_ml.Data(dataframe=df, continuous_features=['Age', 'Credit amount', 'Duration'], outcome_name=target)
m = dice_ml.Model(model=best_cf_estimator['knn'], backend="sklearn")

In [16]:
exp = dice_ml.Dice(d, m, method="random") # need to check diferent method (random, genetic, kdtree)

minority_class = y_train.value_counts().idxmin()
majority_class = y_train.value_counts().idxmax()
classes_gap = y_train.value_counts()[majority_class]-y_train.value_counts()[minority_class]

# X_sample_minority_class = df_sample[df_sample[target] == minority_class].drop(target, axis=1)
X_sample_majority_class = X_train[y_train == majority_class]

augmented_data = pd.DataFrame()
cf_per = 1
cf_counter = 0
for i, (index, row) in enumerate(X_sample_majority_class.iterrows()):
    if cf_counter >= classes_gap: break
    if i%500 == 0: print(f'{i}/{classes_gap}')
    try:
        e1 = exp.generate_counterfactuals(pd.DataFrame(row).T, total_CFs=cf_per, desired_class="opposite",
                                          verbose=False,
                                        #   proximity_weight=1, diversity_weight=0
                                          )#, verbose=False)
        cf_df = e1.cf_examples_list[0].final_cfs_df
        if cf_df[target].iloc[0]!=minority_class: continue
        augmented_data = pd.concat([augmented_data, cf_df])
        cf_counter += len(cf_df)
    except:
        pass

0/272


100%|██████████| 1/1 [00:00<00:00,  3.98it/s]
100%|██████████| 1/1 [00:00<00:00,  2.62it/s]
100%|██████████| 1/1 [00:00<00:00,  2.62it/s]
100%|██████████| 1/1 [00:00<00:00,  4.47it/s]
100%|██████████| 1/1 [00:00<00:00,  3.80it/s]
100%|██████████| 1/1 [00:00<00:00,  3.87it/s]
100%|██████████| 1/1 [00:00<00:00,  3.84it/s]
100%|██████████| 1/1 [00:00<00:00,  4.52it/s]
100%|██████████| 1/1 [00:00<00:00,  5.82it/s]
100%|██████████| 1/1 [00:00<00:00,  5.22it/s]
100%|██████████| 1/1 [00:00<00:00,  4.34it/s]
100%|██████████| 1/1 [00:00<00:00,  4.04it/s]
100%|██████████| 1/1 [00:00<00:00,  3.78it/s]
100%|██████████| 1/1 [00:00<00:00,  3.79it/s]
100%|██████████| 1/1 [00:00<00:00,  3.16it/s]
100%|██████████| 1/1 [00:00<00:00,  2.83it/s]
100%|██████████| 1/1 [00:00<00:00,  3.93it/s]
100%|██████████| 1/1 [00:00<00:00,  4.82it/s]
100%|██████████| 1/1 [00:00<00:00,  3.88it/s]
100%|██████████| 1/1 [00:00<00:00,  2.46it/s]
100%|██████████| 1/1 [00:00<00:00,  3.67it/s]
100%|██████████| 1/1 [00:00<00:00,

In [17]:
X_augmented_cf = pd.concat([X_train, augmented_data.drop(target, axis=1)]).astype(X_train.dtypes)
y_augmented_cf = pd.concat([y_train, pd.Series([minority_class]*len(augmented_data))])

display(y_augmented_cf.value_counts())

print(f'{sample_frac*100}% sampled adult dataset with CF {scoring} scores:')
best_est_cf_random, sampled_scores = utils.fit_and_evaluate(X_augmented_cf, y_augmented_cf, X_test, y_test,
                        search_estimators=search_pipelines, search_params=search_parameters, scoring=scoring)
cf_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'CF {scoring} score'])
cf_result_df.T

1    471
0    471
dtype: int64

5.0% sampled adult dataset with CF f1 scores:


Unnamed: 0,lg,rf,xgb
CF f1 score,0.802632,0.768519,0.809524


# summary

In [18]:
result_df = pd.concat([whole_dataset_result_df, ros_result_df, smote_result_df, adasyn_result_df, cf_result_df], axis=1)
result_df

Unnamed: 0,full data f1 score,ROS f1 score,SMOTE f1 score,ADASYN f1 score,CF f1 score
lg,0.823529,0.759259,0.763889,0.76815,0.802632
rf,0.822669,0.775414,0.786957,0.785714,0.768519
xgb,0.815109,0.788546,0.816327,0.813142,0.809524


In [19]:
result_df.to_csv(rf'log/german_{scoring}.csv')

In [20]:
best_est_cf_random['xgb']['model'].get_params()['reg_lambda']
smote_best_ests['xgb']['model'].get_params()['reg_lambda']

0.5