In [1]:
import sklearn
# import shap

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# Import base classifiers
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from baselines import AdaFairClassifier
from imbens.ensemble import SMOTEBoostClassifier, SMOTEBaggingClassifier, RUSBoostClassifier, UnderBaggingClassifier, SelfPacedEnsembleClassifier
from fairlearn.postprocessing import ThresholdOptimizer
from fairens import FairAugEnsemble, FairEnsemble

# Set GPU for matrix computations
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cuda':
    torch.cuda.set_device(0)
    device_id = torch.cuda.current_device()
    print (f"Now using GPU #{device_id}:\n{torch.cuda.get_device_name(device_id)}")

# Import utilities
from data import FairDataset    # This is a custom class that we will use to load the datasets
from eval import evaluate_multi_split, verbose_print
from trainer import Trainer
from utils import seed_generator, dict_info, describe_data

pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[FairAdapt]'


Now using GPU #0:
Tesla V100-SXM2-32GB


# Load dataset


In [2]:
SEED = 42
n_splits = 5
i_split = 0

dataset_kwargs = {
    'y_col': 'label',
    'train_size': 0.6,
    'val_size': 0.2,
    'test_size': 0.2,
    'concat_train_val': True,
    'normalize': True,
    'random_state': 42,
}

dataname = 'adult'
s_attr = 'gender'
# dataname = 'compas'
# s_attr = 'sex'
data = FairDataset(
    dataname=dataname,
    csv_path=f'./data/{dataname}.csv',
    s_col=s_attr,
    x_with_s=True,
    **dataset_kwargs
)

data.describe()

(
    (X_train, y_train, s_train),
    (X_val, y_val, s_val),
    (X_test, y_test, s_test),
    (idx_train, idx_val, idx_test)
) = data.get_subgroup_split(i_split=i_split, random_state=SEED, n_splits=n_splits)

classes = np.unique(y_train)
n_feat = X_train.shape[1]
n_class = len(classes)

Dataset    : adult (45222, 99) load from ./data/adult.csv
Sens/Res   : gender/label
Split      : train/test = 0.8/0.2, random_state = 42, x_with_s = True
train data [#samples 36177 #features 98]:
+-----+-------+-------+------------+
|     |   y=0 |   y=1 |   pos_rate |
| s=0 | 10421 |  1335 |     0.1136 |
+-----+-------+-------+------------+
| s=1 | 16790 |  7631 |     0.3125 |
+-----+-------+-------+------------+
test data [#samples 9045 #features 98]:
+-----+-------+-------+------------+
|     |   y=0 |   y=1 |   pos_rate |
| s=0 |  2605 |   334 |     0.1136 |
+-----+-------+-------+------------+
| s=1 |  4198 |  1908 |     0.3125 |
+-----+-------+-------+------------+



In [3]:
data_split_info = f"{data.fullname}_split[{i_split+1}of{n_splits}]_seed[{SEED}]"
print(data_split_info)

adult_gender_split[1of5]_seed[42]


In [4]:
data.df.describe()

Unnamed: 0,gender,capital-gain,race,age,education-num,capital-loss,hours-per-week,workclass=Federal-gov,workclass=Local-gov,workclass=Private,...,native-country=Puerto-Rico,native-country=Scotland,native-country=South,native-country=Taiwan,native-country=Thailand,native-country=Trinadad&Tobago,native-country=United-States,native-country=Vietnam,native-country=Yugoslavia,label
count,45222.0,45222.0,45222.0,45222.0,45222.0,45222.0,45222.0,45222.0,45222.0,45222.0,...,45222.0,45222.0,45222.0,45222.0,45222.0,45222.0,45222.0,45222.0,45222.0,45222.0
mean,0.675048,0.011014,0.860267,0.295177,0.607897,0.020339,0.407531,0.031091,0.068551,0.736522,...,0.00387,0.000442,0.002233,0.001216,0.000641,0.000575,0.913095,0.001835,0.000509,0.247844
std,0.468362,0.075065,0.346714,0.181067,0.170192,0.092965,0.122526,0.173566,0.252691,0.440524,...,0.062088,0.021026,0.047207,0.034854,0.025316,0.023971,0.281698,0.042803,0.022547,0.431766
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.150685,0.533333,0.0,0.397959,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,1.0,0.0,1.0,0.273973,0.6,0.0,0.397959,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,1.0,0.0,1.0,0.410959,0.8,0.0,0.44898,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
s_prv, s_prt = 1, 0

msk_prv, msk_prt = s_train == s_prv, s_train == s_prt
n_prv, n_prt = msk_prv.sum(), msk_prt.sum()
idx_prv, idx_prt = np.where(msk_prv)[0], np.where(msk_prt)[0]

X_prv, y_prv = X_train[idx_prv], y_train[idx_prv]
X_prt, y_prt = X_train[idx_prt], y_train[idx_prt]

X_prv.shape, X_prt.shape

((18316, 98), (8817, 98))

In [6]:
from utils_unloc import DataProcessor

dp = DataProcessor()

df_train = pd.DataFrame(X_train, columns=data.feature_names)
df_train = df_train.drop(data.s_col, axis=1)
idx_feats, num_feats, name_feats = dp.parse_feature_types(df_train, verbose=True)

////// Feature Numbers //////
Total:        97 
Continuous:   5  	 e.g., ['capital-gain', 'age', 'education-num', 'capital-loss', 'hours-per-week'] 
Categorical:  1  	 e.g., ['race'] 
One-hot:      91 	 e.g., ['workclass=Federal-gov', 'workclass=Local-gov', 'workclass=Private', 'workclass=Self-emp-inc', 'workclass=Self-emp-not-inc']
Raw one-hot:  6  	 e.g., ['education', 'occupation', 'workclass', 'relationship', 'marital-status']
Raw total:    12 


In [7]:
df_train_encoded, feats_con, feats_cat = dp.get_reverse_onehot_dataframe(df_train, name_feats)
df_train_encoded

Unnamed: 0,capital-gain,age,education-num,capital-loss,hours-per-week,race,workclass,education,marital-status,occupation,relationship,native-country
0,0.000000,0.287671,0.533333,0.0,0.397959,1,2,11,0,5,1,38
1,0.000000,0.150685,0.800000,0.0,0.397959,0,2,9,2,9,5,4
2,0.000000,0.479452,0.533333,0.0,0.448980,1,4,11,2,3,0,38
3,0.000000,0.205479,0.733333,0.0,0.500000,0,2,7,4,11,1,38
4,0.000000,0.232877,0.200000,0.0,0.448980,0,2,5,2,13,0,25
...,...,...,...,...,...,...,...,...,...,...,...,...
27128,0.000000,0.136986,0.600000,0.0,0.448980,1,2,15,4,11,1,38
27129,0.000000,0.068493,0.600000,0.0,0.397959,1,2,15,4,2,3,38
27130,0.000000,0.520548,0.533333,0.0,0.316327,1,2,11,5,8,1,38
27131,0.000000,0.561644,0.733333,0.0,0.357143,1,2,7,0,9,1,38


In [8]:
df_train_raw = dp.get_raw_dataframe(df_train, data, train_idx=idx_train, name_feats=name_feats)
df_train_raw

Unnamed: 0,capital-gain,age,education-num,capital-loss,hours-per-week,race,workclass,education,marital-status,occupation,relationship,native-country,Y,S
0,0.0,38.0,9.0,0.0,40.0,1.0,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,United-States,0.0,1.0
1,0.0,28.0,13.0,0.0,40.0,0.0,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Cuba,0.0,0.0
2,0.0,52.0,9.0,0.0,45.0,1.0,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,United-States,1.0,1.0
3,0.0,32.0,12.0,0.0,50.0,0.0,Private,Assoc-acdm,Never-married,Sales,Not-in-family,United-States,0.0,1.0
4,0.0,34.0,4.0,0.0,45.0,0.0,Private,7th-8th,Married-civ-spouse,Transport-moving,Husband,Mexico,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27128,0.0,27.0,10.0,0.0,45.0,1.0,Private,Some-college,Never-married,Sales,Not-in-family,United-States,0.0,0.0
27129,0.0,22.0,10.0,0.0,40.0,1.0,Private,Some-college,Never-married,Craft-repair,Own-child,United-States,0.0,1.0
27130,0.0,55.0,9.0,0.0,32.0,1.0,Private,HS-grad,Separated,Priv-house-serv,Not-in-family,United-States,0.0,0.0
27131,0.0,58.0,12.0,0.0,36.0,1.0,Private,Assoc-acdm,Divorced,Prof-specialty,Not-in-family,United-States,0.0,1.0


In [9]:
from utils_unloc import ComparableSampleAnalyzer

prox_kwargs = {
    't_con': 0.1, 't_cat': 1, 'restart_prob': 0.1, 'mat_norm': 'sym', 
    'relax': True, 'max_relax': 3, 'relax_factor': 0.1, 
    'include_self': False
}

csa = ComparableSampleAnalyzer(
    df_train_encoded, feats_con, feats_cat, 
    save_path='data_cache', data_setting=data_split_info, 
    compress=False, verbose=True
)
P = csa.get_proximity(**prox_kwargs)

////// Comparable Analyzer //////
Data shape: (27133, 12)
[5] Continuous features: ['capital-gain', 'age', 'education-num', 'capital-loss', 'hours-per-week']
[7] Categorical features: ['race', 'education', 'occupation', 'workclass', 'relationship', 'marital-status', 'native-country']



Matrix loaded from data_cache/proximity/adult_gender_split[1of5]_seed[42]_TC=0.1_TD=1_RP=0.1_norm=sym_matrix.h5


In [10]:
def get_filter_mask(i, y, s, group_constraint='na', label_constraint='na'):
    filter_msk = np.ones(len(y), dtype=bool)
    if group_constraint == 'na':
        pass
    elif group_constraint == 'same':
        filter_msk[s != s[i]] = False
    elif group_constraint == 'diff':
        filter_msk[s == s[i]] = False
    else:
        raise NotImplementedError(f"Invalid group_constraint: {group_constraint}")

    if label_constraint == 'na':
        pass
    elif label_constraint == 'same':
        filter_msk[y != y[i]] = False
    elif label_constraint == 'diff':
        filter_msk[y == y[i]] = False
    else:
        raise NotImplementedError(f"Invalid label_constraint: {label_constraint}")
    
    return filter_msk

def get_mixup_idx(i, P, y, s, group_constraint='na', label_constraint='na', top_n=10, random_seed=None):
    """Get the most similar index with highest correspondence score (P)"""
    # print (f"P shape: {P.shape} y shape: {y.shape} s shape: {s.shape}, i: {i}")
    filter_msk = get_filter_mask(i, y, s, group_constraint, label_constraint)
    # print (f"group_constraint: {group_constraint} s: s[{i}] = {s[i]}, s[filter_msk] = {s[filter_msk]}")
    # print (f"label_constraint: {label_constraint} y: y[{i}] = {y[i]}, y[filter_msk] = {y[filter_msk]}")
    filter_idx = np.where(filter_msk)[0]
    filter_sort_idx = np.argsort(P[i][filter_idx])
    cand_idx = filter_idx[filter_sort_idx][-top_n:][::-1]
    # print (cand_idx, P[i][cand_idx])
    # permute the comparable indices
    if random_seed is not None:
        np.random.seed(random_seed)
    # randomly select one index from the top_n
    return np.random.choice(cand_idx)

def mixup_instance(i, mixup_i, df_x, df_x_input, name_feats, seed=None):
    feats_encoded = df_x.columns
    feats_input = df_x_input.columns

    if seed is not None:
        np.random.seed(seed)
    mix_weight = np.random.uniform(0, 1, 1)[0]

    x_seed_enc = df_x.iloc[i].values
    x_mix_enc = df_x.iloc[mixup_i].values
    
    x_new_enc = []
    x_new_input = np.zeros(len(feats_input))
    
    for idx_feat_enc, feat_enc in enumerate(feats_encoded):
        if feat_enc in name_feats['con']:
            # continuous feature, linearly mix the two samples
            value = value_input = mix_weight * x_seed_enc[idx_feat_enc] + (1-mix_weight) * x_mix_enc[idx_feat_enc]
            feat_input = feat_enc
        elif feat_enc in name_feats['cat']:
            # categorical feature, randomly choose one of the two samples with the given mix_weight
            value = value_input = np.random.choice([x_seed_enc[idx_feat_enc], x_mix_enc[idx_feat_enc]], p=[mix_weight, 1-mix_weight])
            feat_input = feat_enc
        elif feat_enc in name_feats['onehot_raw']:
            # onehot feature, randomly choose one of the two samples with the given mix_weight
            value = np.random.choice([x_seed_enc[idx_feat_enc], x_mix_enc[idx_feat_enc]], p=[mix_weight, 1-mix_weight])
            value_input = 1
            feat_input = f"{feat_enc}={value}"
        else:
            raise NotImplementedError(f"Feature type not found for: {feat_enc}")

        # fill value in the encoded space
        x_new_enc.append(value)
        # fill value in the input space
        idx_feat_input = feats_input.get_loc(feat_input)
        x_new_input[idx_feat_input] = value_input

    x_new_enc = np.array(x_new_enc)
    return x_new_enc, x_new_input

i = 29
mixup_i = get_mixup_idx(i, P, y_train, s_train, group_constraint='same', label_constraint='diff', top_n=10, random_seed=42)
print(i, mixup_i)
print(f's_i = {s_train[i]}, s_j = {s_train[mixup_i]}')
print(f'y_i = {y_train[i]}, y_j = {y_train[mixup_i]}')

df_train_nooh, _, _= dp.get_reverse_onehot_dataframe(df_train, name_feats, encode=False)

x_new_enc, x_new_input = mixup_instance(i, mixup_i, df_train_nooh, df_train, name_feats, seed=42)

x_new_enc, x_new_input

29 24194
s_i = 0.0, s_j = 0.0
y_i = 0.0, y_j = 1.0


(array(['0.030860499141062547', '0.3544709540199714', '0.8', '0.0',
        '0.4298704020996243', '1.0', 'Private', 'Bachelors', 'Divorced',
        'Exec-managerial', 'Not-in-family', 'United-States'], dtype='<U32'),
 array([0.0308605 , 1.        , 0.35447095, 0.8       , 0.        ,
        0.4298704 , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.

In [11]:
from utils_unloc import DataProcessor, UnfairnessAttributer, ComparableSampleAnalyzer
from utils import generate_random_seeds

class UnLoc():
    
    def __init__(self) -> None:
        pass

    def _save_df(self, df, name, path='./data_cache/score'):
        df.to_csv(f"{path}/{name}.csv", index=False)

    def _load_df(self, name, path='./data_cache/score'):
        return pd.read_csv(f"{path}/{name}.csv")

    def compute_df_encoded(self, X, features, s_col):
        dp = DataProcessor()
        df_train_withs = pd.DataFrame(X, columns=features)
        idx_feats, num_feats, name_feats = dp.parse_feature_types(df_train_withs, verbose=False)
        df_train_withs_nooh, feats_con, feats_cat = dp.get_reverse_onehot_dataframe(df_train_withs, name_feats, encode=False)
        self.df_train_withs = df_train_withs
        self.df_train_withs_nooh = df_train_withs_nooh
        self.name_feats_withs = name_feats
        
        # we dont use sensitive attribute for computing comparable samples
        df_train = pd.DataFrame(X, columns=features).drop(s_col, axis=1)
        idx_feats, num_feats, name_feats = dp.parse_feature_types(df_train, verbose=False)
        df_train_encoded, feats_con, feats_cat = dp.get_reverse_onehot_dataframe(df_train, name_feats)

        self.df_train = df_train
        self.df_train_encoded = df_train_encoded
        self.feats_con = feats_con
        self.feats_cat = feats_cat
        self.name_feats = name_feats
        
    def fit_transform(self, X, y, s, data_split_info, features, s_col, prox_kwargs, s_prv=1, s_prt=0):
        # guarantee X's 1st column is sensitive attribute
        assert (X[:, 0] == s).all()
        
        file_name = f"{data_split_info}_{str(prox_kwargs)}_score"
        
        self.compute_df_encoded(X, features, s_col)

        try:
            df_score = self._load_df(file_name)
            print(f"Loaded score from {file_name}.csv")
            return df_score
        except:
            pass

        P = self.get_proximity(X, y, s, data_split_info, features, s_col, prox_kwargs, s_prv=s_prv, s_prt=s_prt)

        # compute confidence score
        attributer = UnfairnessAttributer(P)
        df_score = attributer.compute_confidence(X, y, s, s_prv=s_prv, s_prt=s_prt)

        # save df_score
        self._save_df(df_score, file_name)
        print(f"Saved score to {file_name}.csv")

        return df_score

    def get_proximity(self, X, y, s, data_split_info, features, s_col, prox_kwargs, s_prv=1, s_prt=0):
        
        df_train_encoded, feats_con, feats_cat = self.df_train_encoded, self.feats_con, self.feats_cat
        
        csa = ComparableSampleAnalyzer(
            df_train_encoded, feats_con, feats_cat, 
            save_path='data_cache', data_setting=data_split_info, 
            compress=False, verbose=True
        )
        P = csa.get_proximity(**prox_kwargs)
        return P
    

    def fair_aug(
        self, aug_ratio, X_train, y_train, s_train, P, 
        s_prv=1, s_prt=0, filter=None, weights=None, dummy=False, random_seed=None, verbose=False
    ):
        if aug_ratio == 0:
            return X_train, y_train, s_train, 0
        
        if filter is not None:
            if filter == 'prt-pos':
                mask_seeds = (y_train == 1) & (s_train == s_prt)
            elif filter == 'pos':
                mask_seeds = (y_train == 1)
            else:
                raise NotImplementedError
        else: mask_seeds = np.ones(len(y_train), dtype=bool)
        
        if weights is None or dummy:
            weights = np.ones(len(y_train), dtype=float)
        else:
            assert weights.shape == y_train.shape

        seed_weights = weights[mask_seeds]
        seed_weights /= seed_weights.sum()
        
        # compute n aug
        n_aug = int(aug_ratio * len(seed_weights))
        
        # sample seeds
        idx_seeds = np.where(mask_seeds)[0]
        np.random.seed(random_seed)
        idx_aug_seeds = np.random.choice(
            idx_seeds, n_aug, p=seed_weights, replace=True
        )

        random_seeds = generate_random_seeds(n_aug, random_seed)

        X_new, y_new, s_new = [], [], []

        for i, idx_seed in enumerate(idx_aug_seeds):
            rand_seed = random_seeds[i]
            idx_mixup = get_mixup_idx(
                idx_seed, P, y_train, s_train,
                group_constraint='same', label_constraint='diff', 
                top_n=10, random_seed=rand_seed
            )
            _, x_new_input = mixup_instance(
                idx_seed, idx_mixup, 
                self.df_train_withs_nooh, 
                self.df_train_withs, 
                self.name_feats_withs, 
                seed=rand_seed
            )
            X_new.append(x_new_input)
            y_new.append(y_train[idx_seed])
            s_new.append(s_train[idx_seed])
        
        X_edited = np.concatenate([X_train, np.array(X_new)], axis=0)
        y_edited = np.concatenate([y_train, np.array(y_new)], axis=0)
        s_edited = np.concatenate([s_train, np.array(s_new)], axis=0)
        
        if verbose:
            print(
                f"{len(y_edited) - len(y_train)} samples added from the dataset.\n"
                f"Dataset shape: {X_train.shape} -> {X_edited.shape}\n"
                f"S distribution {np.unique(s_train, return_counts=True)} -> {np.unique(s_edited, return_counts=True)}\n"
                f"Y distribution {np.unique(y_train, return_counts=True)} -> {np.unique(y_edited, return_counts=True)}"
            )

        return X_edited, y_edited, s_edited, n_aug
        

    def fair_removal(
        self, X_train, y_train, s_train, weights, edit_ratio, how='removal',
        s_prv=1, s_prt=0, filter=None, verbose=False, dummy=False, random_state=42,
    ):
        # Sort indices by descending weights
        sorted_indices = np.argsort(weights)[::-1]

        if filter is not None:
            if filter == 'prt-neg':
                filter_idx = (y_train[sorted_indices] == 0) & (s_train[sorted_indices] == s_prt)
            elif filter == 'prv-pos':
                filter_idx = (y_train[sorted_indices] == 1) & (s_train[sorted_indices] == s_prv)
            elif filter == 'prt-neg-prv-pos':
                filter_idx = ((y_train[sorted_indices] == 0) & (s_train[sorted_indices] == s_prt)) | ((y_train[sorted_indices] == 1) & (s_train[sorted_indices] == s_prv))
            elif filter == 'prv':
                filter_idx = (s_train[sorted_indices] == s_prv)
            elif filter == 'prt':
                filter_idx = (s_train[sorted_indices] == s_prt)
            elif filter == 'neg':
                filter_idx = (y_train[sorted_indices] == 0)
            elif filter == 'pos':
                filter_idx = (y_train[sorted_indices] == 1)
            else:
                raise NotImplementedError

            sorted_indices = sorted_indices[filter_idx]
        
        n_edit = int(edit_ratio * len(sorted_indices))
        # select the removal indices
        edit_indices = sorted_indices[:n_edit]

        if dummy:
            # set seed
            np.random.seed(random_state)
            edit_indices = np.random.choice(sorted_indices, n_edit, replace=False)
        else:
            edit_indices = sorted_indices[:n_edit]
        
        # Create edit mask
        if how == 'removal':
            keep_mask = np.ones(len(X_train), dtype=bool)
            keep_mask[edit_indices] = False
            # Apply undersampled mask
            X_edited = X_train[keep_mask]
            y_edited = y_train[keep_mask]
            s_edited = s_train[keep_mask]
        elif how == 'relabel':
            relabel_mask = np.zeros(len(X_train), dtype=bool)
            relabel_mask[edit_indices] = True
            y_edited = y_train.copy()
            y_edited[relabel_mask] = 1 - y_edited[relabel_mask]
            X_edited = X_train
            s_edited = s_train
        else: raise NotImplementedError
        
        if verbose:
            print(
                f"{len(y_train) - len(y_edited)} samples removed from the dataset.\n"
                f"Dataset shape: {X_train.shape} -> {X_edited.shape}\n"
                f"S distribution {np.unique(s_train, return_counts=True)} -> {np.unique(s_edited, return_counts=True)}\n"
                f"Y distribution {np.unique(y_train, return_counts=True)} -> {np.unique(y_edited, return_counts=True)}"
            )
        return X_edited, y_edited, s_edited, n_edit

prox_kwargs = {
    't_con': 0.1, 't_cat': 2, 'restart_prob': 0.1, 'mat_norm': 'sym', 
    'relax': True, 'max_relax': 3, 'relax_factor': 0.1, 
    'include_self': False
}

unloc = UnLoc()
df_score = unloc.fit_transform(X_train, y_train, s_train, data_split_info, data.feature_names, data.s_col, prox_kwargs)
X_edited, y_edited, s_edited, n_edit = unloc.fair_removal(
    X_train, y_train, s_train, 
    # how='removal',
    how='relabel',
    weights=df_score['unf'], edit_ratio=1, 
    filter='prt-neg', verbose=True
)

X_edited, y_edited, s_edited, n_edit = unloc.fair_aug(
    aug_ratio=0.1, X_train=X_train, y_train=y_train, s_train=s_train, P=P, 
    s_prv=1, s_prt=0, filter='pos', weights=df_score['unf'], dummy=False, random_seed=42, verbose=True
)

Loaded score from adult_gender_split[1of5]_seed[42]_{'t_con': 0.1, 't_cat': 2, 'restart_prob': 0.1, 'mat_norm': 'sym', 'relax': True, 'max_relax': 3, 'relax_factor': 0.1, 'include_self': False}_score.csv
0 samples removed from the dataset.
Dataset shape: (27133, 98) -> (27133, 98)
S distribution (array([0., 1.]), array([ 8817, 18316])) -> (array([0., 1.]), array([ 8817, 18316]))
Y distribution (array([0., 1.]), array([20409,  6724])) -> (array([0., 1.]), array([12593, 14540]))
672 samples added from the dataset.
Dataset shape: (27133, 98) -> (27805, 98)
S distribution (array([0., 1.]), array([ 8817, 18316])) -> (array([0., 1.]), array([ 8903, 18902]))
Y distribution (array([0., 1.]), array([20409,  6724])) -> (array([0., 1.]), array([20409,  7396]))


In [12]:
"""Load Datasets"""

dataset_kwargs = {
    'y_col': 'label',
    'train_size': 0.6,
    'val_size': 0.2,
    'test_size': 0.2,
    'concat_train_val': True,
    'normalize': True,
    'random_state': 42,
}

all_datasets = {
    # 'compas': ['sex', 'race'],
    # 'adult': ['gender', 'race'],
    # 'bank': ['age', 'marital=married'],
    # 'lsa': ['gender', 'race'],
    # 'meps': ['SEX', 'RACE'],
    # 'adult': ['race'],
    'meps': ['SEX'],
    # 'meps': ['RACE'],
    # 'lsa_unfair_gender_race': ['gender', 'race'],
}

"""
Create a dictionary of datasets: dataset_zoo
key: dataset name
value: FairDataset object
"""
dataset_zoo = {}
for dataname, s_attrs in all_datasets.items():
    for s_attr in s_attrs:
        dataset = FairDataset(
            dataname=dataname,
            csv_path=f'./data/{dataname}.csv',
            s_col=s_attr,
            **dataset_kwargs
        )
        dataset_zoo[dataset.fullname] = dataset

        # dataset.describe()
        dataset.brief()

# Print the information of the datasets and models
print(
    f"////// Dataset ZOO //////\n"
    f"{dict_info(dataset_zoo)}\n"
)

Dataset    : meps (15839, 126) load from ./data/meps.csv
Sens/Res   : SEX/label
Split      : train/test = 0.8/0.2, random_state = 42, x_with_s = True
train      | size {0: 6067, 1: 6604} | grp_pos_ratio: {0: 0.1327, 1: 0.2078}
test       | size {0: 1517, 1: 1651} | grp_pos_ratio: {0: 0.1325, 1: 0.2078}

////// Dataset ZOO //////
meps_SEX: <data.FairDataset object at 0x7f8e868b6c10>




In [14]:
import tqdm
from sklearn.base import clone
from eval import evaluate

def run_unloc_aug_exp(
        unloc, dataset_zoo, base_model_zoo, edit_ratio_space, setting_space, prox_kwargs, how,
        n_splits=5, n_runs=1, random_state=42, verbose=False
):
    print(
        f"////// Baseline Experiment //////\n"
        f"Base Model Zoo:       {list(base_model_zoo.keys())}\n"
        f"Dataset Zoo:          {list(dataset_zoo.keys())}\n"
        f"edit_ratio_space:     {edit_ratio_space}\n"
        # f"setting_space:        {setting_space}\n"
        f"n_splits:             {n_splits}\n"
        f"n_runs:               {n_runs}\n"
        f"random_state:         {random_state}\n"
    )
    
    all_res = []

    for data_name, data in dataset_zoo.items():

        for base_model_name, base_model in base_model_zoo.items():

            for i_run in range(n_runs):

                print (f"Data: {data_name} | Run: {i_run} | Base: {base_model_name}")
                rand_seed = random_state + i_run

                base_model = base_model.set_params(random_state=rand_seed)
                
                for i_split in range(n_splits):
                    
                    print (f"Data: {data_name} | Run: {i_run} | Base: {base_model_name} | split {i_split}")
                    
                    # get the i-th split of a n-fold cross validation
                    (
                        (X_train, y_train, s_train),
                        (X_val, y_val, s_val),
                        (X_test, y_test, s_test),
                        (idx_train, idx_val, idx_test)
                    ) = data.get_subgroup_split(
                        i_split=i_split, 
                        random_state=rand_seed,
                        n_splits=n_splits
                    )
                    
                    data_split_info = f"{data.fullname}_split[{i_split+1}of{n_splits}]_seed[{rand_seed}]"
                    unloc = UnLoc()
                    df_score = unloc.fit_transform(
                        X_train, y_train, s_train, data_split_info, 
                        features=data.feature_names, 
                        s_col=data.s_col,
                        prox_kwargs=prox_kwargs,
                    )
                    df_score['unf_inv'] = df_score['unf'].max() - df_score['unf']
                    P = unloc.get_proximity(
                        X_train, y_train, s_train, data_split_info, 
                        features=data.feature_names, 
                        s_col=data.s_col,
                        prox_kwargs=prox_kwargs,
                    )

                    for setting_name, setting_kwargs in setting_space.items():
                            
                        weights = df_score[setting_kwargs['weights']]
                        dummy = setting_kwargs['dummy']
                        f = setting_kwargs['filter']

                        for edit_ratio in tqdm.tqdm(edit_ratio_space, disable=verbose, desc=f"{setting_name} - n_edit"):

                            X_edited, y_edited, s_edited, n_edit = unloc.fair_aug(
                                aug_ratio=edit_ratio, 
                                X_train=X_train, y_train=y_train, s_train=s_train, 
                                P=P, 
                                s_prv=1, s_prt=0, 
                                filter=f, 
                                weights=weights, 
                                dummy=dummy, 
                                random_seed=rand_seed, 
                                verbose=False
                            )
                            
                            clf = clone(base_model)
                            clf.set_params(random_state=rand_seed)
                            clf.fit(X_edited, y_edited)
                            
                            res = evaluate(clf, X_test, y_test, s_test)
                            
                            all_res.append({
                                **res,
                                'how': how,
                                'setting': setting_name,
                                'n_edit': n_edit,
                                'edit_ratio': edit_ratio,
                                'dataset': data_name,
                                'base_model': base_model_name,
                                'i_run': i_run,
                                'i_split': i_split,
                            })
                            if verbose:
                                res_vis = res.copy()
                                for k, v in res_vis.items():
                                    if isinstance(v, float):
                                        res_vis[k] = np.round(v*100, 2)
                                print (f"split: {i_split} | n_edit {n_edit} {edit_ratio:.2f} | {res_vis}")
        
    df_res = pd.DataFrame(all_res)
    
    return df_res        


base_model_zoo = {
    'LR': LogisticRegression(),
    # 'KN': KNeighborsClassifier(n_neighbors=5),
    # 'DT': DecisionTreeClassifier(max_depth=10),
    # 'MLP': MLPClassifier(hidden_layer_sizes=(8), max_iter=50),
    # 'ADA': AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=None), n_estimators=5),
    # 'BAG': BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=None), n_estimators=5),
}
# n_space = np.linspace(0, 5000, 21).astype(int)
# edit_ratio_space = np.linspace(0, 7, 36) # adult gender - inv
# edit_ratio_space = np.linspace(0, 4, 41) # adult race - inv
edit_ratio_space = np.linspace(0, 2, 41) # meps sex - inv
setting_space = setting_space = {
    # 'rand_prtneg': {'dummy': True, 'filter': 'prt-neg', 'weights': 'unf'},
    # 'disag_prtneg': {'dummy': False, 'filter': 'prt-neg', 'weights': 'unf'},
    # 'disag_contr_prtneg': {'dummy': False, 'filter': 'prt-neg', 'weights': 'unf_contr'},
    # 'rand_prtneg': {'dummy': True, 'filter': 'prt-pos', 'weights': 'unf'},
    'disag_inv_prtpos': {'dummy': False, 'filter': 'prt-pos', 'weights': 'unf_inv'},
    # 'disag_prtpos': {'dummy': False, 'filter': 'prt-pos', 'weights': 'unf'},
}
prox_kwargs = {
    't_con': 0.1, 't_cat': 2, 'restart_prob': 0.1, 'mat_norm': 'sym', 
    'relax': True, 'max_relax': 3, 'relax_factor': 0.1, 
    'include_self': False
}
n_splits = 5
n_runs = 1
random_state = 42
verbose = False
verbose = True
how = 'remove'

df_res = run_unloc_aug_exp(
    unloc, dataset_zoo, base_model_zoo, edit_ratio_space, setting_space, prox_kwargs=prox_kwargs, how=how, 
    n_splits=n_splits, n_runs=n_runs, random_state=random_state, verbose=verbose
)
df_res

////// Baseline Experiment //////
Base Model Zoo:       ['LR']
Dataset Zoo:          ['meps_SEX']
edit_ratio_space:     [0.   0.05 0.1  0.15 0.2  0.25 0.3  0.35 0.4  0.45 0.5  0.55 0.6  0.65
 0.7  0.75 0.8  0.85 0.9  0.95 1.   1.05 1.1  1.15 1.2  1.25 1.3  1.35
 1.4  1.45 1.5  1.55 1.6  1.65 1.7  1.75 1.8  1.85 1.9  1.95 2.  ]
n_splits:             5
n_runs:               1
random_state:         42

Data: meps_SEX | Run: 0 | Base: LR
Data: meps_SEX | Run: 0 | Base: LR | split 0


Loaded score from meps_SEX_split[1of5]_seed[42]_{'t_con': 0.1, 't_cat': 2, 'restart_prob': 0.1, 'mat_norm': 'sym', 'relax': True, 'max_relax': 3, 'relax_factor': 0.1, 'include_self': False}_score.csv
////// Comparable Analyzer //////
Data shape: (9504, 40)
[6] Continuous features: ['AGE', 'PCS42', 'MCS42', 'K6SUM42', 'PHQ242', 'POVCAT']
[34] Categorical features: ['RACE', 'ASTHDX', 'REGION', 'FTSTU', 'CHOLDX', 'ADSMOK42', 'INSCOV', 'MNHLTH', 'ACTLIM', 'SOCLIM', 'ACTDTY', 'EMPHDX', 'ARTHTYPE', 'PREGNT', 'CHDDX', 'ADHDADDX', 'JTPAIN', 'EMPST', 'STRKDX', 'ANGIDX', 'CANCERDX', 'OHRTDX', 'DIABDX', 'DFSEE42', 'DFHEAR42', 'COGLIM', 'WLKLIM', 'HIBPDX', 'HONRDC', 'MARRY', 'CHBRON', 'ARTHDX', 'MIDX', 'RTHLTH']

Matrix loaded from data_cache/proximity/meps_SEX_split[1of5]_seed[42]_TC=0.1_TD=2_RP=0.1_norm=sym_matrix.h5
split: 0 | n_edit 0 0.00 | {'acc': 86.71, 'bacc': 69.08, 'ap': 38.83, 'roc': 69.08, 'f1': 72.25, 'dp': 7.83, 'eo': 11.99, 'ge': 7.37, 'si': 3.0, 'acc_grp': {0: 0.894, 1: 0.843}, 'po

Unnamed: 0,acc,bacc,ap,roc,f1,dp,eo,ge,si,acc_grp,...,g_adv,acc_cls,how,setting,n_edit,edit_ratio,dataset,base_model,i_run,i_split
0,0.867109,0.690803,0.388313,0.690803,0.722479,0.078289,0.119938,0.073669,0.029987,"{0: 0.894, 1: 0.843}",...,1,"{0: 0.96, 1: 0.422}",remove,disag_inv_prtpos,0,0.00,meps_SEX,LR,0,0
1,0.865215,0.688933,0.382753,0.688933,0.719590,0.071858,0.101290,0.074647,0.026515,"{0: 0.892, 1: 0.841}",...,1,"{0: 0.958, 1: 0.42}",remove,disag_inv_prtpos,30,0.05,meps_SEX,LR,0,0
2,0.866162,0.690231,0.385757,0.690231,0.721296,0.067457,0.080608,0.074133,0.019886,"{0: 0.894, 1: 0.841}",...,1,"{0: 0.958, 1: 0.422}",remove,disag_inv_prtpos,60,0.10,meps_SEX,LR,0,0
3,0.866162,0.692412,0.387218,0.692412,0.722870,0.064767,0.057890,0.073971,0.017361,"{0: 0.896, 1: 0.839}",...,1,"{0: 0.957, 1: 0.428}",remove,disag_inv_prtpos,90,0.15,meps_SEX,LR,0,0
4,0.867109,0.694437,0.390716,0.694437,0.725089,0.061578,0.047989,0.073405,0.014520,"{0: 0.897, 1: 0.84}",...,1,"{0: 0.958, 1: 0.431}",remove,disag_inv_prtpos,120,0.20,meps_SEX,LR,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,0.857594,0.704223,0.378457,0.704223,0.723851,0.015905,0.121608,0.076331,0.057468,"{0: 0.889, 1: 0.829}",...,1,"{0: 0.938, 1: 0.471}",remove,disag_inv_prtpos,1087,1.80,meps_SEX,LR,0,4
201,0.857278,0.705489,0.378892,0.705489,0.724412,0.013872,0.123667,0.076317,0.059362,"{0: 0.887, 1: 0.83}",...,1,"{0: 0.937, 1: 0.474}",remove,disag_inv_prtpos,1117,1.85,meps_SEX,LR,0,4
202,0.857278,0.705489,0.378892,0.705489,0.724412,0.012606,0.131558,0.076317,0.061572,"{0: 0.888, 1: 0.829}",...,1,"{0: 0.937, 1: 0.474}",remove,disag_inv_prtpos,1147,1.90,meps_SEX,LR,0,4
203,0.856015,0.703998,0.375630,0.703998,0.722443,0.012553,0.134473,0.076924,0.063151,"{0: 0.887, 1: 0.827}",...,1,"{0: 0.936, 1: 0.472}",remove,disag_inv_prtpos,1177,1.95,meps_SEX,LR,0,4


In [15]:
# file_name = f'./res_cache/unloc_aug_base{list(base_model_zoo.keys())}_seed{random_state}_split{n_splits}_data{all_datasets}.csv'
# df_res.to_csv(file_name, index=False)

# print (f"Saved results to {file_name}")

Saved results to ./res_cache/unloc_aug_base['LR']_seed42_split5_data{'meps': ['SEX']}.csv


In [None]:
def plot_line_xy_tradeoff(
        df, x, y, group_key, style, monotonous='x', 
        ax=None, title=None, errorbar=False, **kwargs
):
    assert x in df.columns and y in df.columns
    assert group_key in df.columns and style in df.columns
    df_plot = df.groupby([group_key, style]).agg({x: ['mean', 'std'], y: ['mean', 'std']}).reset_index()

    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    for style_val in df_plot[style].unique():
        df_subplot = df_plot[df_plot[style] == style_val]
        if monotonous == 'x':
            df_subplot = df_subplot.loc[:df_subplot[x]['mean'].idxmin()]
        else:
            raise NotImplementedError
        x_mean, x_std = df_subplot[x]['mean'], df_subplot[x]['std']
        y_mean, y_std = df_subplot[y]['mean'], df_subplot[y]['std']
        sns.lineplot(x_mean, y_mean, label=style_val, marker="o", markersize=8, ax=ax)
        if errorbar:
            ax.errorbar(
                x=x_mean, y=y_mean, xerr=x_std, yerr=y_std,
                ecolor='k', fmt='none', alpha=0.2,
                # capthick=2, capsize=5, 
            )
        ax.set(
            xlabel=x.upper(),
            ylabel=y.upper(),
        )
        ax.legend()
    return ax

def plot_scatter_xy_tradeoff(
        df, x, y, group_key, style, 
        ax=None, title=None, errorbar=False, **kwargs
):
    assert x in df.columns and y in df.columns
    assert group_key in df.columns and style in df.columns
    df_plot = df.groupby([group_key, style]).agg({x: ['mean', 'std'], y: ['mean', 'std']}).reset_index()

    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    for style_val in df_plot[style].unique():
        df_subplot = df_plot[df_plot[style] == style_val]
        x_mean, x_std = df_subplot[x]['mean'], df_subplot[x]['std']
        y_mean, y_std = df_subplot[y]['mean'], df_subplot[y]['std']
        
        ax.scatter(x_mean, y_mean, label=style_val, marker="o", s=50)
        if errorbar:
            ax.errorbar(
                x=x_mean, y=y_mean, xerr=x_std, yerr=y_std,
                ecolor='k', fmt='none', alpha=0.5,
                # capthick=2, capsize=5, 
            )
        ax.set(
            xlabel=x.upper(),
            ylabel=y.upper(),
        )
        ax.legend()
    return ax


def plot_xy_group_tradeoff_with_baseline(
        df, df_baseline, xs, ys, group_key, style, monotonous='x', 
        subfig_size=(3, 3), **kwargs
):
    n_x, n_y = len(xs), len(ys)
    h_ax, w_ax = subfig_size

    dataset_unique = df['dataset'].unique()

    for data_name in dataset_unique:
        df_data = df[df['dataset'] == data_name]
        df_data_baseline = df_baseline[df_baseline['dataset'] == data_name]
        fig, axs = plt.subplots(n_y, n_x, figsize=(w_ax*n_x, h_ax*n_y))
        for i, x in enumerate(xs):
            for j, y in enumerate(ys):
                ax = plot_line_xy_tradeoff(df_data, x, y, group_key, style, monotonous, ax=axs[j, i], title=None, **kwargs)
                ax = plot_scatter_xy_tradeoff(df_data_baseline, x, y, group_key, 'method', ax=axs[j, i], title=None, **kwargs)
        plt.suptitle(f"Dataset: {data_name}")
        plt.tight_layout()
        plt.show()

df_baseline = pd.read_csv('./res_cache/baseline_seed42_split5.csv')
df_baseline

plot_xy_group_tradeoff_with_baseline(
    df_res, df_baseline, xs=['si', 'ge', 'dp', 'eo'], ys=['acc', 'bacc'], 
    group_key='n_edit', style='setting', monotonous='x',
    subfig_size=(4, 4), errorbar=True,
)