In [1]:
import sklearn
# import shap

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# Import base classifiers
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from baselines import AdaFairClassifier
from imbens.ensemble import SMOTEBoostClassifier, SMOTEBaggingClassifier, RUSBoostClassifier, UnderBaggingClassifier, SelfPacedEnsembleClassifier
from fairlearn.postprocessing import ThresholdOptimizer
from fairens import FairAugEnsemble, FairEnsemble

# Set GPU for matrix computations
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cuda':
    torch.cuda.set_device(0)
    device_id = torch.cuda.current_device()
    print (f"Now using GPU #{device_id}:\n{torch.cuda.get_device_name(device_id)}")

# Import utilities
from data import FairDataset    # This is a custom class that we will use to load the datasets
from eval import evaluate_multi_split, verbose_print
from trainer import Trainer
from utils import seed_generator, dict_info, describe_data

pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[FairAdapt]'


In [2]:
import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

class TorchMLPClassifier(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin):
    def __init__(
        self,
        input_size,
        hidden_size,
        output_size,
        learning_rate=0.01,
        num_epochs=50,
        batch_size=32,
    ):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.model = MLP(input_size, hidden_size, output_size)

    def _validate_input(self, X, y):
        X, y = self._validate_data(
            X,
            y,
            accept_sparse=["csr", "csc"],
            multi_output=True,
            dtype=(np.float64, np.float32),
            reset=True,
        )
        self.classes_ = sklearn.utils.multiclass.unique_labels(y)
        return X, y

    def fit(self, X, y):

        X, y = self._validate_input(X, y)

        # Convert data to PyTorch tensors
        X_tensor = torch.tensor(X, dtype=torch.float32)
        y_tensor = torch.tensor(y, dtype=torch.long)

        # Define loss function and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)

        # Train the model
        for epoch in range(self.num_epochs):
            for i in range(0, len(X), self.batch_size):
                # Forward pass
                outputs = self.model(X_tensor[i : i + self.batch_size])

                # Compute loss
                loss = criterion(outputs, y_tensor[i : i + self.batch_size])

                # Backward and optimize
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

    def predict(self, X):
        # Convert data to PyTorch tensor
        X_tensor = torch.tensor(X, dtype=torch.float32)

        # Forward pass and get predictions
        outputs = self.model(X_tensor)
        _, predicted = torch.max(outputs.data, 1)

        # Convert predictions to numpy array and return
        return predicted.numpy()

    def predict_proba(self, X):
        # Convert data to PyTorch tensor
        X_tensor = torch.tensor(X, dtype=torch.float32)

        # Forward pass and get softmax probabilities
        outputs = self.model(X_tensor)
        softmax = nn.Softmax(dim=1)
        probabilities = softmax(outputs).detach().numpy()

        # Return probabilities
        return probabilities

# Load dataset


In [14]:
SEED = 42
n_splits = 5
i_split = 0

dataset_kwargs = {
    'y_col': 'label',
    'train_size': 0.6,
    'val_size': 0.2,
    'test_size': 0.2,
    'concat_train_val': True,
    'normalize': True,
    'random_state': 42,
}

# dataname = 'adult'
# s_attr = 'gender'
# s_attr = 'race'
# dataname = 'compas'
# s_attr = 'sex'
# dataname = 'lsa'
# s_attr = 'race'
dataname = 'meps'
# s_attr = 'SEX'
s_attr = 'RACE'
data = FairDataset(
    dataname=dataname,
    csv_path=f'./data/{dataname}.csv',
    s_col=s_attr,
    x_with_s=True,
    **dataset_kwargs
)

data.describe()

(
    (X_train, y_train, s_train),
    (X_val, y_val, s_val),
    (X_test, y_test, s_test),
    (idx_train, idx_val, idx_test)
) = data.get_subgroup_split(i_split=i_split, random_state=SEED, n_splits=n_splits)

classes = np.unique(y_train)
n_feat = X_train.shape[1]
n_class = len(classes)

dataset_zoo = {
    f'{data.fullname}': data,
}
dataset_zoo

Dataset    : meps (15839, 126) load from ./data/meps.csv
Sens/Res   : RACE/label
Split      : train/test = 0.8/0.2, random_state = 42, x_with_s = True
train data [#samples 12671 #features 125]:
+-----+-------+-------+------------+
|     |   y=0 |   y=1 |   pos_rate |
| s=0 |  7125 |  1019 |     0.1251 |
+-----+-------+-------+------------+
| s=1 |  3369 |  1158 |     0.2558 |
+-----+-------+-------+------------+
test data [#samples 3168 #features 125]:
+-----+-------+-------+------------+
|     |   y=0 |   y=1 |   pos_rate |
| s=0 |  1781 |   255 |     0.1252 |
+-----+-------+-------+------------+
| s=1 |   843 |   289 |     0.2553 |
+-----+-------+-------+------------+



{'meps_RACE': <data.FairDataset at 0x7f10b434eee0>}

In [4]:
# from sklearn.decomposition import TruncatedSVD

# def compl_svd_projector(names, svd=-1):
#     if svd > 0:
#         tSVD = TruncatedSVD(n_components=svd)
#         tSVD.fit(names)
#         basis = tSVD.components_.T
#         print('Singular values:')
#         print(tSVD.singular_values_)
#     else:
#         basis = names.T

#     proj = np.linalg.inv(np.matmul(basis.T, basis))
#     proj = np.matmul(basis, proj)
#     proj = np.matmul(proj, basis.T)
#     proj_compl = np.eye(proj.shape[0]) - proj
#     return proj_compl

# logreg = LogisticRegression(random_state=SEED)
# logreg.fit(X_train[:, 1:], s_train)
# sen_directions = logreg.coef_


# tSVD = TruncatedSVD(n_components= 2 + 1)
# tSVD.fit(sen_directions)
# sen_directions = tSVD.components_


# proj = compl_svd_projector(sen_directions)

# # X_train.shape, X_train[:, 1:].shape

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

from inFairness.fairalgo import SenSeI, SenSR
from inFairness import distances
from inFairness.auditor import SenSRAuditor, SenSeIAuditor

output_size = 2
hidden_size = 100
EPOCHS = 5

class AdultDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __getitem__(self, idx):
        data = self.data[idx]
        label = self.labels[idx]
        return data, label
    
    def __len__(self):
        return len(self.labels)


def to_tensor(X, y, s):
    return (
        torch.tensor(X, dtype=torch.float32, device=device),
        torch.tensor(y, dtype=torch.long, device=device),
        torch.tensor(s, dtype=torch.long, device=device)
    )

def to_dataset(X, y, s):
    X_tensor, y_tensor, s_tensor = to_tensor(X, y, s)
    ds = AdultDataset(X_tensor, y_tensor)
    return ds

def to_dataloader(X, y, s, batch_size, shuffle):
    ds = to_dataset(X, y, s)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=shuffle)
    return dl

class Model(nn.Module):

    def __init__(self, input_size, hidden_size, output_size):

        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        # self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fcout = nn.Linear(hidden_size, output_size)

    def forward(self, x):

        x = F.relu(self.fc1(x))
        # x = F.relu(self.fc2(x))
        x = self.fcout(x)
        return x

    def predict_proba(self, x):
        if isinstance(x, np.ndarray):
            x = torch.tensor(x).float()

        logits = self.forward(x)
        y_pred_proba = F.softmax(logits, dim=1).detach().numpy()
        return y_pred_proba
    
    def predict(self, x):
        y_pred_proba = self.predict_proba(x)
        y_pred = np.argmax(y_pred_proba, axis=1)
        return y_pred

In [6]:
from eval import evaluate

all_res = []

for i_split in range(n_splits):

    (
        (X_train, y_train, s_train),
        (X_val, y_val, s_val),
        (X_test, y_test, s_test),
        (idx_train, idx_val, idx_test)
    ) = data.get_subgroup_split(i_split=i_split, random_state=SEED, n_splits=n_splits)
    

    X_train, y_train, s_train = to_tensor(X_train, y_train, s_train)

    train_dl = to_dataloader(X_train, y_train, s_train, batch_size=128, shuffle=True)
    test_dl = to_dataloader(X_test, y_test, s_test, batch_size=1000, shuffle=False)

    input_size = X_train.shape[1]
    
    clf = Model(input_size, hidden_size, output_size).to(device) # classifier
    optimizer = torch.optim.Adam(clf.parameters(), lr=1e-3)
    loss_fn = F.cross_entropy

    # distance_x_LR = distances.LogisticRegSensitiveSubspace()
    # distance_y = distances.SquaredEuclideanDistance()

    # distance_x_LR.fit(X_train, data_SensitiveAttrs=s_train[:, np.newaxis])
    # distance_y.fit(num_dims=output_size)

    # distance_x_LR.to(device)
    # distance_y.to(device)

    rho = 5.0
    eps = 0.1
    auditor_nsteps = 100
    auditor_lr = 1e-3

    for epoch in tqdm(range(EPOCHS)):

        for x, y in train_dl:

            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            y_pred = clf(x).squeeze()
            loss = loss_fn(y_pred, y)
            loss.backward()
            optimizer.step()
        
        print (f"epoch {epoch}", evaluate(clf, X_test, y_test, s_test))

    res = evaluate(clf, X_test, y_test, s_test)
    all_res.append({
        **res,
        # 'method': model_name,
        # 'n_edit': 0,
        # 'dataset': data_name,
        # 'base_model': base_model_name,
        # 'i_run': i_run,
        'i_split': i_split,
    })

df_res = pd.DataFrame(all_res)
df_res

  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8338308457711443, 'bacc': 0.7307457236358572, 'ap': 0.5005422577039799, 'roc': 0.7307457236358572, 'f1': 0.7526399772804773, 'dp': 0.1676957063523437, 'eo': 0.11535733564317907, 'ge': 0.0930685678281832, 'si': 0.02155887230514096, 'acc_grp': {0: 0.918, 1: 0.793}, 'pos_rate_grp': {0: 0.066, 1: 0.234}, 'g_adv': 1, 'acc_cls': {0: 0.935, 1: 0.526}}
epoch 1 {'acc': 0.8340519624101713, 'bacc': 0.7767947983802601, 'ap': 0.5251412306925751, 'roc': 0.7767947983802602, 'f1': 0.7771674767191132, 'dp': 0.23465933083963955, 'eo': 0.15664268946383963, 'ge': 0.08313889197525964, 'si': 0.06755113322277502, 'acc_grp': {0: 0.918, 1: 0.793}, 'pos_rate_grp': {0: 0.088, 1: 0.323}, 'g_adv': 1, 'acc_cls': {0: 0.89, 1: 0.663}}
epoch 2 {'acc': 0.840685461580984, 'bacc': 0.7466659183655004, 'ap': 0.5202489032539062, 'roc': 0.7466659183655004, 'f1': 0.7667678156730207, 'dp': 0.1874755579856247, 'eo': 0.1657439837306519, 'ge': 0.08795834841854795, 'si': 0.07330016583747927, 'acc_grp': {0: 0.919,

  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8357103372028745, 'bacc': 0.7536753082775702, 'ap': 0.5148254838597476, 'roc': 0.7536753082775703, 'f1': 0.7671114526896748, 'dp': 0.1922448225837136, 'eo': 0.13858132791021732, 'ge': 0.08804965516650552, 'si': 0.03173023770038695, 'acc_grp': {0: 0.914, 1: 0.798}, 'pos_rate_grp': {0: 0.08, 1: 0.272}, 'g_adv': 1, 'acc_cls': {0: 0.916, 1: 0.591}}
epoch 1 {'acc': 0.8413488114980652, 'bacc': 0.7586198000226326, 'ap': 0.5268269299495518, 'roc': 0.7586198000226325, 'f1': 0.7737523729978455, 'dp': 0.17441615278765177, 'eo': 0.07944802219460445, 'ge': 0.08551808787933561, 'si': 0.042786069651741296, 'acc_grp': {0: 0.918, 1: 0.804}, 'pos_rate_grp': {0: 0.088, 1: 0.262}, 'g_adv': 1, 'acc_cls': {0: 0.923, 1: 0.595}}
epoch 2 {'acc': 0.844555002763958, 'bacc': 0.7668814579494301, 'ap': 0.5363673176750815, 'roc': 0.7668814579494301, 'f1': 0.7803206362727306, 'dp': 0.19015249142209978, 'eo': 0.11149085476845044, 'ge': 0.08300307203735012, 'si': 0.055942509673852954, 'acc_grp': {0: 0

  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8291685095090667, 'bacc': 0.7429637868113805, 'ap': 0.49864350383030664, 'roc': 0.7429637868113805, 'f1': 0.7567365894302637, 'dp': 0.2111500991001497, 'eo': 0.19222052240920162, 'ge': 0.09199722459858037, 'si': 0.022335249889429455, 'acc_grp': {0: 0.915, 1: 0.788}, 'pos_rate_grp': {0: 0.064, 1: 0.275}, 'g_adv': 1, 'acc_cls': {0: 0.914, 1: 0.572}}
epoch 1 {'acc': 0.8371295886775763, 'bacc': 0.7558860722587215, 'ap': 0.518153609608023, 'roc': 0.7558860722587214, 'f1': 0.7692589843950743, 'dp': 0.2054763396023552, 'eo': 0.16604340189245848, 'ge': 0.08720523241842643, 'si': 0.042459088898717384, 'acc_grp': {0: 0.918, 1: 0.798}, 'pos_rate_grp': {0: 0.071, 1: 0.277}, 'g_adv': 1, 'acc_cls': {0: 0.917, 1: 0.595}}
epoch 2 {'acc': 0.8375718708536046, 'bacc': 0.7463052923799334, 'ap': 0.5142377564703084, 'roc': 0.7463052923799334, 'f1': 0.7643649407069654, 'dp': 0.20538536680210762, 'eo': 0.21963000736585636, 'ge': 0.0889880924996139, 'si': 0.06225121627598408, 'acc_grp': {0: 0

  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8383458646616542, 'bacc': 0.7526548613648741, 'ap': 0.518607061428965, 'roc': 0.7526548613648741, 'f1': 0.7684073413086668, 'dp': 0.19547057355052114, 'eo': 0.18875149543597652, 'ge': 0.08752867505735004, 'si': 0.029632905793896505, 'acc_grp': {0: 0.912, 1: 0.803}, 'pos_rate_grp': {0: 0.071, 1: 0.266}, 'g_adv': 1, 'acc_cls': {0: 0.923, 1: 0.583}}
epoch 1 {'acc': 0.842436974789916, 'bacc': 0.7576185152847823, 'ap': 0.528169879624188, 'roc': 0.7576185152847823, 'f1': 0.7739928096544635, 'dp': 0.19079604318791735, 'eo': 0.15791489909535933, 'ge': 0.08538501414477685, 'si': 0.05672268907563025, 'acc_grp': {0: 0.917, 1: 0.807}, 'pos_rate_grp': {0: 0.073, 1: 0.264}, 'g_adv': 1, 'acc_cls': {0: 0.926, 1: 0.589}}
epoch 2 {'acc': 0.8433215391419726, 'bacc': 0.7414493094136554, 'ap': 0.5232963672369333, 'roc': 0.7414493094136554, 'f1': 0.7655454040600433, 'dp': 0.1502683642264245, 'eo': 0.09566551218485941, 'ge': 0.08790815305754986, 'si': 0.05827067669172932, 'acc_grp': {0: 0.9

  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8279522335249889, 'bacc': 0.7482245999431871, 'ap': 0.4999305654465176, 'roc': 0.7482245999431871, 'f1': 0.7588224176425936, 'dp': 0.21469335957257019, 'eo': 0.17271431978809676, 'ge': 0.0911323309380879, 'si': 0.024325519681556832, 'acc_grp': {0: 0.915, 1: 0.786}, 'pos_rate_grp': {0: 0.072, 1: 0.286}, 'g_adv': 1, 'acc_cls': {0: 0.906, 1: 0.59}}
epoch 1 {'acc': 0.8329279080053074, 'bacc': 0.7533265390538177, 'ap': 0.5102546855781831, 'roc': 0.7533265390538176, 'f1': 0.7649764424960719, 'dp': 0.2167981275841092, 'eo': 0.18955799093636633, 'ge': 0.0888270441071845, 'si': 0.07087571870853604, 'acc_grp': {0: 0.917, 1: 0.792}, 'pos_rate_grp': {0: 0.068, 1: 0.285}, 'g_adv': 1, 'acc_cls': {0: 0.911, 1: 0.595}}
epoch 2 {'acc': 0.8345864661654135, 'bacc': 0.7614560024718553, 'ap': 0.5172333993199133, 'roc': 0.7614560024718553, 'f1': 0.7703200011950986, 'dp': 0.21915324957175925, 'eo': 0.1790099047188642, 'ge': 0.08663859194471425, 'si': 0.0861344537815126, 'acc_grp': {0: 0.917

Unnamed: 0,acc,bacc,ap,roc,f1,dp,eo,ge,si,acc_grp,pos_rate_grp,g_adv,acc_cls,i_split
0,0.842897,0.748584,0.525293,0.748584,0.769416,0.157461,0.065144,0.086892,0.0712,"{0: 0.925, 1: 0.803}","{0: 0.081, 1: 0.239}",1,"{0: 0.936, 1: 0.562}",0
1,0.84743,0.75713,0.537628,0.75713,0.77741,0.161354,0.063569,0.083914,0.054616,"{0: 0.925, 1: 0.81}","{0: 0.082, 1: 0.244}",1,"{0: 0.936, 1: 0.578}",1
2,0.842105,0.777297,0.53732,0.777297,0.783412,0.229673,0.180081,0.081425,0.065126,"{0: 0.922, 1: 0.803}","{0: 0.077, 1: 0.306}",1,"{0: 0.906, 1: 0.649}",2
3,0.851283,0.778161,0.553562,0.778161,0.790866,0.184602,0.114308,0.078946,0.066674,"{0: 0.918, 1: 0.819}","{0: 0.09, 1: 0.275}",1,"{0: 0.923, 1: 0.633}",3
4,0.841221,0.759438,0.527034,0.759438,0.774093,0.208344,0.163509,0.085397,0.088125,"{0: 0.924, 1: 0.801}","{0: 0.066, 1: 0.275}",1,"{0: 0.922, 1: 0.597}",4


In [7]:
file_name = f"./res_cache/Infair_base_{data.fullname}_seed{42}_split{5}.csv"
df_res.to_csv(file_name, index=False)

print (f"Saved results to {file_name}")

# df_res.save(f'./results/{dataname}_{s_attr}_results.csv', index=False)

df_res.mean(axis=0)

Saved results to ./res_cache/Infair_base_adult_gender_seed42_split5.csv


acc        0.844987
bacc       0.764122
ap         0.536167
roc        0.764122
f1         0.779040
dp         0.188287
eo         0.117322
ge         0.083315
si         0.069148
g_adv      1.000000
i_split    2.000000
dtype: float64

In [15]:
from aif360.sklearn.preprocessing import LearnedFairRepresentations
from eval import evaluate

class TransformPredictor():
    def __init__(self, estimator, transformer):
        self.estimator = estimator
        self.transformer = transformer

    def predict(self, X):
        X_trans = self.transformer.transform(pd.DataFrame(X))
        X_trans = torch.tensor(X_trans.values).float()
        return self.estimator.predict(X_trans)

    def predict_proba(self, X):
        X_trans = self.transformer.transform(pd.DataFrame(X))
        X_trans = torch.tensor(X_trans.values).float()
        return self.estimator.predict_proba(X_trans)

all_res = []

for i_split in range(n_splits):

    (
        (X_train, y_train, s_train),
        (X_val, y_val, s_val),
        (X_test, y_test, s_test),
        (idx_train, idx_val, idx_test)
    ) = data.get_subgroup_split(i_split=i_split, random_state=SEED, n_splits=n_splits)

    lfr = LearnedFairRepresentations(
        prot_attr=X_train[:, 0],
        # random_state=SEED,
        n_prototypes=50,
        reconstruct_weight=0.01,
        target_weight=10.0,
        fairness_weight=50,
        tol=0.0000001,
        max_iter=1000,
        verbose=True,
    )
    df_X_train = pd.DataFrame(X_train)
    lfr.fit(df_X_train, y_train, priv_group=1)

    X_train_edit = lfr.transform(pd.DataFrame(X_train)).values
    # y_train_edit = lfr.predict(X_train)
    X_test_edit = lfr.transform(pd.DataFrame(X_test)).values

    X_train_, y_train_, s_train_ = to_tensor(X_train_edit, y_train, s_train)

    train_dl = to_dataloader(X_train_, y_train_, s_train_, batch_size=64, shuffle=True)
    test_dl = to_dataloader(X_test, y_test, s_test, batch_size=1000, shuffle=False)

    input_size = X_train.shape[1]
    
    clf = Model(input_size, hidden_size, output_size).to(device) # classifier
    optimizer = torch.optim.Adam(clf.parameters(), lr=1e-3)
    loss_fn = F.cross_entropy

    # distance_x_LR = distances.LogisticRegSensitiveSubspace()
    # distance_y = distances.SquaredEuclideanDistance()

    # distance_x_LR.fit(X_train, data_SensitiveAttrs=s_train[:, np.newaxis])
    # distance_y.fit(num_dims=output_size)

    # distance_x_LR.to(device)
    # distance_y.to(device)

    rho = 5.0
    eps = 0.1
    auditor_nsteps = 100
    auditor_lr = 1e-3

    # for epoch in tqdm(range(EPOCHS)):
    for epoch in tqdm(range(10)):

        for x, y in train_dl:

            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            y_pred = clf(x).squeeze()
            loss = loss_fn(y_pred, y)
            loss.backward()
            optimizer.step()
        
        print (f"epoch {epoch}", evaluate(clf, X_test, y_test, s_test))

    lfrclf = TransformPredictor(clf, lfr)
    # print (X_test)

    res = evaluate(lfrclf, X_test, y_test, s_test)
    all_res.append({
        **res,
        # 'method': model_name,
        # 'n_edit': 0,
        # 'dataset': data_name,
        # 'base_model': base_model_name,
        # 'i_run': i_run,
        'i_split': i_split,
    })

df_res = pd.DataFrame(all_res)
df_res.mean(axis=0)

Converged! iter: 182, loss: 3.518


  0%|          | 0/10 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8525883838383839, 'bacc': 0.6994781519864851, 'ap': 0.367153692353447, 'roc': 0.6994781519864851, 'f1': 0.7169575136670828, 'dp': 0.06096274132748339, 'eo': 0.01588911426639622, 'ge': 0.0786536340354149, 'si': 0.014204545454545454, 'acc_grp': {0: 0.88, 1: 0.804}, 'pos_rate_grp': {0: 0.114, 1: 0.175}, 'g_adv': 1, 'acc_cls': {0: 0.933, 1: 0.466}}
epoch 1 {'acc': 0.8320707070707071, 'bacc': 0.7263354867142113, 'ap': 0.36347127823490283, 'roc': 0.7263354867142112, 'f1': 0.7170205163886643, 'dp': 0.09145452525911682, 'eo': 0.03376362528190814, 'ge': 0.08081097899160139, 'si': 0.015467171717171718, 'acc_grp': {0: 0.855, 1: 0.792}, 'pos_rate_grp': {0: 0.158, 1: 0.249}, 'g_adv': 1, 'acc_cls': {0: 0.888, 1: 0.565}}
epoch 2 {'acc': 0.836489898989899, 'bacc': 0.7217360890079642, 'ap': 0.3643346923956516, 'roc': 0.7217360890079642, 'f1': 0.7179066331024591, 'dp': 0.08066290863398753, 'eo': 0.025287356321839094, 'ge': 0.08050200689694165, 'si': 0.017361111111111112, 'acc_grp': {0:

  0%|          | 0/10 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8279671717171717, 'bacc': 0.5, 'ap': 0.1720328282828283, 'roc': 0.5, 'f1': 0.45294422379554483, 'dp': 0.0, 'eo': 0.0, 'ge': 0.10388867708730469, 'si': 0.0, 'acc_grp': {0: 0.875, 1: 0.744}, 'pos_rate_grp': {0: 0.0, 1: 0.0}, 'g_adv': None, 'acc_cls': {0: 1.0, 1: 0.0}}
epoch 1 {'acc': 0.8279671717171717, 'bacc': 0.5, 'ap': 0.1720328282828283, 'roc': 0.5, 'f1': 0.45294422379554483, 'dp': 0.0, 'eo': 0.0, 'ge': 0.10388867708730469, 'si': 0.0, 'acc_grp': {0: 0.875, 1: 0.744}, 'pos_rate_grp': {0: 0.0, 1: 0.0}, 'g_adv': None, 'acc_cls': {0: 1.0, 1: 0.0}}
epoch 2 {'acc': 0.8279671717171717, 'bacc': 0.5, 'ap': 0.1720328282828283, 'roc': 0.5, 'f1': 0.45294422379554483, 'dp': 0.0, 'eo': 0.0, 'ge': 0.10388867708730469, 'si': 0.0, 'acc_grp': {0: 0.875, 1: 0.744}, 'pos_rate_grp': {0: 0.0, 1: 0.0}, 'g_adv': None, 'acc_cls': {0: 1.0, 1: 0.0}}
epoch 3 {'acc': 0.8279671717171717, 'bacc': 0.5, 'ap': 0.1720328282828283, 'roc': 0.5, 'f1': 0.45294422379554483, 'dp': 0.0, 'eo': 0.0, 'ge': 0.1

  0%|          | 0/10 [00:00<?, ?it/s]

epoch 0 {'acc': 0.829229797979798, 'bacc': 0.7541136119081779, 'ap': 0.38310627281215515, 'roc': 0.7541136119081778, 'f1': 0.7282761706251785, 'dp': 0.11047956569730713, 'eo': 0.04572650682737184, 'ge': 0.07687746836193157, 'si': 0.019886363636363636, 'acc_grp': {0: 0.85, 1: 0.792}, 'pos_rate_grp': {0: 0.179, 1: 0.29}, 'g_adv': 1, 'acc_cls': {0: 0.869, 1: 0.64}}
epoch 1 {'acc': 0.8099747474747475, 'bacc': 0.7526901004304161, 'ap': 0.3654925971008414, 'roc': 0.7526901004304161, 'f1': 0.7129216679762647, 'dp': 0.11350114893055738, 'eo': 0.05255154747322968, 'ge': 0.0797568382140736, 'si': 0.026199494949494948, 'acc_grp': {0: 0.831, 1: 0.773}, 'pos_rate_grp': {0: 0.206, 1: 0.32}, 'g_adv': 1, 'acc_cls': {0: 0.84, 1: 0.665}}
epoch 2 {'acc': 0.7910353535353535, 'bacc': 0.7543714131994261, 'ap': 0.35409247912597774, 'roc': 0.754371413199426, 'f1': 0.6998706872124594, 'dp': 0.12408102910855484, 'eo': 0.06817847278142886, 'ge': 0.0809548007322893, 'si': 0.03156565656565657, 'acc_grp': {0: 0.814

  0%|          | 0/10 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8563762626262627, 'bacc': 0.685986818507891, 'ap': 0.36232798573975045, 'roc': 0.685986818507891, 'f1': 0.7104508978404543, 'dp': 0.08522739105986242, 'eo': 0.0403008426231015, 'ge': 0.07854563280568899, 'si': 0.01893939393939394, 'acc_grp': {0: 0.89, 1: 0.796}, 'pos_rate_grp': {0: 0.088, 1: 0.173}, 'g_adv': 1, 'acc_cls': {0: 0.946, 1: 0.426}}
epoch 1 {'acc': 0.8551136363636364, 'bacc': 0.6801246413199427, 'ap': 0.35565516160802224, 'roc': 0.6801246413199425, 'f1': 0.7052361952613172, 'dp': 0.08189167424521163, 'eo': 0.040237567629312444, 'ge': 0.07958650884625366, 'si': 0.018308080808080808, 'acc_grp': {0: 0.89, 1: 0.792}, 'pos_rate_grp': {0: 0.086, 1: 0.168}, 'g_adv': 1, 'acc_cls': {0: 0.947, 1: 0.414}}
epoch 2 {'acc': 0.8535353535353535, 'bacc': 0.6915575681492109, 'ap': 0.36179959907755477, 'roc': 0.6915575681492108, 'f1': 0.7122156127540807, 'dp': 0.08472928974570798, 'eo': 0.04011901027252873, 'ge': 0.07909184031383001, 'si': 0.021464646464646464, 'acc_grp': {0:

  0%|          | 0/10 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8285443637511841, 'bacc': 0.5, 'ap': 0.1714556362488159, 'roc': 0.5, 'f1': 0.4531169055430841, 'dp': 0.0, 'eo': 0.0, 'ge': 0.10346798780487791, 'si': 0.0, 'acc_grp': {0: 0.875, 1: 0.744}, 'pos_rate_grp': {0: 0.0, 1: 0.0}, 'g_adv': None, 'acc_cls': {0: 1.0, 1: 0.0}}
epoch 1 {'acc': 0.8285443637511841, 'bacc': 0.5, 'ap': 0.1714556362488159, 'roc': 0.5, 'f1': 0.4531169055430841, 'dp': 0.0, 'eo': 0.0, 'ge': 0.10346798780487791, 'si': 0.0, 'acc_grp': {0: 0.875, 1: 0.744}, 'pos_rate_grp': {0: 0.0, 1: 0.0}, 'g_adv': None, 'acc_cls': {0: 1.0, 1: 0.0}}
epoch 2 {'acc': 0.8285443637511841, 'bacc': 0.5, 'ap': 0.1714556362488159, 'roc': 0.5, 'f1': 0.4531169055430841, 'dp': 0.0, 'eo': 0.0, 'ge': 0.10346798780487791, 'si': 0.0, 'acc_grp': {0: 0.875, 1: 0.744}, 'pos_rate_grp': {0: 0.0, 1: 0.0}, 'g_adv': None, 'acc_cls': {0: 1.0, 1: 0.0}}
epoch 3 {'acc': 0.8285443637511841, 'bacc': 0.5, 'ap': 0.1714556362488159, 'roc': 0.5, 'f1': 0.4531169055430841, 'dp': 0.0, 'eo': 0.0, 'ge': 0.10346

acc        0.845886
bacc       0.615225
ap         0.290595
roc        0.615225
f1         0.610044
dp         0.044792
eo         0.026726
ge         0.087867
si         0.006124
g_adv      1.000000
i_split    2.000000
dtype: float64

In [17]:
file_name = f"./res_cache/Infair_LFR_{data.fullname}_seed{42}_split{5}.csv"
df_res.to_csv(file_name, index=False)

print (f"Saved results to {file_name}")

# df_res.save(f'./results/{dataname}_{s_attr}_results.csv', index=False)

df_res.mean(axis=0)

Saved results to ./res_cache/Infair_LFR_meps_RACE_seed42_split5.csv


acc        0.845886
bacc       0.615225
ap         0.290595
roc        0.615225
f1         0.610044
dp         0.044792
eo         0.026726
ge         0.087867
si         0.006124
g_adv      1.000000
i_split    2.000000
dtype: float64

In [23]:
# UnLoc Remove

from sklearn.base import clone
from eval import evaluate
from unloc import UnLoc

def run_unloc_rem_exp(
        unloc, mlpclass, dataset_zoo, edit_ratio_space, setting_space, prox_kwargs, how,
        n_splits=5, n_runs=1, random_state=42, verbose=False
):
    print(
        f"////// Baseline Experiment //////\n"
        f"Base Model Zoo:       {list(base_model_zoo.keys())}\n"
        f"Dataset Zoo:          {list(dataset_zoo.keys())}\n"
        f"edit_ratio_space:     {edit_ratio_space}\n"
        # f"setting_space:        {setting_space}\n"
        f"n_splits:             {n_splits}\n"
        f"n_runs:               {n_runs}\n"
        f"random_state:         {random_state}\n"
    )
    
    all_res = []

    for data_name, data in dataset_zoo.items():

        for i_run in range(n_runs):

            print (f"Data: {data_name} | Run: {i_run} | Base: {Model}")
            rand_seed = random_state + i_run
            
            for i_split in range(n_splits):
                
                print (f"Data: {data_name} | Run: {i_run} | split {i_split}")
                
                # get the i-th split of a n-fold cross validation
                (
                    (X_train, y_train, s_train),
                    (X_val, y_val, s_val),
                    (X_test, y_test, s_test),
                    (idx_train, idx_val, idx_test)
                ) = data.get_subgroup_split(
                    i_split=i_split, 
                    random_state=rand_seed,
                    n_splits=n_splits
                )
                print (((y_train==0) & (s_train==0)).sum())
                
                data_split_info = f"{data.fullname}_split[{i_split+1}of{n_splits}]_seed[{rand_seed}]"
                unloc = UnLoc()
                df_score = unloc.fit_transform(
                    X_train, y_train, s_train, data_split_info, 
                    features=data.feature_names, 
                    s_col=data.s_col,
                    prox_kwargs=prox_kwargs,
                )
                df_score['unf_inv'] = df_score['unf'].max() - df_score['unf']

                for setting_name, setting_kwargs in setting_space.items():
                        
                    weights = df_score[setting_kwargs['weights']]
                    dummy = setting_kwargs['dummy']
                    f = setting_kwargs['filter']

                    for edit_ratio in tqdm(edit_ratio_space, disable=verbose, desc=f"{setting_name} - n_edit"):

                        X_edited, y_edited, s_edited, n_edit = unloc.fair_removal(
                            X_train, y_train, s_train,
                            how=how,
                            edit_ratio=edit_ratio,
                            weights=weights,
                            dummy=dummy,
                            filter=f, 
                            verbose=False
                        )


                        # clf = clone(base_model)
                        # clf.set_params(random_state=rand_seed)
                        # clf.fit(X_edited, y_edited)

                        X_tr, y_tr, s_tr = to_tensor(X_edited, y_edited, s_edited)

                        train_dl = to_dataloader(X_tr, y_tr, s_tr, batch_size=128, shuffle=True)
                        test_dl = to_dataloader(X_tr, y_tr, s_tr, batch_size=1000, shuffle=False)

                        input_size = X_train.shape[1]
                        
                        clf = mlpclass(input_size, hidden_size, output_size).to(device) # classifier
                        optimizer = torch.optim.Adam(clf.parameters(), lr=1e-3)
                        loss_fn = F.cross_entropy
                        
                        # for epoch in tqdm(range(EPOCHS)):
                        for epoch in range(EPOCHS):

                            for x, y in train_dl:
                                
                                try:
                                    x, y = x.to(device), y.to(device)
                                    optimizer.zero_grad()
                                    y_pred = clf(x).squeeze()
                                    loss = loss_fn(y_pred, y)
                                    loss.backward()
                                    optimizer.step()
                                except Exception as e:
                                    print (e)
                                    print (y.shape, y_pred.shape)
                                    # print unique values
                                    print (np.unique(y), y)
                                    pass

                        # clf = Model(input_size, hidden_size, output_size).to(device) # classifier
                        # optimizer = torch.optim.Adam(clf.parameters(), lr=1e-3)
                        # lossfn = F.cross_entropy

                        # distance_x_LR = distances.LogisticRegSensitiveSubspace()
                        # distance_y = distances.SquaredEuclideanDistance()

                        # distance_x_LR.fit(X_tr, data_SensitiveAttrs=s_tr[:, np.newaxis])
                        # distance_y.fit(num_dims=output_size)

                        # distance_x_LR.to(device)
                        # distance_y.to(device)

                        # rho = 5.0
                        # eps = 0.1
                        # auditor_nsteps = 10
                        # auditor_lr = 1e-3
                        # EPOCHS = 5

                        # fairalgo_LR = SenSeI(clf, distance_x_LR, distance_y, lossfn, rho, eps, auditor_nsteps, auditor_lr)  # infairness algorithm
                        # fairalgo_LR.train()

                        # for epoch in tqdm(range(EPOCHS)):
                        #     for x, y in train_dl:
                        #         x, y = x.to(device), y.to(device)
                        #         optimizer.zero_grad()
                        #         result = fairalgo_LR(x, y)
                        #         result.loss.backward()
                        #         optimizer.step()
                            
                        #     print (f"epoch {epoch}", evaluate(clf, X_test, y_test, s_test))


                        res = evaluate(clf, X_test, y_test, s_test)
                        
                        all_res.append({
                            **res,
                            'how': how,
                            'setting': setting_name,
                            'n_edit': n_edit,
                            'edit_ratio': edit_ratio,
                            'dataset': data_name,
                            'base_model': 'torchmlp',
                            'i_run': i_run,
                            'i_split': i_split,
                        })
                        if verbose:
                            res_vis = res.copy()
                            for k, v in res_vis.items():
                                if isinstance(v, float):
                                    res_vis[k] = np.round(v*100, 2)
                            print (f"split: {i_split} | n_edit {n_edit} {edit_ratio:.2f} | {res_vis}")
        
    df_res = pd.DataFrame(all_res)
    
    return df_res        


base_model_zoo = {
    'MLP': LogisticRegression(),
    # 'KN': KNeighborsClassifier(n_neighbors=5),
    # 'DT': DecisionTreeClassifier(max_depth=10),
    # 'MLP': MLPClassifier(hidden_layer_sizes=(8), max_iter=50),
    # 'ADA': AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=None), n_estimators=5),
    # 'BAG': BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=None), n_estimators=5),
}
assert len(dataset_zoo) == 1

if list(dataset_zoo.values())[0].fullname == 'adult_gender':
    edit_ratio_space = np.linspace(0, 0.4, 41) # adult gender
elif list(dataset_zoo.values())[0].fullname == 'adult_race':
    edit_ratio_space = np.linspace(0, 0.2, 41) # adult race
elif list(dataset_zoo.values())[0].fullname == 'meps_SEX':
    edit_ratio_space = np.linspace(0, 0.5, 51)
elif list(dataset_zoo.values())[0].fullname == 'meps_RACE':
    edit_ratio_space = np.linspace(0, 0.5, 51) # adult race
else:
    raise NotImplementedError

# edit_ratio_space = np.linspace(0, 1, 11)

setting_space = setting_space = {
    # 'rand_prtneg': {'dummy': True, 'filter': 'prt-neg', 'weights': 'unf'},
    'disag_prtneg': {'dummy': False, 'filter': 'prt-neg', 'weights': 'unf'},
    # 'disag_contr_prtneg': {'dummy': False, 'filter': 'prt-neg', 'weights': 'unf_contr'},
}
prox_kwargs = {
    't_con': 0.1, 't_cat': 2, 'restart_prob': 0.1, 'mat_norm': 'sym', 
    'relax': True, 'max_relax': 3, 'relax_factor': 0.1, 
    'include_self': False
}
n_splits = 5
n_runs = 1
random_state = 42
verbose = False
verbose = True
how = 'removal'

unloc = UnLoc()

df_res = run_unloc_rem_exp(
    unloc, Model, dataset_zoo, edit_ratio_space, setting_space, prox_kwargs=prox_kwargs, how=how, 
    n_splits=n_splits, n_runs=n_runs, random_state=random_state, verbose=verbose
)
df_res.mean(axis=0)

////// Baseline Experiment //////
Base Model Zoo:       ['MLP']
Dataset Zoo:          ['meps_SEX']
edit_ratio_space:     [0.   0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 0.1  0.11 0.12 0.13
 0.14 0.15 0.16 0.17 0.18 0.19 0.2  0.21 0.22 0.23 0.24 0.25 0.26 0.27
 0.28 0.29 0.3  0.31 0.32 0.33 0.34 0.35 0.36 0.37 0.38 0.39 0.4  0.41
 0.42 0.43 0.44 0.45 0.46 0.47 0.48 0.49 0.5 ]
n_splits:             5
n_runs:               1
random_state:         42

Data: meps_SEX | Run: 0 | Base: <class '__main__.Model'>
Data: meps_SEX | Run: 0 | split 0
3948


Loaded score from meps_SEX_split[1of5]_seed[42]_{'t_con': 0.1, 't_cat': 2, 'restart_prob': 0.1, 'mat_norm': 'sym', 'relax': True, 'max_relax': 3, 'relax_factor': 0.1, 'include_self': False}_score.csv
split: 0 | n_edit 0 0.00 | {'acc': 85.7, 'bacc': 72.76, 'ap': 39.61, 'roc': 72.76, 'f1': 73.76, 'dp': 10.59, 'eo': 10.32, 'ge': 7.41, 'si': 5.21, 'acc_grp': {0: 0.893, 1: 0.824}, 'pos_rate_grp': {0: 0.098, 1: 0.204}, 'g_adv': 1, 'acc_cls': {0: 0.925, 1: 0.53}}
split: 0 | n_edit 39 0.01 | {'acc': 85.7, 'bacc': 71.3, 'ap': 38.48, 'roc': 71.3, 'f1': 72.91, 'dp': 9.81, 'eo': 11.57, 'ge': 7.57, 'si': 4.55, 'acc_grp': {0: 0.889, 1: 0.827}, 'pos_rate_grp': {0: 0.09, 1: 0.188}, 'g_adv': 1, 'acc_cls': {0: 0.933, 1: 0.494}}
split: 0 | n_edit 78 0.02 | {'acc': 86.02, 'bacc': 69.1, 'ap': 37.37, 'roc': 69.1, 'f1': 71.71, 'dp': 7.36, 'eo': 7.45, 'ge': 7.66, 'si': 3.12, 'acc_grp': {0: 0.891, 1: 0.832}, 'pos_rate_grp': {0: 0.078, 1: 0.152}, 'g_adv': 1, 'acc_cls': {0: 0.949, 1: 0.433}}
split: 0 | n_edit 11

acc             0.860133
bacc            0.704481
ap              0.384003
roc             0.704481
f1              0.725376
dp              0.041976
eo              0.053805
ge              0.075178
si              0.015526
g_adv           0.988235
n_edit        986.215686
edit_ratio      0.250000
i_run           0.000000
i_split         2.000000
dtype: float64

In [24]:
file_name = f"./res_cache/Infair_unloc_rem_{data.fullname}_({min(edit_ratio_space)}-{max(edit_ratio_space)}-{len(edit_ratio_space)})_seed{42}_split{5}.csv"
df_res.to_csv(file_name, index=False)

print (f"Saved results to {file_name}")

df_res.mean(axis=0)

Saved results to ./res_cache/Infair_unloc_rem_meps_SEX_(0.0-0.5-51)_seed42_split5.csv


acc             0.860133
bacc            0.704481
ap              0.384003
roc             0.704481
f1              0.725376
dp              0.041976
eo              0.053805
ge              0.075178
si              0.015526
g_adv           0.988235
n_edit        986.215686
edit_ratio      0.250000
i_run           0.000000
i_split         2.000000
dtype: float64

In [10]:
# UnLoc Remove

from sklearn.base import clone
from eval import evaluate
from unloc import UnLoc

def run_unloc_aug_exp(
        unloc, mlpclass, dataset_zoo, edit_ratio_space, setting_space, prox_kwargs, how,
        n_splits=5, n_runs=1, random_state=42, verbose=False
):
    print(
        f"////// Baseline Experiment //////\n"
        f"Base Model Zoo:       {list(base_model_zoo.keys())}\n"
        f"Dataset Zoo:          {list(dataset_zoo.keys())}\n"
        f"edit_ratio_space:     {edit_ratio_space}\n"
        # f"setting_space:        {setting_space}\n"
        f"n_splits:             {n_splits}\n"
        f"n_runs:               {n_runs}\n"
        f"random_state:         {random_state}\n"
    )
    
    all_res = []

    for data_name, data in dataset_zoo.items():

        for i_run in range(n_runs):

            print (f"Data: {data_name} | Run: {i_run} | Base: {Model}")
            rand_seed = random_state + i_run
            
            for i_split in range(n_splits):
                
                print (f"Data: {data_name} | Run: {i_run} | split {i_split}")
                
                # get the i-th split of a n-fold cross validation
                (
                    (X_train, y_train, s_train),
                    (X_val, y_val, s_val),
                    (X_test, y_test, s_test),
                    (idx_train, idx_val, idx_test)
                ) = data.get_subgroup_split(
                    i_split=i_split, 
                    random_state=rand_seed,
                    n_splits=n_splits
                )
                
                data_split_info = f"{data.fullname}_split[{i_split+1}of{n_splits}]_seed[{rand_seed}]"
                unloc = UnLoc()
                df_score = unloc.fit_transform(
                    X_train, y_train, s_train, data_split_info, 
                    features=data.feature_names, 
                    s_col=data.s_col,
                    prox_kwargs=prox_kwargs,
                )
                df_score['unf_inv'] = df_score['unf'].max() - df_score['unf']
                P = unloc.get_proximity(
                    X_train, y_train, s_train, data_split_info, 
                    features=data.feature_names, 
                    s_col=data.s_col,
                    prox_kwargs=prox_kwargs,
                )

                for setting_name, setting_kwargs in setting_space.items():
                        
                    weights = df_score[setting_kwargs['weights']]
                    dummy = setting_kwargs['dummy']
                    f = setting_kwargs['filter']

                    for edit_ratio in tqdm(edit_ratio_space, disable=verbose, desc=f"{setting_name} - n_edit"):

                        X_edited, y_edited, s_edited, n_edit = unloc.fair_aug(
                            aug_ratio=edit_ratio, 
                            X_train=X_train, y_train=y_train, s_train=s_train, 
                            P=P, 
                            s_prv=1, s_prt=0, 
                            filter=f, 
                            weights=weights, 
                            dummy=dummy, 
                            random_seed=rand_seed, 
                            verbose=False
                        )
                        
                        # clf = clone(base_model)
                        # clf.set_params(random_state=rand_seed)
                        # clf.fit(X_edited, y_edited)

                        X_tr, y_tr, s_tr = to_tensor(X_edited, y_edited, s_edited)

                        train_dl = to_dataloader(X_tr, y_tr, s_tr, batch_size=128, shuffle=True)
                        test_dl = to_dataloader(X_tr, y_tr, s_tr, batch_size=1000, shuffle=False)

                        input_size = X_train.shape[1]
                        
                        clf = mlpclass(input_size, hidden_size, output_size).to(device) # classifier
                        optimizer = torch.optim.Adam(clf.parameters(), lr=1e-3)
                        loss_fn = F.cross_entropy

                        # for epoch in tqdm(range(EPOCHS)):
                        for epoch in range(EPOCHS):

                            for x, y in train_dl:
                                
                                try:
                                    x, y = x.to(device), y.to(device)
                                    optimizer.zero_grad()
                                    y_pred = clf(x).squeeze()
                                    loss = loss_fn(y_pred, y)
                                    loss.backward()
                                    optimizer.step()
                                except Exception as e:
                                    print (e)
                                    print (y.shape, y_pred.shape)
                                    # print unique values
                                    print (np.unique(y), y)
                                    # raise e
                                    pass
                                                
                        res = evaluate(clf, X_test, y_test, s_test)
                        
                        all_res.append({
                            **res,
                            'how': how,
                            'setting': setting_name,
                            'n_edit': n_edit,
                            'edit_ratio': edit_ratio,
                            'dataset': data_name,
                            'base_model': 'torchmlp',
                            'i_run': i_run,
                            'i_split': i_split,
                        })
                        if verbose:
                            res_vis = res.copy()
                            for k, v in res_vis.items():
                                if isinstance(v, float):
                                    res_vis[k] = np.round(v*100, 2)
                            print (f"split: {i_split} | n_edit {n_edit} {edit_ratio:.2f} | {res_vis}")
        
    df_res = pd.DataFrame(all_res)
    
    return df_res        


base_model_zoo = {
    'MLP': LogisticRegression(),
    # 'KN': KNeighborsClassifier(n_neighbors=5),
    # 'DT': DecisionTreeClassifier(max_depth=10),
    # 'MLP': MLPClassifier(hidden_layer_sizes=(8), max_iter=50),
    # 'ADA': AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=None), n_estimators=5),
    # 'BAG': BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=None), n_estimators=5),
}

assert len(dataset_zoo) == 1
if list(dataset_zoo.values())[0].fullname == 'adult_gender':
    edit_ratio_space = np.linspace(0, 2, 41) # adult gender
elif list(dataset_zoo.values())[0].fullname == 'adult_race':
    edit_ratio_space = np.linspace(0, 0.3, 31) # adult race
elif list(dataset_zoo.values())[0].fullname == 'meps_SEX':
    edit_ratio_space = np.linspace(0, 1, 26)
elif list(dataset_zoo.values())[0].fullname == 'meps_RACE':
    edit_ratio_space = np.linspace(0, 1, 26)
else:
    raise NotImplementedError

# n_space = np.linspace(0, 5000, 21).astype(int)
# edit_ratio_space = np.linspace(0, 2, 11)
# edit_ratio_space = [1]
setting_space = seting_space = {
    # 'rand_prtneg': {'dummy': True, 'filter': 'prt-neg', 'weights': 'unf'},
    # 'disag_prtneg': {'dummy': False, 'filter': 'prt-neg', 'weights': 'unf'},
    # 'disag_contr_prtneg': {'dummy': False, 'filter': 'prt-neg', 'weights': 'unf_contr'},
    # 'rand_prtneg': {'dummy': True, 'filter': 'prt-pos', 'weights': 'unf'},
    'disag_inv_prtpos': {'dummy': False, 'filter': 'prt-pos', 'weights': 'unf_inv'},
    # 'disag_prtpos': {'dummy': False, 'filter': 'prt-pos', 'weights': 'unf'},
}
prox_kwargs = {
    't_con': 0.1, 't_cat': 2, 'restart_prob': 0.1, 'mat_norm': 'sym', 
    'relax': True, 'max_relax': 3, 'relax_factor': 0.1, 
    'include_self': False
}
n_splits = 5
n_runs = 1
random_state = 42
verbose = False
verbose = True
how = 'remove'

unloc = UnLoc()

df_res = run_unloc_aug_exp(
    unloc, Model, dataset_zoo, edit_ratio_space, setting_space, prox_kwargs=prox_kwargs, how=how, 
    n_splits=n_splits, n_runs=n_runs, random_state=random_state, verbose=verbose
)
df_res.mean(axis=1)

////// Baseline Experiment //////
Base Model Zoo:       ['MLP']
Dataset Zoo:          ['meps_RACE']
edit_ratio_space:     [0.   0.04 0.08 0.12 0.16 0.2  0.24 0.28 0.32 0.36 0.4  0.44 0.48 0.52
 0.56 0.6  0.64 0.68 0.72 0.76 0.8  0.84 0.88 0.92 0.96 1.  ]
n_splits:             5
n_runs:               1
random_state:         42

Data: meps_RACE | Run: 0 | Base: <class '__main__.Model'>
Data: meps_RACE | Run: 0 | split 0


Loaded score from meps_RACE_split[1of5]_seed[42]_{'t_con': 0.1, 't_cat': 2, 'restart_prob': 0.1, 'mat_norm': 'sym', 'relax': True, 'max_relax': 3, 'relax_factor': 0.1, 'include_self': False}_score.csv
////// Comparable Analyzer //////
Data shape: (9504, 40)
[6] Continuous features: ['AGE', 'PCS42', 'MCS42', 'K6SUM42', 'PHQ242', 'POVCAT']
[34] Categorical features: ['SEX', 'ASTHDX', 'CANCERDX', 'ACTLIM', 'MIDX', 'DIABDX', 'ARTHTYPE', 'MARRY', 'SOCLIM', 'PREGNT', 'DFHEAR42', 'HONRDC', 'STRKDX', 'ANGIDX', 'WLKLIM', 'CHBRON', 'CHOLDX', 'HIBPDX', 'REGION', 'COGLIM', 'ADSMOK42', 'ADHDADDX', 'RTHLTH', 'FTSTU', 'OHRTDX', 'CHDDX', 'EMPHDX', 'JTPAIN', 'DFSEE42', 'ARTHDX', 'ACTDTY', 'MNHLTH', 'EMPST', 'INSCOV']

Matrix loaded from data_cache/proximity/meps_RACE_split[1of5]_seed[42]_TC=0.1_TD=2_RP=0.1_norm=sym_matrix.h5
split: 0 | n_edit 0 0.00 | {'acc': 85.86, 'bacc': 71.11, 'ap': 38.6, 'roc': 71.11, 'f1': 72.92, 'dp': 14.75, 'eo': 15.47, 'ge': 7.53, 'si': 5.11, 'acc_grp': {0: 0.892, 1: 0.799}, '

0       0.344608
1       2.476491
2       4.697563
3       6.841687
4       9.054725
         ...    
125    46.531543
126    48.751979
127    50.894281
128    53.118282
129    55.328100
Length: 130, dtype: float64

In [11]:
file_name = f"./res_cache/Infair_unloc_aug_{data.fullname}_({min(edit_ratio_space)}-{max(edit_ratio_space)}-{len(edit_ratio_space)})_seed{42}_split{5}.csv"
df_res.to_csv(file_name, index=False)

print (f"Saved results to {file_name}")

df_res.mean(axis=0)

Saved results to ./res_cache/Infair_unloc_aug_meps_RACE_(0.0-1.0-26)_seed42_split5.csv


acc             0.860442
bacc            0.704533
ap              0.384460
roc             0.704533
f1              0.725921
dp              0.105666
eo              0.093503
ge              0.075072
si              0.014208
g_adv           1.000000
n_edit        381.769231
edit_ratio      0.500000
i_run           0.000000
i_split         2.000000
dtype: float64

In [12]:
from eval import evaluate

all_res = []

for i_split in range(n_splits):

    (
        (X_train, y_train, s_train),
        (X_val, y_val, s_val),
        (X_test, y_test, s_test),
        (idx_train, idx_val, idx_test)
    ) = data.get_subgroup_split(i_split=i_split, random_state=SEED, n_splits=n_splits)
    

    X_train, y_train, s_train = to_tensor(X_train, y_train, s_train)

    train_dl = to_dataloader(X_train, y_train, s_train, batch_size=128, shuffle=True)
    test_dl = to_dataloader(X_test, y_test, s_test, batch_size=1000, shuffle=False)

    input_size = X_train.shape[1]
    # output_size = 2
    # hidden_size = 100
    
    clf = Model(input_size, hidden_size, output_size).to(device) # classifier
    optimizer = torch.optim.Adam(clf.parameters(), lr=1e-3)
    lossfn = F.cross_entropy

    distance_x_LR = distances.LogisticRegSensitiveSubspace()
    distance_y = distances.SquaredEuclideanDistance()

    distance_x_LR.fit(X_train, data_SensitiveAttrs=s_train[:, np.newaxis])
    distance_y.fit(num_dims=output_size)

    distance_x_LR.to(device)
    distance_y.to(device)

    rho = 5.0
    eps = 0.1
    auditor_nsteps = 10
    auditor_lr = 1e-3

    fairalgo_LR = SenSeI(clf, distance_x_LR, distance_y, lossfn, rho, eps, auditor_nsteps, auditor_lr)  # infairness algorithm
    fairalgo_LR.train()

    for epoch in tqdm(range(EPOCHS)):
        for x, y in train_dl:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            result = fairalgo_LR(x, y)
            result.loss.backward()
            optimizer.step()
        
        print (f"epoch {epoch}", evaluate(clf, X_test, y_test, s_test))

    res = evaluate(clf, X_test, y_test, s_test)
    all_res.append({
        **res,
        # 'method': model_name,
        # 'n_edit': 0,
        # 'dataset': data_name,
        # 'base_model': base_model_name,
        # 'i_run': i_run,
        'i_split': i_split,
    })

df_res = pd.DataFrame(all_res)
df_res

  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8589015151515151, 'bacc': 0.6662232823960239, 'ap': 0.3534507923269391, 'roc': 0.6662232823960239, 'f1': 0.6972212309645566, 'dp': 0.07002575548258555, 'eo': 0.04408384043272484, 'ge': 0.07915055488099383, 'si': 0.00946969696969697, 'acc_grp': {0: 0.89, 1: 0.803}, 'pos_rate_grp': {0: 0.072, 1: 0.142}, 'g_adv': 1, 'acc_cls': {0: 0.96, 1: 0.372}}
epoch 1 {'acc': 0.8585858585858586, 'bacc': 0.6754811879387354, 'ap': 0.35939530696531247, 'roc': 0.6754811879387354, 'f1': 0.7043988269794721, 'dp': 0.08700458878005096, 'eo': 0.07417173766058144, 'ge': 0.07858291949360006, 'si': 0.014204545454545454, 'acc_grp': {0: 0.889, 1: 0.803}, 'pos_rate_grp': {0: 0.075, 1: 0.162}, 'g_adv': 1, 'acc_cls': {0: 0.955, 1: 0.396}}
epoch 2 {'acc': 0.8607954545454546, 'bacc': 0.6644597718838643, 'ap': 0.3563681421971068, 'roc': 0.6644597718838643, 'f1': 0.6970743387388896, 'dp': 0.07690198338042446, 'eo': 0.08187964841108852, 'ge': 0.0783937244921164, 'si': 0.01672979797979798, 'acc_grp': {0: 0

  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0 {'acc': 0.858270202020202, 'bacc': 0.6454913660735834, 'ap': 0.3380238763419794, 'roc': 0.6454913660735835, 'f1': 0.6784783580161494, 'dp': 0.06768277020694635, 'eo': 0.050709939148073035, 'ge': 0.08081008523607644, 'si': 0.011994949494949494, 'acc_grp': {0: 0.892, 1: 0.797}, 'pos_rate_grp': {0: 0.056, 1: 0.124}, 'g_adv': 1, 'acc_cls': {0: 0.97, 1: 0.321}}
epoch 1 {'acc': 0.8607954545454546, 'bacc': 0.665913391417489, 'ap': 0.35734517143605254, 'roc': 0.665913391417489, 'f1': 0.6982921647933669, 'dp': 0.10105382271064305, 'eo': 0.1035158891142664, 'ge': 0.07829590139331499, 'si': 0.022727272727272728, 'acc_grp': {0: 0.895, 1: 0.799}, 'pos_rate_grp': {0: 0.058, 1: 0.159}, 'g_adv': 1, 'acc_cls': {0: 0.963, 1: 0.369}}
epoch 2 {'acc': 0.8630050505050505, 'bacc': 0.6752426488333619, 'ap': 0.368583460791341, 'roc': 0.675242648833362, 'f1': 0.7074732496419244, 'dp': 0.11018105201774422, 'eo': 0.11196754563894523, 'ge': 0.07662838093682481, 'si': 0.022727272727272728, 'acc_grp': {0: 0.

  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8589015151515151, 'bacc': 0.6518113342898135, 'ap': 0.3428020639369863, 'roc': 0.6518113342898135, 'f1': 0.6846251134971969, 'dp': 0.07406089679063083, 'eo': 0.07220299884659748, 'ge': 0.08008119769891663, 'si': 0.009785353535353536, 'acc_grp': {0: 0.891, 1: 0.801}, 'pos_rate_grp': {0: 0.058, 1: 0.133}, 'g_adv': 1, 'acc_cls': {0: 0.967, 1: 0.336}}
epoch 1 {'acc': 0.8592171717171717, 'bacc': 0.6702161047345767, 'ap': 0.35615309588672517, 'roc': 0.6702161047345768, 'f1': 0.7005612696001431, 'dp': 0.09947794816969462, 'eo': 0.14394463667820068, 'ge': 0.07867098368133343, 'si': 0.0211489898989899, 'acc_grp': {0: 0.887, 1: 0.81}, 'pos_rate_grp': {0: 0.065, 1: 0.164}, 'g_adv': 1, 'acc_cls': {0: 0.958, 1: 0.382}}
epoch 2 {'acc': 0.8607954545454546, 'bacc': 0.6529546269727403, 'ap': 0.34802297748227073, 'roc': 0.6529546269727403, 'f1': 0.6868851698479326, 'dp': 0.09350246794449033, 'eo': 0.16078431372549018, 'ge': 0.07907399713177408, 'si': 0.021464646464646464, 'acc_grp': {0

  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8598484848484849, 'bacc': 0.6669543579626973, 'ap': 0.35519391580161475, 'roc': 0.6669543579626972, 'f1': 0.6983568870493877, 'dp': 0.10812096051983032, 'eo': 0.12664359861591695, 'ge': 0.07862339689454728, 'si': 0.014204545454545454, 'acc_grp': {0: 0.893, 1: 0.799}, 'pos_rate_grp': {0: 0.058, 1: 0.166}, 'g_adv': 1, 'acc_cls': {0: 0.961, 1: 0.373}}
epoch 1 {'acc': 0.8592171717171717, 'bacc': 0.6585590028694405, 'ap': 0.34805064216828924, 'roc': 0.6585590028694406, 'f1': 0.6908240298644633, 'dp': 0.11107138642248712, 'eo': 0.1402537485582468, 'ge': 0.07949622701668609, 'si': 0.02241161616161616, 'acc_grp': {0: 0.894, 1: 0.797}, 'pos_rate_grp': {0: 0.051, 1: 0.162}, 'g_adv': 1, 'acc_cls': {0: 0.964, 1: 0.353}}
epoch 2 {'acc': 0.8626893939393939, 'bacc': 0.6715835724533716, 'ap': 0.3646272269464169, 'roc': 0.6715835724533715, 'f1': 0.7041771073420569, 'dp': 0.12235763327247356, 'eo': 0.17001153402537483, 'ge': 0.0769875677246443, 'si': 0.025252525252525252, 'acc_grp': {0

  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8620145247868646, 'bacc': 0.6574873388581952, 'ap': 0.35316842462621245, 'roc': 0.6574873388581952, 'f1': 0.6916661486846711, 'dp': 0.07643756329482228, 'eo': 0.06613900771054138, 'ge': 0.0781739165320239, 'si': 0.007893905904641616, 'acc_grp': {0: 0.895, 1: 0.803}, 'pos_rate_grp': {0: 0.058, 1: 0.134}, 'g_adv': 1, 'acc_cls': {0: 0.969, 1: 0.346}}
epoch 1 {'acc': 0.8658035996210925, 'bacc': 0.6721883702780398, 'ap': 0.3717739188375913, 'roc': 0.6721883702780398, 'f1': 0.7068655923287086, 'dp': 0.09176337854950414, 'eo': 0.10277089066288864, 'ge': 0.07544193522893422, 'si': 0.01673508051784023, 'acc_grp': {0: 0.896, 1: 0.811}, 'pos_rate_grp': {0: 0.059, 1: 0.151}, 'g_adv': 1, 'acc_cls': {0: 0.967, 1: 0.378}}
epoch 2 {'acc': 0.8616987685506788, 'bacc': 0.6667901900013475, 'ap': 0.358504844617676, 'roc': 0.6667901900013475, 'f1': 0.6994240470123251, 'dp': 0.09893490990638878, 'eo': 0.1185189221589516, 'ge': 0.07774217816866641, 'si': 0.024313230186296178, 'acc_grp': {0: 

Unnamed: 0,acc,bacc,ap,roc,f1,dp,eo,ge,si,acc_grp,pos_rate_grp,g_adv,acc_cls,i_split
0,0.86048,0.672991,0.361511,0.672991,0.703826,0.086616,0.07904,0.077939,0.019255,"{0: 0.891, 1: 0.805}","{0: 0.07, 1: 0.156}",1,"{0: 0.959, 1: 0.387}",0
1,0.86048,0.690434,0.373958,0.690434,0.716931,0.137266,0.161799,0.076541,0.030934,"{0: 0.894, 1: 0.799}","{0: 0.067, 1: 0.204}",1,"{0: 0.95, 1: 0.431}",1
2,0.859848,0.660397,0.350701,0.660397,0.692839,0.096344,0.154556,0.079074,0.020202,"{0: 0.887, 1: 0.811}","{0: 0.056, 1: 0.153}",1,"{0: 0.964, 1: 0.357}",2
3,0.864583,0.680741,0.375054,0.680741,0.712816,0.133743,0.200692,0.075484,0.030619,"{0: 0.895, 1: 0.81}","{0: 0.054, 1: 0.187}",1,"{0: 0.961, 1: 0.401}",3
4,0.862646,0.665901,0.360103,0.665901,0.699357,0.100998,0.141187,0.077351,0.024629,"{0: 0.892, 1: 0.809}","{0: 0.056, 1: 0.156}",1,"{0: 0.965, 1: 0.366}",4


In [13]:
file_name = f"./res_cache/Infair_SenSeI_{data.fullname}_seed{42}_split{5}.csv"
df_res.to_csv(file_name, index=False)

print (f"Saved results to {file_name}")

df_res.mean(axis=0)

Saved results to ./res_cache/Infair_SenSeI_meps_RACE_seed42_split5.csv


acc        0.861607
bacc       0.674093
ap         0.364265
roc        0.674093
f1         0.705154
dp         0.110993
eo         0.147455
ge         0.077278
si         0.025128
g_adv      1.000000
i_split    2.000000
dtype: float64

In [14]:
all_res = []

for i_split in range(n_splits):

    (
        (X_train, y_train, s_train),
        (X_val, y_val, s_val),
        (X_test, y_test, s_test),
        (idx_train, idx_val, idx_test)
    ) = data.get_subgroup_split(i_split=i_split, random_state=SEED, n_splits=n_splits)
    

    X_train, y_train, s_train = to_tensor(X_train, y_train, s_train)

    train_dl = to_dataloader(X_train, y_train, s_train, batch_size=64, shuffle=True)
    test_dl = to_dataloader(X_test, y_test, s_test, batch_size=1000, shuffle=False)

    input_size = X_train.shape[1]
    # output_size = 2
    # hidden_size = 64
    
    clf = Model(input_size, hidden_size, output_size).to(device) # classifier
    optimizer = torch.optim.Adam(clf.parameters(), lr=1e-3)
    lossfn = F.cross_entropy

    distance_x_LR = distances.LogisticRegSensitiveSubspace()
    distance_y = distances.SquaredEuclideanDistance()

    distance_x_LR.fit(X_train, data_SensitiveAttrs=s_train[:, np.newaxis])
    distance_y.fit(num_dims=output_size)

    distance_x_LR.to(device)
    distance_y.to(device)

    rho = 5.0
    eps = 0.1
    auditor_nsteps = 10
    auditor_lr = 1e-3

    fairalgo_LR = SenSR(clf, distance_x_LR, lossfn, eps, lr_lamb=10, lr_param=1e-3, auditor_nsteps=auditor_nsteps, auditor_lr=0.1)  # infairness algorithm
    fairalgo_LR.train()

    for epoch in tqdm(range(EPOCHS)):
        for x, y in train_dl:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            result = fairalgo_LR(x, y)
            result.loss.backward()
            optimizer.step()
        
        print (f"epoch {epoch}", evaluate(clf, X_test, y_test, s_test))

    res = evaluate(clf, X_test, y_test, s_test)
    all_res.append({
        **res,
        # 'method': model_name,
        # 'n_edit': 0,
        # 'dataset': data_name,
        # 'base_model': base_model_name,
        # 'i_run': i_run,
        'i_split': i_split,
    })

df_res = pd.DataFrame(all_res)
df_res.mean(axis=0)

  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8576388888888888, 'bacc': 0.6458369329886992, 'ap': 0.3367745488811736, 'roc': 0.6458369329886992, 'f1': 0.6784226149949618, 'dp': 0.04784896596249835, 'eo': 0.017308992562542258, 'ge': 0.08110680021748981, 'si': 0.0003156565656565657, 'acc_grp': {0: 0.889, 1: 0.801}, 'pos_rate_grp': {0: 0.064, 1: 0.112}, 'g_adv': 1, 'acc_cls': {0: 0.969, 1: 0.323}}
epoch 1 {'acc': 0.8576388888888888, 'bacc': 0.6400224548542008, 'ap': 0.332999037497578, 'roc': 0.6400224548542008, 'f1': 0.6728309163389674, 'dp': 0.04333654987608212, 'eo': 0.0074259703574681805, 'ge': 0.08144020174885551, 'si': 0.0015782828282828283, 'acc_grp': {0: 0.89, 1: 0.799}, 'pos_rate_grp': {0: 0.061, 1: 0.104}, 'g_adv': 1, 'acc_cls': {0: 0.972, 1: 0.308}}
epoch 2 {'acc': 0.8614267676767676, 'bacc': 0.6662946342691854, 'ap': 0.35902597760395927, 'roc': 0.6662946342691853, 'f1': 0.6990557841470844, 'dp': 0.06354870285392963, 'eo': 0.044557133198106835, 'ge': 0.07797321091885855, 'si': 0.0006313131313131314, 'acc_g

  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8570075757575758, 'bacc': 0.6505433585046886, 'ap': 0.3385293790378167, 'roc': 0.6505433585046887, 'f1': 0.682388302290423, 'dp': 0.06060869021916458, 'eo': 0.022368601802344887, 'ge': 0.08112274740786812, 'si': 0.001893939393939394, 'acc_grp': {0: 0.891, 1: 0.795}, 'pos_rate_grp': {0: 0.065, 1: 0.125}, 'g_adv': 1, 'acc_cls': {0: 0.965, 1: 0.336}}
epoch 1 {'acc': 0.8576388888888888, 'bacc': 0.6501977915895729, 'ap': 0.3396869690791709, 'roc': 0.6501977915895729, 'f1': 0.6824688334446392, 'dp': 0.0639478781231126, 'eo': 0.027551310280994556, 'ge': 0.0808382367823736, 'si': 0.0022095959595959595, 'acc_grp': {0: 0.894, 1: 0.792}, 'pos_rate_grp': {0: 0.062, 1: 0.126}, 'g_adv': 1, 'acc_cls': {0: 0.966, 1: 0.334}}
epoch 2 {'acc': 0.8547979797979798, 'bacc': 0.6186829983176347, 'ap': 0.31269732908651554, 'roc': 0.6186829983176347, 'f1': 0.6489114733055343, 'dp': 0.0530677487209036, 'eo': 0.029276538201487484, 'ge': 0.08401086464463564, 'si': 0.0012626262626262627, 'acc_grp':

  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8573232323232324, 'bacc': 0.6428443328550932, 'ap': 0.33322723028605383, 'roc': 0.6428443328550931, 'f1': 0.6752861781833355, 'dp': 0.05904322894610787, 'eo': 0.04890426758938865, 'ge': 0.0814055605742019, 'si': 0.0031565656565656565, 'acc_grp': {0: 0.889, 1: 0.8}, 'pos_rate_grp': {0: 0.058, 1: 0.117}, 'g_adv': 1, 'acc_cls': {0: 0.97, 1: 0.316}}
epoch 1 {'acc': 0.8601641414141414, 'bacc': 0.6700591822094691, 'ap': 0.35802656332162064, 'roc': 0.6700591822094691, 'f1': 0.7011131714749197, 'dp': 0.0785663706984526, 'eo': 0.08143021914648207, 'ge': 0.07825572987799972, 'si': 0.0006313131313131314, 'acc_grp': {0: 0.889, 1: 0.809}, 'pos_rate_grp': {0: 0.071, 1: 0.149}, 'g_adv': 1, 'acc_cls': {0: 0.96, 1: 0.381}}
epoch 2 {'acc': 0.8585858585858586, 'bacc': 0.6603636119081779, 'ap': 0.3479154919543575, 'roc': 0.6603636119081778, 'f1': 0.6919548093426899, 'dp': 0.07552743201871612, 'eo': 0.09158016147635523, 'ge': 0.07967163703720764, 'si': 0.000946969696969697, 'acc_grp': {0:

  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8607954545454546, 'bacc': 0.6828259505021521, 'ap': 0.3683587796119765, 'roc': 0.682825950502152, 'f1': 0.711487505222119, 'dp': 0.10408581921178504, 'eo': 0.08119953863898499, 'ge': 0.07699903607060772, 'si': 0.0, 'acc_grp': {0: 0.895, 1: 0.799}, 'pos_rate_grp': {0: 0.072, 1: 0.176}, 'g_adv': 1, 'acc_cls': {0: 0.954, 1: 0.412}}
epoch 1 {'acc': 0.8614267676767676, 'bacc': 0.6671785329985653, 'ap': 0.3588152576323932, 'roc': 0.6671785329985652, 'f1': 0.6996604542954378, 'dp': 0.08956625268141648, 'eo': 0.08627450980392154, 'ge': 0.07787558861970892, 'si': 0.0015782828282828283, 'acc_grp': {0: 0.894, 1: 0.803}, 'pos_rate_grp': {0: 0.062, 1: 0.152}, 'g_adv': 1, 'acc_cls': {0: 0.963, 1: 0.371}}
epoch 2 {'acc': 0.8598484848484849, 'bacc': 0.6800685975609756, 'ap': 0.3644753455608719, 'roc': 0.6800685975609756, 'f1': 0.7087017209600928, 'dp': 0.10693384798017315, 'eo': 0.10772779700115342, 'ge': 0.07762667844016864, 'si': 0.0006313131313131314, 'acc_grp': {0: 0.892, 1: 0.80

  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0 {'acc': 0.8604357436059362, 'bacc': 0.6455806719669407, 'ap': 0.34180067971552724, 'roc': 0.6455806719669406, 'f1': 0.6796946077202026, 'dp': 0.06533589031387282, 'eo': 0.05122197095605263, 'ge': 0.07963700381999562, 'si': 0.0006315124723713293, 'acc_grp': {0: 0.894, 1: 0.8}, 'pos_rate_grp': {0: 0.054, 1: 0.119}, 'g_adv': 1, 'acc_cls': {0: 0.973, 1: 0.319}}
epoch 1 {'acc': 0.8620145247868646, 'bacc': 0.6567570773256075, 'ap': 0.3527063532010323, 'roc': 0.6567570773256074, 'f1': 0.6910176991051656, 'dp': 0.06779255453125786, 'eo': 0.04048715363866717, 'ge': 0.07821655676268192, 'si': 0.0012630249447426586, 'acc_grp': {0: 0.895, 1: 0.802}, 'pos_rate_grp': {0: 0.06, 1: 0.128}, 'g_adv': 1, 'acc_cls': {0: 0.969, 1: 0.344}}
epoch 2 {'acc': 0.8585412061888222, 'bacc': 0.6320229332300229, 'ap': 0.3287307489246405, 'roc': 0.6320229332300229, 'f1': 0.6650765773421714, 'dp': 0.05708302717312947, 'eo': 0.044178949949595425, 'ge': 0.0813092742031905, 'si': 0.001894537417113988, 'acc_grp': {

acc        0.859272
bacc       0.651439
ap         0.343714
roc        0.651439
f1         0.684230
dp         0.070336
eo         0.061306
ge         0.079909
si         0.001136
g_adv      1.000000
i_split    2.000000
dtype: float64

In [15]:
file_name = f"./res_cache/Infair_SenSR_{data.fullname}_seed{42}_split{5}.csv"
df_res.to_csv(file_name, index=False)

print (f"Saved results to {file_name}")

df_res.mean(axis=0)

Saved results to ./res_cache/Infair_SenSR_meps_RACE_seed42_split5.csv


acc        0.859272
bacc       0.651439
ap         0.343714
roc        0.651439
f1         0.684230
dp         0.070336
eo         0.061306
ge         0.079909
si         0.001136
g_adv      1.000000
i_split    2.000000
dtype: float64

In [16]:
# from inFairness.fairalgo import SenSTIR
# from eval import evaluate

# all_res = []

# for i_split in range(n_splits):

#     (
#         (X_train, y_train, s_train),
#         (X_val, y_val, s_val),
#         (X_test, y_test, s_test),
#         (idx_train, idx_val, idx_test)
#     ) = data.get_subgroup_split(i_split=i_split, random_state=SEED, n_splits=n_splits)
    

#     X_train, y_train, s_train = to_tensor(X_train, y_train, s_train)

#     train_dl = to_dataloader(X_train, y_train, s_train, batch_size=128, shuffle=True)
#     test_dl = to_dataloader(X_test, y_test, s_test, batch_size=1000, shuffle=False)

#     input_size = X_train.shape[1]
#     # output_size = 2
#     # hidden_size = 100
    
#     clf = Model(input_size, hidden_size, output_size).to(device) # classifier
#     optimizer = torch.optim.Adam(clf.parameters(), lr=1e-3)
#     lossfn = F.cross_entropy

#     distance_x_LR = distances.LogisticRegSensitiveSubspace()
#     distance_y = distances.SquaredEuclideanDistance()

#     distance_x_LR.fit(X_train, data_SensitiveAttrs=s_train[:, np.newaxis])
#     distance_y.fit(num_dims=output_size)

#     distance_x_LR.to(device)
#     distance_y.to(device)

#     rho = 5.0
#     eps = 0.1
#     auditor_nsteps = 10
#     auditor_lr = 1e-3
#     monte_carlo_samples_ndcg = 10

#     EPOCHS = 10

#     fairalgo_LR = SenSTIR(clf, distance_x_LR, distance_y, rho, eps, auditor_nsteps, auditor_lr, monte_carlo_samples_ndcg)  # infairness algorithm
#     fairalgo_LR.train()

#     for epoch in tqdm(range(EPOCHS)):
#         fairalgo_LR.train()
#         for x, y in train_dl:
#             x, y = x.to(device), y.to(device)
#             optimizer.zero_grad()
#             result = fairalgo_LR(x, y)
#             result.loss.backward()
#             optimizer.step()
        
#         fairalgo_LR.eval()
#         print (f"epoch {epoch}", evaluate(clf, X_test, y_test, s_test))

#     res = evaluate(clf, X_test, y_test, s_test)
#     all_res.append({
#         **res,
#         # 'method': model_name,
#         # 'n_edit': 0,
#         # 'dataset': data_name,
#         # 'base_model': base_model_name,
#         # 'i_run': i_run,
#         'i_split': i_split,
#     })

# df_res = pd.DataFrame(all_res)
# df_res

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

from inFairness.fairalgo import SenSeI
from inFairness import distances
from inFairness.auditor import SenSRAuditor, SenSeIAuditor

class AdultDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __getitem__(self, idx):
        data = self.data[idx]
        label = self.labels[idx]
        return data, label
    
    def __len__(self):
        return len(self.labels)


def to_tensor(X, y, s):
    return (
        torch.tensor(X, dtype=torch.float32, device=device),
        torch.tensor(y, dtype=torch.long, device=device),
        torch.tensor(s, dtype=torch.long, device=device)
    )

def to_dataset(X, y, s):
    X_tensor, y_tensor, s_tensor = to_tensor(X, y, s)
    ds = AdultDataset(X_tensor, y_tensor)
    return ds

def to_dataloader(X, y, s, batch_size, shuffle):
    ds = to_dataset(X, y, s)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=shuffle)
    return dl

data.s_col
protected_vars = [data.s_col]

(
    (X_train, y_train, s_train),
    (X_val, y_val, s_val),
    (X_test, y_test, s_test),
    (idx_train, idx_val, idx_test)
) = data.get_subgroup_split(i_split=i_split, random_state=SEED, n_splits=n_splits)

X_train, y_train, s_train = to_tensor(X_train, y_train, s_train)

train_dl = to_dataloader(X_train, y_train, s_train, batch_size=64, shuffle=True)
test_dl = to_dataloader(X_test, y_test, s_test, batch_size=1000, shuffle=False)

In [18]:
class Model(nn.Module):

    def __init__(self, input_size, hidden_size, output_size):

        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fcout = nn.Linear(hidden_size, output_size)

    def forward(self, x):

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fcout(x)
        return x

    def predict_proba(self, x):
        if isinstance(x, np.ndarray):
            x = torch.tensor(x).float()

        logits = self.forward(x)
        y_pred_proba = F.softmax(logits, dim=1).detach().numpy()
        return y_pred_proba
    
    def predict(self, x):
        y_pred_proba = self.predict_proba(x)
        y_pred = np.argmax(y_pred_proba, axis=1)
        return y_pred

input_size = X_train.shape[1]
output_size = 2
hidden_size = 64

clf = Model(input_size, hidden_size, output_size).to(device)
optimizer = torch.optim.Adam(clf.parameters(), lr=1e-3)
loss_fn = F.cross_entropy

EPOCHS = 1

clf.train()

Model(
  (fc1): Linear(in_features=125, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fcout): Linear(in_features=64, out_features=2, bias=True)
)

In [19]:
optimizer = torch.optim.Adam(clf.parameters(), lr=1e-3)
lossfn = F.cross_entropy

distance_x_LR = distances.LogisticRegSensitiveSubspace()
distance_y = distances.SquaredEuclideanDistance()

distance_x_LR.fit(X_train, data_SensitiveAttrs=s_train[:, np.newaxis])
distance_y.fit(num_dims=output_size)

distance_x_LR.to(device)
distance_y.to(device)

rho = 5.0
eps = 0.1
auditor_nsteps = 100
auditor_lr = 1e-3
EPOCHS = 10

fairalgo_LR = SenSeI(clf, distance_x_LR, distance_y, lossfn, rho, eps, auditor_nsteps, auditor_lr)
fairalgo_LR.train()

for epoch in tqdm(range(EPOCHS)):
    for x, y in train_dl:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        result = fairalgo_LR(x, y)
        result.loss.backward()
        optimizer.step()

  0%|          | 0/10 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
from eval import evaluate

evaluate(clf, X_test, y_test, s_test)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

from inFairness.fairalgo import SenSeI
from inFairness import distances
from inFairness.auditor import SenSRAuditor, SenSeIAuditor

%load_ext autoreload
%autoreload 2

class AdultDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __getitem__(self, idx):
        data = self.data[idx]
        label = self.labels[idx]
        return data, label
    
    def __len__(self):
        return len(self.labels)

import data_infair as data


train_df, test_df = data.load_data()

X_train_df, Y_train_df = train_df
X_test_df, Y_test_df = test_df

# Let's drop the protected attributes from the training and test data and store them in a
# separate dataframe that we'll use later to train the individually fair metric.
protected_vars = ['sex_Male']

X_protected_df = X_train_df[protected_vars]
X_train_df = X_train_df.drop(columns=protected_vars)
X_test_df = X_test_df.drop(columns=protected_vars)

# Create test data with spouse variable flipped
X_test_df_spouse_flipped = X_test_df.copy()
X_test_df_spouse_flipped.relationship_Wife = 1 - X_test_df_spouse_flipped.relationship_Wife

device = torch.device('cpu')

# Convert all pandas dataframes to PyTorch tensors
X_train, y_train = data.convert_df_to_tensor(X_train_df, Y_train_df)
X_test, y_test = data.convert_df_to_tensor(X_test_df, Y_test_df)
X_test_flip, y_test_flip = data.convert_df_to_tensor(X_test_df_spouse_flipped, Y_test_df)
X_protected = torch.tensor(X_protected_df.values).float()

# Create the training and testing dataset
train_ds = AdultDataset(X_train, y_train)
test_ds = AdultDataset(X_test, y_test)
test_ds_flip = AdultDataset(X_test_flip, y_test_flip)

# Create train and test dataloaders
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=1000, shuffle=False)
test_dl_flip = DataLoader(test_ds_flip, batch_size=1000, shuffle=False)


X_train_df.head()

In [None]:
# Create a fully connected neural network

class Model(nn.Module):

    def __init__(self, input_size, output_size):

        super().__init__()
        self.fc1 = nn.Linear(input_size, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fcout = nn.Linear(100, output_size)

    def forward(self, x):

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fcout(x)
        return x

    def predict_proba(self, x):
        if isinstance(x, np.ndarray):
            x = torch.tensor(x).float()

        logits = self.forward(x)
        y_pred_proba = F.softmax(logits, dim=1).detach().numpy()
        return y_pred_proba
    
    def predict(self, x):
        y_pred_proba = self.predict_proba(x)
        y_pred = np.argmax(y_pred_proba, axis=1)
        return y_pred


In [None]:
input_size = X_train.shape[1]
output_size = 2

network_standard = Model(input_size, output_size).to(device)
optimizer = torch.optim.Adam(network_standard.parameters(), lr=1e-3)
loss_fn = F.cross_entropy

EPOCHS = 1

network_standard.train()

for epoch in tqdm(range(EPOCHS)):

    for x, y in train_dl:

        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        y_pred = network_standard(x).squeeze()
        loss = loss_fn(y_pred, y)
        loss.backward()
        optimizer.step()

# from eval import evaluate

# evaluate(network_standard, X_test, y_test)

In [None]:
input_size = X_train.shape[1]
output_size = 2

network_fair_LR = Model(input_size, output_size).to(device)
optimizer = torch.optim.Adam(network_fair_LR.parameters(), lr=1e-3)
lossfn = F.cross_entropy

distance_x_LR = distances.LogisticRegSensitiveSubspace()
distance_y = distances.SquaredEuclideanDistance()

distance_x_LR.fit(X_train, data_SensitiveAttrs=X_protected)
distance_y.fit(num_dims=output_size)

distance_x_LR.to(device)
distance_y.to(device)

rho = 5.0
eps = 0.1
auditor_nsteps = 100
auditor_lr = 1e-3

fairalgo_LR = SenSeI(network_fair_LR, distance_x_LR, distance_y, lossfn, rho, eps, auditor_nsteps, auditor_lr)
fairalgo_LR.train()

for epoch in tqdm(range(EPOCHS)):
    for x, y in train_dl:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        result = fairalgo_LR(x, y)
        result.loss.backward()
        optimizer.step()

In [None]:
network_fair_LR.predict(X_test)

In [None]:
"""Load Datasets"""

dataset_kwargs = {
    'y_col': 'label',
    'train_size': 0.6,
    'val_size': 0.2,
    'test_size': 0.2,
    'concat_train_val': True,
    'normalize': True,
    'random_state': 42,
}

all_datasets = {
    'compas': ['sex', 'race'],
    'adult': ['gender', 'race'],
    'bank': ['age', 'marital=married'],
    'lsa': ['gender', 'race'],
    'lsa_unfair_gender_race': ['gender', 'race'],
    'meps': ['SEX', 'RACE'],
    'german': ['sex', 'foreign_worker', 'marital_status=single'],
}

"""
Create a dictionary of datasets: dataset_zoo
key: dataset name
value: FairDataset object
"""
dataset_zoo = {}
for dataname, s_attrs in all_datasets.items():
    for s_attr in s_attrs:
        dataset = FairDataset(
            dataname=dataname,
            csv_path=f'./data/{dataname}.csv',
            s_col=s_attr,
            **dataset_kwargs
        )
        dataset_zoo[dataset.fullname] = dataset

        # dataset.describe()
        dataset.brief()

# Print the information of the datasets and models
print(
    f"////// Dataset ZOO //////\n"
    f"{dict_info(dataset_zoo)}\n"
)

dataset_zoo_subset = {
    'compas_sex': dataset_zoo['compas_sex'],
    'compas_race': dataset_zoo['compas_race'],
}

In [None]:
from baselines import ReweightClassifier, AdaFairClassifier, ReductionClassifier, ThresholdClassifier, MimicClassifier
import tqdm
from eval import evaluate


def run_baseline_exp(dataset_zoo, base_model_zoo, model_zoo, n_splits=5, n_runs=1, random_state=42, verbose=False):
    """
    Run baseline experiment with different base models and different datasets.
    """
    print(
        f"////// Baseline Experiment //////\n"
        f"Base Model Zoo: {list(base_model_zoo.keys())}\n"
        f"Model Zoo: {list(model_zoo.keys())}\n"
        f"Dataset Zoo: {list(dataset_zoo.keys())}\n"
        f"n_splits: {n_splits}\n"
        f"n_runs: {n_runs}\n"
        f"random_state: {random_state}\n"
    )

    all_res = []

    for data_name, data in dataset_zoo.items():

        for base_model_name, base_model in base_model_zoo.items():
            
            for model_name, (model, model_kwargs) in model_zoo.items():

                for i_run in range(n_runs):

                    print (f"Data: {data_name} | Run: {i_run} | Base: {base_model_name} | Model: {model_name}")
                    rand_seed = random_state + i_run

                    base_model = base_model.set_params(random_state=rand_seed)
                    
                    for i_split in tqdm.tqdm(range(n_splits), disable=verbose):
                        
                        # get the i-th split of a n-fold cross validation
                        (
                            (X_train, y_train, s_train),
                            (X_val, y_val, s_val),
                            (X_test, y_test, s_test),
                            (idx_train, idx_val, idx_test)
                        ) = data.get_subgroup_split(
                            i_split=i_split, 
                            random_state=rand_seed,
                            n_splits=n_splits
                        )

                        clf = model(
                            estimator=base_model,
                            random_state=rand_seed,
                            **model_kwargs,
                        )
                        try:
                            clf.fit(X_train, y_train)
                        except Exception as e:
                            # print (f"Error: {e}")
                            clf.fit(X_train, y_train, sensitive_features=s_train)
                        
                        res = evaluate(clf, X_test, y_test, s_test)

                        all_res.append({
                            **res,
                            'method': model_name,
                            'n_edit': 0,
                            'dataset': data_name,
                            'base_model': base_model_name,
                            'i_run': i_run,
                        })
                        if verbose:
                            res_vis = res.copy()
                            for k, v in res_vis.items():
                                if isinstance(v, float):
                                    res_vis[k] = np.round(v*100, 2)
                            print (f"Split: {i_split} | {res_vis}")

    df_res = pd.DataFrame(all_res)
    df_res['n_edit'] = 0
    
    return df_res

ensemble_kwargs = {
    'n_estimators': 10,
    # 'random_state': 42,
}

base_model_zoo = {
    'LR': LogisticRegression(),
    # 'KN': KNeighborsClassifier(n_neighbors=5),
    # 'DT': DecisionTreeClassifier(max_depth=10),
    # 'MLP': MLPClassifier(hidden_layer_sizes=(8), max_iter=50),
    # 'ADA': AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=None), n_estimators=5),
    # 'BAG': BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=None), n_estimators=5),
}

model_zoo = {
    'ERM': (MimicClassifier, {}),
    'RW': (ReweightClassifier, {}),
    'ThrDP': (ThresholdClassifier, {'constraints': 'demographic_parity'}),
    'ThrEO': (ThresholdClassifier, {'constraints': 'equalized_odds'}),
    'RedDP': (ReductionClassifier, {'constraints': 'DemographicParity'}),
    'RedEO': (ReductionClassifier, {'constraints': 'EqualizedOdds'}),
    'AdaF1': (AdaFairClassifier, {'saIndex': 0, 'saValue': 0, 'CSB': 'CSB1', **ensemble_kwargs}),
    # 'AdaF2': (AdaFairClassifier, {'saIndex': 0, 'saValue': 0, 'CSB': 'CSB2', **ensemble_kwargs}),
}

seed = 42
n_runs = 1
n_split = 5

df_res = run_baseline_exp(dataset_zoo, base_model_zoo, model_zoo, n_splits=n_split, n_runs=n_runs, random_state=seed, verbose=True)

In [None]:
def plot_scatter_xy_tradeoff(
        df, x, y, group_key, style, 
        ax=None, title=None, errorbar=False, 
        **kwargs
):
    assert x in df.columns and y in df.columns
    assert group_key in df.columns and style in df.columns
    df_plot = df.groupby([group_key, style]).agg({x: ['mean', 'std'], y: ['mean', 'std']}).reset_index()

    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    for style_val in df_plot[style].unique():
        df_subplot = df_plot[df_plot[style] == style_val]
        x_mean, x_std = df_subplot[x]['mean'], df_subplot[x]['std']
        y_mean, y_std = df_subplot[y]['mean'], df_subplot[y]['std']
        
        ax.scatter(x_mean, y_mean, label=style_val, marker="o", s=50)
        if errorbar:
            ax.errorbar(
                x=x_mean, y=y_mean, 
                xerr=x_std, yerr=y_std, 
                ecolor='k', fmt='none', alpha=0.5,
                # capthick=2, capsize=5, 
            )
        ax.set(
            xlabel=x.upper(),
            ylabel=y.upper(),
        )
        ax.legend()
    return ax

def plot_xy_group_scatter_tradeoff(
        df, xs, ys, group_key, style,
        subfig_size=(3, 3), **kwargs
):
    n_x, n_y = len(xs), len(ys)
    h_ax, w_ax = subfig_size
    
    # one plot for one dataset
    dataset_unique = df['dataset'].unique()

    for data_name in dataset_unique:
        df_data = df[df['dataset'] == data_name]
        fig, axs = plt.subplots(n_y, n_x, figsize=(w_ax*n_x, h_ax*n_y))
        for i, x in enumerate(xs):
            for j, y in enumerate(ys):
                ax = plot_scatter_xy_tradeoff(
                    df_data, x, y, group_key, style, ax=axs[j, i], title=None, **kwargs
                )
        plt.suptitle(f"Dataset: {data_name}")
        plt.tight_layout()
        plt.show()


df_res['n_edit'] = 0
plot_xy_group_scatter_tradeoff(
    df_res, xs=['si', 'ge', 'dp', 'eo'], ys=['acc', 'bacc'], 
    group_key='n_edit', style='method', errorbar=True,
    subfig_size=(4, 4)
)

In [None]:
df_res[df_res['method'] == 'AdaF1']

In [None]:
file_name = f'./res_cache/baseline_clf{list(base_model_zoo.keys())}_data{list(dataset_zoo.keys())}_seed{seed}_split{n_split}.csv'
df_res.to_csv(file_name, index=False)

# df_res.to_csv(f'./res_cache/baseline_{dataname}_{s_attr}_seed{seed}_split{n_split}.csv', index=False))