In [6]:
import numpy as np
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from ferm import FERM
from ferm_mt import FERM_MT
from data_loader import load_dataset

In [48]:
np.random.seed(912)

In [3]:
param_grid = [{
    'C': [0.1, 1.0, 10.0],
    'gamma': [0.001, 0.01, 0.1, 1.0],
    'kernel': ['rbf']
}]

# dtrain, dtest, sf = load_dataset('arrhythm')

In [61]:
data = np.vstack([dtrain.data, dtest.data])
target = np.hstack([dtrain.target, dtest.target])
sens = np.hstack([dtrain.sens, dtest.sens])

# perm = np.random.permutation(data.shape[0])
# data_ = data[perm]
# target_ = target[perm]
# sens_ = sens[perm]

# ntrain = int(data_.shape[0]*0.8)
# X_train, y_train, sens_train = data_[:ntrain, :], target_[:ntrain], sens_[:ntrain]
# X_test, y_test, sens_test = data_[ntrain:, :], target_[ntrain:], sens_[ntrain:]

In [62]:
from joblib import Parallel, delayed

def job_fn(model, data, target, sens, param_grid, split=0.8):
    '''
    Run job in parallel on diff random sets 
    and get fpr, tpr etc on the values. Sensitive 
    group agnostic permutations.
    '''
    perm = np.random.permutation(data.shape[0])
    ntrain = int(data.shape[0]*split)
    data, target, sens = data[perm], target[perm], sens[perm]
    X_train, y_train, sens_train = data[:ntrain, :], target[:ntrain], sens[:ntrain]
    X_test, y_test, sens_test = data[ntrain:, :], target[ntrain:], sens[ntrain:]
    clf = GridSearchCV(model, param_grid)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc = accuracy_score(y_test, pred)
    tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
    tpr = tp / (tp + fp)
    fpr = fp / (fp + tn)
    gt_0, gt_1 = y_test[sens_test == 0], y_test[sens_test == 1]
    p_0, p_1 = pred[sens_test == 0], pred[sens_test==1]
    tn_0, fp_0, fn_0, tp_0 = confusion_matrix(gt_0, p_0).ravel()
    tn_1, fp_1, fn_1, tp_1 = confusion_matrix(gt_1, p_1).ravel()
    fpr_sens = {'0': fp_0 / (fp_0 + tn_0), '1': fp_1 / (fp_1 + tn_1)}
    tpr_sens = {'0': tp_0 / (tp_0 + fp_0), '1': tp_1 / (tp_1 + fp_1)}
    return acc, tpr, fpr, tpr_sens, fpr_sens


def get_results(res):
    acc = np.array([r[0] for r in res])
    tpr = np.array([r[1] for r in res])
    fpr = np.array([r[2] for r in res])
    tpr_sens_0 = np.array([r[3]['0'] for r in res])
    tpr_sens_1 = np.array([r[3]['1'] for r in res])
    fpr_sens_0 = np.array([r[4]['0'] for r in res])
    fpr_sens_1 = np.array([r[4]['1'] for r in res])
    
    print(f'acc: {np.mean(acc)} ± {np.std(acc)}')
    print(f'TPR: {np.mean(tpr)} ± {np.std(tpr)}')
    print(f'dFPR: {np.mean(np.abs(fpr_sens_0 - fpr_sens_1))} ± {np.std(np.abs(fpr_sens_0 - fpr_sens_1))}')

In [63]:
svc = svm.SVC()
res = Parallel(n_jobs=8)([delayed(job_fn)(svc, data, target, sens, param_grid) for _ in range(5)])

In [64]:
get_results(res)

acc: 0.7736263736263735 ± 0.047444028889940454
TPR: 0.7472649843521697 ± 0.06155661322432872
dFPR: 0.15830461477520302 ± 0.10385149537310387


In [81]:
fmt = FERM_MT(sensible_feature=sens, rho=0.1)
res1 = Parallel(n_jobs=16)([delayed(job_fn)(fmt, data, target, sens, param_grid) for _ in range(5)])

In [82]:
get_results(res1)

acc: 0.6835164835164835 ± 0.030611842367437633
TPR: 0.7038564188522762 ± 0.0721365272974951
dFPR: 0.05626652237299702 ± 0.0473861393478715


In [71]:
ferm = FERM(sensible_feature=sens)
res2 = Parallel(n_jobs=16)([delayed(job_fn)(ferm, data, target, sens, param_grid) for _ in range(5)])

In [72]:
get_results(res2)

acc: 0.7450549450549451 ± 0.03139089419139712
TPR: 0.7442373997731182 ± 0.04801370476815631
dFPR: 0.21062759462759462 ± 0.12426776061245264


In [42]:
fpr_fmt_0 = [r[4]['0'] for r in res1]
fpr_fm_1 = [r[4]['1'] for r in res1]
fpr_ferm_0 = [r[4]['0'] for r in res2]
fpr_ferm_1 = [r[4]['1'] for r in res2]

In [44]:
np.mean(fpr_fmt_0), np.mean(fpr_fm_1)

(0.29986714975845413, 0.29342586352293654)

In [45]:
np.mean(fpr_ferm_0), np.mean(fpr_ferm_1)

(0.23407114624505931, 0.2400731652767091)

In [4]:
dtrain, dtest, sv = load_dataset('adult')

loading adult dataset...


In [7]:
import time
ferm_mt = FERM_MT(sensible_feature=dtrain.sens)
start = time.time()
ferm_mt.fit(dtrain.data, dtrain.target)
print('Elapsed: ', time.time() - start)

KeyboardInterrupt: 