In [1]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from ferm import FERM
from ferm_mt import FERM_MT
from data_loader import load_dataset
from sklearn.utils import resample

np.random.seed(912)

In [45]:
param_grid = [{
    'C': [0.1, 1.0, 10.0],
    'gamma': [0.001, 0.01, 0.1, 1.0],
    'kernel': ['rbf']
}]

dtrain, dtest, sf = load_dataset('arrhythm')

loading arrhythmia dataset...


  df.colums = [i for i in range(279)]


In [48]:
data = np.vstack([dtrain.data, dtest.data])
target = np.hstack([dtrain.target, dtest.target])
sens = np.hstack([dtrain.sens, dtest.sens])

id_y1 = target == 1.0
id_ny1 = target == -1.0

data_y1 = data[id_y1]
data_ny1 = data[id_ny1]
y1 = target[id_y1]
ny1 = target[id_ny1]

ny1_ds = resample(ny1, replace=True, n_samples=y1.shape[0], random_state=912)
data_ny1_ds = resample(data_ny1, replace=True, n_samples=y1.shape[0], random_state=912)

data_n = np.vstack([data_y1, data_ny1_ds])
target_n = np.hstack([y1, ny1_ds])
sens_n = resample(sens, replace=True, n_samples=target_n.shape[0], random_state=912)

In [24]:
from joblib import Parallel, delayed

def job_fn(model, data, target, sens, param_grid, split=0.8):
    '''
    Run job in parallel on diff random sets 
    and get fpr, tpr etc on the values. Sensitive 
    group agnostic permutations.
    '''
    perm = np.random.permutation(data.shape[0])
    ntrain = int(data.shape[0]*split)
    data, target, sens = data[perm], target[perm], sens[perm]
    X_train, y_train, sens_train = data[:ntrain, :], target[:ntrain], sens[:ntrain]
    X_test, y_test, sens_test = data[ntrain:, :], target[ntrain:], sens[ntrain:]
    clf = GridSearchCV(model, param_grid)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc = accuracy_score(y_test, pred)
    tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
    tpr = tp / (tp + fp)
    fpr = fp / (fp + tn)
    gt_0, gt_1 = y_test[sens_test == 0], y_test[sens_test == 1]
    p_0, p_1 = pred[sens_test == 0], pred[sens_test==1]
    tn_0, fp_0, fn_0, tp_0 = confusion_matrix(gt_0, p_0).ravel()
    tn_1, fp_1, fn_1, tp_1 = confusion_matrix(gt_1, p_1).ravel()
    fpr_sens = {'0': fp_0 / (fp_0 + tn_0), '1': fp_1 / (fp_1 + tn_1)}
    tpr_sens = {'0': tp_0 / (tp_0 + fp_0), '1': tp_1 / (tp_1 + fp_1)}
    return acc, tpr, fpr, tpr_sens, fpr_sens


def get_results(res):
    acc = np.array([r[0] for r in res])
    tpr = np.array([r[1] for r in res])
    fpr = np.array([r[2] for r in res])
    tpr_sens_0 = np.array([r[3]['0'] for r in res])
    tpr_sens_1 = np.array([r[3]['1'] for r in res])
    fpr_sens_0 = np.array([r[4]['0'] for r in res])
    fpr_sens_1 = np.array([r[4]['1'] for r in res])
    
    print(f'acc: {np.mean(acc)} ± {np.std(acc)}')
    print(f'TPR: {np.mean(tpr)} ± {np.std(tpr)}')
    print(f'dFPR: {np.mean(np.abs(fpr_sens_0 - fpr_sens_1))} ± {np.std(np.abs(fpr_sens_0 - fpr_sens_1))}')

In [25]:
svc = svm.SVC()
res = Parallel(n_jobs=8)([delayed(job_fn)(svc, data, target, sens, param_grid) for _ in range(5)])

In [26]:
get_results(res)

acc: 0.6976127320954907 ± 0.018756148042083472
TPR: 0.6728084727678224 ± 0.018810578050465116
dFPR: 0.12690230625411061 ± 0.08754575554499024


In [27]:
fmt = FERM_MT(sensible_feature=sens_n, rho=0.1)
res1 = Parallel(n_jobs=16)([delayed(job_fn)(fmt, data, target, sens, param_grid) for _ in range(5)])

In [28]:
get_results(res1)

acc: 0.6944297082228117 ± 0.011051812549601962
TPR: 0.661987073926553 ± 0.03294750101954578
dFPR: 0.07449569629941082 ± 0.04708200449115961


In [29]:
ferm = FERM(sensible_feature=sens_n)
res2 = Parallel(n_jobs=16)([delayed(job_fn)(ferm, data, target, sens, param_grid) for _ in range(5)])

In [30]:
get_results(res2)

acc: 0.6870026525198939 ± 0.017833141977688715
TPR: 0.6483849263409208 ± 0.017989104731772027
dFPR: 0.08537689607738352 ± 0.035846356385355244


In [5]:
data_n.shape, target_n.shape, sens_n.shape

((490, 278), (490,), (490,))

In [41]:
s1 = (sens == 1).sum()/target[id_y1].shape[0]
s2 = (sens == 0).sum()/target[id_y1].shape[0]
s1n = (sens == 1).sum()/target[id_ny1].shape[0]
s2n = (sens == 0).sum()/target[id_ny1].shape[0]
s1, s2, s1n, s2n

(2.0306965761511218,
 0.1936245572609209,
 1.6586306653809064,
 0.15814850530376084)

In [49]:
(sens[id_y1] == 1).sum(), (sens[id_y1] == 0).sum()

(160, 85)

In [50]:
(sens[id_ny1] == 1).sum(),(sens[id_ny1] == 0).sum()

(89, 117)