## Pre-processing Mitigation Methods: AIF360



In [None]:
import numpy as np
from numpy.random import choice as np_choice
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
from IPython.display import Markdown, display
# from sklearn.decomposition import PCA

import fairness_helpers as fh
import global_variables as gv
import utilities

from aif360.algorithms.preprocessing import DisparateImpactRemover
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric

#### get data

In [None]:
X, X1 = fh.get_aif360_data()

#### get models

#### Step 1. Convert Pandas DataFrame into [BinaryLabelDataset datatype](https://aif360.readthedocs.io/en/latest/modules/generated/aif360.datasets.BinaryLabelDataset.html#aif360.datasets.BinaryLabelDataset)

In [None]:
dataset1b = BinaryLabelDataset(df=X1, 
                          label_names=['CVD'], 
                          protected_attribute_names=['sex-binary'])

dataset2b = BinaryLabelDataset(df=X1, 
                          label_names=['CVD'], 
                          protected_attribute_names=['race-binary'])

dataset3b = BinaryLabelDataset(df=X1, 
                          label_names=['CVD'], 
                          protected_attribute_names=['age-binary'])

### Baseline. Direct Removal of Protected Attributes

### Method 1. Disparate Impact Remover

Edits feature values to increase group fairness while preserving rank-ordering. Must specify the <font color='blue'>repair_level</font> variable to indicate how much group distributions should overlap.

We want the individual rankings within their group to be preserved after repair ie if an individual has the highest ranking in group Q, it will still have the highest ranking in group Q after repair

In [None]:
# build DisparateRemoverObject

binaryLabelDataset = aif360.datasets.BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=encoded_df,
    label_names=['Loan_Status'],
    protected_attribute_names=['Gender'])

In [None]:
# transform original dataset

di = DisparateImpactRemover(repair_level = 1.0)
dataset_transf_train = di.fit_transform(binaryLabelDataset)
transformed = dataset_transf_train.convert_to_dataframe()[0]

In [None]:
# retrain model after processing

DIs = []
for level in tqdm(np.linspace(0., 1., 11)):
    di = DisparateImpactRemover(repair_level=level)
    train_repd = di.fit_transform(train)
    test_repd = di.fit_transform(test)
    
    X_tr = np.delete(train_repd.features, index, axis=1)
    X_te = np.delete(test_repd.features, index, axis=1)
    y_tr = train_repd.labels.ravel()
    
    lmod = LogisticRegression(class_weight='balanced', solver='liblinear')
    lmod.fit(X_tr, y_tr)
    
    test_repd_pred = test_repd.copy()
    test_repd_pred.labels = lmod.predict(X_te)

    p = [{protected: 1}]
    u = [{protected: 0}]
    cm = BinaryLabelDatasetMetric(test_repd_pred, privileged_groups=p, unprivileged_groups=u)
    DIs.append(cm.disparate_impact())

In [None]:
# bias evaluation
%matplotlib notebook

plt.plot(np.linspace(0, 1, 11), DIs, marker='o')
plt.plot([0, 1], [1, 1], 'g')
plt.plot([0, 1], [0.8, 0.8], 'r')
plt.ylim([0.4, 1.2])
plt.ylabel('Disparate Impact (DI)')
plt.xlabel('repair level')
plt.show()

### Method 2. Learning Fair Representations

Finds the latent representation which encodes the data well but obfuscated information about protected attributes

### Method 3. Optimized Preprocessing

Learns a probabilistic transformation that edits the features and labels in the data with group fairness, individual distortion, and dad fidelity constraints and objectives

### Method 4. Reweighing 

Weights the examples in each (group, label) combination differently to ensure fairness before classification