In [1]:
import numpy as np

import pandas as pd

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.optimizers import SGD, Adam

import copy

import aif360
from aif360.datasets import AdultDataset, BankDataset, CompasDataset, GermanDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.algorithms import preprocessing, inprocessing, postprocessing
from aif360.algorithms.preprocessing.optim_preproc import OptimPreproc
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions\
            import load_preproc_data_adult, load_preproc_data_german, load_preproc_data_compas
from aif360.algorithms.preprocessing.optim_preproc_helpers.distortion_functions\
            import get_distortion_adult, get_distortion_german, get_distortion_compas
from aif360.algorithms.preprocessing.optim_preproc_helpers.opt_tools import OptTools

from IPython.display import Markdown, display
import warnings
warnings.filterwarnings('ignore')
%load_ext jupyternotify
#np.random.seed(1)

<IPython.core.display.Javascript object>

In [2]:
def get_comparison_algo(inprocessing_algo):
    if isinstance(inprocessing_algo, inprocessing.PrejudiceRemover):
        return sklearn.linear_model.LogisticRegression()
    if isinstance(inprocessing_algo, inprocessing.GerryFairClassifier):
        return sklearn.linear_model.LogisticRegression()
    if isinstance(inprocessing_algo, inprocessing.MetaFairClassifier):
        return BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)
    if isinstance(inprocessing_algo, inprocessing.ExponentiatedGradientReduction):
        return sklearn.linear_model.LogisticRegression()
    if isinstance(inprocessing_algo, inprocessing.GridSearchReduction):
        return sklearn.linear_model.LogisticRegression()

In [3]:
def get_model_name(model):
    if isinstance(model, sklearn.linear_model.LogisticRegression):
        return "Logistic Regression"
    if isinstance(model, sklearn.linear_model.LinearRegression):
        return "Linear Regression"
    if isinstance(model, sklearn.ensemble.BaggingClassifier):
        return "Meta Classifier"
    
    if isinstance(model, preprocessing.DisparateImpactRemover):
        return "DIR"
    if isinstance(model, preprocessing.Reweighing):
        return "RW"
    
    if isinstance(model, inprocessing.PrejudiceRemover):
        return "PR"
    if isinstance(model, inprocessing.ExponentiatedGradientReduction):
        return "EGR"
    if isinstance(model, inprocessing.GerryFairClassifier):
        return "GFC"
    if isinstance(model, inprocessing.GridSearchReduction):
        return "GSR"
    if isinstance(model, inprocessing.MetaFairClassifier):
        return "MFC"
    
    if isinstance(model, postprocessing.EqOddsPostprocessing):
        return "EOP"
    if isinstance(model, postprocessing.CalibratedEqOddsPostprocessing):
        return "CEOP"
    if isinstance(model, postprocessing.RejectOptionClassification):
        return "ROC"
    
    return "Logistic Regression"

In [4]:
def analyze_algo(dataset_train, dataset_test, privileged_groups, unprivileged_groups, classifier=None, 
                 preprocessing_algo=None, inprocessing_algo=None, postprocessing_algo=None):
    base = sklearn.linear_model.LogisticRegression()
    if inprocessing_algo is not None:
        base = get_comparison_algo(inprocessing_algo)
    '''base.fit(dataset_train.features, dataset_train.labels.ravel())
    results = base.predict(dataset_test.features)
    if isinstance(base, sklearn.linear_model.LinearRegression):
        results = np.rint(results)
    dataset_test_pred = dataset_test.copy()
    dataset_test_pred.labels = np.array([results]).transpose()'''
    
    dataset_train_pred = dataset_train.copy(deepcopy=True)
    dataset_test_pred = dataset_test.copy(deepcopy=True)
    
    dataset_test_features = dataset_test.features
    
    scale_orig = StandardScaler()
    X_train = dataset_train.features#scale_orig.fit_transform(dataset_train.features)
    y_train = dataset_train.labels.ravel()
    model = base
    model.fit(X_train, y_train)

    fav_idx = np.where(model.classes_ == dataset_train.favorable_label)[0][0]
    y_train_pred_prob = model.predict_proba(X_train)[:,fav_idx]

    # Prediction probs for testing data
    X_test = dataset_test.features#scale_orig.transform(dataset_test.features)
    y_test_pred_prob = model.predict_proba(X_test)[:,fav_idx]

    dataset_train_pred.scores = y_train_pred_prob.reshape(-1,1)
    dataset_test_pred.scores = y_test_pred_prob.reshape(-1,1)

    class_thresh = 0.5
    y_train_pred = np.zeros_like(dataset_train_pred.labels)
    y_train_pred[y_train_pred_prob >= class_thresh] = dataset_train_pred.favorable_label
    y_train_pred[~(y_train_pred_prob >= class_thresh)] = dataset_train_pred.unfavorable_label
    dataset_train_pred.labels = y_train_pred

    y_test_pred = np.zeros_like(dataset_test_pred.labels)
    y_test_pred[y_test_pred_prob >= class_thresh] = dataset_test_pred.favorable_label
    y_test_pred[~(y_test_pred_prob >= class_thresh)] = dataset_test_pred.unfavorable_label
    dataset_test_pred.labels = y_test_pred
    
    dataset_test.features = dataset_test_features
    
    CM = ClassificationMetric(dataset_test,
                              dataset_test_pred,
                              unprivileged_groups=unprivileged_groups,
                              privileged_groups=privileged_groups)
    #print(f"Orig Consistency: {CM.consistency()}")
    
    if preprocessing_algo is not None:
        dataset_train = preprocessing_algo.fit_transform(dataset_train)
    
    if inprocessing_algo is not None:
        model = inprocessing_algo
        dataset_train_pred = dataset_train.copy(deepcopy=True)
        dataset_test_pred = dataset_test.copy(deepcopy=True)

        scale_orig = StandardScaler()
        #X_train = scale_orig.fit_transform(dataset_train.features)
        dataset_train.features = dataset_train.features#scale_orig.fit_transform(dataset_train.features)
        y_train = dataset_train.labels.ravel()
        #model = GSR
        model.fit(dataset_train)

        fav_idx = np.where(np.array([0, 1]) == dataset_train.favorable_label)[0][0]
        y_train_pred_prob = model.predict(dataset_train).scores
        #print(y_train_pred_prob)
        y_train_pred_prob = y_train_pred_prob#[:,fav_idx]

        # Prediction probs for testing data
        #X_test = scale_orig.transform(dataset_test.features)
        dataset_transf_test = dataset_test.copy(deepcopy=True)
        dataset_transf_test.features = dataset_test.features#scale_orig.transform(dataset_test.features)
        y_test_pred_prob = model.predict(dataset_transf_test).scores#[:,fav_idx]

        dataset_train_pred.scores = y_train_pred_prob.reshape(-1,1)
        dataset_test_pred.scores = y_test_pred_prob.reshape(-1,1)

        class_thresh = 0.5
        y_train_pred = np.zeros_like(dataset_train_pred.labels)
        y_train_pred[y_train_pred_prob >= class_thresh] = dataset_train_pred.favorable_label
        y_train_pred[~(y_train_pred_prob >= class_thresh)] = dataset_train_pred.unfavorable_label
        dataset_train_pred.labels = y_train_pred

        y_test_pred = np.zeros_like(dataset_test_pred.labels)
        y_test_pred[y_test_pred_prob >= class_thresh] = dataset_test_pred.favorable_label
        y_test_pred[~(y_test_pred_prob >= class_thresh)] = dataset_test_pred.unfavorable_label
        dataset_test_pred.labels = y_test_pred
    else:
        dataset_train_pred = dataset_train.copy(deepcopy=True)
        dataset_test_pred = dataset_test.copy(deepcopy=True)

        scale_orig = StandardScaler()
        X_train = dataset_train.features#scale_orig.fit_transform(dataset_train.features)
        y_train = dataset_train.labels.ravel()
        model = base
        model.fit(X_train, y_train)

        fav_idx = np.where(model.classes_ == dataset_train.favorable_label)[0][0]
        y_train_pred_prob = model.predict_proba(X_train)[:,fav_idx]

        # Prediction probs for testing data
        X_test = dataset_test.features#scale_orig.transform(dataset_test.features)
        y_test_pred_prob = model.predict_proba(X_test)[:,fav_idx]

        dataset_train_pred.scores = y_train_pred_prob.reshape(-1,1)
        dataset_test_pred.scores = y_test_pred_prob.reshape(-1,1)

        class_thresh = 0.5
        y_train_pred = np.zeros_like(dataset_train_pred.labels)
        y_train_pred[y_train_pred_prob >= class_thresh] = dataset_train_pred.favorable_label
        y_train_pred[~(y_train_pred_prob >= class_thresh)] = dataset_train_pred.unfavorable_label
        dataset_train_pred.labels = y_train_pred

        y_test_pred = np.zeros_like(dataset_test_pred.labels)
        y_test_pred[y_test_pred_prob >= class_thresh] = dataset_test_pred.favorable_label
        y_test_pred[~(y_test_pred_prob >= class_thresh)] = dataset_test_pred.unfavorable_label
        dataset_test_pred.labels = y_test_pred
        
    #dataset_test.features = dataset_test_features
    
    if postprocessing_algo is not None:
        dataset_test_pred = postprocessing_algo.fit_predict(dataset_test, dataset_test_pred)
        
    dataset_test.features = dataset_test_features
    
    CM = ClassificationMetric(dataset_test,
                              dataset_test_pred,
                              unprivileged_groups=unprivileged_groups,
                              privileged_groups=privileged_groups)
    name = []
    if preprocessing_algo is not None:
        name.append(get_model_name(preprocessing_algo))
    if inprocessing_algo is not None:
        name.append(get_model_name(inprocessing_algo))
    if postprocessing_algo is not None:
        name.append(get_model_name(postprocessing_algo))
    if len(name) == 0:
        name.append("Logistic Regression")
    print(f"Consistency Score for {' + '.join(name)}: {CM.consistency()}")

In [5]:
%%notify
dataset = BankDataset(
    protected_attribute_names=['age'],
    privileged_classes=[lambda x: x >= 25], #age >= 25 is privileged
    features_to_drop=['day_of_week'] #ignore sex-related stuff
)
privileged_groups = [{'age': 1}]
unprivileged_groups = [{'age': 0}]

dataset_train, dataset_test = dataset.split([0.7], shuffle = True)

preprocessing_algos = [None,
                       preprocessing.Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups),
                       preprocessing.DisparateImpactRemover()]
inprocessing_algos = [None,
                      inprocessing.ExponentiatedGradientReduction(sklearn.linear_model.LogisticRegression(), constraints="DemographicParity", drop_prot_attr=False),
                      inprocessing.GridSearchReduction(sklearn.linear_model.LogisticRegression(), constraints="DemographicParity", drop_prot_attr=False),
                      inprocessing.PrejudiceRemover()]
postprocessing_algos = [None,
                        postprocessing.CalibratedEqOddsPostprocessing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups),
                        postprocessing.RejectOptionClassification(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups),
                        postprocessing.EqOddsPostprocessing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)]

for post in postprocessing_algos:
    for inproc in inprocessing_algos:
        for pre in preprocessing_algos:
            analyze_algo(dataset_train, dataset_test, privileged_groups, unprivileged_groups, 
                 preprocessing_algo=copy.deepcopy(pre), 
                 inprocessing_algo=copy.deepcopy(inproc),
                 postprocessing_algo=copy.deepcopy(post))



Consistency Score for Logistic Regression: [0.89452279]
Consistency Score for RW: [0.89452279]
Consistency Score for DIR: [0.89452279]
Consistency Score for EGR: [0.89452279]
Consistency Score for RW + EGR: [0.89452279]
Consistency Score for DIR + EGR: [0.89452279]
Consistency Score for GSR: [0.89452279]
Consistency Score for RW + GSR: [0.89452279]
Consistency Score for DIR + GSR: [0.89452279]
Consistency Score for PR: [0.89452279]
Consistency Score for RW + PR: [0.89452279]
Consistency Score for DIR + PR: [0.89452279]
Consistency Score for CEOP: [0.89452279]
Consistency Score for RW + CEOP: [0.89452279]
Consistency Score for DIR + CEOP: [0.89452279]
Consistency Score for EGR + CEOP: [0.89452279]
Consistency Score for RW + EGR + CEOP: [0.89452279]
Consistency Score for DIR + EGR + CEOP: [0.89452279]
Consistency Score for GSR + CEOP: [0.89452279]
Consistency Score for RW + GSR + CEOP: [0.89452279]
Consistency Score for DIR + GSR + CEOP: [0.89452279]
Consistency Score for PR + CEOP: [0.8

<IPython.core.display.Javascript object>

In [6]:
#Logistic Regression Only

dataset = BankDataset(
    protected_attribute_names=['age'],
    privileged_classes=[lambda x: x >= 25], #age >= 25 is privileged
    features_to_drop=['day_of_week'] #ignore sex-related stuff
)
privileged_groups = [{'age': 1}]
unprivileged_groups = [{'age': 0}]
base = sklearn.linear_model.LogisticRegression()

dataset_train_pred = dataset_train.copy(deepcopy=True)
dataset_test_pred = dataset_test.copy(deepcopy=True)

dataset_test_features = dataset_test.features

scale_orig = StandardScaler()
X_train = dataset_train.features#scale_orig.fit_transform(dataset_train.features)
y_train = dataset_train.labels.ravel()
model = base
model.fit(X_train, y_train)

fav_idx = np.where(model.classes_ == dataset_train.favorable_label)[0][0]
y_train_pred_prob = model.predict_proba(X_train)[:,fav_idx]

# Prediction probs for testing data
X_test = dataset_test.features#scale_orig.transform(dataset_test.features)
y_test_pred_prob = model.predict_proba(X_test)[:,fav_idx]

dataset_train_pred.scores = y_train_pred_prob.reshape(-1,1)
dataset_test_pred.scores = y_test_pred_prob.reshape(-1,1)

class_thresh = 0.5
y_train_pred = np.zeros_like(dataset_train_pred.labels)
y_train_pred[y_train_pred_prob >= class_thresh] = dataset_train_pred.favorable_label
y_train_pred[~(y_train_pred_prob >= class_thresh)] = dataset_train_pred.unfavorable_label
dataset_train_pred.labels = y_train_pred

y_test_pred = np.zeros_like(dataset_test_pred.labels)
y_test_pred[y_test_pred_prob >= class_thresh] = dataset_test_pred.favorable_label
y_test_pred[~(y_test_pred_prob >= class_thresh)] = dataset_test_pred.unfavorable_label
dataset_test_pred.labels = y_test_pred

dataset_test.features = dataset_test_features

CM = ClassificationMetric(dataset_test,
                          dataset_test_pred,
                          unprivileged_groups=unprivileged_groups,
                          privileged_groups=privileged_groups)
print(f"Orig Consistency: {CM.consistency()}")



Orig Consistency: [0.89452279]


In [7]:
dataset = BankDataset(
    protected_attribute_names=['age'],
    privileged_classes=[lambda x: x >= 25], #age >= 25 is privileged
    features_to_drop=['day_of_week'] #ignore sex-related stuff
)
privileged_groups = [{'age': 1}]
unprivileged_groups = [{'age': 0}]
base = sklearn.linear_model.LogisticRegression()

RW = preprocessing.Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
dataset_train = RW.fit_transform(dataset_train)

dataset_train_pred = dataset_train.copy(deepcopy=True)
dataset_test_pred = dataset_test.copy(deepcopy=True)

dataset_test_features = dataset_test.features

scale_orig = StandardScaler()
X_train = dataset_train.features#scale_orig.fit_transform(dataset_train.features)
y_train = dataset_train.labels.ravel()
model = base
model.fit(X_train, y_train)

fav_idx = np.where(model.classes_ == dataset_train.favorable_label)[0][0]
y_train_pred_prob = model.predict_proba(X_train)[:,fav_idx]

# Prediction probs for testing data
X_test = dataset_test.features#scale_orig.transform(dataset_test.features)
y_test_pred_prob = model.predict_proba(X_test)[:,fav_idx]

dataset_train_pred.scores = y_train_pred_prob.reshape(-1,1)
dataset_test_pred.scores = y_test_pred_prob.reshape(-1,1)

class_thresh = 0.5
y_train_pred = np.zeros_like(dataset_train_pred.labels)
y_train_pred[y_train_pred_prob >= class_thresh] = dataset_train_pred.favorable_label
y_train_pred[~(y_train_pred_prob >= class_thresh)] = dataset_train_pred.unfavorable_label
dataset_train_pred.labels = y_train_pred

y_test_pred = np.zeros_like(dataset_test_pred.labels)
y_test_pred[y_test_pred_prob >= class_thresh] = dataset_test_pred.favorable_label
y_test_pred[~(y_test_pred_prob >= class_thresh)] = dataset_test_pred.unfavorable_label
dataset_test_pred.labels = y_test_pred

dataset_test.features = dataset_test_features

CM = ClassificationMetric(dataset_test,
                          dataset_test_pred,
                          unprivileged_groups=unprivileged_groups,
                          privileged_groups=privileged_groups)
print(f"RW Consistency: {CM.consistency()}")



RW Consistency: [0.89452279]
