This file shows steps to construct synthetic data using the method shown in the paper. <br>
We first import packages and set a seed

In [1]:
import sys
sys.path.append("models")
sys.path.append("AIF360/")
import numpy as np
from scipy.stats import bernoulli
from aif360.datasets import StandardDataset
#from fairness_metrics.tot_metrics import TPR, TNR
from aif360.algorithms.preprocessing.optim_preproc import OptimPreproc
from sklearn.linear_model import LogisticRegression
from aif360.metrics import BinaryLabelDatasetMetric
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
from aif360.algorithms.preprocessing.optim_preproc_helpers.opt_tools import OptTools
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore", UserWarning)
#from compas_model import get_evaluation
np.random.seed(10)

pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


We then define label column. In this example, I have 10000 0's and 10000 1's for the label column. 0 is the negative outcome and 1 is the positive outcome. <br>
We also need to define the sensitive attribute value and the correlation between sensitive attribute and the outcome

In [2]:
Y = [0] * 10000
Y.extend([1] * 10000)
# S here is the sensitive attribute where 0 is the unprivileged group and 1 is the privileged group
S = []
for i in Y:
    if i == 0:
        S.append(bernoulli.rvs(0.35))
    else:
        S.append(bernoulli.rvs(0.6))

The next section is to define non-sensitive features for prediction. In non-sensitive features, we need to determine the correlation between the non-sensitive feature and outcome, sensitive attibute and other non-sensitive features

In [3]:
df = pd.DataFrame()
df['y'] = Y
df['sens'] = S
df['tmp'] = 1
df.groupby(['y', 'sens']).count()
tmp = []
for i, j in zip(df['y'], df['sens']):
    if i == 0 and j == 0:
        tmp.append(np.random.poisson(lam=6))
    if i == 1 and j == 0:
        tmp.append(np.random.poisson(lam=3))
    if i == 0 and j == 1:
        tmp.append(np.random.poisson(lam=4))
    if i == 1 and j == 1:
        tmp.append(np.random.poisson(lam=2))
df['edu_orig'] = tmp
tmp = df['edu_orig'].sort_values().to_list()

# create 4 equal-width bins to bin continuous data into categorical
def cat_edu(x):
    if x <= tmp[int(len(tmp) / 4)]:
        return 'edu_cat1'
    elif x >= tmp[int(3 * len(tmp) / 4)]:
        return 'edu_cat4'
    elif x >= tmp[int(2 * len(tmp) / 4)]:
        return 'edu_cat3'
    else:
        return 'edu_cat2'

df['edu'] = df['edu_orig'].apply(lambda x: cat_edu(x))

The code above defines the first non-sensitive feature, education and how to convert the feature into categorical. 

In [4]:
temp = []
tmp_list = [(i * 2 - 1) * 0.1 + (j * 2 - 1) * 0.4 + np.log(z + 1) for i, j,
            z in zip(df['sens'].tolist(), df['y'].tolist(), df['edu_orig'].tolist())]
for i in tmp_list:
    temp.append(
        np.random.normal(
            loc=i,
            scale=1.5) +
        np.random.exponential(1) -
        1)
df['occuption_orig'] = temp
tmp = temp


def cat_occ(x):
    if x <= tmp[int(len(tmp) / 4)]:
        return 'occ_cat1'
    elif x >= tmp[int(3 * len(tmp) / 4)]:
        return 'occ_cat4'
    elif x >= tmp[int(2 * len(tmp) / 4)]:
        return 'occ_cat3'
    else:
        return 'occ_cat2'


df['occuption'] = df['occuption_orig'].apply(lambda x: cat_occ(x))


temp = []
tmp_list = [(i * 2 - 1) * 0.05 + (j * 2 -1)*0.05 +np.log(z +1) /2 +
            np.log(np.abs(k)+1)/5 for i,j,z,k in zip(df['sens'].tolist(),
                                                     df['y'].tolist(),
                                                     df['edu_orig'].tolist(),
                                                     df['occuption_orig'].tolist())]
for i in tmp_list:
    temp.append(np.random.normal(loc=i,scale=1.5) + np.random.exponential(1)-1)
df['age_orig'] = temp
tmp = temp
def cat_age(x):
    if x <= tmp[int(len(tmp) / 4)]:
        return 'age_cat1'
    elif x >= tmp[int(3 * len(tmp) / 4)]:
        return 'age_cat4'
    elif x >= tmp[int(2 * len(tmp) / 4)]:
        return 'age_cat3'
    else:
        return 'age_cat2'


df['age'] = df['age_orig'].apply(lambda x: cat_age(x))

In the part above, we define the other two non-sensitive features. Now, we have finished the synthetic data construction. <br>
Next, we will use the synthetic data to do our missing value experiment. 

In [5]:
# we first define the distort score metrics to use the categorical fix
def custom_distort(vold, vnew):
    distort = {}
    distort['edu'] = pd.DataFrame(
        {'edu_cat1': [0., 1., 2., 3., 100.],
         'edu_cat2': [1., 0., 1., 2., 100.],
         'edu_cat3': [2., 1., 0., 1., 100.],
         'edu_cat4': [3., 2., 1., 0., 100.],
         'missing': [0., 0., 0., 0., 1.]},
        index=['edu_cat1', 'edu_cat2', 'edu_cat3', 'edu_cat4', 'missing'])
    distort['occuption'] = pd.DataFrame(
        {'occ_cat1': [0., 1., 2., 3.],
         'occ_cat2': [1., 0., 1., 2.],
         'occ_cat3': [2., 1., 0., 1.],
         'occ_cat4': [3., 2., 1., 0.]},
        index=['occ_cat1', 'occ_cat2', 'occ_cat3', 'occ_cat4'])
    distort['age'] = pd.DataFrame(
        {'age_cat1': [0., 1., 2., 3.],
         'age_cat2': [1., 0., 1., 2.],
         'age_cat3': [2., 1., 0., 1.],
         'age_cat4': [3., 2., 1., 0.]},
        index=['age_cat1', 'age_cat2', 'age_cat3', 'age_cat4'])
    distort['sens'] = pd.DataFrame(
        {0.0: [0., 2.],
         1.0: [2., 0.]},
        index=[0.0, 1.0])
    distort['y'] = pd.DataFrame(
        {0.0: [0., 2.],
         1.0: [2., 0.]},
        index=[0.0, 1.0])

    total_cost = 0.0
    for k in vold:
        if k in vnew:
            total_cost += distort[k].loc[vnew[k], vold[k]]

    return total_cost


class CustomDataset(StandardDataset):
    """Adult Census Income Dataset.

    See :file:`aif360/data/raw/adult/README.md`.
    """

    def __init__(self, label_name='y',
                 favorable_classes=['1'],
                 protected_attribute_names=['x_control'],
                 privileged_classes=['0'],
                 instance_weights_name=None,
                 categorical_features=[],
                 features_to_keep=[], features_to_drop=[],
                 na_values=[''], custom_preprocessing=None,
                 df=None,
                 metadata=None):

        super().__init__(
            df=df,
            label_name=label_name,
            favorable_classes=favorable_classes,
            protected_attribute_names=protected_attribute_names,
            privileged_classes=privileged_classes,
            instance_weights_name=instance_weights_name,
            categorical_features=categorical_features,
            features_to_keep=features_to_keep,
            features_to_drop=features_to_drop,
            na_values=na_values,
            custom_preprocessing=custom_preprocessing,
            metadata=metadata)


In [6]:
print(df)

       y  sens  tmp  edu_orig       edu  occuption_orig occuption  age_orig  \
0      0     1    1         5  edu_cat4        0.880949  occ_cat1  2.367865   
1      0     0    1         6  edu_cat4        3.012828  occ_cat4 -0.077540   
2      0     0    1         4  edu_cat3        1.810059  occ_cat4 -1.949661   
3      0     1    1         3  edu_cat3        2.441403  occ_cat4 -0.745622   
4      0     0    1         2  edu_cat1        0.141826  occ_cat1 -0.425448   
...   ..   ...  ...       ...       ...             ...       ...       ...   
19995  1     1    1         3  edu_cat3       -0.707350  occ_cat1  0.056468   
19996  1     0    1         3  edu_cat3        1.771835  occ_cat4 -0.078363   
19997  1     0    1         4  edu_cat3        3.687200  occ_cat4  1.844894   
19998  1     0    1         4  edu_cat3        1.045248  occ_cat1 -0.908072   
19999  1     0    1         3  edu_cat3        0.706447  occ_cat1  2.685175   

            age  
0      age_cat4  
1      age_cat1

The section below is how we create missing values.

In [None]:
df['y'] = df['y'].astype(int)
df1 = df[['y', 'edu', 'occuption', 'age', 'sens']]
tot = []
for index, row in df1.iterrows():
    result = ''
    for j in df1.columns:
        result = result + str(row[j])
    tot.append(result)
df['tmp_feature'] = tot
df['mis_prob'] = 0
# here, the first element in the column 'tmp_feature' is the label (either 0 or 1 with 0 being negative outcome) 
# and the last element is sensitive value (either 0 or 1, with 0 being unprivileged)
# we define the proportion of missing values in the data. Here the missing values are under MAR
for i in df['tmp_feature'].unique():
    if i[0] == '1' and i[-1] == '0':
        df.loc[df['tmp_feature'] == i, 'mis_prob'] = 0.05
    elif i[-1] == '0':
        df.loc[df['tmp_feature'] == i, 'mis_prob'] = 0.5
    elif i[-1] != '0' and i[0] == '1':
        df.loc[df['tmp_feature'] == i, 'mis_prob'] = 0.05
    else:
        df.loc[df['tmp_feature'] == i, 'mis_prob'] = 0.03
new_label = []
for i, j in zip(df['mis_prob'], df['edu']):
    if np.random.binomial(1, i, 1)[0] == 1:
        new_label.append(-1)
    else:
        new_label.append(j)
df['edu'] = new_label
print('Total number of missing values')
print(len(df.loc[df['edu'] == -1, :].index))
print('Total number of observations')
print(len(df.index))

Total number of missing values
3970
Total number of observations
20000


In this section, we create our training and test set for training classifier and validation

In [None]:
def mod_edu(x):
    if x == -1:
        return 'missing'
    else:
        return x


df['edu'] = df['edu'].apply(lambda x: mod_edu(x))


df_pos = df.loc[df['y'] == 1, :]
df_neg = df.loc[df['y'] == 0, :]

df_train_pos, df_test_pos = train_test_split(
    df_pos, test_size=1000, random_state=10)
df_train_neg, df_test_neg = train_test_split(
    df_neg, test_size=1000, random_state=10)
df_test = df_test_pos.append(df_test_neg)

df_train_tot = df_train_pos.append(df_train_neg)

_, df_train_tot_pos = train_test_split(
    df_train_pos, test_size=4000, random_state=10)
_, df_train_tot_neg = train_test_split(
    df_train_neg, test_size=4000, random_state=10)
df_train = df_train_tot_pos.append(df_train_tot_neg)

Do the categorical fix using the training set

In [None]:
orig_cat_train = df_train[['y','age', 'sens', 'occuption', 'edu']]
orig_cat_test = df_test[['y', 'age','sens', 'occuption', 'edu']]

all_protected_attribute_maps = {"sens": {0.0: 0, 1.0: 1}}
D_features = ['sens']
dataset_orig_cat_train = CustomDataset(
    favorable_classes=[1], protected_attribute_names=['sens'], privileged_classes=[
        [1]], categorical_features=[
            'occuption', 'edu','age'], features_to_keep=[
                'occuption', 'edu', 'y', 'sens','age'], df=orig_cat_train, metadata={
                    'label_maps': [{1.0: 1, 0.0: 0}], 'protected_attribute_maps': [
                                all_protected_attribute_maps[x] for x in D_features]})

dataset_orig_cat_test = CustomDataset(
    favorable_classes=[1], protected_attribute_names=['sens'], privileged_classes=[
        [1]], categorical_features=[
            'occuption', 'edu','age'], features_to_keep=[
                'occuption', 'edu', 'y', 'sens','age'], df=orig_cat_test, metadata={
                    'label_maps': [{1.0: 1, 0.0: 0}], 'protected_attribute_maps': [
                                all_protected_attribute_maps[x] for x in D_features]})


privileged_groups = [{'sens': 1}]
unprivileged_groups = [{'sens': 0}]
optim_options = {
    "distortion_fun": custom_distort,
    "epsilon": 0.08,
    "clist": [0.99, 1.99, 2.99],
    "dlist": [.2, 0.1, 0]
}


OP = OptimPreproc(OptTools, optim_options,
                  unprivileged_groups=unprivileged_groups,
                  privileged_groups=privileged_groups)

OP = OP.fit(dataset_orig_cat_train)


dataset_transf_cat_test = OP.transform(dataset_orig_cat_test, transform_Y=True)
dataset_transf_cat_test = dataset_orig_cat_test.align_datasets(
    dataset_transf_cat_test)


dataset_transf_cat_train = OP.transform(
    dataset_orig_cat_train, transform_Y=True)
dataset_transf_cat_train = dataset_orig_cat_train.align_datasets(
    dataset_transf_cat_train)

Optimized Preprocessing: Objective converged to 0.161315
