<img src="background_1.png">

<h1> AEQUITAS Hands-on Demonstrator </h1>

<img src="background_3.png">

<h3> Libraries </h3>

In [1]:
import sys
sys.path.append("../")
import numpy as np
import pandas as pd
pd.set_option('display.width', 500)
from aequitas.engine import Aequitas, NpEncoder
import aequitas.tools.data_manip as dm
import aequitas.tools as tools
from aequitas.gateway import Gateway

<h5> Initialize connection with the context engine</h5>

In [2]:
gw = Gateway('review', host='http://rain.cs.umu.se/aeq/')
fs_only = False

<h3> Importing the dataset </h3>

In [3]:
#Import dataset
dataset_name="IT_candidates.csv"
dataset_directory="../datasets/"+dataset_name
dataset = pd.read_csv(dataset_directory)

<h3> Pre-processing the original dataset </h3>

<h5> Addressing missing values in data </h5>

In [4]:
# remove education column since there is an educution_num
dataset = dataset.drop('education', axis=1)

num_data = dataset.shape[0]
col_names = dataset.columns
for c in col_names:
	dataset[c] = dataset[c].replace("?", np.NaN)
dataset = dataset.apply(lambda x:x.fillna(x.value_counts().index[0]))

<h3> Examples of use for the AEQUITAS objects </h3>

<p> Example 1 - Empty parameters file </p>

In [5]:
parameters={
}
Aeq_dataset=Aequitas(dataset,parameters)
Aeq_dataset.structure(verbose=True)

Dataset:
        Column Name Data Type Column Type (suggestion)  Number_Values                                             Values
0               age     int64               Continuous             74                                                  -
1         workclass      text      Categorical/Ordinal              8  [Private, Local-gov, Self-emp-not-inc, Federal...
2   educational-num     int64      Categorical/Ordinal             16  [7, 9, 12, 10, 6, 15, 4, 13, 14, 16, 3, 11, 5,...
3    marital-status      text      Categorical/Ordinal              7  [Never-married, Married-civ-spouse, Widowed, D...
4            skills      text      Categorical/Ordinal             14  [C++;Git, C;C++;Python, C;Python, Python, Swif...
5      relationship      text      Categorical/Ordinal              6  [Own-child, Husband, Not-in-family, Unmarried,...
6              race      text      Categorical/Ordinal              5  [Black, White, Asian-Pac-Islander, Other, Amer...
7            gender    

##### Regrouping categories according to 'race' values

In [6]:
groups = [['White'], ['Black','Asian-Pac-Islander','Other','Amer-Indian-Eskimo']]
labels=['White','Minority']
dataset["race"]=dm.merge_values(dataset["race"],groups,labels)
print("Unique values: ",dataset["race"].unique())

Unique values:  ['Minority' 'White']


<p> Example 2 - Basic parameters file </p>

In [7]:
parameters={
    "class_attribute":{
        "name": 'candidate',
    },
}
Aeq_dataset=Aequitas(dataset,parameters)
Aeq_dataset.descriptive_stats(verbose=True)

Proportions: (candidate)
            0
No   0.760718
Yes  0.239282



<p> Example 3 - Parameters file with sensitive attribute definition</p>

In [8]:
parameters={
    "class_attribute":{
        "name": 'candidate',
        "positive_value":'Yes'
    },
    "sensitive_attributes":
    [
        {
            "name": 'gender',
        },
        {
            "name": 'race',
        }
    ]
}
Aeq_dataset=Aequitas(dataset,parameters)
Aeq_dataset.descriptive_stats(verbose=True)

Proportions: (candidate)
            0
No   0.760718
Yes  0.239282

Proportions: (gender)
               0
Male    0.668482
Female  0.331518

Proportions: (race)
                 0
White     0.855043
Minority  0.144957

Outcome distribution by group:
              No       Yes
Female  0.890749  0.109251
Male    0.696233  0.303767

Outcome distribution by group:
                No       Yes
Minority  0.847458  0.152542
White     0.746013  0.253987


Association between gender and race.
Contingency Table:
race    Minority  White
gender                 
Female      3165  13027
Male        3915  28735

Chi-squared statistic: 497.9678182429906
Cramer's V: 0.10087228311688282
Degrees of Freedom: 1
p-value: 2.6310785315092373e-110
There is a statistically significant association between gender and race.

Association between gender and candidate.
Contingency Table:
candidate     No   Yes
gender                
Female     14423  1769
Male       22732  9918

Chi-squared statistic: 2248.847679013

<h5> Save dataset analysis so far to the context engine </h5>

In [9]:
gw.save_element(Aeq_dataset.parameters, element_key="dataset", filesystem=fs_only)

ConnectionError: HTTPConnectionPool(host='rain.cs.umu.se', port=80): Max retries exceeded with url: /aeq/project/review_1/data/dataset/0 (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7f939257bcd0>: Failed to resolve 'rain.cs.umu.se' ([Errno -3] Temporary failure in name resolution)"))

<p> Splitting in training and testing samples </p>

In [None]:
training_sample,test_sample = dm.split_dataset(dataset,ratio=0.3, random_state=51)

<p> Definition of Parameters dic and the two corresponding Aequitas objects</p>

In [None]:
parameters={
    "class_attribute":{
        "name": 'candidate',
        "positive_value":'Yes'
    },
    "sensitive_attributes":
    [
        {
            "name": 'gender',
            "privileged_group":'Male'
        },
    ]
}

Aeq_training=Aequitas(training_sample,parameters)
Aeq_test=Aequitas(test_sample,parameters)

Preliminary metric evaluation: Statistical Parity

In [None]:
Aeq_training.descriptive_stats(verbose=False)
Aeq_training.statistical_parity(verbose=True)

Preliminary metric evaluation: Disparate Impact

In [None]:
Aeq_training.disparate_impact(verbose=True)

Definition of the parameters for encoding tranformations

In [None]:
transform_dictionary = {
    "candidate": {
        "encode": "labeling",
        "labels": {
            "No": 0,
            "Yes": 1, 
        }
    },
    "gender": {
        "encode": "labeling",
        "labels": {
            "Female": 0,
            "Male": 1, 
        }
    },
    "race": {
        "encode": "labeling",
        "labels": {
            "Minority": 0,
            "White": 1, 
        } 
    },
    "workclass": {
        "encode": "labeling",
        "scaling": "min-max"
    },
    "marital-status": {
        "encode": "labeling",
        "scaling": "min-max"
    },
    "skills": {
        "encode": "labeling", 
        "scaling": "min-max"
    },
    "relationship": {
        "encode": "labeling", 
        "scaling": "min-max"
    },
    "native-country": {
        "encode": "labeling", 
        "scaling": "min-max"
    },
    "age":{
        "scaling": "standard"
    },
    "educational-num":{
        "scaling": "min-max"
    },
    "capital-gain":{
        "scaling": "standard"
    },
    "capital-loss":{
        "scaling": "standard"
    },
    "hours-per-week":{
        "scaling": "standard"
    }
}

Aeq_training.transform_instructions(transform_dictionary)
Aeq_test.transform_instructions(transform_dictionary)

<h3> Mitigation of Bias with AEQUITAS library </h3>

<p> Appliying 'Massaging' to training dataset over the sensitive attribute 'Gender' and class 'XXXXXXXXX'</p>

In [None]:
Aeq_training_unbiased=Aeq_training.mitigation(method='massaging', sensitive_attribute='gender')
Aeq_training_unbiased.statistical_parity(verbose=True)

<h3> Train and test a classifier using the modified "unbiased" training sample</h3>

<p> Define classifier parameters </p>

In [None]:
classifier_type="Decision_Tree"
classifier_params={
    "random_state":42, 
    "min_samples_leaf":10
}
class_attribute=Aeq_training.parameters["class_attribute"]["name"]

In [None]:
# transform to numerical values
Aeq_training_unbiased.transform()
Aeq_test.transform()

clf=tools.train_classifier(Aeq_training_unbiased.dataset,class_attribute,classifier_type,classifier_params)
predicted_test_sample, _, _, _= tools.test_classifier(clf,Aeq_test.dataset,class_attribute,verbose=True)

# transform to text values
Aeq_training_unbiased.inverse_transform()
Aeq_test.inverse_transform()

Define a new Aequitas object for the predicted test sample

In [None]:
Aeq_predicted_test=Aeq_test.copy()
Aeq_predicted_test.set_dataset(predicted_test_sample)
Aeq_predicted_test.inverse_transform()

Checking the statistical parity and the disparate impact on the predicted test sample

In [None]:
Aeq_predicted_test.statistical_parity(verbose=True)
Aeq_predicted_test.disparate_impact(verbose=True)

<p> Computing Equalized Odds on the prediction results</p>

In [None]:
prediction=np.array(Aeq_predicted_test.dataset[class_attribute])
Aeq_test.equal_opportunity(prediction,verbose=True)
Aeq_test.equal_odds(prediction,verbose=True)

<h5> Update the context engine: Training sample, Modified Training Sample, Test sample </h5

In [None]:
gw.save_element(Aeq_training.parameters, element_key="dataset", version="training")
gw.save_element(Aeq_training_unbiased.parameters, element_key="training", version="mitigated")
gw.save_element(Aeq_test.parameters, element_key="dataset", version="test")
Aeq_test.display()