# Measuring and characterizing fairness as a notion of inclusiveness.

Certain Machine Learning models are made to perform classification tasks of samples over labels which are subjective, what means that several users of the models might judge the label of the sample differently depending on their personal experience.

The predictions of the models might contain biases towards certain types of judgements which are more common than others and consequently easier to learn, and ignore other judgements. These biases might already be contained in the training dataset or generated by the classification model. 

However for the predictions to be fair towards each user of the model, they should be inclusive of all the different judgements, and possibly should be tuned to each of the users.

In this tutorial we teach:
- how to use metrics to measure how fair according to this notion of inclusiveness the models are,
- and how to use various characterizations of the predictions to understand where the unfairness might come from.

The tutorial is based on the example use-case of a Machine Learning model to classify the toxicity of a sentence (see image below).
We train a classifier (Logistic Regression) using the toxicity dataset (sentences and toxicity labels) to predict sentence toxicity, and evaluate how fair the outputs of the process are based on the ground truth annotations provided by multiple judges (crowdsourcing annotators).

![title](images/overview_tutorial_fairness_inclusiveness.png)

# Import statements

### Load all necessary packages

In [1]:
import sys
sys.path.append("../")  

import os

from aif360.datasets import ToxicityDataset
from aif360.metrics import InclusivenessLabelDatasetMetric

from IPython.display import Markdown, display

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import sklearn.metrics as sk_met
from sklearn.base import TransformerMixin, BaseEstimator

import numpy as np
import pickle
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os

### Example to load the full toxicity dataset

In [2]:
from aif360.datasets import BinaryLabelDataset
import copy
### The toxicity dataset (toxicity_annotations.tsv, toxicity_annotated_comments.tsv, toxicity_worker_demographics.tsv) should be downloaded from https://figshare.com/articles/Wikipedia_Talk_Labels_Toxicity/4563973
### and placed in the folder "data/raw/toxicity".
# Example on how to load the full dataset.
#tox_dataset = ToxicityDataset()

## Train a model

### Load training and evaluation data

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize, regexp, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def clean_data(annotations, worker_demo, comments):
        
        # Preprocess workers.
        worker_demo = worker_demo.replace(np.NaN, 'nan')
 
        #### Add all the information to the annotations.
        # Add the worker demographics.
        annotations = annotations.reset_index().merge(worker_demo, on='worker_id', how='left').set_index(annotations.index.names)
        # Remove the unknown demographics and the demographics with a NaN. And put them in a general test set.
        annotations = annotations.replace(np.NaN, 'nan')
        annotations.loc[((annotations['english_first_language'].str.contains('nan')) |(annotations['gender'].str.contains('nan')) | (annotations['age_group'].str.contains('nan')) | (annotations['education'].str.contains('nan')) ),'general_split'] = 1 #'test'
        annotations = annotations.reset_index()
        annotations['english_first_language'].replace('nan', 2, inplace=True)
        annotations['pop_label'] = annotations.loc[:, ['gender', 'age_group', 'education']].apply(lambda x: ' '.join([str(x['gender']),str(x['age_group']), str(x['education'])]), axis=1)    
          
        # Add the comments in order to train / test the ML models.
        annotations = annotations.reset_index().merge(comments.loc[:, ['comment']].reset_index(), on='rev_id', how='left').set_index('index')
        return annotations
    
def normalize_text(text):
        tokenizer = RegexpTokenizer(r'\w+')
        stopword_set = set(stopwords.words('english'))
        stemmer = PorterStemmer()
        # Convert text to lower-case and strip punctuation/symbols from words.
        norm_text = text.lower()
        # Replace breaks with spaces.
        norm_text = norm_text.replace('<br />', ' ')
        # Pad punctuation with spaces on both sides.
        for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
            norm_text = norm_text.replace(char, ' ' + char + ' ') 
        # Tokenize.
        norm_text = tokenizer.tokenize(norm_text)
        # Remove stop words.
        norm_text = [w for w in norm_text if not w in stopword_set]
        norm_text = " ".join(norm_text)
        return norm_text

    
def clean_comments(comments):
        comments['comment'] = comments.loc[:, 'comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
        comments['comment'] = comments.loc[:, 'comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))
        comments['comment'] = comments.loc[:, 'comment'].apply(lambda x: normalize_text(x))
        return comments

### Read the documents  

In [4]:
try:
    print("Load the dataset.")
    filepath = os.path.join(os.path.dirname(os.path.abspath("__file__")), '..', 'aif360', 'data', 'raw', 'toxicity')
    print(filepath)
    comments = pd.read_csv(filepath + '/toxicity_annotated_comments.tsv', sep = '\t', dtype = {'rev_id':int, 'comment':str}, index_col = 0)
    annotations = pd.read_csv(filepath + '/toxicity_annotations.tsv',  sep = '\t', index_col = 0)
    worker_demo = pd.read_csv(filepath + '/toxicity_worker_demographics.tsv', sep='\t')
except IOError as err:
    print("IOError: {}".format(err))
    print("To use this class, please download the following files from https://figshare.com/articles/Wikipedia_Talk_Labels_Toxicity/4563973:")
    print("\n\ttoxicity_annotated_comments.tsv")
    print("\ttoxicity_annotations.tsv")
    print("\ttoxicity_worker_demographics.tsv")
    print("\nand place them, as-is, in the folder:")
    print("\n\t{}\n".format(os.path.abspath(os.path.join(os.path.abspath(__file__), '..', '..', 'data', 'raw', 'toxicity'))))
    sys.exit(1)

Load the dataset.
C:\Users\AgatheBalayn\Documents\thesis_related\AIF360\examples\..\aif360\data\raw\toxicity


  mask |= (ar1 == a)


##### Clean the dataset

In [5]:
### For now we do not use the whole dataset to be faster.
n_lim = 0
comments = clean_comments(comments)
if n_lim > 0:
    annotations = annotations.iloc[0:n_lim]
# Merge the different datasets.
annotations = clean_data(annotations, worker_demo, comments)
# Compute the ground truth (majority vote) label.
annotations['MV'] = annotations.groupby(['rev_id'])['toxicity'].transform(lambda x : (x.mean() >= 0.5).astype(int))

### Create the training and test sets

In [6]:
### Prepare the unique comments for training and testing of the ML models.
def prepare_aggregated_data(comment_):
    # Get the unique comments.
    comment_ = comment_.drop_duplicates('rev_id')
    # Cleaning.
    return comment_.loc[:, ['comment', 'MV']]

In [7]:
from sklearn.model_selection import train_test_split
# Training and test sets with annotations as ground truth:
annotations_train, annotations_test = train_test_split(annotations, test_size=0.3)
# To ensure they are copies and not views of annotations:
annotations_train = annotations_train.copy()
annotations_test = annotations_test.copy()
# Training and test sets with majority-vote as ground truth:
comments_train = prepare_aggregated_data(annotations_train.copy())
comments_test = prepare_aggregated_data(annotations_test.copy())

### Train the ML models

In [8]:
# Functions to load the model.
class DataFrameColumnExtracter_doc(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.column].values.astype('U')

##### The ML model is trained on the majority vote labels

In [9]:
# Load the model, here Logistic Regression model.
clf_LR = Pipeline([# Sentences.
                  ('sentences_features', Pipeline([
                      ('sentence_extractor', DataFrameColumnExtracter_doc('comment')),#.values.astype('U'),
                    ('vect', CountVectorizer(max_features = 1500, ngram_range = (1,5), analyzer = 'char')),
                     ('tf', TfidfTransformer(norm = 'l2'))
                  ])),
            # Classifier.
            ('clf', LogisticRegression(solver='liblinear'))#C=LR_C, tol=LR_C_tol))
        ])
## 1) Perform grid search over the parameters of the model.

# Parameters of the grid search:
tuned_parameters = {'clf__C': [1e-4, 1e-2, 1, 10], 'clf__tol': [1, 1e-2, 1e-4]} 

# Initialize the grid search.
clf = GridSearchCV(clf_LR, tuned_parameters, cv=5, verbose=0)

# Functions to save and load the results of the grid search.
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
# Use a small number of data to train the model faster (this is used for quick testing).
nb_data = 10000

# Train the grid search.
# To comment if already run ones and the paramters were saved.
# =============
#best_model = clf.fit(comments_train[0:nb_data], comments_train['MV'][0:nb_data])
#best_parameters = best_model.best_params_  
#print(best_parameters)  
#best_result = best_model.best_score_  
#print(best_result)  
#save_obj(best_parameters, 'best_param_LR_aggregated')
# =============


## 2) Train the final model.
best_parameters = load_obj('best_param_LR_aggregated')
clf_LR.set_params(**best_parameters)
clf_LR.fit(comments_train.iloc[0:nb_data], comments_train.iloc[0:nb_data, comments_train.columns.get_loc('MV')])


## 3) Evaluate general performance.
#train_pred = clf_LR.predict(comments_train.iloc[0:nb_data])
#test_pred = clf_LR.predict(comments_test.iloc[0:nb_data])
#print("Training accuracy: ", sk_met.accuracy_score(comments_train.iloc[0:nb_data, comments_train.columns.get_loc('MV')], train_pred))
#print("Test accuracy: ", sk_met.accuracy_score(comments_test.iloc[0:nb_data, comments_test.columns.get_loc('MV')], test_pred))

#C_train = sk_met.confusion_matrix(comments_train.iloc[0:nb_data, comments_train.columns.get_loc('MV')], train_pred)
#C_train = C_train / C_train.astype(np.float).sum(axis=0)
#C_test = sk_met.confusion_matrix(comments_test.iloc[0:nb_data, comments_test.columns.get_loc('MV')], test_pred)
#C_test = C_test / C_test.astype(np.float).sum(axis=0)
#print("Training confusion matrix:", C_train)
#print("Test confusion matrix:", C_test)

Pipeline(memory=None,
     steps=[('sentences_features', Pipeline(memory=None,
     steps=[('sentence_extractor', DataFrameColumnExtracter_doc(column='comment')), ('vect', CountVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lower...alty='l2', random_state=None, solver='liblinear',
          tol=0.01, verbose=0, warm_start=False))])

##### Train a second ML model on the annotation (not majority-vote) labels in order to compare the fairness measures later

In [10]:
# Load the model, here Logistic Regression model.
clf_LR_annotations = Pipeline([# Sentences.
                  ('sentences_features', Pipeline([
                      ('sentence_extractor', DataFrameColumnExtracter_doc('comment')),#.values.astype('U'),
                    ('vect', CountVectorizer(max_features = 1500, ngram_range = (1,5), analyzer = 'char')),
                     ('tf', TfidfTransformer(norm = 'l2'))
                  ])),
            # Classifier.
            ('clf', LogisticRegression(solver='liblinear'))#C=LR_C, tol=LR_C_tol))
        ])

## 1) Perform grid search over the parameters of the model.

# Parameters of the grid search:
tuned_parameters = {'clf__C': [1e-4, 1e-2, 1, 10], 'clf__tol': [1, 1e-2, 1e-4]} 

# Initialize the grid search.
clf_annotations = GridSearchCV(clf_LR_annotations, tuned_parameters, cv=5, verbose=0)
    
# Use a small number of data to train the model faster.
nb_data = 10000

# Train the grid search.
# To comment if already run ones and the paramters were saved.
# =============
#best_model = clf_annotations.fit(annotations_train[0:nb_data], annotations_train['toxicity'][0:nb_data])
#best_parameters = best_model.best_params_  
#print(best_parameters)  
#best_result = best_model.best_score_  
#print(best_result)  
#save_obj(best_parameters, 'best_param_LR_annotations')
# =============


## 2) Train the final model.
best_parameters = load_obj('best_param_LR_annotations')
clf_LR_annotations.set_params(**best_parameters)
clf_LR_annotations.fit(annotations_train.iloc[0:nb_data], annotations_train.iloc[0:nb_data, annotations_train.columns.get_loc('toxicity')])


## 3) Evaluate general performance.
#train_pred = clf_LR_annotations.predict(annotations_train.iloc[0:nb_data])
#test_pred = clf_LR_annotations.predict(annotations_test.iloc[0:nb_data])
#print("Training accuracy: ", sk_met.accuracy_score(annotations_train.iloc[0:nb_data, annotations_train.columns.get_loc('toxicity')], train_pred))
#print("Test accuracy: ", sk_met.accuracy_score(annotations_test.iloc[0:nb_data, annotations_test.columns.get_loc('toxicity')], test_pred))

#C_train = sk_met.confusion_matrix(annotations_train.iloc[0:nb_data, annotations_train.columns.get_loc('toxicity')], train_pred)
#C_train = C_train / C_train.astype(np.float).sum(axis=0)
#C_test = sk_met.confusion_matrix(annotations_test.iloc[0:nb_data, annotations_test.columns.get_loc('toxicity')], test_pred)
#C_test = C_test / C_test.astype(np.float).sum(axis=0)
#print("Training confusion matrix:", C_train)
#print("Test confusion matrix:", C_test)

Pipeline(memory=None,
     steps=[('sentences_features', Pipeline(memory=None,
     steps=[('sentence_extractor', DataFrameColumnExtracter_doc(column='comment')), ('vect', CountVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lower...alty='l2', random_state=None, solver='liblinear',
          tol=0.01, verbose=0, warm_start=False))])

In [11]:
# Remove variables not used anymore to empty memory
del annotations, comments, worker_demo, comments_train, comments_test #, train_pred, test_pred

## Compute the fairness performance

In [12]:
from aif360.datasets import SubjectivityDataset

def default_preprocessing(df):
    return df

# Wrapper to load the datasets to compute the fairness on.
def subjectivity_dataset_wrapper(annotations, label_name,
                 protected_attribute_names=['gender', 'english_first_language', 'age_group', 'education', 'rev_id', 'worker_id', 'pop_label'], privileged_classes=None,
                 instance_weights_name=None, categorical_features=[],
                 features_to_keep=[], features_to_drop=[], na_values=[],
                 custom_preprocessing=default_preprocessing, 
                metadata={'label_maps': [{1.0: 'Toxic', 0.0: 'Non-toxic'}],},
                mapping_categorical_protected=(('gender',('female','male', 'other', 'nan')), ('age_group',('Under 18', '18-30', '30-45', '45-60', 'Over 60', 'nan')), ('education',('none', 'hs', 'some', 'bachelors', 'masters', 'professional', 'doctorate', 'nan')))):
    
    if 'comment' in annotations.columns.tolist():
        annotations.drop('comment', axis=1, inplace=True)
    if 'general_split' in annotations.columns.tolist():
        annotations.drop('general_split', axis=1, inplace=True)
          
    # Create the ground truth data.
    if 'toxicity' in annotations.columns.tolist():
        annotations['GT'] = annotations['toxicity'].copy()
    if label_name != 'toxicity':
        # Delete the 'toxicity' column.
        if 'toxicity' in annotations.columns.tolist():
            annotations.drop('toxicity', axis=1, inplace=True)
        annotations.rename(columns={label_name:'toxicity'}, inplace=True)
        label_name = 'toxicity'
    else:
        if 'pred_1' in annotations.columns.tolist():
            annotations.drop('pred_1', axis=1, inplace=True)
    annotations = annotations.astype({"toxicity": float})

    # Make the categorical data numbers.
    if mapping_categorical_protected != ():
        for tuple_type in mapping_categorical_protected:
            for tuple_details in tuple_type:
                if tuple_type.index(tuple_details) == 0:
                    key = tuple_details
                else:
                    for tuple_categories in tuple_details:
                        annotations[key].replace(tuple_categories, tuple_details.index(tuple_categories), inplace=True)

    annotations['pop_label'] = annotations[['gender', 'age_group', 'education']].apply(lambda x: int(''.join([str(x['gender']),str(x['age_group']), str(x['education'])])), axis=1)    
    annotations = annotations.loc[:, ['rev_id', 'worker_id', 'toxicity', 'toxicity_score', 'gender', 'english_first_language', 'age_group', 'education', 'pop_label', 'GT']]#, 'MV']]
    
    dataset = SubjectivityDataset(annotations, label_name, 'GT',
                 protected_attribute_names=protected_attribute_names, privileged_classes=privileged_classes,
                 instance_weights_name=instance_weights_name, categorical_features=categorical_features,
                 features_to_keep=features_to_keep, features_to_drop=features_to_drop, na_values=na_values,
                 custom_preprocessing=custom_preprocessing, metadata=metadata)
    return dataset, annotations


# The dataset might be too large for the classifier to process all the data. In this case, it is splitted to get the predictions on all the data.
def compute_pred_dataset(classifier, dataset_orig_train, prediction_col, nb_data):
    for i in range(int(len(dataset_orig_train) / nb_data)):
        low_interval = i*nb_data
        high_interval = (i+1)*nb_data
        dataset_orig_train.iloc[low_interval:high_interval, dataset_orig_train.columns.get_loc(prediction_col)] = classifier.predict(dataset_orig_train.iloc[low_interval:high_interval])
    if high_interval < len(dataset_orig_train):
        dataset_orig_train.iloc[high_interval:, dataset_orig_train.columns.get_loc(prediction_col)] = classifier.predict(dataset_orig_train.iloc[high_interval:])
    return dataset_orig_train

##### Create SubjectivityDataset datasets out of the training and test results and ground truth on the classifier trained on the MV

In [13]:
prediction_col = "pred_1"
annotations_train.loc[:, prediction_col] = -1
annotations_test.loc[:, prediction_col] = -1
nb_data = 5000 # Here we limit the number of data for faster evaluation (however the more data, the more the evaluation is accurate).

annotations_test_pred = compute_pred_dataset(clf_LR, annotations_test, prediction_col, nb_data)
subjectivity_dataset_test_GT, annotations_subjectivity_dataset_test_GT = subjectivity_dataset_wrapper(annotations_test.copy(), 'toxicity')
subjectivity_dataset_test_outputs, annotations_subjectivity_dataset_test_outputs = subjectivity_dataset_wrapper(annotations_test_pred.copy(), prediction_col)

# Instantiate the fairness metric class.
test_metric_inclusion = InclusivenessLabelDatasetMetric(subjectivity_dataset_test_GT, subjectivity_dataset_test_outputs)

MemoryError: 

##### Create SubjectivityDataset datasets out of the training and test results and ground truth on the classifier trained on the annotations

In [None]:
annotations_test_pred_2 = compute_pred_dataset(clf_LR_annotations, annotations_test, prediction_col, nb_data)
subjectivity_dataset_test_outputs_2, annotations_subjectivity_dataset_test_outputs_2 = subjectivity_dataset_wrapper(annotations_test_pred.copy(), prediction_col)

# Instantiate the fairness metric class.
test_metric_inclusion_2 = InclusivenessLabelDatasetMetric(subjectivity_dataset_test_GT, subjectivity_dataset_test_outputs_2)

In [None]:
# Empty some memory
del clf_LR, annotations_test, annotations_test_pred, subjectivity_dataset_test_outputs, annotations_subjectivity_dataset_test_outputs
del clf_LR_annotations, annotations_test_pred_2, subjectivity_dataset_test_outputs_2, annotations_subjectivity_dataset_test_outputs_2
del clf

### Examples of the computation of different metrics

##### For the classifier trained on the majority vote.

In [None]:
results_test_annotator_acc = test_metric_inclusion.average_accuracy('annotator_disagreement', number_bins=None, filtering_value=None)
results_test_annotator_TPR = test_metric_inclusion.true_positive_rate('annotator_disagreement', number_bins=None, filtering_value=None)
results_test_annotator_TNR = test_metric_inclusion.true_negative_rate('annotator_disagreement', number_bins=None, filtering_value=None)
results_test_annotator_FPR = test_metric_inclusion.false_positive_rate('annotator_disagreement', number_bins=None, filtering_value=None)
results_test_annotator_FNR = test_metric_inclusion.false_negative_rate('annotator_disagreement', number_bins=None, filtering_value=None)

In [None]:
results_test_annotation_acc = test_metric_inclusion.average_accuracy('annotation_popularity', number_bins=None, filtering_value=None)
results_test_annotation_TPR = test_metric_inclusion.true_positive_rate('annotation_popularity', number_bins=None, filtering_value=None)
results_test_annotation_TNR = test_metric_inclusion.true_negative_rate('annotation_popularity', number_bins=None, filtering_value=None)
results_test_annotation_FPR = test_metric_inclusion.false_positive_rate('annotation_popularity', number_bins=None, filtering_value=None)
results_test_annotation_FNR = test_metric_inclusion.false_negative_rate('annotation_popularity', number_bins=None, filtering_value=None)

In [None]:
results_test_demography_acc = test_metric_inclusion.average_accuracy('demography', number_bins=None, filtering_value=None)
results_test_demography_TPR = test_metric_inclusion.true_positive_rate('demography', number_bins=None, filtering_value=None)
results_test_demography_TNR = test_metric_inclusion.true_negative_rate('demography', number_bins=None, filtering_value=None)
results_test_demography_FPR = test_metric_inclusion.false_positive_rate('demography', number_bins=None, filtering_value=None)
results_test_demography_FNR = test_metric_inclusion.false_negative_rate('demography', number_bins=None, filtering_value=None)

In [None]:
results_test_sample_acc = test_metric_inclusion.average_accuracy('sample_ambiguity', number_bins=None, filtering_value=None)
results_test_sample_TNR = test_metric_inclusion.true_negative_rate('sample_ambiguity', number_bins=None, filtering_value=None)
results_test_sample_TPR = test_metric_inclusion.true_positive_rate('sample_ambiguity', number_bins=None, filtering_value=None)
results_test_sample_FPR = test_metric_inclusion.false_positive_rate('sample_ambiguity', number_bins=None, filtering_value=None)
results_test_sample_FNR = test_metric_inclusion.false_negative_rate('sample_ambiguity', number_bins=None, filtering_value=None)

In [None]:
print("Results computed on accuracy:")
print("Results: fairness on annotator")
print("Classifier with majority vote: ", results_test_annotator_acc)
print("Results: fairness on samples")
print("Classifier with majority vote: ", results_test_sample_acc)
print("Results: fairness on annotations")
print("Classifier with majority vote: ", results_test_annotation_acc)
print("Results: fairness on demographic categories")
print("Classifier with majority vote: ", results_test_demography_acc)

In [None]:
# Method to transform the age, gender, education ordinal categories into human-readable categories (for later visualization).
def translate_demography_index(data):
    for index, row in data.iterrows():
        print(index)
        print(str(int(index)))
        demog_translation = ''
         
        # Find the fist element (gender)
        if len(str(int(index))) < 3:
            # This means the gender is 0.
            demog_translation += 'female'
        else:
            if str(int(index))[0] == '1':
                demog_translation += 'male'
            elif str(int(index))[0] == '2':
                demog_translation += 'other'
            elif str(int(index))[0] == '3':
                demog_translation += 'nan'
        # Find the second element:
        demog_translation += ' '
        if len(str(int(index))) < 2:
            # This means the gender and age is 0.
            demog_translation += 'Under 18'
        else:
            if str(int(index))[1] == '1':
                demog_translation += '18-30'
            elif str(int(index))[1] == '2':
                demog_translation += '30-45'
            elif str(int(index))[1] == '3':
                demog_translation += '45-60'
            elif str(int(index))[1] == '4':
                demog_translation += 'Over 60'
            elif str(int(index))[1] == '5':
                demog_translation += 'nan'
    
        # Find the third element:
        demog_translation += ' '
        index_element = len(str(int(index))) - 1
        
        if str(int(index))[index_element] == '0':
            demog_translation += 'none'
        elif str(int(index))[index_element] == '1':
            demog_translation += 'hs'
        elif str(int(index))[index_element] == '2':
            demog_translation += 'some'
        elif str(int(index))[index_element] == '3':
            demog_translation += 'bachelors'
        elif str(int(index))[index_element] == '4':
            demog_translation += 'masters'
        elif str(int(index))[index_element] == '5':
            demog_translation += 'professional'
        elif str(int(index))[index_element] == '6':
            demog_translation += 'doctorate'
        elif str(int(index))[index_element] == '7':
            demog_translation += 'nan'
        data.rename(index={index:demog_translation}, inplace=True)
    return data

In [None]:
translate_demography_index(results_test_demography_acc[2])

### Examples of unfairness visualizations

In [None]:
def annotation_plot_color(value):
    if value > 0.5:
        return 'b'
    else:
        return 'w'

def plot_fairness(data, metric, bin_name='', title_name=''):
    yticks = list(data.index.values)

    # Plot the bins
    data = data.as_matrix(columns=[metric])
    fig, ax = plt.subplots()
    heatmap = ax.pcolor(data)
    cbar = fig.colorbar(heatmap, ax=ax)
    cbar.set_label(metric, rotation=90)

    ax.set_yticks(np.arange(data.shape[0]) + 0.5, minor=False)
    ax.set_yticklabels(yticks,rotation=0)
    ax.set_ylabel(bin_name)
    ax.tick_params(left='off', bottom='off',labelbottom='off', color='grey',labelsize='small')

    # Add the exact evaluation measure per bin
    for i in range(data.shape[0]):
        text = ax.text(0.5, i+0.5, np.round(data[i][0], 2), ha="center", va="center", color=annotation_plot_color(data[i][0]))
    if title_name != '':
        ax.set_title(title_name)
    fig.set_figwidth(4)
    fig.show()

In [None]:
# Example visualisation of the results: comparison of the performance of the two classifiers on the different fairness-bins.
plot_fairness(results_test_annotator_acc[2], 'accuracy', 'annotator disagreement rate', 'Performance of the classifier trained on the MV \n based on annotator bins.')

### Results with the classifier trained on the annotations

In [None]:
#del test_metric_inclusion

results_test_annotator_2_acc = test_metric_inclusion_2.average_accuracy('annotator_disagreement', number_bins=None, filtering_value=None)
results_test_annotation_2_acc = test_metric_inclusion_2.average_accuracy('annotation_popularity', number_bins=None, filtering_value=None)
results_test_annotation_2_TPR = test_metric_inclusion_2.true_positive_rate('annotation_popularity', number_bins=None, filtering_value=None)
results_test_annotation_2_TNR = test_metric_inclusion_2.true_negative_rate('annotation_popularity', number_bins=None, filtering_value=None)
results_test_annotation_2_FPR = test_metric_inclusion_2.false_positive_rate('annotation_popularity', number_bins=None, filtering_value=None)
results_test_annotation_2_FNR = test_metric_inclusion_2.false_negative_rate('annotation_popularity', number_bins=None, filtering_value=None)
results_test_demography_2_acc = test_metric_inclusion_2.average_accuracy('demography', number_bins=None, filtering_value=None)
results_test_demography_2_TPR = test_metric_inclusion_2.true_positive_rate('demography', number_bins=None, filtering_value=None)
results_test_sample_2_acc = test_metric_inclusion_2.average_accuracy('sample_ambiguity', number_bins=None, filtering_value=None)

In [None]:
print("Results computed on accuracy:")
print("Results: fairness on annotator")
print("Classifier with majority vote: ", results_test_annotator_2_acc)
print("Results: fairness on samples")
print("Classifier with majority vote: ", results_test_sample_2_acc)
print("Results: fairness on annotations")
print("Classifier with majority vote: ", results_test_annotation_2_acc)
print("Results: fairness on demographic categories")
print("Classifier with majority vote: ", results_test_demography_2_acc)

In [None]:
plot_fairness(results_test_annotator_2_acc[2], 'accuracy', 'annotator disagreement rate', 'Performance of the classifier trained on the annotations \n based on annotator bins.')
# We observe that although the second classifier performs less well for the annotators who often agree with the majority (bin [0.001; 0.2]), it performs better for the annotators who disagree a lot with the majority vote (bin [0.8; 1.0]), what makes it globally fairer.

### Comparison of the 2 classifiers

In [None]:
print("Compare results computed on accuracy:")
print("Results: fairness on annotator")
print("Classifier with majority vote: ", results_test_annotator_acc, " Classifier with annotations: ", results_test_annotator_2_acc)
print("Results: fairness on samples")
print("Classifier with majority vote: ", results_test_sample_acc, " Classifier with annotations: ", results_test_sample_2_acc)
print("Results: fairness on annotations")
print("Classifier with majority vote: ", results_test_annotation_acc, " Classifier with annotations: ", results_test_annotation_2_acc)
print("Results: fairness on demographic categories")
print("Classifier with majority vote: ", results_test_demography_acc, " Classifier with annotations: ", results_test_demography_2_acc)