In [1]:
import pandas as pd
from pandas.io.json import json_normalize  
import os, syserrno
import argparse
import pickle
import numpy as np

from readWriteRankings.readAndWriteRankings import loadPickleFromDisk
from utilsAndConstants.constants import ESSENTIALLY_ZERO
from utilsAndConstants.utils import setMemoryLimit
from dataset_creator import *
from main import rankAndDump
# from src.dataset_creator import *
# from src.evaluator import Evaluator
# from src.evaluator.failProbabilityYangStoyanovich import determineFailProbOfGroupFairnessTesterForStoyanovichRanking
# from post_processing_methods import fair_ranker
# from src.post_processing_methods.fair_ranker.create import fairRanking, feldmanRanking

In [17]:
def convertFormat(inpath, outpath, filename):
    
    if not os.path.exists(os.path.dirname(outpath+filename)):
        try:
            os.makedirs(os.path.dirname(outpath+filename))
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    df = pd.DataFrame([ [x.qualification, x.originalQualification, x.isProtected] for x in loadPickleFromDisk(inpath+filename)], columns=['y_pred','y','g'])
    df.to_pickle(outpath+filename)

In [9]:
def fairRanking(k, protectedCandidates, nonProtectedCandidates, minProp, alpha):
    """
    creates a ranked output that satisfies the fairness definition in :class:'FairnessInRankingsTester'
    if k is larger than one of the candidate lists we have available, the ranking is filled up with
    candidates from the other group, i.e. if all protected candidates already appear in the ranking
    the left over positions are filled with non-protected

    Parameters:
    ----------
    k : int
        the expected length of the ranking

    protectedCandidates : [Candidates]
        array of protected class:`candidates <datasetCreator.candidate.Candidate>`, assumed to be
        sorted by candidate qualification in descending order

    nonProtectedCandidates : [Candidates]
        array of non-protected class:`candidates <datasetCreator.candidate.Candidate>`, assumed to be
        sorted by candidate qualification in descending order

    minProp : float
        minimal proportion of protected candidates to appear in the fair ranking result

    alpha : float
        significance level for the binomial cumulative distribution function -> minimum probability at
        which a fair ranking contains the minProp amount of protected candidates

    Return:
    ------
    an array of class:`candidates <datasetCreator.candidate.Candidate>` that maximizes ordering and
    selection fairness

    the left-over candidates that were not selected into the ranking, sorted color-blindly
    """


    result = []
    gft = FairnessInRankingsTester(minProp, alpha, k, correctedAlpha=True)
    countProtected = 0

    idxProtected = 0
    idxNonProtected = 0

    for i in range(k):
        if idxProtected >= len(protectedCandidates) and idxNonProtected >= len(nonProtectedCandidates):
            # no more candidates available, return list shorter than k
            return result, []
        if idxProtected >= len(protectedCandidates):
            # no more protected candidates available, take non-protected instead
            result.append(nonProtectedCandidates[idxNonProtected])
            idxNonProtected += 1

        elif idxNonProtected >= len(nonProtectedCandidates):
            # no more non-protected candidates available, take protected instead
            result.append(protectedCandidates[idxProtected])
            idxProtected += 1
            countProtected += 1

        elif countProtected < gft.candidates_needed[i]:
            # add a protected candidate
            result.append(protectedCandidates[idxProtected])
            idxProtected += 1
            countProtected += 1

        else:
            # find the best candidate available
            if protectedCandidates[idxProtected].qualification >= nonProtectedCandidates[idxNonProtected].qualification:
                # the best is a protected one
                result.append(protectedCandidates[idxProtected])
                idxProtected += 1
                countProtected += 1
            else:
                # the best is a non-protected one
                result.append(nonProtectedCandidates[idxNonProtected])
                idxNonProtected += 1

    return result, __mergeTwoRankings(protectedCandidates[idxProtected:], nonProtectedCandidates[idxNonProtected:])


In [10]:
def __mergeTwoRankings(ranking1, ranking2):
    result = ranking1 + ranking2
    result.sort(key=lambda candidate: candidate.originalQualification, reverse=True)
    return result

In [18]:
#German Credit
#use prefix=ful list
k = 200
pairsOfPAndAlpha = [(0.1, 0.1),  # no real results, skip in evaluation
                        (0.2, 0.1),  # no real results, skip in evaluation
                        (0.3, 0.0220),
                        (0.4, 0.0222),
                        (0.5, 0.0207),
                        (0.6, 0.0209),
                        (0.7, 0.0216),
                        (0.8, 0.0216),
                        (0.9, 0.0256)]

protectedGermanCreditAge25, nonProtectedGermanCreditAge25 = germanCreditData.create(
        "../rawData/GermanCredit/GermanCredit_age25.csv", "DurationMonth", "CreditAmount",
        "score", "age25", protectedAttribute=["younger25"])
rankAndDump(protectedGermanCreditAge25, nonProtectedGermanCreditAge25, k,
        "GermanCreditAge25", "../results/rankingDumps/German Credit/Age25",
        pairsOfPAndAlpha)

# #Compas
# k = 6890
# pairsOfPAndAlpha = [(0.1, 0.0140),
#                         (0.2, 0.0115),
#                         (0.3, 0.0103),
#                         (0.4, 0.0099),
#                         (0.5, 0.0096),
#                         (0.6, 0.0093),
#                         (0.7, 0.0094),
#                         (0.8, 0.0095),
#                         (0.9, 0.0100)]

# protectedCompasRace, nonProtectedCompasRace = compasData.createRace(
#        "../rawData/COMPAS/ProPublica_race.csv", "race", "Violence_rawscore", "Recidivism_rawscore",
#        "priors_count")
# rankAndDump(protectedCompasRace, nonProtectedCompasRace, k, "CompasRace",
#                        "../results/rankingDumps/Compas/Race", pairsOfPAndAlpha)

# protectedCompasGender, nonProtectedCompasGender = compasData.createGender(
#        "../rawData/COMPAS/ProPublica_sex.csv", "sex", "Violence_rawscore", "Recidivism_rawscore",
#        "priors_count")
# rankAndDump(protectedCompasGender, nonProtectedCompasGender, k, "CompasGender",
#                       "../results/rankingDumps/Compas/Gender", pairsOfPAndAlpha)

create rankings of GermanCreditAge25
colorblind ranking [Done]
fair rankings [Done]
feldman ranking [Done]
Write rankings to disk [Done]


In [6]:
#German Credit
#use prefix=ful list
# k = 200
# pairsOfPAndAlpha = [(0.1, 0.1),  # no real results, skip in evaluation
#                         (0.2, 0.1),  # no real results, skip in evaluation
#                         (0.3, 0.0220),
#                         (0.4, 0.0222),
#                         (0.5, 0.0207),
#                         (0.6, 0.0209),
#                         (0.7, 0.0216),
#                         (0.8, 0.0216),
#                         (0.9, 0.0256)]

# protectedGermanCreditAge25, nonProtectedGermanCreditAge25 = germanCreditData.create(
#         "../rawData/GermanCredit/GermanCredit_age25.csv", "DurationMonth", "CreditAmount",
#         "score", "age25", protectedAttribute=["younger25"])
# rankAndDump(protectedGermanCreditAge25, nonProtectedGermanCreditAge25, k,
#         "GermanCreditAge25", "../results/rankingDumps/German Credit/Age25",
#         pairsOfPAndAlpha)

#Compas
# k = 6890
k = 3000
pairsOfPAndAlpha = [(0.1, 0.0140),
                        (0.2, 0.0115),
                        (0.3, 0.0103),
                        (0.4, 0.0099),
                        (0.5, 0.0096),
                        (0.6, 0.0093),
                        (0.7, 0.0094),
                        (0.8, 0.0095),
                        (0.9, 0.0100)]

protectedCompasRace, nonProtectedCompasRace = compasData.createRace(
       "../rawData/COMPAS/ProPublica_race.csv", "race", "Violence_rawscore", "Recidivism_rawscore",
       "priors_count")
rankAndDump(protectedCompasRace, nonProtectedCompasRace, k, "CompasRace",
                       "../results/rankingDumps/Compas/Race", pairsOfPAndAlpha)

protectedCompasGender, nonProtectedCompasGender = compasData.createGender(
       "../rawData/COMPAS/ProPublica_sex.csv", "sex", "Violence_rawscore", "Recidivism_rawscore",
       "priors_count")
rankAndDump(protectedCompasGender, nonProtectedCompasGender, k, "CompasGender",
                      "../results/rankingDumps/Compas/Gender", pairsOfPAndAlpha)

create rankings of CompasRace
colorblind rankingComputing m: 2000 of 3000
Computing m inverse: 2000 of 3000
 [Done]
fair rankingsComputing m: 2000 of 3000
Computing m inverse: 2000 of 3000
Computing m: 2000 of 3000
Computing m inverse: 2000 of 3000
Computing m: 2000 of 3000
Computing m inverse: 2000 of 3000
Computing m: 2000 of 3000
Computing m inverse: 2000 of 3000
Computing m: 2000 of 3000
Computing m inverse: 2000 of 3000
Computing m: 2000 of 3000
Computing m inverse: 2000 of 3000
Computing m: 2000 of 3000
Computing m inverse: 2000 of 3000
Computing m: 2000 of 3000
Computing m inverse: 2000 of 3000
Computing m: 2000 of 3000
Computing m inverse: 2000 of 3000
 [Done]
feldman rankingComputing m: 2000 of 3000
Computing m inverse: 2000 of 3000
 [Done]
Write rankings to disk [Done]
create rankings of CompasGender
colorblind rankingComputing m: 2000 of 3000
Computing m inverse: 2000 of 3000
 [Done]
fair rankingsComputing m: 2000 of 3000
Computing m inverse: 2000 of 3000
Computing m: 2000 o

In [20]:
# #convert datasets to dataframes and save to our fair ranking dir
# inpath = '../results/rankingDumps/German_Credit_varyk/Age25/'
# outpath = '../../fair_ranking/data/processed/fa_ir_german_credit'+str(k)+'/'
# print(outpath)
# convertFormat(inpath, outpath, 'GermanCreditAge25ColorblindRanking.pickle')
# convertFormat(inpath, outpath, 'GermanCreditAge25FeldmanRanking.pickle')
# convertFormat(inpath, outpath, 'GermanCreditAge25FairRanking01PercentProtected.pickle')
# convertFormat(inpath, outpath, 'GermanCreditAge25FairRanking02PercentProtected.pickle')
# convertFormat(inpath, outpath, 'GermanCreditAge25FairRanking03PercentProtected.pickle')
# convertFormat(inpath, outpath, 'GermanCreditAge25FairRanking04PercentProtected.pickle')
# convertFormat(inpath, outpath, 'GermanCreditAge25FairRanking05PercentProtected.pickle')
# convertFormat(inpath, outpath, 'GermanCreditAge25FairRanking06PercentProtected.pickle')
# convertFormat(inpath, outpath, 'GermanCreditAge25FairRanking07PercentProtected.pickle')
# convertFormat(inpath, outpath, 'GermanCreditAge25FairRanking08PercentProtected.pickle')
# convertFormat(inpath, outpath, 'GermanCreditAge25FairRanking09PercentProtected.pickle')

In [16]:
# inpath = '../results/rankingDumps/Compas/Race/'
# outpath = '../../fair_ranking/data/processed/fa_ir_compas_race/'
# convertFormat(inpath, outpath, 'CompasRaceColorblindRanking.pickle')
# convertFormat(inpath, outpath, 'CompasRaceFeldmanRanking.pickle')
# convertFormat(inpath, outpath, 'CompasRaceFairRanking01PercentProtected.pickle')
# convertFormat(inpath, outpath, 'CompasRaceFairRanking02PercentProtected.pickle')
# convertFormat(inpath, outpath, 'CompasRaceFairRanking03PercentProtected.pickle')
# convertFormat(inpath, outpath, 'CompasRaceFairRanking04PercentProtected.pickle')
# convertFormat(inpath, outpath, 'CompasRaceFairRanking05PercentProtected.pickle')
# convertFormat(inpath, outpath, 'CompasRaceFairRanking06PercentProtected.pickle')
# convertFormat(inpath, outpath, 'CompasRaceFairRanking07PercentProtected.pickle')
# convertFormat(inpath, outpath, 'CompasRaceFairRanking08PercentProtected.pickle')
# convertFormat(inpath, outpath, 'CompasRaceFairRanking09PercentProtected.pickle')

In [15]:
# inpath = '../results/rankingDumps/Compas/Gender/'
# outpath = '../../fair_ranking/data/processed/fa_ir_compas_gender/'
# convertFormat(inpath, outpath, 'CompasGenderColorblindRanking.pickle')
# convertFormat(inpath, outpath, 'CompasGenderFeldmanRanking.pickle')
# convertFormat(inpath, outpath, 'CompasGenderFairRanking01PercentProtected.pickle')
# convertFormat(inpath, outpath, 'CompasGenderFairRanking02PercentProtected.pickle')
# convertFormat(inpath, outpath, 'CompasGenderFairRanking03PercentProtected.pickle')
# convertFormat(inpath, outpath, 'CompasGenderFairRanking04PercentProtected.pickle')
# convertFormat(inpath, outpath, 'CompasGenderFairRanking05PercentProtected.pickle')
# convertFormat(inpath, outpath, 'CompasGenderFairRanking06PercentProtected.pickle')
# convertFormat(inpath, outpath, 'CompasGenderFairRanking07PercentProtected.pickle')
# convertFormat(inpath, outpath, 'CompasGenderFairRanking08PercentProtected.pickle')
# convertFormat(inpath, outpath, 'CompasGenderFairRanking09PercentProtected.pickle')

In [None]:
# [[x.qualification, x.originalQualification, x.isProtected] for x in german_cb]