This script prepares the datasets for the situation testing pipeline. All datasets are stored within the 'data' folder. Original datasets are read from the 'data/raw' folder; modified datasets are saved into the 'data' folder with 'clean_' as a prefix. Here, we also indicate the origin of all original datasets.

In [2]:
import os
import pandas as pd
import numpy as np
# ...

In [3]:
# set working directory - note: all code runs from the src folder
wrk_dir = os.path.dirname(os.getcwd())
print(wrk_dir)
# set data path
data_path = wrk_dir + '\\' + 'data' + '\\'
print(data_path)

C:\Users\Jose Alvarez\Documents\Projects\CounterfactualSituationTesting
C:\Users\Jose Alvarez\Documents\Projects\CounterfactualSituationTesting\data\


### German Credit Data (Revised)

Consider the revised German credit by [Gromping (2019)](http://www1.beuth-hochschule.de/FB_II/reports/Report-2019-004.pdf), stored also by [UCI](https://archive.ics.uci.edu/ml/datasets/South+German+Credit+%28UPDATE%29#). For the original version see, for example, [a class assignment using the old data](https://online.stat.psu.edu/stat857/node/215/) with the following [descriptions](https://online.stat.psu.edu/stat857/node/222/). Also see [UCI's archives](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)) for more details on the old dataset. For other credit scoring datasets see [here](https://github.com/kozodoi/Fair_Credit_Scoring/blob/main/data/README.md).

In [4]:
data1 = pd.read_csv(data_path + 'raw\\' + 'SouthGermanCredit\\' + 'south_formatted.csv', index_col=False, sep=" ")
print(data1.shape)
print(data1.columns.tolist())
data1.head(5)

(1000, 21)
['laufkont', 'laufzeit', 'moral', 'verw', 'hoehe', 'sparkont', 'beszeit', 'rate', 'famges', 'buerge', 'wohnzeit', 'verm', 'alter', 'weitkred', 'wohn', 'bishkred', 'beruf', 'pers', 'telef', 'gastarb', 'kredit']


Unnamed: 0,laufkont,laufzeit,moral,verw,hoehe,sparkont,beszeit,rate,famges,buerge,...,verm,alter,weitkred,wohn,bishkred,beruf,pers,telef,gastarb,kredit
0,1,18,4,2,1049,1,2,4,2,1,...,2,21,3,1,1,3,2,1,2,1
1,1,9,4,0,2799,1,3,2,3,1,...,1,36,3,1,2,3,1,1,2,1
2,2,12,2,9,841,2,4,2,2,1,...,1,23,3,1,1,2,2,1,2,1
3,1,12,4,0,2122,1,3,3,3,1,...,1,39,3,1,2,2,1,1,1,1
4,1,12,4,0,2171,1,3,4,3,1,...,2,38,1,2,2,2,2,1,1,1


In [5]:
# from English to German
rename_data1 = {'laufkont': 'status',
               'laufzeit': 'duration',
               'moral': 'credit_history',
               'verw': 'purpose',
               'hoehe': 'amount',
               'sparkont': 'savings',
               'beszeit': 'employment_duration',
               'rate': 'installment_rate',
               'famges': 'personal_status_sex',
               'buerge': 'other_debtors',
               'wohnzeit': 'present_residence',
               'verm': 'property',
               'alter': 'age',
               'weitkred': 'other_installment_plans',
               'wohn': 'housing',
               'bishkred': 'number_credits',
               'beruf': 'job',
               'pers': 'people_liable',
               'telef': 'telephone',
               'gastarb': 'foreign_worker',
               'kredit': 'credit_risk'}
# rename data1
data1.rename(columns=rename_data1, inplace=True)

In [6]:
# personal_status_sex 
# --- male : divorced/separated 1 
# --- female : non-single or male : single 2 
# --- male : married/widowed 3 
# --- female : single 4

def g(x):
    if x==1 or x==3:
        return 'Male'
    if x==2 or x==4:
        return 'Female'
    return x

# check
print(data1[(data1['personal_status_sex']==1) | (data1['personal_status_sex']==3)].shape[0])
# create 'gender' (our protected attribute)
data1['gender'] = data1['personal_status_sex'].map(g)
print(data1[data1['gender']=='Male'].shape[0])
# drop original col
data1 = data1.drop(columns=['personal_status_sex'])

598
598


In [8]:
# check gender
data1.groupby('gender').count()

Unnamed: 0_level_0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,other_debtors,present_residence,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Female,402,402,402,402,402,402,402,402,402,402,402,402,402,402,402,402,402,402,402,402
Male,598,598,598,598,598,598,598,598,598,598,598,598,598,598,598,598,598,598,598,598


In [10]:
# check other A candidates...
data1.groupby('foreign_worker').count()

Unnamed: 0_level_0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,other_debtors,present_residence,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,credit_risk,gender
foreign_worker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37
2,963,963,963,963,963,963,963,963,963,963,963,963,963,963,963,963,963,963,963,963


In [7]:
# store in data folder
data1.to_csv(data_path + '\\' + 'clean_GermanCreditData.csv', sep='|', index=False)

### Law School Data

The original dataset was taken from [Kusner et al. (2017)](https://github.com/mkusner/counterfactual-fairness). This dataset in turn comes from a survey study conducted in the late 1980s by the Law School Admission Council. For the original study see [Wightman (1998)](https://archive.lawschooltransparency.com/reform/projects/investigations/2015/documents/NLBPS.pdf).

Notes: (1) values for the var 'sex': sex==1 for women, sex==2 for men; (2) nowadays the LSAT ranges from 120 - 180, but from 1981 to 1991 it used to be a 48-point scale, ranging from 10 to 48 as in the var 'LSAT'; (3) the data doesn't contain a decision outcome. 

In [3]:
data2 = pd.read_csv(data_path + 'raw\\' + 'law_data.csv', index_col=False, )
print(data2.shape)
print(data2.columns.tolist())
data2.head(5)

(21791, 9)
['Unnamed: 0', 'race', 'sex', 'LSAT', 'UGPA', 'region_first', 'ZFYA', 'sander_index', 'first_pf']


Unnamed: 0.1,Unnamed: 0,race,sex,LSAT,UGPA,region_first,ZFYA,sander_index,first_pf
0,0,White,1,39.0,3.1,GL,-0.98,0.782738,1.0
1,1,White,1,36.0,3.0,GL,0.09,0.735714,1.0
2,2,White,2,30.0,3.1,MS,-0.35,0.670238,1.0
3,5,Hispanic,2,39.0,2.2,NE,0.58,0.697024,1.0
4,6,White,1,37.0,3.4,GL,-1.26,0.78631,1.0


In [4]:
# drop 'PO' like in Kusner et al.
data2 = data2[data2['region_first']!='PO']
data2.reset_index(drop=True, inplace=True)

In [5]:
# map sex
data2 = data2.drop(columns=['Unnamed: 0', 'sander_index', 'first_pf', 'region_first'])
data2['sex'] = data2['sex'].map({1: 'Female', 2: 'Male'})

In [6]:
data2.groupby('sex').count()

Unnamed: 0_level_0,race,LSAT,UGPA,ZFYA
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,9537,9537,9537,9537
Male,12253,12253,12253,12253


In [7]:
# map race
data2.groupby('race').count()

Unnamed: 0_level_0,sex,LSAT,UGPA,ZFYA
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Amerindian,99,99,99,99
Asian,845,845,845,845
Black,1282,1282,1282,1282
Hispanic,488,488,488,488
Mexican,389,389,389,389
Other,293,293,293,293
Puertorican,110,110,110,110
White,18284,18284,18284,18284


In [8]:
# two 'simpler' options

def r_nonwhite(x):
    if x=='White':
        return 'White'
    else:
        return 'NonWhite'

def r_simpler(x):
    if x=='Hispanic' or x=='Mexican' or x=='Puertorican':
        return 'Latino'
    if x=='White':
        return 'White'
    if x=='Amerindian' or x=='Other':
        return 'Other'
    if x=='Black':
        return 'Black'
    if x=='Asian':
        return 'Asian'


In [9]:
data2['race_nonwhite'] = data2['race'].map(r_nonwhite)
data2['race_simpler']  = data2['race'].map(r_simpler)

In [10]:
data2.groupby('race_nonwhite').count()

Unnamed: 0_level_0,race,sex,LSAT,UGPA,ZFYA,race_simpler
race_nonwhite,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NonWhite,3506,3506,3506,3506,3506,3506
White,18284,18284,18284,18284,18284,18284


In [11]:
data2.groupby('race_simpler').count()

Unnamed: 0_level_0,race,sex,LSAT,UGPA,ZFYA,race_nonwhite
race_simpler,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Asian,845,845,845,845,845,845
Black,1282,1282,1282,1282,1282,1282
Latino,987,987,987,987,987,987
Other,392,392,392,392,392,392
White,18284,18284,18284,18284,18284,18284


In [12]:
# store in data folder
data2.to_csv(data_path + '\\' + 'clean_LawData.csv', sep='|', index=False)

### Adult Census Data (Revised)

TODO: see https://github.com/zykls/folktables