In [1]:
# LIBRAIRES
import numpy as np
import pandas as pd
import random
import pickle

import string

from aif360.datasets import AdultDataset

from matplotlib import pyplot as plt 

pip install 'aif360[LawSchoolGPA]'


In [2]:
# METHODS
def swap_vals(dirty_df, n):
    mods = pd.DataFrame()
    for i in range(n):
        col_i = random.randint(0,dirty_df.shape[1]-1)
        col = dirty_df.columns[col_i]
        row_1 = random.randint(0,dirty_df.shape[0]-1)
        row_2 = random.randint(0,dirty_df.shape[0]-1)
        # val = dirty_df.iloc[row][col]
        val_1 = dirty_df.iat[row_1,col_i]
        val_2 = dirty_df.iat[row_2,col_i]
    
        dirty_df.iat[row_1,col_i] = val_2
        dirty_df.iat[row_2,col_i] = val_1
    
        mod = {'col': col, 'row 1': row_1, 'val 1': val_1,'row 2': row_2,'val 2':val_2}
        # mods = mods.append(mod, ignore_index = True)
        mods = pd.concat([mods, pd.DataFrame([mod])], ignore_index=True)
    return(dirty_df,mods)

def rand_mod(dirty_df, n):
    mods = pd.DataFrame()
    for i in range(n):
        col_i = random.randint(0,dirty_df.shape[1]-1)
        col = dirty_df.columns[col_i]
        row_1 = random.randint(0,dirty_df.shape[0]-1)
        row_2 = random.randint(0,dirty_df.shape[0]-1)
        # val = dirty_df.iloc[row][col]
        val_1 = dirty_df.iat[row_1,col_i]
        val_2 = dirty_df.iat[row_2,col_i]
    
        dirty_df.iat[row_1,col_i] = val_2
        dirty_df.iat[row_2,col_i] = val_1
    
        mod = {'col': col, 'row 1': row_1, 'val 1': val_1,'row 2': row_2,'val 2':val_2}
        # mods = mods.append(mod, ignore_index = True)
        mods = pd.concat([mods, pd.DataFrame([mod])], ignore_index=True)

    return(dirty_df,mods)

### Note

downloaded needed files:
'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names'

In [3]:
# GET DATA
label_map = {1.0: '>50K', 0.0: '<=50K'}
protected_attribute_maps = [{1.0: 'Male', 0.0: 'Female'}]

data = AdultDataset(
    protected_attribute_names=['sex'],
    categorical_features=['workclass', 'education', 'marital-status',
                          'occupation', 'relationship', 'native-country', 'race'],
    privileged_classes=[['Male']],
    metadata={'label_map': label_map,'protected_attribute_maps': protected_attribute_maps})

df = pd.DataFrame(data.features, columns=data.feature_names)
Y = data.labels.ravel()
P = data.protected_attributes.ravel()
P_val = np.vectorize(protected_attribute_maps[0].get)(P)

print('Dataset Retrieved')
print(" Shape of X (df): {}".format(df.shape))
print(" Shape of Y: {}".format(Y.shape))
print(" Shape of P: {}".format(P.shape))
print(" Shape of P_val: {}".format(P_val.shape))



Dataset Retrieved
 Shape of X (df): (45222, 102)
 Shape of Y: (45222,)
 Shape of P: (45222,)
 Shape of P_val: (45222,)


In [4]:
# could not process full dataset, taking a smaller sample
n_sample = 20000
n = Y.shape[0]
ind = np.arange(n)
np.random.shuffle(ind)
i_sample = ind[:n_sample]

df = df.iloc[i_sample]
Y = Y[i_sample]
P = P[i_sample]
P_val = P_val[i_sample]

print('Dataset Downsized')
print(" Shape of X (df): {}".format(df.shape))
print(" Shape of Y: {}".format(Y.shape))
print(" Shape of P: {}".format(P.shape))
print(" Shape of P_val: {}".format(P_val.shape))

Dataset Downsized
 Shape of X (df): (20000, 102)
 Shape of Y: (20000,)
 Shape of P: (20000,)
 Shape of P_val: (20000,)


In [5]:
# convert Y to multiclass 
y_multi = np.ones((Y.shape[0],2),dtype=bool)
y_multi[:,1]=Y
y_multi[:,0]=np.abs(Y-1)
Y = y_multi
print(" Shape of Y: {}".format(Y.shape))

 Shape of Y: (20000, 2)


In [6]:
df_dirty = df.copy()
cells = df_dirty.shape[0] * df_dirty.shape[1]
print(f'cell cnt: {cells}')

swap_cnt = int(0.05*cells)
print(f'swap cell cnt: {swap_cnt} x2')
df_dirty,swap_mods=swap_vals(df_dirty, swap_cnt)

mod_cnt = int(0.1*cells)
print(f'rand mod cell cnt: {mod_cnt}')
df_dirty,rand_mods=rand_mod(df_dirty, mod_cnt)
# mods

cell cnt: 2040000
swap cell cnt: 102000 x2
rand mod cell cnt: 204000


In [7]:
# checking for columns that are weirdly highly correlated
tmp = df.copy()
tmp['Y'] = Y[:,1]
corr = tmp.astype(float).corr()
print(corr[corr['Y'].abs() > 0.55]['Y'])

# weird =['r_offense_date', 'r_charge_degree_O']

# df = df.drop(columns=['is_recid']+weird) # remove label
# df_dirty = df_dirty.drop(columns=['is_recid']+weird) # remove label

Y    1.0
Name: Y, dtype: float64


In [8]:
data_dict = {
    'Y':Y,
    'P_val':P_val,
    'P':P,
    'priv_group': 1, # priv group is male
    'pos_label': 1 # best label is that they are predicted to have high income (Y = 1, >50K)
}
with open('datasets/adult/data.p', "wb") as fileObj:
    pickle.dump(data_dict, fileObj)


# np.savetxt("datasets/compas/clean.csv", df.to_numpy(), delimiter=",")
# np.savetxt("datasets/compas/dirty.csv", df_dirty.to_numpy(), delimiter=",")

df = df.astype(float)
df_dirty = df_dirty.astype(float)

df.to_csv("datasets/adult/clean.csv", sep=",", header=True, index=False)
df_dirty.to_csv("datasets/adult/dirty.csv", sep=",", header=True, index=False)

In [9]:
df_dirty.shape


(20000, 102)

In [10]:
print(f'Total Data Points: {Y.shape[0]}')
print(f'By Label: {Y.sum(axis=0)}')
tmp_unique,tmp_cnts = np.unique(P, return_counts=True)
print(f'By Prot Group: {dict(zip(tmp_unique,tmp_cnts))}')
tmp_unique,tmp_cnts = np.unique(P_val, return_counts=True)
print(f'By Prot Value: {dict(zip(tmp_unique,tmp_cnts))}')


Total Data Points: 20000
By Label: [15014  4986]
By Prot Group: {0.0: 6546, 1.0: 13454}
By Prot Value: {'Female': 6546, 'Male': 13454}
