In [2]:
import sys
sys.path.insert(1, "../")

import numpy as np
np.random.seed(0)

from aif360.datasets import BinaryLabelDataset
#from aif360.metrics.BinaryLabelDatasetMetric 
from aif360.algorithms.preprocessing import Reweighing, LFR, DisparateImpactRemover
from aif360.algorithms.postprocessing.calibrated_eq_odds_postprocessing import CalibratedEqOddsPostprocessing
from aif360.algorithms.postprocessing import EqOddsPostprocessing, RejectOptionClassification

from IPython.display import Markdown, display

import import_ipynb
import csv
import pandas as pd
from copy import deepcopy

#from bias_inspection import username_cols, protected_cols, pretest_filenames
from fetch_data import events_without_movement, plot_names



In [None]:
username_cols = {2018: 17, 2019: 18}
protected_cols = {'Gender': {2018: 18, 2019: 20}, 
                  'Prior_exp': {2018: 24, 2019: 25}}
pretest_filenames = {2018: "Datasets/2018_REFLECT_Pretest.csv",
                     2019: "Datasets/2019_REFLECT_Pretest.csv"}

In [None]:
def reassign_gender_list(attrs):
    return

In [None]:
def reassign_exp_list(attrs):
    for i,item in enumerate(attrs):
        if item == '3':
            attrs[i] = '2'
        elif item == '4' or item == '5':
            attrs[i] = '3'
        
    return

In [None]:
def get_protected_attr_by_ids(ids, protected_labels=["Gender", "Prior_exp"]):
    studies = [2018, 2019]
    protected_attrs = {}
    
    for s_id in ids:
        found = False
        for year in studies:
            with open(pretest_filenames[year]) as csv_file:
                csv_reader = csv.reader(csv_file, delimiter=',')
                for row in csv_reader:
                    if row[username_cols[year]] == s_id or row[username_cols[year]] in s_id:
                        found = True 
                        for protected_label in protected_labels:
                            if protected_label not in protected_attrs:
                                protected_attrs[protected_label] = []
                            protected_attrs[protected_label].append(row[protected_cols[protected_label][year]])
            if found == True:
                break
             
    for protected_label in protected_labels:
        if protected_label == "Gender":
            reassign_gender_list(protected_attrs[protected_label])
        elif protected_label == "Prior_exp":
            reassign_exp_list(protected_attrs[protected_label])
        
    return protected_attrs

In [None]:
def convert_arrays_to_df(data, labels, ids, emb_dim=32, seq_length=20,
                         protected_labels=['Gender', 'Prior_exp'], label_names=['posttest'],
                         binarize=False):
    new_data = {'pretests': []}#, 'posttest': []} # 'ratings': []} #
    for label in label_names:
        new_data[label] = []
    protected_attrs = get_protected_attr_by_ids(ids['ids'], protected_labels=protected_labels)
    event_cols = ['event_'+e for e in events_without_movement]
    goal_cols = ['goal_'+p for p in plot_names]
    emb_cols = ['emb_dim'+str(i) for i in range(32)]
    
    action_cols = deepcopy(event_cols)
    action_cols.extend(deepcopy(goal_cols))
    action_cols.extend(deepcopy(emb_cols))
    
    all_cols = []
    
    for s in range(20):
        all_cols.extend([a+"_"+str(s) for a in action_cols])
    #print("action-subseq cols: ", all_cols)
        
    for a in all_cols:
        new_data[a] = []
    for protected_label in protected_labels:
        new_data[protected_label] = []
    
    for n,subseq in enumerate(data[0]):
        new_data['pretests'].append(data[1][n])
        if 'posttest' in label_names:
            if binarize:
                new_data['posttest'].append(labels[0][n]>=5/17)
            else:
                new_data['posttest'].append(labels[0][n])
        if 'ratings' in label_names:
            if binarize:
                new_data['ratings'].append(labels[1][n]>=2.5/5)
            else:
                new_data['ratings'].append(labels[1][n])
        for protected_label in protected_labels:
            new_data[protected_label].append(protected_attrs[protected_label][n])
        col_idx = 0
        for s in range(seq_length):
            for a,d in enumerate(subseq[s]):
                new_data[all_cols[col_idx]].append(d)
                col_idx += 1
                
    return pd.DataFrame(data=new_data), action_cols

In [None]:
def convert_df_to_arrays(df, action_cols, emb_dim=32, seq_length=20):
    data = [[], df['pretests'].tolist()]
    
    for sample_num in range(len(df.index)):
        data[0].append([]) # build subsequence
        for s in range(seq_length):
            data[0][sample_num].append([]) # build action vector
            for col in action_cols:
                data[0][sample_num][s].append(df[col+"_"+str(s)][sample_num])
        data[0][sample_num] = np.array(data[0][sample_num])
        
    print(np.array(data[0]).shape)
    print(np.array(data[1]).shape)
    data = [np.array(data[0]), np.array(data[1])]
    
    return data

In [None]:
def convert_df_to_labels(df, label_names=['ratings']):
    
    return df[label_names[0]].tolist()

In [None]:
def bias_preprocessing(data, labels, ids, algorithm='Reweighing', emb_dim=32, 
                       seq_length=20, protected_labels=['Gender', 'Prior_exp'], label_names=['posttest']):
                
    df, action_cols = convert_arrays_to_df(data, labels, ids, emb_dim=emb_dim, seq_length=seq_length, 
                         protected_labels=protected_labels, label_names=label_names, binarize=True)
                
    #bld = BinaryLabelDataset(df, ['posttest', 'ratings'], protected_labels,
    #                         unprivileged_protected_attributes=[[2], [1]],
    #                         privileged_protected_attributes=[[1], [2,3]])
    bld = BinaryLabelDataset(df=df, favorable_label=1, unfavorable_label=0, 
                             label_names=label_names, protected_attribute_names=protected_labels)
    
    # add a metric?
    if algorithm == 'Reweighing':
        privileged_groups = []
        unprivileged_groups = []
        if 'Gender' in protected_labels:
            privileged_groups.append({'Gender': 2})
            unprivileged_groups.append({'Gender': 1})
        if 'Prior_exp' in protected_labels:
            privileged_groups.extend([{'Prior_exp': 3}, {'Prior_exp': 2}])
            unprivileged_groups.append({'Prior_exp': 1})
        print("Algorithm: ", algorithm, "\nProtected labels: ", protected_labels)
        
        RW = Reweighing(unprivileged_groups=unprivileged_groups,
                        privileged_groups=privileged_groups)
        dataset_transf_train = RW.fit_transform(bld)
        transf_df = dataset_transf_train.convert_to_dataframe()[0]
        
    elif algorithm == 'LFR':
        privileged_groups = []
        unprivileged_groups = []
        if 'Gender' in protected_labels:
            privileged_groups = [{'Gender': 2}]
            unprivileged_groups = [{'Gender': 1}]
        elif 'Prior_exp' in protected_labels:
            privileged_groups.extend([{'Prior_exp': 2}])
            unprivileged_groups.append({'Prior_exp': 1})
        print("Algorithm: ", algorithm, "\nProtected labels: ", protected_labels)
        
        lf = LFR(unprivileged_groups=unprivileged_groups,
                        privileged_groups=privileged_groups)
        dataset_transf_train = lf.fit_transform(bld)
        transf_df = dataset_transf_train.convert_to_dataframe()[0]
    
    elif algorithm == 'OptimPreproc':
        privileged_groups = [{'Gender': 1}]
        unprivileged_groups = [{'Gender': 2}]
        print("Algorithm: ", algorithm, "\nProtected labels: ", protected_labels)
        
        lf = OptimPreproc(unprivileged_groups=unprivileged_groups,
                          privileged_groups=privileged_groups)
        dataset_transf_train = lf.fit_transform(bld)
        transf_df = dataset_transf_train.convert_to_dataframe()[0]
        
    elif algorithm == 'DisparateImpactRemover':
        privileged_groups = [{'Gender': 2}]
        unprivileged_groups = [{'Gender': 1}]
        print("Algorithm: ", algorithm, "\nProtected labels: ", protected_labels)
        
        dr = DisparateImpactRemover(repair_level=0.75, sensitive_attribute=protected_labels[0])
        dataset_transf_train = dr.fit_transform(bld)
        transf_df = dataset_transf_train.convert_to_dataframe()[0]
    
    transf_data = convert_df_to_arrays(transf_df, action_cols, emb_dim=emb_dim, seq_length=seq_length)
    
    return transf_data, dataset_transf_train.instance_weights

In [None]:
def bias_postprocessing(data, true_labels, pred_labels, ids, algorithm='CalibratedEqOddsPostprocessing', 
                        emb_dim=32, seq_length=20, protected_labels=['Gender'], 
                        label_names=['ratings']):
    protected_labels = ['Prior_exp'] 
    df_true, action_cols = convert_arrays_to_df(data, true_labels, ids, emb_dim=emb_dim, seq_length=seq_length, 
                                                protected_labels=protected_labels, label_names=label_names,
                                                binarize=True)
    
    bld_true = BinaryLabelDataset(df=df_true, favorable_label=1, unfavorable_label=0, 
                                  label_names=label_names, protected_attribute_names=protected_labels)
    
    #df_pred, action_cols = convert_arrays_to_df(data, pred_labels, ids, emb_dim=emb_dim, seq_length=seq_length, 
    #                                            protected_labels=protected_labels, label_names=label_names,
    #                                            binarize=False)
    bld_pred = deepcopy(bld_true)
    bld_pred.scores = np.array(pred_labels).reshape(-1,1)
    
    y_pred = np.zeros_like(bld_pred.labels)
    y_pred[np.array(pred_labels) >= 2.5/5] = bld_pred.favorable_label
    y_pred[~(np.array(pred_labels) >= 2.5/5)] = bld_pred.unfavorable_label
    bld_pred.labels = y_pred
    
    #bld_pred = BinaryLabelDataset(df=df_pred, favorable_label=1, unfavorable_label=0, 
    #                              label_names=label_names, protected_attribute_names=protected_labels)
    
    # add a metric?
    if algorithm == 'CalibratedEqOddsPostprocessing':
        privileged_groups = []
        unprivileged_groups = []
        if 'Gender' in protected_labels:
            privileged_groups.append({'Gender': 2})
            unprivileged_groups.append({'Gender': 1})
        if 'Prior_exp' in protected_labels:
            privileged_groups.extend([{'Prior_exp': 2}, {'Prior_exp': 3}])
            unprivileged_groups.append({'Prior_exp': 1})
        print("Algorithm: ", algorithm, "\nProtected labels: ", protected_labels)
        
        ceq = CalibratedEqOddsPostprocessing(unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
        dataset_transf_pred = ceq.fit_predict(bld_true, bld_pred)
        transf_df = dataset_transf_pred.convert_to_dataframe()[0]
        
    elif algorithm == 'EqOddsPostprocessing':
        privileged_groups = []
        unprivileged_groups = []
        if 'Gender' in protected_labels:
            privileged_groups.append({'Gender': 2})
            unprivileged_groups.append({'Gender': 1})
        if 'Prior_exp' in protected_labels:
            privileged_groups.extend([{'Prior_exp': 3}, {'Prior_exp': 2}])
            unprivileged_groups.append({'Prior_exp': 1})
        print("Algorithm: ", algorithm, "\nProtected labels: ", protected_labels)
        
        eq = EqOddsPostprocessing(unprivileged_groups=unprivileged_groups,
                                  privileged_groups=privileged_groups)
        dataset_transf_pred = eq.fit_predict(bld_true, bld_pred)
        transf_df = dataset_transf_pred.convert_to_dataframe()[0]
        
    elif algorithm == 'RejectOptionClassification':
        privileged_groups = []
        unprivileged_groups = []
        if 'Gender' in protected_labels:
            privileged_groups.append({'Gender': 2})
            unprivileged_groups.append({'Gender': 1})
        if 'Prior_exp' in protected_labels:
            privileged_groups.extend([{'Prior_exp': 3}, {'Prior_exp': 2}])
            unprivileged_groups.append({'Prior_exp': 1})
        print("Algorithm: ", algorithm, "\nProtected labels: ", protected_labels)
        
        roc = RejectOptionClassification(unprivileged_groups=unprivileged_groups,
                                        privileged_groups=privileged_groups)
        dataset_transf_pred = roc.fit_predict(bld_true, bld_pred)
        transf_df = dataset_transf_pred.convert_to_dataframe()[0]
    
    transf_pred = convert_df_to_labels(transf_df, label_names=label_names)
    
    return [d[0] for d in dataset_transf_pred.scores.tolist()]