In [None]:
import copy 
import os 
from os.path import join
import shutil
import itertools
from collections import Counter
import json
import pickle
import pprint
import pandas as pd
import numpy as np
import pprint
import torch 
import torch.nn.functional as F 

import matplotlib.pyplot as plt 
plt.rcParams['figure.figsize'] = [6, 8]

In [None]:
import sys
basedir = '/Users/RobertAdragna/Documents/School/Fourth_Year/ESC499-Thesis/codebases/causal_discovery'
sys.path.append(basedir)

import data_processing as dp 
import models 
from utils import proc_fteng

In [None]:
print(os.listdir(os.getcwd()))

In [None]:
res_dir = '0525_smallsample'
invariance_algos = {'irm':{}}  #,'icp':{}}
non_invariance_algos = {'linreg':{}}

for als in [invariance_algos, non_invariance_algos]:
    for al in als.keys(): 
        als[al]['expdir'] = join(join(join(os.getcwd(), res_dir), al), 'causal_discovery')
        als[al]['processed_dir'] = join(join(join(os.getcwd(), res_dir), al), 'processed_results')
        als[al]['params'] = pd.read_pickle(join(join(join(os.getcwd(), res_dir), al), '{}_paramfile.pkl'.format(al)))

        if not os.path.exists(als[al]['processed_dir']):
            raise Exception('Directory has not yet been processed')

In [None]:
invariance_algos['irm']['params'].head(10)

# Utilities

In [None]:
def pred_binarize(v):
    '''Convert all values to 0 if <0.5, 1 otherwise'''
    def thresh(x):
        if (x >= 0.5): return 1 
        else: return 0
    print(v.shape)
    return np.apply_along_axis(thresh, 1, v)
    
        
def compute_loss(pred, ground, ltype='MSE'):
    '''Compute loss between two prediction vectors'''

    
    if ltype == 'MSE':
        return F.mse_loss(torch.tensor(pred).float(), torch.tensor(ground).float()).numpy()
    if ltype == 'ACC':
        pred = pred_binarize(pred) 
        return 1 - F.mse_loss(torch.tensor(pred).float(), torch.tensor(ground).float()).numpy()
    
def fairness_dp(pred, ground, d, patts):
    '''Compute demographic aparity wrt data
    :param pred: vector, binary entries (np[float])
    :param ground: vector, binary entries (np[float])
    :param d: dataset (pandas df)
    :param patt: datts dict {cat:[all orig columns]}'''
    
    #Get the protected attribute columns 
    assert len(patts.keys()) == 1
    protected = [patts[cat] for cat in patts.keys()][0]
    
    #Compute p(y_hat=1 | a)  Va  (demographic parity)
    probs = {}
    for aval in protected: 
        if '_DUMmY' in aval:
            subpop = (d[[a for a in protected if '_DUMmY' not in a]] == 0).all(1)
        else:
            subpop = (d[aval] == 1)
        probs[aval] = pred[subpop].sum() / len(pred[subpop])
    return probs
    
    #Compute p(y_hat=1 | a)  Va  (equalized odds)
    
    

In [None]:
def df_subset(df, subset):
    '''Get a subset of df rows whose columns specified in subset equal their respective values
    :param df: Dataframe (pandas)
    :param subset: Series of col_name:value pairs (pandas series)
    '''
    new_df = df.copy(deep=True)
    for col, val in pd.Series.iteritems(subset):
        new_df = new_df[new_df[col] == val]
    return new_df

def get_dset_fname(dset, b):
    if dset == 'adult':
        datafname = join(join(b, 'data'), 'adult.csv')
    elif dset == 'german':
        datafname = join(join(b, 'data'), 'germanCredit.csv')
    else:
        raise Exception('Dataset unimplemented')
    
    return datafname

In [None]:
def generate_all_existing_results(allcols, ags):
    ''' param allcols: A list of the features to be included
        param ags: A list of paramdfs for each algorithm'''
    add = pd.DataFrame()
    for param_df in ags: 
        if add.empty:
            add = param_df[allcols]
        else:
            add = add.append(param_df[allcols], ignore_index=True)
    
    uniq = np.logical_not(add.duplicated())
    return add[uniq]
    
    
def generate_results(fixed, compared): 
    '''
    :param fixed: A list of tuples (pname, pval) that are fixed across exps
    :param compared: A dictionary of pname:full range of possible values in experiment ''' 
    
    fixed_results = pd.Series([np.nan]*len(fixed), index=[f[0] for f in fixed]) #  , index=fixed_cols) 
    for f in fixed:
        fixed_results[f[0]] = f[1]
    
    #Set Up the Results Dataframe 
    compared_results = pd.DataFrame(itertools.product(*[compared[cat] for cat in compared]))
    compared_results.columns = list(compared.keys())
    
    #Set up the results 
    results = fixed_results.to_frame().T
    results['key'] = 0 
    compared_results['key'] = 0
    results = results.merge(compared_results, on='key', how='inner')
    results.drop('key', axis='columns', inplace=True)
    
    return results 

In [None]:
def compute_results(algos, resdf, orig_cols, loss_fn='ACC'):
    reddata = -1

    for al in algos.keys():  #Enumerate through algos     
        ##Hack
        if ('{}-error'.format(al) in list(resdf.columns)) and ('{}-fairness'.format(al) in list(resdf.columns)):
            resdf.drop('{}-error'.format(al), axis='columns', inplace=True) 
            resdf.drop('{}-fairness'.format(al), axis='columns', inplace=True) 
        #######

        #Set up columns in dataframe
        res_entries = ['{}-error'.format(al), '{}-fairness'.format(al)]
        for r in res_entries:
            resdf[r] = np.nan


        for resid, row in resdf.iterrows():
            rel = df_subset(algos[al]['params'], row[orig_cols])  #Get row-associated entry in param dframe
            assert rel.shape[0] <= 1 #Guarentee just one (Excluding multi-index mappings)
            if rel.shape[0] == 0: 
                continue

            data, y_all, d_atts = dp.adult_dataset_processing(get_dset_fname(row['Dataset'], basedir), \
                                                          proc_fteng(row['Fteng']), \
                                                          reduce_dsize=reddata, \
                                                          bin=row['Bin'])
            #Compute Predictions  
            if al == 'icp':
                model = models.InvariantCausalPrediction()
                learned_model = [pd.read_pickle(rel.loc[rel.index[0], 'coeffs'])]
                print(learned_model[0])
                predictions = model.predict(data, *learned_model)    
                if predictions.empty:
                    resdf.loc[resid, '{}-error'.format(al)] = 'NA'
                    resdf.loc[resid, '{}-fairness'.format(al)] = 'NA'
                    continue

            elif al == 'irm':
                model = models.InvariantRiskMinimization()
                learned_model = [torch.load(rel.loc[rel.index[0], 'phi'])]
                predictions = model.predict(data.values, *learned_model)

            elif al == 'linreg':
                model = models.Linear()
                learned_model = [pd.read_pickle(rel.loc[rel.index[0], 'linregressors'])]
                predictions = model.predict(data, *learned_model)   


            #Compute Metrics on Predictions 
            import pdb; pdb.set_trace()
            error = compute_loss(predictions.values, np.expand_dims(y_all.values, axis=1), ltype=loss_fn)
            fairness =  str(fairness_dp(pred_binarize(predictions.values), np.expand_dims(y_all.values, axis=1),\
                                    data, {'race':d_atts['race']}))



            #Save computed values to resdf 
            resdf.loc[resid, '{}-error'.format(al)] = error
            resdf.loc[resid, '{}-fairness'.format(al)] = fairness
               

# Evaluating on Invariance Algorithms 

In [None]:
invar_FIXED = [['Dataset', 'adult'], \
               ['ReduceDsize', 10000], \
               ['Eq_Estrat', -1]] 

invar_COMPARED =  {'Envs':['workclass', 'native-country'], \
                   'Seed':[147, 256, 304],
                   'Fteng':['1', '12'], \
                   'Bin':[1]}

invar_orig_cols = [a[0] for a in invar_FIXED] + list(invar_COMPARED.keys())
invar_results = generate_all_existing_results(invar_orig_cols, \
                                             [invariance_algos[a]['params'] for a in list(invariance_algos.keys())])    
#invar_results = generate_results(invar_FIXED, invar_COMPARED)

invar_results.head(25)

In [None]:
compute_results(invariance_algos, invar_results, invar_orig_cols, loss_fn='ACC')

In [None]:
pd.options.display.max_colwidth = 400
invar_results.head(10)

# Evaluating on Non-Invariance Algorithms

In [None]:
var_FIXED = [['Dataset', 'adult'], \
               ['ReduceDsize', 10000], \
               ['Bin', 1]] 

var_COMPARED =  {'Fteng':['1', '12'], \
                 'Seed':[147, 256, 304]}

noninvar_results = generate_results(var_FIXED, var_COMPARED)
noninvar_orig_cols = noninvar_results.columns

noninvar_results.head(10)

In [None]:
compute_results(non_invariance_algos, noninvar_results, noninvar_orig_cols, loss_fn='MSE')

In [None]:
pd.options.display.max_colwidth = 400
noninvar_results.head(10)