In [None]:
import copy 
import os 
from os.path import join
import shutil
import itertools
from collections import Counter
import json
import pickle
import pprint
import pandas as pd
import numpy as np
import pprint
import torch 
import torch.nn.functional as F 
import math 

import matplotlib.pyplot as plt 
plt.rcParams['figure.figsize'] = [6, 8]

In [None]:
import sys
basedir = '/Users/RobertAdragna/Documents/School/Fourth_Year/ESC499-Thesis/codebases/causal_discovery'
sys.path.append(basedir)

import data_processing as dp 
import environment_processing as eproc 
import models 
from utils import proc_fteng

In [None]:
print(os.listdir(os.getcwd()))

In [None]:
res_dir = '0602_validation'
invariance_algos = {'icp':{}}   #'icp':{},
non_invariance_algos = {'linreg':{}}

for als in [invariance_algos, non_invariance_algos]:
    for al in als.keys(): 
        als[al]['expdir'] = join(join(join(os.getcwd(), res_dir), al), 'causal_discovery')
        als[al]['processed_dir'] = join(join(join(os.getcwd(), res_dir), al), 'processed_results')
        als[al]['params'] = pd.read_pickle(join(join(join(os.getcwd(), res_dir), al), '{}_paramfile.pkl'.format(al)))

        if not os.path.exists(als[al]['processed_dir']):
            raise Exception('Directory has not yet been processed')

In [None]:
invariance_algos['icp']['params'] = invariance_algos['icp']['params'].drop('2', axis=0)
invariance_algos['icp']['params'].head(10)

# Utilities

In [None]:
def pred_binarize(v):
    '''Convert all values to 0 if <0.5, 1 otherwise'''
    def thresh(x):
        if (x >= 0.5): return 1 
        else: return 0
    print(v.shape)
    return np.apply_along_axis(thresh, 1, v)
    
        
def compute_loss(pred, ground, ltype='MSE'):
    '''Compute loss between two prediction vectors'''

    
    if ltype == 'MSE':
        return F.mse_loss(torch.tensor(pred).float(), torch.tensor(ground).float()).numpy()
    if ltype == 'ACC':
        pred = pred_binarize(pred) 
        return 1 - F.mse_loss(torch.tensor(pred).float(), torch.tensor(ground).float()).numpy()
    
def fairness_dp(pred, ground, d, patts, ftype='DP'):
    '''Compute demographic aparity wrt data
    :param pred: vector, binary entries (np[float])
    :param ground: vector, binary entries (np[float])
    :param d: dataset (pandas df)
    :param patt: datts dict {cat:[all orig columns]}'''
    
    def avg_diff_scores(p):
        ''' Given a dictionary of scores for different sensitive attributes p.keys, \ 
            return the average difference between these values '''
        na = len(p.keys())
        n_combos = math.factorial(na)/ (2 * math.factorial(na - 2))
        
        tot = 0
        for pair in itertools.combinations(list(p.keys()), 2): 
            tot += abs(p[pair[0]] - p[pair[1]])
        
        return float(tot/na)

    
    #Get the protected attribute columns 
    assert len(patts.keys()) == 1
    protected = [patts[cat] for cat in patts.keys()][0]
    
    probs = {}   
    #Compute p(y_hat=1 | a, y)  Va  (demographic parity)

    for aval in protected: 
        if '_DUMmY' in aval:
            subpop = (d[[a for a in protected if '_DUMmY' not in a]] == 0).all(1).values.squeeze()
        else:
            subpop = (d[aval] == 1).values.squeeze()
            
        if ftype == 'DP': 
            probs[aval] = pred[subpop].sum() / len(pred[subpop])
        
        elif ftype == 'EOP':
            probs[aval] = pred[subpop & (ground == 1).squeeze()].sum() / len(pred[subpop & (ground == 1).squeeze()])
        
        elif ftype == 'CAL':
            probs[aval] = ground[subpop & (pred == 1).squeeze()].sum() / len(ground[subpop])
    
    return avg_diff_scores(probs)

In [None]:
def df_subset(df, subset):
    '''Get a subset of df rows whose columns specified in subset equal their respective values
    :param df: Dataframe (pandas)
    :param subset: Series of col_name:value pairs (pandas series)
    '''
    new_df = df.copy(deep=True)
    for col, val in pd.Series.iteritems(subset):
        new_df = new_df[new_df[col] == val]
    return new_df

def get_dset_fname(dset, b):
    if dset == 'adult':
        datafname = join(join(b, 'data'), 'adult.csv')
    elif dset == 'german':
        datafname = join(join(b, 'data'), 'germanCredit.csv')
    else:
        raise Exception('Dataset unimplemented')
    
    return datafname

In [None]:
def generate_all_existing_results(allcols, ags):
    ''' param allcols: A list of the features to be included
        param ags: A list of paramdfs for each algorithm'''
    add = pd.DataFrame()
    for param_df in ags: 
        if add.empty:
            add = param_df[allcols]
        else:
            add = add.append(param_df[allcols], ignore_index=True)
    
    uniq = np.logical_not(add.duplicated())
    return add[uniq]
    
    
def generate_results(fixed, compared): 
    '''
    :param fixed: A list of tuples (pname, pval) that are fixed across exps
    :param compared: A dictionary of pname:full range of possible values in experiment ''' 
    
    fixed_results = pd.Series([np.nan]*len(fixed), index=[f[0] for f in fixed]) #  , index=fixed_cols) 
    for f in fixed:
        fixed_results[f[0]] = f[1]
    
    #Set Up the Results Dataframe 
    compared_results = pd.DataFrame(itertools.product(*[compared[cat] for cat in compared]))
    compared_results.columns = list(compared.keys())
    
    #Set up the results 
    results = fixed_results.to_frame().T
    results['key'] = 0 
    compared_results['key'] = 0
    results = results.merge(compared_results, on='key', how='inner')
    results.drop('key', axis='columns', inplace=True)
    
    return results 

In [None]:
def split_var_train_val(mod, ad, ay, seed, val):
    if val != '-1': 
        val = float(val)
        assert 0.0 < float(val) < 1.0
        train_data, train_y_all = mod.get_traindata(ad, ay, float(val), int(seed))
        val_data, val_y_all = mod.get_valdata(ad, ay, float(val), int(seed))
    else:
        train_data, train_y_all = ad, ay
        val_data, val_y_all = pd.DataFrame(), pd.DataFrame()
        
    return train_data, train_y_all, val_data, val_y_all
        
def split_train_val(ad, ay, atts, val):
    assert (type(val) == str) and ('[' not in val)
    
    e_store = eproc.get_environments(ad, atts)
    if val != '-1':
        val_ein = e_store.pop(tuple([val])) 
        train_data, train_y_all = ad[np.logical_not(val_ein.values)], ay[np.logical_not(val_ein.values)]
        val_data, val_y_all = ad[val_ein.values], ay[val_ein.values]
    else:
        train_data, train_y_all = ad, ay
        val_data, val_y_all = pd.DataFrame(), pd.DataFrame()
    
    return train_data, train_y_all, val_data, val_y_all
    

def compute_results(algos, resdf, orig_cols, from_scratch=False):
    reddata = -1
    loss_types = ['ACC']
    fairness_types = ['DP', 'EOP', 'CAL']
    sens_atts = {'adult':['race', 'gender', 'relationship'], \
                 'german':['Personal']}     
    
    #Get All The Results Columns of Interest: 
    res_cols = []
    for al in algos.keys():
        for m in ['train', 'val']:
            for l in loss_types:
                res_cols.append('{}-{}_error-{}'.format(al, m, l))
            for f in fairness_types:
                res_cols.append('{}-{}_fairness-{}'.format(al, m, f)) 
    for col in res_cols:  #Add cols to resultsdf
        if (col not in list(resdf.columns)) or from_scratch:
            resdf[col] = np.nan
    
    
    for al in algos.keys():  #Enumerate through algos     
        for resid, row in resdf.iterrows():
            algo_rescols = [c for c in res_cols if al in c]
            if row[algo_rescols].isnull().all():   #Check if merics for row already been computed 
                
                #Get entry of real dataset correpsonding to row 
                rel = df_subset(algos[al]['params'], row[orig_cols])  #Get row-associated entry in param dframe
                assert rel.shape[0] <= 1 #Guarentee just one (Excluding multi-index mappings)
                if rel.shape[0] == 0: 
                    continue
                
                alldata, all_y_all, d_atts = dp.data_loader(get_dset_fname(row['Dataset'], basedir), \
                                                              proc_fteng(row['Fteng']), \
                                                              dsize=reddata, \
                                                              bin=row['Bin'])
                #Split data
                train_data, train_y_all, val_data, val_y_all = split_train_val(alldata, all_y_all, \
                                                                               {row['Envs']:d_atts[row['Envs']]}, \
                                                                               row['Val'])  
                #Compute Predictions  
                if al == 'icp':
                    model = models.InvariantCausalPrediction() 
                    learned_model = [pd.read_pickle(rel.loc[rel.index[0], 'coeffs'])]
                    
                       
                    train_predictions = model.predict(train_data, *learned_model)
                    val_predictions = model.predict(val_data, *learned_model)

                elif al == 'irm':
                    model = models.InvariantRiskMinimization()
                    try:
                        learned_model = [torch.load(rel.loc[rel.index[0], 'phi'])]
                    except:
                        import pdb; pdb.set_trace()
       
                    train_predictions = model.predict(train_data.values, *learned_model)
                    val_predictions = model.predict(val_data.values, *learned_model)                

                elif al == 'linreg':
                    model = models.Linear()
                    learned_model = [pd.read_pickle(rel.loc[rel.index[0], 'linregressors'])]
                    
                    #Split data and predict
                    train_data, train_y_all, val_data, val_y_all = split_var_train_val(alldata, \
                                                                                       all_y_all, row['Seed'], \
                                                                                       row['Val'])     
                    train_predictions = model.predict(train_data, *learned_model)   
                    val_predictions = model.predict(val_data, *learned_model)  

                    
                #Compute Metrics on Predictions 
                for ftype in fairness_types:
                    for ltype in loss_types:
                        for r in [['train', train_predictions, train_y_all, train_data], ['val', val_predictions, val_y_all, val_data]] :
                            m, predictions, y_all, data = r[0], r[1], r[2], r[3]
                        
                            #Manage special case 
                            if predictions.empty:
                                resdf.loc[resid, '{}-{}_error-{}'.format(al, m, ltype)] = 'NA'
                                resdf.loc[resid, '{}-{}_fairness-{}'.format(al, m, ftype)] = 'NA'

                            else:
                                error = compute_loss(predictions.values, y_all.values, ltype=ltype)
                                full_fair= ''
                                for s in sens_atts[row['Dataset']]:
                                    fairness =  fairness_dp(pred_binarize(predictions.values), y_all.values,\
                                                            data, {s:d_atts[s]}, ftype=ftype)
                                    full_fair = full_fair + ' {0}:{1:.3f} \n'.format(s, fairness)


                                #Save computed values to resdf 
                                resdf.loc[resid, '{}-{}_error-{}'.format(al, m, ltype)] = error
                                resdf.loc[resid, '{}-{}_fairness-{}'.format(al, m, ftype)] = full_fair

# Evaluating on Invariance Algorithms 

In [None]:
invar_FIXED = [['Dataset', 'adult'], \
               ['ReduceDsize', 10000], \
               ['Eq_Estrat', -1]] 

invar_COMPARED =  {'Envs':['workclass', 'native-country'], \
                   'Seed':[147, 256, 304],
                   'Fteng':['1', '12'], \
                   'Bin':[1]}

invar_orig_cols = [a[0] for a in invar_FIXED] + list(invar_COMPARED.keys()) + ['Val']
invar_results = generate_all_existing_results(invar_orig_cols, \
                                             [invariance_algos[a]['params'] for a in list(invariance_algos.keys())])    
#invar_results = generate_results(invar_FIXED, invar_COMPARED)

invar_results.head(25)

In [None]:
compute_results(invariance_algos, invar_results, invar_orig_cols, from_scratch=True)

In [None]:
pd.options.display.max_colwidth = 4000
invar_results.head(20)

# Evaluating on Non-Invariance Algorithms

In [None]:
var_FIXED = [['Dataset', 'adult'], \
               ['ReduceDsize', 10000], \
               ['Bin', 1]] 

var_COMPARED =  {'Fteng':['1', '12'], \
                 'Seed':[147, 256, 304]}

var_orig_cols = [a[0] for a in var_FIXED] + list(var_COMPARED.keys()) + ['Val']
var_results = generate_all_existing_results(var_orig_cols, \
                                             [non_invariance_algos[a]['params'] for a in list(non_invariance_algos.keys())]) 
# var_results = generate_results(var_FIXED, var_COMPARED)

var_results.head(10)

In [None]:
compute_results(non_invariance_algos, var_results, var_orig_cols)

In [None]:
pd.options.display.max_colwidth = 400
var_results.head(10)

# Save To Latex

In [None]:
latex_resdir = 'latex_results'
latex_fname = '0602_validation_invar.xlsx'
invar_results.to_excel(os.path.join(latex_resdir, latex_fname))