In [1]:
# Analyzing results on NIH datasets

# Import necessary packages
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
from balancersV2 import BinaryBalancer
import updated_tools
import seaborn as sns
from sklearn.metrics import roc_auc_score, confusion_matrix
import random
from random import sample
random.seed(10)

In [2]:
# Counting fairness
# Set relevant variables
folder_name = 'reorganized_nih_results'
column_titles = ['Average Initial TPR', 'Average Initial FPR', 'Average Initial J-stat', 'Average Initial TPR_1', 'Average Initial TPR_0', 
                 'Average Initial FPR_1', 'Average Initial FPR_0', 'Average Post TPR', 'Average Post FPR', 'Average Post J-stat', 
                 'Average Post TPR_1', 'Average Post TPR_0', 'Average Post FPR_1', 'Average Post FPR_0']
models = ['train100%_female_images','train75%_female_images','train50%_female_images','train25%_female_images','train0%_female_images']
condition_names = ['Atelectasis','Cardiomegaly','Effusion','Infiltration','Mass',
                   'Nodule','Pneumonia','Pneumothorax','Consolidation','Edema',
                   'Emphysema','Fibrosis','Pleural_Thickening','Hernia']
num_folds = 20
num_list = [i for i in range(num_folds)]
fold_list = ['Fold_' + str(k) for k in range(num_folds)]

In [3]:
# See changes in performance after correcting for fairness
for i, model in tqdm(enumerate(models)):
    print(model)
    avg_J_pre = []
    avg_tpr_pre = []
    avg_fpr_pre = []
    avg_tpr_1_pre = []
    avg_tpr_0_pre = []
    avg_fpr_1_pre = []
    avg_fpr_0_pre = []
    avg_J_post = []
    avg_tpr_post = []
    avg_fpr_post = []
    avg_tpr_1_post = []
    avg_tpr_0_post = []
    avg_fpr_1_post = []
    avg_fpr_0_post = []
    for condition in condition_names:
        folder_path = os.path.join(folder_name, model, condition)
        threshold_csv = pd.read_csv(os.path.join(folder_path, 'optimal_thresholds.csv'))
        thresholds = threshold_csv['Optimal Threshold'].values
        J_pre = []
        tpr_pre = []
        fpr_pre = []
        tpr_1_pre = []
        tpr_0_pre = []
        fpr_1_pre = []
        fpr_0_pre = []
        J_post = []
        tpr_post = []
        fpr_post = []
        tpr_1_post = []
        tpr_0_post = []
        fpr_1_post = []
        fpr_0_post = []
        for k in range(num_folds):

            # Load data and initialize balanccer
            t = thresholds[k]
            df = pd.read_csv(os.path.join(folder_path, fold_list[k]), index_col=[0])
            y = df.y.values
            y_hat = (df.y_prob.values >= t)*1
            a = df.a.values
            balancer = BinaryBalancer(y, y_hat, a, a)

            # Extract tprs and fprs before correction
            tpr_1_pre.append(balancer.a_gr_list[1].tpr)
            tpr_0_pre.append(balancer.a_gr_list[0].tpr)
            fpr_1_pre.append(balancer.a_gr_list[1].fpr)
            fpr_0_pre.append(balancer.a_gr_list[0].fpr)
            tpr_pre.append(balancer.overall_rates.tpr)
            fpr_pre.append(balancer.overall_rates.fpr)
            J_pre.append(balancer.overall_rates.tpr - balancer.overall_rates.fpr)

            # Load another dataset to learn parameters for fairness correction
            i = sample(num_list[:k] + num_list[k:],1)[0]
            df_val = pd.read_csv(os.path.join(folder_path, fold_list[i]), index_col=[0])
            y_val = df_val.y.values
            y_hat_val = (df_val.y_prob.values >= t)*1
            a_val = df_val.a.values
            balancer_val = BinaryBalancer(y_val, y_hat_val, a_val, a_val)

            # Correct for fairness on validation
            balancer_val.adjust(task='fair')

            # Evaluate on test
            y_fair = balancer_val.predict(y_hat, a)
            fair_balancer = BinaryBalancer(y, y_fair, a, a)

            # Extract tprs and fprs before correction
            tpr_1_post.append(fair_balancer.a_gr_list[1].tpr)
            tpr_0_post.append(fair_balancer.a_gr_list[0].tpr)
            fpr_1_post.append(fair_balancer.a_gr_list[1].fpr)
            fpr_0_post.append(fair_balancer.a_gr_list[0].fpr)
            tpr_post.append(fair_balancer.overall_rates.tpr)
            fpr_post.append(fair_balancer.overall_rates.fpr)
            J_post.append(fair_balancer.overall_rates.tpr - fair_balancer.overall_rates.fpr)

        # Average out tprs and fprs
        avg_J_pre.append(np.mean(np.array(J_pre)))
        avg_tpr_pre.append(np.mean(np.array(tpr_pre)))
        avg_fpr_pre.append(np.mean(np.array(fpr_pre)))
        avg_tpr_1_pre.append(np.mean(np.array(tpr_1_pre)))
        avg_tpr_0_pre.append(np.mean(np.array(tpr_0_pre)))
        avg_fpr_1_pre.append(np.mean(np.array(fpr_1_pre)))
        avg_fpr_0_pre.append(np.mean(np.array(fpr_0_pre)))
        avg_J_post.append(np.mean(np.array(J_post)))
        avg_tpr_post.append(np.mean(np.array(tpr_post)))
        avg_fpr_post.append(np.mean(np.array(fpr_post)))
        avg_tpr_1_post.append(np.mean(np.array(tpr_1_post)))
        avg_tpr_0_post.append(np.mean(np.array(tpr_0_post)))
        avg_fpr_1_post.append(np.mean(np.array(fpr_1_post)))
        avg_fpr_0_post.append(np.mean(np.array(fpr_0_post)))

    # Make csv file
    df = pd.DataFrame(list(zip(avg_tpr_pre, avg_fpr_pre, avg_J_pre, avg_tpr_1_pre, avg_tpr_0_pre, avg_fpr_1_pre, avg_fpr_0_pre, 
                      avg_tpr_post, avg_fpr_post, avg_J_post, avg_tpr_1_post, avg_tpr_0_post ,avg_fpr_1_post ,avg_fpr_0_post)), 
        columns = column_titles, index = condition_names)
    df.to_csv(os.path.join('tables', model + '_correction.csv'), index=[0])

0it [00:00, ?it/s]

train100%_female_images


1it [00:32, 32.39s/it]

train75%_female_images


2it [01:03, 31.57s/it]

train50%_female_images


3it [01:33, 30.77s/it]

train25%_female_images


4it [02:04, 31.05s/it]

train0%_female_images


5it [02:34, 30.90s/it]
