In [1]:
# Analyzing results on NIH datasets

# Import necessary packages
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
from balancersV2 import BinaryBalancer
import updated_tools
import seaborn as sns
from sklearn.metrics import roc_auc_score, confusion_matrix
import random
random.seed(10)

In [2]:
# Function that uses validation df to calculate threshold that maximizes Youdens-Statistic
def get_optimal_j(df):
    thresholds = np.linspace(0,1,501)
    j_list = []
    y = df['y'].values
    for t in thresholds:
        y_hat = (df['y_prob'].values >= t)*1
        tn, fp, fn, tp = confusion_matrix(y, y_hat).ravel()
        tpr = tp/(tp+fn)
        fpr = fp/(fp+tn)
        j_list.append(tpr-fpr)
    return thresholds[np.argmax(np.array(j_list))]

In [3]:
# Counting fairness
# Set relevant variables
folder_name = 'reorganized_nih_results'
columns = ['0/100 f', '0/100 m', '25/75 f', '25/75 m', '50/50 f', '50/50 m', '75/25 f', '75/25 m', '100/0 f', '100/0 m']
models = ['train100%_female_images','train75%_female_images','train50%_female_images','train25%_female_images','train0%_female_images']
condition_names = ['Atelectasis','Cardiomegaly','Effusion','Infiltration','Mass',
                   'Nodule','Pneumonia','Pneumothorax','Consolidation','Edema',
                   'Emphysema','Fibrosis','Pleural_Thickening','Hernia']
num_folds = 20
num_list = [i for i in range(num_folds)]
fold_list = ['Fold_' + str(k) for k in range(num_folds)]

In [4]:
# Calculate optimal thresholds
for i, model in enumerate(models):
    print(model)
    favor_m = []
    favor_f = []
    for condition in tqdm(condition_names):
        bias_tpr = []
        bias_fpr = []
        df_list = []
        j_list = []
        for k in range(num_folds):
            folder_path = os.path.join(folder_name, model, condition)
            df = pd.read_csv(os.path.join(folder_path, fold_list[k]), index_col=[0])
            df_list.append(df)
        for k in range(num_folds):
            i = random.sample(num_list[:k] + num_list[k:],1)[0]
            val = df_list[i]
            j_list.append(get_optimal_j(val))
        dict = {'Fold':fold_list,'Optimal Threshold':j_list}
        pd.DataFrame(dict).to_csv(os.path.join(folder_path, 'optimal_thresholds.csv'), index=False)

train100%_female_images


100%|██████████| 14/14 [14:53<00:00, 63.80s/it]


train75%_female_images


100%|██████████| 14/14 [16:14<00:00, 69.64s/it]


train50%_female_images


100%|██████████| 14/14 [15:05<00:00, 64.71s/it]


train25%_female_images


100%|██████████| 14/14 [14:51<00:00, 63.65s/it]


train0%_female_images


100%|██████████| 14/14 [15:16<00:00, 65.48s/it]
