In [1]:
import pandas as pd
import clinicaldg.cxr.Constants as cxrConstants
import clinicaldg.cxr.process as cxrProcess

In [2]:
df_paths = cxrConstants.df_paths
df_paths

{'MIMIC': '/scratch/rc4499/thesis/data/mimic-cxr/clinicaldg/preprocessed.csv',
 'CXP': '/scratch/rc4499/thesis/data/chexpert/clinicaldg/preprocessed.csv',
 'NIH': '/scratch/rc4499/thesis/data/chestxray8/clinicaldg/preprocessed.csv',
 'PAD': '/scratch/rc4499/thesis/data/padchest/clinicaldg/preprocessed.csv'}

In [3]:
df = pd.read_csv(df_paths["MIMIC"])

for env in df_paths:
    df = pd.read_csv(df_paths[env])
    print(env)
    print(len(df))
    print(df[df["Pneumonia"] > 0]["Pneumonia"].count())

MIMIC
376206
25065
CXP
223648
6047
NIH
112120
1431
PAD
144494
7673


In [5]:
dfs = {}
for env in cxrConstants.df_paths:
    func = cxrProcess.get_process_func(env)
    df_env = func(pd.read_csv(cxrConstants.df_paths[env]), only_frontal = True)
    print(env, get_prop(df_env))
    train_df, valid_df, test_df = cxrProcess.split(df_env)
    dfs[env] = {
        'all': df_env,
        'train': train_df,
        'val': valid_df,
        'test': test_df
    }

MIMIC 0.07307391379302723
CXP 0.025103727766877873
NIH 0.012928113904724047
PAD 0.05154780337262089


In [48]:
def get_prop(df, column="Pneumonia"):
    num_instances = len(df)
    num_diseased = df[df[column] == 1][column].count()
    return num_diseased / (num_instances - num_diseased)

def get_resample_class(orig_prop, new_prop, resample_method):
    if new_prop > orig_prop:
        if resample_method == "over":
            return 1
        else:
            return 0
    if new_prop < orig_prop:
        if resample_method == "under":
            return 1
        else:
            return 0
        
def calculate_num_resample(df, orig_prop, new_prop, resample_method):
    pass

def balance_proportion(orig_df, new_df, resample_method="over", column="Pneumonia"):
    orig_df = orig_df.fillna(0.0)
    orig_prop = get_prop(orig_df, column)
    new_prop = get_prop(new_df, column)
    assert resample_method in ["over", "under"]
    resample_class = get_resample_class(orig_prop, new_prop, resample_method)
    print(f"Resampling '{column}' via '{resample_method}' on class {resample_class} from {orig_prop} to {new_prop}")
    
    # Estimate the number of items we'll need to resample
    df_diseased = orig_df[orig_df[column] == 1.0]
    df_normal = orig_df[orig_df[column] == 0.0]
    num_diseased = len(df_diseased)
    num_normal = len(df_normal)
    assert num_diseased + num_normal == len(orig_df)
    
    if resample_method == "over":
        if resample_class == 0:
            new_num_normal = int(num_diseased / new_prop)
            print(f"Resampling normal samples from {num_normal} to {new_num_normal}")
            df_normal_rs = df_normal.sample(new_num_normal, replace=True, random_state=0)
            resampled_df = pd.concat([df_normal_rs, df_diseased])
        else:
            # Resample the pneumonia class
            new_num_diseased = int(new_prop * num_normal)
            print(f"Resampling diseased samples from {num_diseased} to {new_num_diseased}")
            df_diseased_rs = df_diseased.sample(new_num_diseased, replace=True, random_state=0)
            resampled_df = pd.concat([df_normal, df_diseased_rs])
    
    resampled_df.sort_index(inplace=True)
    print(f"New df proportion: {get_prop(resampled_df, column)}")
    return resampled_df
            
balance_proportion(dfs["NIH"]["train"], dfs["PAD"]["test"])

Resampling 'Pneumonia' via 'over' on class 1 from 0.012494579976722426 to 0.05075699814834985
Resampling diseased samples from 1095 to 4448
New df proportion: 0.05075423902873183


Unnamed: 0,subject_id,path,Sex,Age,env,frontal,study_id,No Finding,Atelectasis,Cardiomegaly,Effusion,Pneumonia,Pneumothorax,Consolidation,Edema
3,2,/chestxray8/images/00000002_000.png,M,80-,NIH,True,2,True,False,False,False,False,False,False,False
4,3,/chestxray8/images/00000003_001.png,F,60-80,NIH,True,3,False,False,False,False,False,False,False,False
5,3,/chestxray8/images/00000003_002.png,F,60-80,NIH,True,3,False,False,False,False,False,False,False,False
6,3,/chestxray8/images/00000003_003.png,F,60-80,NIH,True,3,False,False,False,False,False,False,False,False
7,3,/chestxray8/images/00000003_004.png,F,60-80,NIH,True,3,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112115,30801,/chestxray8/images/00030801_001.png,M,20-40,NIH,True,30801,False,False,False,False,True,False,False,False
112115,30801,/chestxray8/images/00030801_001.png,M,20-40,NIH,True,30801,False,False,False,False,True,False,False,False
112116,30802,/chestxray8/images/00030802_000.png,M,20-40,NIH,True,30802,True,False,False,False,False,False,False,False
112117,30803,/chestxray8/images/00030803_000.png,F,40-60,NIH,True,30803,True,False,False,False,False,False,False,False


In [33]:
dfs["CXP"]["train"]

Unnamed: 0,subject_id,path,Sex,Age,env,frontal,study_id,No Finding,Atelectasis,Cardiomegaly,Effusion,Pneumonia,Pneumothorax,Consolidation,Edema
0,1,/CheXpert-v1.0/train/patient00001/study1/view1...,F,60-80,CXP,True,patient00001/study1,1.0,,,,,0.0,,
4,3,/CheXpert-v1.0/train/patient00003/study1/view1...,M,40-60,CXP,True,patient00003/study1,,,,,,0.0,,1.0
5,4,/CheXpert-v1.0/train/patient00004/study1/view1...,F,20-40,CXP,True,patient00004/study1,1.0,,,0.0,,,0.0,
7,5,/CheXpert-v1.0/train/patient00005/study1/view1...,M,20-40,CXP,True,patient00005/study1,1.0,,0.0,0.0,,,0.0,
9,5,/CheXpert-v1.0/train/patient00005/study2/view1...,M,20-40,CXP,True,patient00005/study2,,,,,,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223643,64736,/CheXpert-v1.0/valid/patient64736/study1/view1...,F,40-60,CXP,True,patient64736/study1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
223644,64737,/CheXpert-v1.0/valid/patient64737/study1/view1...,M,60-80,CXP,True,patient64737/study1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
223645,64738,/CheXpert-v1.0/valid/patient64738/study1/view1...,M,60-80,CXP,True,patient64738/study1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
223646,64739,/CheXpert-v1.0/valid/patient64739/study1/view1...,F,40-60,CXP,True,patient64739/study1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

def balance_df_label(df, sampler, label_bal=0.05154780337262089, invert=False):
    target = df["Pneumonia"] == (1 if not invert else 0)
    rus = sampler(random_state=42, sampling_strategy=label_bal if not invert else 1-label_bal - 0.23)
    res_df, _ = rus.fit_resample(df, target)

    print(f"Previous pneumonia prop: {get_pneumonia_prop(df)} with {len(df)} instances")
    print(f"Resampled pneumonia prop: {get_pneumonia_prop(res_df)} with {len(res_df)} instances")

    return res_df

mimic_balanced = balance_df_label(dfs["MIMIC"], RandomOverSampler, invert=True)
cxp_balanced = balance_df_label(dfs["CXP"], RandomOverSampler, invert=False)

# # Balance the size of the two datasets
# n = len(cxp_balanced)
# mimic_balanced = mimic_balanced.sample(n)

Previous pneumonia prop: 0.07307391379302723 with 242754 instances
Resampled pneumonia prop: 0.05094534416074703 with 341016 instances
Previous pneumonia prop: 0.025103727766877873 with 191229 instances
Resampled pneumonia prop: 0.05154760756060167 with 196162 instances


In [16]:
cxp_balanced.sample(341016, replace=True, random_state=42)

Unnamed: 0,subject_id,path,Sex,Age,env,frontal,study_id,No Finding,Atelectasis,Cardiomegaly,Effusion,Pneumonia,Pneumothorax,Consolidation,Edema
121958,35892,/CheXpert-v1.0/train/patient35892/study8/view1...,M,60-80,CXP,True,patient35892/study8,,,,1.0,,,,1.0
146867,41582,/CheXpert-v1.0/train/patient41582/study2/view1...,M,60-80,CXP,True,patient41582/study2,,,,1.0,,,,
131932,37879,/CheXpert-v1.0/train/patient37879/study5/view1...,F,40-60,CXP,True,patient37879/study5,1.0,,,0.0,,,,0.0
103694,31362,/CheXpert-v1.0/train/patient31362/study1/view1...,M,40-60,CXP,True,patient31362/study1,,0.0,,,,0.0,,
119879,35518,/CheXpert-v1.0/train/patient35518/study1/view1...,M,60-80,CXP,True,patient35518/study1,,,,0.0,,0.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63927,19383,/CheXpert-v1.0/train/patient19383/study3/view1...,F,60-80,CXP,True,patient19383/study3,1.0,,,,,,,0.0
130112,37498,/CheXpert-v1.0/train/patient37498/study2/view1...,F,60-80,CXP,True,patient37498/study2,,,,1.0,,,,1.0
75155,22831,/CheXpert-v1.0/train/patient22831/study3/view1...,M,80-,CXP,True,patient22831/study3,,1.0,,0.0,,0.0,0.0,
136734,39007,/CheXpert-v1.0/train/patient39007/study9/view1...,M,40-60,CXP,True,patient39007/study9,,,,1.0,,0.0,,


In [14]:
TRAIN_ENVS = ["MIMIC", "CXP"]

balance_proportion(dfs["MIMIC"], 
# for env in TRAIN_ENVS:
#     dfs = dfs[env]
#     # Balance the train and validation split in this environment
#     train_env = dfs["train"]
#     train_balanced = 

{'all':        subject_id                                               path Sex  \
0        10000032  /mimic-cxr/p10/p10000032/s50414267/02aa804e-bd...   F   
2        10000032  /mimic-cxr/p10/p10000032/s53189527/2a2277a9-b0...   F   
4        10000032  /mimic-cxr/p10/p10000032/s53911762/68b5c4b1-22...   F   
5        10000032  /mimic-cxr/p10/p10000032/s53911762/fffabebf-74...   F   
6        10000032  /mimic-cxr/p10/p10000032/s56699142/ea030e7a-2e...   F   
...           ...                                                ...  ..   
376200   19999733  /mimic-cxr/p19/p19999733/s57132437/3fcd0406-9b...   F   
376201   19999733  /mimic-cxr/p19/p19999733/s57132437/428e2c18-57...   F   
376203   19999987  /mimic-cxr/p19/p19999987/s55368167/58766883-37...   F   
376204   19999987  /mimic-cxr/p19/p19999987/s58621812/7ba273af-3d...   F   
376205   19999987  /mimic-cxr/p19/p19999987/s58971208/1a1fe7e3-cb...   F   

          Age    env  frontal  study_id  No Finding  Atelectasis  \
0       40-

KeyError: 'CXP'