In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)

from termcolor import colored

In [2]:
def get_clean(data):
    data.loc[:, data.columns[5:]] = data.loc[:, data.columns[5:]].fillna(0)
    zeros = ['Cardiomegaly', 'Consolidation', 'Pleural Effusion']
    data.loc[:, zeros] = data.loc[:, zeros].replace({-1:0})
    ones = ['Edema', 'Atelectasis']
    data.loc[:, ones] = data.loc[:, ones].replace({-1:1})

    data = data[['Path', 'Frontal/Lateral', 'Cardiomegaly', 'Edema', 'Consolidation', 
                   'Atelectasis', 'Pleural Effusion']].copy() 
    
    return data

def get_stats(data):
    print(colored(f"for clean_{data}:", 'blue'))
    print('='*30)
    
    # not clean
    not_clean = pd.read_csv(f'../../Data/Final Data/clean data/not_clean_{data}.csv')
    print(f"{colored(not_clean.shape[0], 'red')} damaged images have been removed")
    
    # clean
    clean = pd.read_csv(f'../../Data/Final Data/clean data/clean_{data}.csv')
    clean = get_clean(clean)
    
    print('-'*40)
    print(clean.iloc[:, 2:].sum(axis=0))
    
    print('-'*40)
    at_car_inter_mask = (clean['Atelectasis'] == 1)&(clean['Cardiomegaly'] == 1)
    at_car_inter_num   = clean.loc[at_car_inter_mask, :].shape[0]
    print(f"There are {colored(at_car_inter_num, 'magenta')} images that have both {colored('Atelectasis', on_color='on_yellow')} and {colored('Cardiomegaly', on_color='on_yellow')}")
    
    print('-'*60)
    clean['num_inter'] = clean.iloc[:, 2:].sum(axis=1)
    print(clean['num_inter'].value_counts())

## Analysis

In [3]:
get_stats(4)

[34mfor clean_4:[0m
[31m302[0m damaged images have been removed
----------------------------------------
Cardiomegaly        26960.0
Edema               65164.0
Consolidation       14760.0
Atelectasis         67026.0
Pleural Effusion    86061.0
dtype: float64
----------------------------------------
There are [35m8051[0m images that have both [43mAtelectasis[0m and [43mCardiomegaly[0m
------------------------------------------------------------
1.0    79866
0.0    66043
2.0    54956
3.0    18963
4.0     3116
5.0      168
Name: num_inter, dtype: int64


In [10]:
ignore = pd.read_csv('../../../../Data/CheXpert-v1.0-small/train.csv')
ignore.loc[:, ignore.columns[5:]] = ignore.loc[:, ignore.columns[5:]].fillna(0)

ignore = ignore[['Path', 'Frontal/Lateral', 'Cardiomegaly', 'Edema', 'Consolidation', 
                   'Atelectasis', 'Pleural Effusion']].copy() 

ignore = ignore.replace({-1:np.nan}).dropna()

ignore.iloc[:, 2:].sum()

Cardiomegaly        19421.0
Edema               40526.0
Consolidation       11056.0
Atelectasis         27147.0
Pleural Effusion    63773.0
dtype: float64

In [11]:
clean_4 = pd.read_csv(f'../../Data/Final Data/clean data/clean_4.csv')
clean_4 = get_clean(clean_4)
clean_4_no_car = clean_4[clean_4['Cardiomegaly']!=1]

car_ignore = ignore[ignore['Cardiomegaly']==1]

new_clean = pd.concat([clean_4_no_car, car_ignore])
new_clean.shape

(215573, 7)

In [13]:
new_clean.iloc[:, 2:].sum()

Cardiomegaly        19421.0
Edema               60908.0
Consolidation       14340.0
Atelectasis         62173.0
Pleural Effusion    83287.0
dtype: float64

In [14]:
new_clean.to_csv('../../Data/Final Data/analysis/new_clean.csv', index=False)

# NIH sample

In [6]:
nih = pd.read_csv(r'D:\Graduation Project\Data\NIH\sample_labels.csv')

In [7]:
nih.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImageWidth,OriginalImageHeight,OriginalImagePixelSpacing_x,OriginalImagePixelSpacing_y
0,00000013_005.png,Emphysema|Infiltration|Pleural_Thickening|Pneu...,5,13,060Y,M,AP,3056,2544,0.139,0.139
1,00000013_026.png,Cardiomegaly|Emphysema,26,13,057Y,M,AP,2500,2048,0.168,0.168
2,00000017_001.png,No Finding,1,17,077Y,M,AP,2500,2048,0.168,0.168
3,00000030_001.png,Atelectasis,1,30,079Y,M,PA,2992,2991,0.143,0.143
4,00000032_001.png,Cardiomegaly|Edema|Effusion,1,32,055Y,F,AP,2500,2048,0.168,0.168


In [8]:
nih = nih[['Image Index', 'Finding Labels']].copy()

In [9]:
nih.columns = ['Path', 'Labels']

In [10]:
nih['Frontal/Lateral'] = 'Frontal'

In [11]:
nih['Path'] = 'sample/images/' + nih['Path']

In [12]:
nih['Labels'].value_counts()

No Finding                                                                              3044
Infiltration                                                                             503
Effusion                                                                                 203
Atelectasis                                                                              192
Nodule                                                                                   144
Pneumothorax                                                                             114
Mass                                                                                      99
Consolidation                                                                             72
Effusion|Infiltration                                                                     69
Pleural_Thickening                                                                        65
Atelectasis|Infiltration                                              

In [13]:
columns = ['Cardiomegaly', 'Edema', 'Consolidation', 'Atelectasis', 'Effusion']

for col in columns:
    col_values = []
    nih[col] = nih.iloc[:, 1].apply(lambda x: int(col in x))

In [14]:
nih.drop('Labels', axis=1, inplace=True)

In [15]:
nih.head()

Unnamed: 0,Path,Frontal/Lateral,Cardiomegaly,Edema,Consolidation,Atelectasis,Effusion
0,sample/images/00000013_005.png,Frontal,0,0,0,0,0
1,sample/images/00000013_026.png,Frontal,1,0,0,0,0
2,sample/images/00000017_001.png,Frontal,0,0,0,0,0
3,sample/images/00000030_001.png,Frontal,0,0,0,1,0
4,sample/images/00000032_001.png,Frontal,1,1,0,0,1


In [16]:
nih.iloc[:, 2:].sum(axis=0)

Cardiomegaly     141
Edema            118
Consolidation    226
Atelectasis      508
Effusion         644
dtype: int64

In [17]:
nih.rename(columns={'Effusion':'Pleural Effusion'}, inplace=True)

In [19]:
mask_1 = nih.iloc[:, 2:].sum(axis=1) == 1
mask_2 = nih['Cardiomegaly'] == 1
car_nih = nih.loc[mask_1&mask_2, :]

In [20]:
aug_car = pd.concat([clean, car_nih])

In [21]:
aug_car.shape

(217006, 7)

In [22]:
aug_car.to_csv('aug_car_r2.csv', index=False)

In [23]:
mask_1 = nih.iloc[:, 2:].sum(axis=1) == 1
mask_2 = nih['Atelectasis'] == 1
atl_nih = nih.loc[mask_1&mask_2, :]

In [24]:
aug_atl = pd.concat([clean, atl_nih])

In [25]:
aug_atl.shape

(217257, 7)

In [26]:
aug_atl.to_csv('aug_atl_r2.csv', index=False)

In [27]:
aug_data = pd.concat([clean, nih])

In [28]:
aug_data.to_csv('aug_data_r2.csv', index=False)

In [29]:
aug_data.shape

(222546, 7)

In [18]:
mask_1 = nih.iloc[:, 2:].sum(axis=1) == 1
mask_2 = nih['Cardiomegaly'] == 1
car_nih = nih.loc[mask_1&mask_2, :]

mask_1 = nih.iloc[:, 2:].sum(axis=1) == 1
mask_2 = nih['Atelectasis'] == 1
atl_nih = nih.loc[mask_1&mask_2, :]

aug_car_atl = pd.concat([clean, car_nih, atl_nih])

In [19]:
aug_car_atl.shape

(217323, 7)

In [20]:
aug_car_atl.to_csv('aug_car_atl.csv', index=False)

# Balancing

In [5]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)

In [6]:
clean = pd.read_csv('../../Data/Final Data/analysis/new_clean_r2.csv')

In [7]:
clean.iloc[:, 2:].sum()

Cardiomegaly        24811.0
Edema               63738.0
Consolidation       14402.0
Atelectasis         62346.0
Pleural Effusion    83441.0
dtype: float64

In [8]:
mask = (clean['Pleural Effusion'] == 1)&(clean['Atelectasis'] == 1)
clean_p1 = clean.loc[mask, :]
clean_p2 = clean.loc[~mask, :]

In [9]:
clean.shape, clean_p1.shape, clean_p2.shape

((216940, 7), (29317, 7), (187623, 7))

In [10]:
29317+187623

216940

In [11]:
clean_p2.iloc[:, 2:].sum()

Cardiomegaly        21029.0
Edema               53716.0
Consolidation       12196.0
Atelectasis         33029.0
Pleural Effusion    54124.0
dtype: float64

In [13]:
clean_p1_sample = clean_p1.sample()

(29317, 7)

In [12]:
new_clean_b = pd.concat([clean_p1, clean_p2])
new_clean_b.iloc[:, 2:].sum()

Cardiomegaly        24811.0
Edema               63738.0
Consolidation       14402.0
Atelectasis         62346.0
Pleural Effusion    83441.0
dtype: float64