In [1]:
import shutil
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
label_df = pd.read_csv('../Dataset/CXR8_Data_Entry_2017.csv', index_col=False)

In [3]:
label_map = label_df[['Image Index', 'View Position']] 

In [4]:
label_map['View Position'].value_counts()

View Position
PA    67310
AP    44810
Name: count, dtype: int64

In [5]:
label_map.loc[:,'Inclusion'] = label_map.loc[:,'View Position'] == 'PA'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_map.loc[:,'Inclusion'] = label_map.loc[:,'View Position'] == 'PA'


In [6]:
label_map['Inclusion'].value_counts()

Inclusion
True     67310
False    44810
Name: count, dtype: int64

In [7]:
label_map = dict(zip(label_map['Image Index'], label_map['Inclusion']))

# Binary

In [8]:
train_dir = '../Dataset/images/train_binary_PA'
val_dir = '../Dataset/images/validation_binary_PA'
test_dir = '../Dataset/images/test_caseonly_binary_PA'

In [19]:
class_names = ['Cases', 'Normal']

In [10]:
# creating path to store images in folders(train/val/test) with subfolders named by class-label
for path in [train_dir, val_dir, test_dir]:
    for label in list(class_names): 
        path_label = path + '/' + label
        if os.path.exists(path_label)== False:
            print('Creating ',path_label)
            os.makedirs(path_label)
        else: print('Existed path', path_label)

Existed path Dataset/images/train_binary_PA/Cases
Existed path Dataset/images/train_binary_PA/Normal
Existed path Dataset/images/validation_binary_PA/Cases
Existed path Dataset/images/validation_binary_PA/Normal
Existed path Dataset/images/test_caseonly_binary_PA/Cases
Existed path Dataset/images/test_caseonly_binary_PA/Normal


In [11]:
folders = os.listdir('../Dataset/images/')

In [12]:
old_folder_list = ['train_binary','validation_binary','test_binary']
new_folder_list = ['train_binary_PA','validation_binary_PA','test_binary_PA']

In [59]:
# training set
old = old_folder_list[0]
new = new_folder_list[0]

n = 0
for label in class_names:
    old_path = 'Dataset/images/' + old + '/' + label
    new_path = 'Dataset/images/' + new + '/' + label
    print('Copying ' + old_path + ' to ' + new_path)

    files = os.listdir(old_path)
    for file in tqdm(files):
        if label_map[file]:
            source_file = os.path.join(old_path, file)
            destination_file = os.path.join(new_path, file)
            shutil.copy(source_file, destination_file)
            n += 1 #count how many PA in train

Copying Dataset/images/train_binary/Cases to Dataset/images/train_binary_PA/Cases


100%|██████████| 31391/31391 [10:31<00:00, 49.69it/s]


Copying Dataset/images/train_binary/Normal to Dataset/images/train_binary_PA/Normal


100%|██████████| 44379/44379 [16:53<00:00, 43.79it/s]


In [60]:
print(n)

49402


In [13]:
# validation set
old = old_folder_list[1]
new = new_folder_list[1]

n = 0
for label in class_names:
    old_path = '../Dataset/images/' + old + '/' + label
    new_path = '../Dataset/images/' + new + '/' + label
    print('Copying ' + old_path + ' to ' + new_path)

    files = os.listdir(old_path)
    for file in tqdm(files):
        if label_map[file]:
            source_file = os.path.join(old_path, file)
            destination_file = os.path.join(new_path, file)
            shutil.copy(source_file, destination_file)
            n += 1

Copying Dataset/images/validation_binary/Cases to Dataset/images/validation_binary_PA/Cases


100%|██████████| 4633/4633 [01:37<00:00, 47.45it/s]


Copying Dataset/images/validation_binary/Normal to Dataset/images/validation_binary_PA/Normal


100%|██████████| 6121/6121 [02:30<00:00, 40.61it/s]


In [14]:
print(n)

6812


In [16]:
# test set
old = old_folder_list[2]
new = new_folder_list[2]

n = 0
for label in class_names:
    old_path = '../Dataset/images/' + old + '/' + label
    new_path = '../Dataset/images/' + new + '/' + label
    print('Copying ' + old_path + ' to ' + new_path)

    files = os.listdir(old_path)
    for file in tqdm(files):
        if label_map[file]:
            source_file = os.path.join(old_path, file)
            destination_file = os.path.join(new_path, file)
            shutil.copy(source_file, destination_file)
            n += 1

Copying Dataset/images/test_binary/Cases to Dataset/images/test_binary_PA/Cases


100%|██████████| 15735/15735 [03:54<00:00, 67.05it/s]


Copying Dataset/images/test_binary/Normal to Dataset/images/test_binary_PA/Normal


100%|██████████| 9861/9861 [03:16<00:00, 50.25it/s]


In [17]:
print(n)

11096


In [30]:
49402 + 6812 + 11096

67310

In [20]:
case_grouped_total = 0
for label in class_names:
    old_path = '../Dataset/images/' + 'train_binary' + '/' + label

    old_path_files = os.listdir(old_path)
    
    case_grouped_total += len(old_path_files)
    print(old_path,': ', len(old_path_files))

Dataset/images/train_binary/Cases :  31391
Dataset/images/train_binary/Normal :  44379


In [None]:
31391

### The ViT binary model has computed on PA images only dataset

# Multiclass 
## Haven't finish the following part since the number of images seems incorrect

In [18]:
train_dir = '../Dataset/images/train_caseonly_grouped_PA'
val_dir = '../Dataset/images/validation_caseonly_grouped_PA'
test_dir = '../Dataset/images/test_caseonly_grouped_PA'

In [8]:
class_names = ['Fluid_overload', 'Infection', 'Mass_Like_Lesions', 'Parenchymal_Disease', 
               'Atelectasis', 'Cardiomegaly', 'Pneumothorax', 'Pleural_Thickening']

In [20]:
# creating path to store images in folders(train/val/test) with subfolders named by class-label
for path in [train_dir, val_dir, test_dir]:
    for label in list(class_names): 
        path_label = path + '/' + label
        if os.path.exists(path_label)== False:
            print('Creating ',path_label)
            os.makedirs(path_label)
        else: print('Existed path', path_label)

Creating  Dataset/images/train_caseonly_grouped_PA/Fluid_overload
Creating  Dataset/images/train_caseonly_grouped_PA/Infection
Creating  Dataset/images/train_caseonly_grouped_PA/Mass_Like_Lesions
Creating  Dataset/images/train_caseonly_grouped_PA/Parenchymal_Disease
Creating  Dataset/images/train_caseonly_grouped_PA/Atelectasis
Creating  Dataset/images/train_caseonly_grouped_PA/Cardiomegaly
Creating  Dataset/images/train_caseonly_grouped_PA/Pneumothorax
Creating  Dataset/images/train_caseonly_grouped_PA/Pleural_Thickening
Creating  Dataset/images/validation_caseonly_grouped_PA/Fluid_overload
Creating  Dataset/images/validation_caseonly_grouped_PA/Infection
Creating  Dataset/images/validation_caseonly_grouped_PA/Mass_Like_Lesions
Creating  Dataset/images/validation_caseonly_grouped_PA/Parenchymal_Disease
Creating  Dataset/images/validation_caseonly_grouped_PA/Atelectasis
Creating  Dataset/images/validation_caseonly_grouped_PA/Cardiomegaly
Creating  Dataset/images/validation_caseonly_gro

In [21]:
folders = os.listdir('../Dataset/images/')

In [22]:
old_folder_list = ['train_caseonly_grouped','validation_caseonly_grouped','test_caseonly_grouped']
new_folder_list = ['train_caseonly_grouped_PA','validation_caseonly_grouped_PA','test_caseonly_grouped_PA']

In [24]:
# training set
old = old_folder_list[0]
new = new_folder_list[0]

n = 0
for label in class_names:
    old_path = 'Dataset/images/' + old + '/' + label
    new_path = 'Dataset/images/' + new + '/' + label
    print('Copying ' + old_path + ' to ' + new_path)

    files = os.listdir(old_path)
    for file in tqdm(files):
        if label_map[file]:
            source_file = os.path.join(old_path, file)
            destination_file = os.path.join(new_path, file)
            shutil.copy(source_file, destination_file)
            n += 1 #count how many PA in train

Copying Dataset/images/train_caseonly_grouped/Fluid_overload to Dataset/images/train_caseonly_grouped_PA/Fluid_overload


100%|██████████| 8384/8384 [03:02<00:00, 45.88it/s]


Copying Dataset/images/train_caseonly_grouped/Infection to Dataset/images/train_caseonly_grouped_PA/Infection


100%|██████████| 14259/14259 [06:29<00:00, 36.56it/s]


Copying Dataset/images/train_caseonly_grouped/Mass_Like_Lesions to Dataset/images/train_caseonly_grouped_PA/Mass_Like_Lesions


100%|██████████| 7030/7030 [03:35<00:00, 32.61it/s]


Copying Dataset/images/train_caseonly_grouped/Parenchymal_Disease to Dataset/images/train_caseonly_grouped_PA/Parenchymal_Disease


100%|██████████| 2301/2301 [01:23<00:00, 27.70it/s]


Copying Dataset/images/train_caseonly_grouped/Atelectasis to Dataset/images/train_caseonly_grouped_PA/Atelectasis


100%|██████████| 7250/7250 [03:01<00:00, 39.90it/s]


Copying Dataset/images/train_caseonly_grouped/Cardiomegaly to Dataset/images/train_caseonly_grouped_PA/Cardiomegaly


100%|██████████| 1505/1505 [00:45<00:00, 33.26it/s]


Copying Dataset/images/train_caseonly_grouped/Pneumothorax to Dataset/images/train_caseonly_grouped_PA/Pneumothorax


100%|██████████| 2320/2320 [01:10<00:00, 33.03it/s]


Copying Dataset/images/train_caseonly_grouped/Pleural_Thickening to Dataset/images/train_caseonly_grouped_PA/Pleural_Thickening


100%|██████████| 1954/1954 [01:06<00:00, 29.49it/s]


In [25]:
print(n)

26514


In [26]:
# validation set
old = old_folder_list[1]
new = new_folder_list[1]

n = 0
for label in class_names:
    old_path = '../Dataset/images/' + old + '/' + label
    new_path = '../Dataset/images/' + new + '/' + label
    print('Copying ' + old_path + ' to ' + new_path)

    files = os.listdir(old_path)
    for file in tqdm(files):
        if label_map[file]:
            source_file = os.path.join(old_path, file)
            destination_file = os.path.join(new_path, file)
            shutil.copy(source_file, destination_file)
            n += 1

Copying Dataset/images/validation_caseonly_grouped/Fluid_overload to Dataset/images/validation_caseonly_grouped_PA/Fluid_overload


100%|██████████| 1306/1306 [00:25<00:00, 52.20it/s]


Copying Dataset/images/validation_caseonly_grouped/Infection to Dataset/images/validation_caseonly_grouped_PA/Infection


100%|██████████| 2154/2154 [00:49<00:00, 43.69it/s]


Copying Dataset/images/validation_caseonly_grouped/Mass_Like_Lesions to Dataset/images/validation_caseonly_grouped_PA/Mass_Like_Lesions


100%|██████████| 1125/1125 [00:29<00:00, 37.73it/s]


Copying Dataset/images/validation_caseonly_grouped/Parenchymal_Disease to Dataset/images/validation_caseonly_grouped_PA/Parenchymal_Disease


100%|██████████| 355/355 [00:14<00:00, 25.23it/s]


Copying Dataset/images/validation_caseonly_grouped/Atelectasis to Dataset/images/validation_caseonly_grouped_PA/Atelectasis


100%|██████████| 1030/1030 [00:22<00:00, 44.81it/s]


Copying Dataset/images/validation_caseonly_grouped/Cardiomegaly to Dataset/images/validation_caseonly_grouped_PA/Cardiomegaly


100%|██████████| 202/202 [00:05<00:00, 34.63it/s]


Copying Dataset/images/validation_caseonly_grouped/Pneumothorax to Dataset/images/validation_caseonly_grouped_PA/Pneumothorax


100%|██████████| 317/317 [00:09<00:00, 32.82it/s]


Copying Dataset/images/validation_caseonly_grouped/Pleural_Thickening to Dataset/images/validation_caseonly_grouped_PA/Pleural_Thickening


100%|██████████| 288/288 [00:07<00:00, 38.22it/s]


In [27]:
print(n)

3885


In [28]:
# test set
old = old_folder_list[2]
new = new_folder_list[2]

n = 0
for label in class_names:
    old_path = '../Dataset/images/' + old + '/' + label
    new_path = '../Dataset/images/' + new + '/' + label
    print('Copying ' + old_path + ' to ' + new_path)

    files = os.listdir(old_path)
    for file in tqdm(files):
        if label_map[file]:
            source_file = os.path.join(old_path, file)
            destination_file = os.path.join(new_path, file)
            shutil.copy(source_file, destination_file)
            n += 1

Copying Dataset/images/test_caseonly_grouped/Fluid_overload to Dataset/images/test_caseonly_grouped_PA/Fluid_overload


100%|██████████| 5337/5337 [01:14<00:00, 71.81it/s]


Copying Dataset/images/test_caseonly_grouped/Infection to Dataset/images/test_caseonly_grouped_PA/Infection


100%|██████████| 7671/7671 [01:44<00:00, 73.67it/s]


Copying Dataset/images/test_caseonly_grouped/Mass_Like_Lesions to Dataset/images/test_caseonly_grouped_PA/Mass_Like_Lesions


100%|██████████| 3052/3052 [01:12<00:00, 42.13it/s]


Copying Dataset/images/test_caseonly_grouped/Parenchymal_Disease to Dataset/images/test_caseonly_grouped_PA/Parenchymal_Disease


100%|██████████| 1510/1510 [00:46<00:00, 32.42it/s]


Copying Dataset/images/test_caseonly_grouped/Atelectasis to Dataset/images/test_caseonly_grouped_PA/Atelectasis


100%|██████████| 3279/3279 [00:52<00:00, 62.44it/s]


Copying Dataset/images/test_caseonly_grouped/Cardiomegaly to Dataset/images/test_caseonly_grouped_PA/Cardiomegaly


100%|██████████| 1069/1069 [00:20<00:00, 52.04it/s]


Copying Dataset/images/test_caseonly_grouped/Pneumothorax to Dataset/images/test_caseonly_grouped_PA/Pneumothorax


100%|██████████| 2665/2665 [01:14<00:00, 35.86it/s]


Copying Dataset/images/test_caseonly_grouped/Pleural_Thickening to Dataset/images/test_caseonly_grouped_PA/Pleural_Thickening


100%|██████████| 1143/1143 [00:25<00:00, 44.91it/s]


In [29]:
print(n)

10431


In [31]:
3833 + 988 + 4303 + 7753 + 4788 + 1689 + 1515 + 1645

26514

In [21]:
26514 + 3885 + 10431

40830

In [15]:
class_names = ['Atelectasis',
                 'Cardiomegaly',
                 'Effusion',
                 'Infiltration',
                 'Mass',
                 'Nodule',
                 'Pneumonia',
                 'Pneumothorax',
                 'Consolidation',
                 'Edema',
                 'Emphysema',
                 'Fibrosis',
                 'Pleural_Thickening',
                 'Hernia']

In [16]:
case_grouped_total = 0
for label in class_names:
    old_path = '../Dataset/images/' + 'train_caseonly' + '/' + label
    #new_path = 'Dataset/images/' + 'train_caseonly_grouped_PA' + '/' + label

    old_path_files = os.listdir(old_path)
    #new_path_files = os.listdir(new_path)
    
    case_grouped_total += len(old_path_files)
    print(old_path,': ', len(old_path_files))
    #print(new_path,': ', len(new_path_files))

Dataset/images/train_caseonly/Atelectasis :  7250
Dataset/images/train_caseonly/Cardiomegaly :  1505
Dataset/images/train_caseonly/Effusion :  7475
Dataset/images/train_caseonly/Infiltration :  11958
Dataset/images/train_caseonly/Mass :  3471
Dataset/images/train_caseonly/Nodule :  4067
Dataset/images/train_caseonly/Pneumonia :  761
Dataset/images/train_caseonly/Pneumothorax :  2320
Dataset/images/train_caseonly/Consolidation :  2485
Dataset/images/train_caseonly/Edema :  1225
Dataset/images/train_caseonly/Emphysema :  1236
Dataset/images/train_caseonly/Fibrosis :  1078
Dataset/images/train_caseonly/Pleural_Thickening :  1954
Dataset/images/train_caseonly/Hernia :  131


In [17]:
case_grouped_total

46916

In [None]:
caseonly_grouped = 45003
caseonly = 46916

In [18]:
46916 - 45003

1913