In [2]:
import pandas as pd
import numpy as np
import shutil
import random
import os

In [3]:
# source directories
masks = 'masks_folder_2'
images = 'train_select_600'

# target directories
train_dir_masks = 'segmentation/masks_train'
val_dir_masks = 'segmentation/masks_val'
test_dir_masks = 'segmentation/masks_test'
train_dir_images = 'segmentation/images_train'
val_dir_images = 'segmentation/images_val'
test_dir_images = 'segmentation/images_test'

# all information about images in dataset
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max
0,50a418190bc3fb1ef1633bf9678929b3,No finding,14,R11,,,,
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,14,R7,,,,
2,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0
4,063319de25ce7edb9b1c6b8881290140,No finding,14,R10,,,,


In [4]:
# list of image ids of pictures in train_select_600 folder
dir_list = os.listdir('train_select_600')
train_select_600_ids = [dir_list.split('.')[0] for dir_list in dir_list]
train_select_600_ids

['0005e8e3701dfb1dd93d53e2ff537b6e',
 '0007d316f756b3fa0baea2ff514ce945',
 '000d68e42b71d3eac10ccc077aba07c1',
 '00150343289f317a0ad5629d5b7d9ef9',
 '0046f681f078851293c4e710c4466058',
 '0061cf6d35e253b6e7f03940592cc35e',
 '008b3176a7248a0a189b5731ac8d2e95',
 '009d4c31ebf87e51c5c8c160a4bd8006',
 '00aca42a24e4ea6066cca2546150c36e',
 '00f2f97f74e086e1f82acc285ee4a5c5',
 '010018c93ed33ae56ed048ee54867e46',
 '011244ab511b20130d846f5f8f0c3866',
 '0114c5cd093c742334d5a5fa14140421',
 '011ae9520e81f1efe71c9d954ec07d09',
 '011eb641c20dc4eaa472382849a99dd4',
 '013893a5fa90241c65c3efcdbdd2cec1',
 '014cca776c9bd5da35715042cdd002a0',
 '01546d3e6175ceaabd7d92f0c566579d',
 '01570ee44031e4ebab6031501293bf66',
 '0187da5fd71c7a2917346bf12a67bbe4',
 '018a2fe44c3451281ec338c27ecc9c53',
 '018b348fd699821cdbc5bdc0b329982c',
 '01a3c3d994d85ce5634d2d13c03fd4b0',
 '01b0069a8e0221a610294d4697b4abf9',
 '01b0510a8d3b0860429536653db5eb21',
 '01c2b9fcb0384c84648ed76c736552a8',
 '01cbbeab94b4d2bfd5cd8a467fee46a7',
 

In [5]:
# shuffling pictures
random.seed(4)
random.shuffle(train_select_600_ids)
train_select_600_ids

['9e63db3c1b443ca561e426b798096b8f',
 '74292e695d6b5868b89acf26363ee93e',
 '83e7cd905776606181931c7b695db12f',
 '7d610aae49bf4f979e16c2d8859f5f87',
 'e301d37e30e70046bf72db9a95b2235f',
 '696c8f0c827772481b1f896016ce4472',
 '33854ebb299f5526f8cc9a8506d80e8f',
 'c7ce99e81ceea73f99d919b38d8df460',
 '7a9bbc3d02750c716fa773dcae80363b',
 '9b00eeaa317747fe5d908526e1268393',
 '6900482a91a538ead56b483f77bcf289',
 'dfb9686c9e1146bd8ff746390ad0ab0e',
 '799915888fe8902eddfe9a19627c618c',
 '66a42146929520fe337cad37816f4384',
 '3527884ce43d577c1cc449fc0f17f646',
 '655c037ef7034935898655e0865857e0',
 '6e2a38e4cc5b4cd14dba22285a5cd210',
 '81545a6a71bbc9cb3d50ef6ba9ccaeaf',
 '20394e709ffb7128e582a7b0901dca2d',
 '4004f114ab2ecb224ac7484957b79ea4',
 'a7b7907cb8643cf431677e6163943ab4',
 'acd377b981077dfd46028b37d3efb502',
 '7237fe007c5cab239011e89137eee3a7',
 '4334f287e7a843348a24c4dfa9718d6f',
 '6e469682b53c4c558e23632c4c7e4fe5',
 'e531672ae6083e717cbe83d7fc71ddda',
 '231f7662ddb1e997de7ae4e11840960f',
 

In [6]:
# calculate split indices
train_end = int(len(train_select_600_ids) * 0.85)
validate_end = int(len(train_select_600_ids) * 0.9)

# train-validation-test split
train = train_select_600_ids[:train_end]
validate = train_select_600_ids[train_end:validate_end]
test = train_select_600_ids[validate_end:]

In [7]:
print(len(train))

3033


In [8]:
print(len(validate))

179


In [9]:
print(len(test))

357


In [18]:
# copying masks for train subset
copied_masks_train = []
file_not_found_train = []

for image in train:
    try:
        shutil.copy(f"{masks}/{image}.png", train_dir_masks)
        copied_masks_train.append(image)
    except FileNotFoundError:
        file_not_found_train.append(image)
    
print (f"""{len(copied_masks_train)} images are copied to {train_dir_masks}. 
{len(file_not_found_train)} images could not be copied.""")

2545 images are copied to segmentation/masks_train. 
488 images could not be copied.


In [19]:
# ensuring that all pictures, that could not be copied have No finding class
masks_train_df = df[df['image_id'].isin(file_not_found_train)][['image_id', 'class_name']]
pd.set_option('display.max_rows', None)
masks_train_df

Unnamed: 0,image_id,class_name
67,764205f702d380f8a4da5cffa538f48d,No finding
96,69f5d29210d98df18120534adb990291,No finding
108,c113ff68c59ef8d40cd4c0bd6e021a6e,No finding
190,4fca8a77e10e418e3851c65c0f42fc23,No finding
208,ae46578e2d798368f43af12ffb20c976,No finding
210,55522f91d2da99602696a2c6b0b5f42a,No finding
437,70c8b6a87c8664190d85784a0811a9d2,No finding
476,96289c1a25aca0025127c41f45ed24e0,No finding
609,34fda0402fb015bab250aa7b3f0372f1,No finding
744,20c912c5a00f426f19d54272bd2b4fcd,No finding


In [11]:
# copying masks for validate subset
copied_masks_validate = []
file_not_found_validate = []

for image in validate:
    try:
        shutil.copy(f"{masks}/{image}.png", val_dir_masks)
        copied_masks_validate.append(image)
    except FileNotFoundError:
        file_not_found_validate.append(image)
    
print (f"""{len(copied_masks_validate)} images are copied to {val_dir_masks}. 
{len(file_not_found_validate)} images could not be copied.""")

153 images are copied to segmentation/masks_val. 
26 images could not be copied.


In [12]:
masks_val_df = df[df['image_id'].isin(file_not_found_validate)][['image_id', 'class_name']]
pd.set_option('display.max_rows', None)
masks_val_df

Unnamed: 0,image_id,class_name
775,31449e7891efb4f8be8f7d76bf247914,No finding
1976,fe358fa490f9b97a3be8142961dd69f6,No finding
2368,f5f0ce7c7c2a1958797bf900ede184fa,No finding
3167,d0902c52f5d4f42d3c26cae4418f74d8,No finding
4559,0760fe6c93bbd24e53213405de113a76,No finding
4676,619ca3842430e97bdd23bae5b5322891,No finding
5485,938ce965d372654fbfcc844ecd3e77ca,No finding
5546,2d2095859e039db2d0201950a208dfe1,No finding
5601,2072cdb6aab12d25059bb55e7d832290,No finding
7348,f972faed5e9885f7409133166cf8ed9a,No finding


In [20]:
# copying masks for test subset
copied_masks_test = []
file_not_found_test = []

for image in test:
    try:
        shutil.copy(f"{masks}/{image}.png", test_dir_masks)
        copied_masks_test.append(image)
    except FileNotFoundError:
        file_not_found_test.append(image)
    
print (f"""{len(copied_masks_test)} images are copied to {test_dir_masks}. 
{len(file_not_found_test)} images could not be copied.""")

291 images are copied to segmentation/masks_test. 
66 images could not be copied.


In [21]:
masks_test_df = df[df['image_id'].isin(file_not_found_test)][['image_id', 'class_name']]
pd.set_option('display.max_rows', None)
masks_test_df

Unnamed: 0,image_id,class_name
794,7cd493beeffa8f16927d1a1452b64be4,No finding
1396,1d253ac85579d2171e2a34f88cb1d5b5,No finding
1581,369f57f4b8b15f18a670985874ada820,No finding
1599,da8512183f1a490e9c7b516ae9e0db5d,No finding
1792,2142468bfb44b63785a97096a69b020a,No finding
2452,73c08b6efeac411a439eb28d4c1aa0a1,No finding
2775,91ef2f7201e2b84ad11ac10797dc7356,No finding
3269,bc75c3442c357666f6db26efb97e391d,No finding
3531,6cd46a59ee847f2f63b1cc204aa4984f,No finding
3972,a1e2ef3ddab1424e24479cb95501e667,No finding


In [15]:
# list of image ids of pictures in masks_train, masks_val and masks_test folders
masks_train_list = os.listdir(train_dir_masks)
train_masks_ids = [masks_train_list.split('.')[0] for masks_train_list in masks_train_list]
print("Train: " + str(len(train_masks_ids)))

masks_val_list = os.listdir(val_dir_masks)
val_masks_ids = [masks_val_list.split('.')[0] for masks_val_list in masks_val_list]
print("Validation: " + str(len(masks_val_list)))

masks_test_list = os.listdir(test_dir_masks)
test_masks_ids = [masks_test_list.split('.')[0] for masks_test_list in masks_test_list]
print("Test: " + str(len(test_masks_ids)))

Train: 2545
Validation: 153
Test: 291


In [22]:
# copying images for train subset
copied_images_train = []
file_not_found_image_train = []

for image in train_masks_ids:
    try:
        shutil.copy(f"{images}/{image}.png", train_dir_images)
        copied_images_train.append(image)
    except FileNotFoundError:
        file_not_found_image_train.append(image)
    
print (f"""{len(copied_images_train)} images are copied to {train_dir_images}. 
{len(file_not_found_image_train)} images could not be copied.""")

2545 images are copied to segmentation/images_train. 
0 images could not be copied.


In [17]:
# copying images for validation subset
copied_images_val = []
file_not_found_image_val = []

for image in val_masks_ids:
    try:
        shutil.copy(f"{images}/{image}.png", val_dir_images)
        copied_images_val.append(image)
    except FileNotFoundError:
        file_not_found_image_val.append(image)
    
print (f"""{len(copied_images_val)} images are copied to {val_dir_images}. 
{len(file_not_found_image_val)} images could not be copied.""")

153 images are copied to segmentation/images_val. 
0 images could not be copied.


In [23]:
# copying images for test subset
copied_images_test = []
file_not_found_image_test = []

for image in test_masks_ids:
    try:
        shutil.copy(f"{images}/{image}.png", test_dir_images)
        copied_images_test.append(image)
    except FileNotFoundError:
        file_not_found_image_test.append(image)
    
print (f"""{len(copied_images_test)} images are copied to {test_dir_images}. 
{len(file_not_found_image_test)} images could not be copied.""")

291 images are copied to segmentation/images_test. 
0 images could not be copied.
