In [17]:
import os
import numpy as np
import torch
from torch.utils.data import random_split
import shutil
from sklearn.model_selection import KFold

In [19]:
K = 5
base_dir = 'dataset_base'
classes = os.listdir(base_dir)
# classes.reverse()
classes = sorted(classes)
classes

['0_Amiloidose',
 '1_Normal',
 '2_Esclerose_Pura_Sem_Crescente',
 '3_Hipercelularidade',
 '4_Hipercelularidade_Pura_Sem_Crescente']

In [39]:
dest_dir = 'static_kfold'

In [None]:
class_folders = ['0_Amiloidose', '1_Normal', '2_Esclerose_Pura_Sem_Crescente', '3_Hipercelularidade', '4_Hipercelularidade_Pura_Sem_Crescente']
stain_folders = ['AZAN', 'HE', 'PAMS', 'PAS']

In [65]:
if not os.path.exists(dest_dir):
    os.mkdir(dest_dir)
    
generator1 = torch.Generator().manual_seed(42)
for folder in os.listdir(base_dir):
    print(f"Splitting Class: {folder}")
    class_folder = str(os.path.join(base_dir, folder))

    ex_fold = 0
    for stain_folder in os.listdir(class_folder):
        print(f"Splitting: {stain_folder}")
        stain_folder_path = str(os.path.join(class_folder, stain_folder))
        file_list = []
        
        for file in os.listdir(stain_folder_path):
            stain_file_path = str(os.path.join(stain_folder_path, file))
            file_list.append(stain_file_path)
    
        train_part, test_part = random_split(file_list, [0.75, 0.25], generator=generator1)
        
        kf = KFold(n_splits=K, shuffle=True)  
        
        ex_fold = 0
        for fold, (train_idx, val_idx) in enumerate(kf.split(train_part)):
            train_array = np.asarray(train_part)            
            train_elements = train_array[train_idx]
            val_elements = train_array[val_idx]            
            
            train_dest_path = None

            if folder == '0_Amiloidose':            
                train_dest_path = str(os.path.join(dest_dir,  "fold_"+str(fold), 'train' , folder))

            else:
                train_dest_path = str(os.path.join(dest_dir, "fold_"+str(fold), 'train',  '1_Non_Amiloidose'))
            
            if not os.path.exists(str(train_dest_path)):
                os.makedirs(str(train_dest_path))
            
            for train_sample in train_elements:
                file_name = train_sample[len(stain_folder_path):].lstrip(os.sep)
                copy_to = str(os.path.join(train_dest_path, file_name))
                shutil.copy(train_sample, copy_to)

            val_dest_path = None

            if folder == '0_Amiloidose':            
                val_dest_path = str(os.path.join(dest_dir, "fold_" + str(fold), 'val', folder))

            else:
                val_dest_path = str(os.path.join(dest_dir, "fold_" + str(fold), 'val',  '1_Non_Amiloidose'))
            
            if not os.path.exists(str(val_dest_path)):
                os.makedirs(str(val_dest_path))

            for val_sample in val_elements:
                file_name = val_sample[len(stain_folder_path):].lstrip(os.sep)
                copy_to = str(os.path.join(val_dest_path, file_name))
                shutil.copy(val_sample, copy_to)
                
            

        test_dest_path = None
        
        if folder == '0_Amiloidose':
            test_dest_path = str(os.path.join(dest_dir, 'test', folder))
        else:
            test_dest_path = str(os.path.join(dest_dir, 'test', '1_Non_Amiloidose'))

        if not os.path.exists(str(test_dest_path)):
                os.makedirs(str(test_dest_path))
            
        for test_sample in test_part:
            file_name = test_sample[len(stain_folder_path):].lstrip(os.sep)
            copy_to = str(os.path.join(test_dest_path, file_name))
            shutil.copy(test_sample, copy_to)
                
        

Splitting Class: 0_Amiloidose
Splitting: AZAN
Splitting: HE
Splitting: PAMS
Splitting: PAS
Splitting Class: 1_Normal
Splitting: AZAN
Splitting: HE
Splitting: PAMS
Splitting: PAS
Splitting Class: 2_Esclerose_Pura_Sem_Crescente
Splitting: AZAN
Splitting: HE
Splitting: PAMS
Splitting: PAS
Splitting Class: 3_Hipercelularidade
Splitting: AZAN
Splitting: HE
Splitting: PAS
Splitting Class: 4_Hipercelularidade_Pura_Sem_Crescente
Splitting: AZAN
Splitting: PAS


In [3]:
for root, subdirs, files in os.walk(base_dir):
    print(subdirs)

['0_Amiloidose', '1_Normal', '2_Esclerose_Pura_Sem_Crescente', '3_Hipercelularidade', '4_Hipercelularidade_Pura_Sem_Crescente']
['AZAN', 'HE', 'PAMS', 'PAS']
[]
[]
[]
[]
['AZAN', 'HE', 'PAMS', 'PAS']
[]
[]
[]
[]
['AZAN', 'HE', 'PAMS', 'PAS']
[]
[]
[]
[]
['AZAN', 'HE', 'PAS']
[]
[]
[]
['AZAN', 'PAS']
[]
[]


In [20]:
def load_dataset(base_dir):
    
    shuffle=True
    X = []
    Y = []
    processed_image_count = 0
    classes = 0
    for root, subdirs, files in os.walk(base_dir):
        if subdirs:
            classes = subdirs
            print ("[INFO] Processando classe ", classes)
        for filename in files:
            file_path = os.path.join(root, filename)
            # if file_path.startswith(base_dir) is false then AssertionError
            assert file_path.startswith(base_dir)
            suffix = file_path[len(base_dir):]
            suffix = suffix.lstrip(os.sep)
            label = suffix.split(os.sep)[0]

            #print("label: ", label)
            #print("Classes: ", classes.index(label))
            
            X.append(file_path)
            Y.append(classes.index(label))
    
    X = np.array(X)
    Y = np.array(Y)
        
    return X, Y

In [67]:
load_dataset('data/base')

[INFO] Processando classe  ['0_Amiloidose', '1_Non_Amiloidose']


(array(['data/base/0_Amiloidose/0_Amiloidose_AZAN_9.jpg',
        'data/base/0_Amiloidose/0_Amiloidose_PAMS_65.jpg',
        'data/base/0_Amiloidose/0_Amiloidose_HE_59.jpg', ...,
        'data/base/1_Non_Amiloidose/3_Hipercelularidade_HE_5.jpg',
        'data/base/1_Non_Amiloidose/1_Normal_HE_497.jpg',
        'data/base/1_Non_Amiloidose/3_Hipercelularidade_HE_576.jpg'],
       dtype='<U77'),
 array([0, 0, 0, ..., 1, 1, 1]))

In [None]:
def create_file_list(base_dir):
    
    shuffle=True
    X = []
    Y = []
    processed_image_count = 0
    classes = 0
    for root, subdirs, files in os.walk(base_dir):
        for filename in files:
            file_path = os.path.join(root, filename)
            # if file_path.startswith(base_dir) is false then AssertionError
            assert file_path.startswith(base_dir)
            suffix = file_path[len(base_dir):]
            suffix = suffix.lstrip(os.sep)
            label = suffix.split(os.sep)[0]

            #print("label: ", label)
            #print("Classes: ", classes.index(label))
            
            X.append(file_path)
            Y.append(classes.index(label))
    
    X = np.array(X)
    Y = np.array(Y)
        
    return X, Y

In [4]:
os.path.realpath('.')

'/home/alexsandro/dev/notebooks-examples'

In [39]:
import shutil
from os import path
y = []
positive_class = '0_Amiloidose'
negative_class = '1_Non_Amiloidose'
dest_base_dir = 'data'
type_dir = 'train'
dest_dir = dest_base_dir + os.sep + type_dir

if path.exists(dest_dir):
    shutil.rmtree(dest_dir)

for root, subdirs, files in os.walk(base_dir):    
    for filename in files:
        copy_from = os.path.join(root, filename)
        assert copy_from.startswith(base_dir)
        # print(f"Copy from: {copy_from}")
        suffix = copy_from[len(base_dir):]        
        suffix = suffix.lstrip(os.sep)
        
        suffix_parts = suffix.split(os.sep)
        label = suffix_parts[0]
        img_file = suffix_parts[-1]
        # print(f"Label: {label}")
        # print(f"File name: {img_file}")

        if label == positive_class:
            copy_to_dir = dest_base_dir + os.sep + type_dir + os.sep + label
            copy_to = copy_to_dir + os.sep + img_file
        else:
            copy_to_dir = dest_base_dir + os.sep + type_dir + os.sep + negative_class 
            copy_to = copy_to_dir + os.sep + img_file

        if not path.exists(copy_to_dir):
            os.makedirs(copy_to_dir)

        shutil.copy(copy_from, copy_to)

In [8]:
import random
from os import path
import shutil

positive_class = '0_Amiloidose'
negative_class = '1_Non_Amiloidose'

In [9]:
# stain_list = ['AZAN', 'HE', 'PAMS', 'PAS']

In [9]:
def dataset_prep(
        base_dir = 'dataset_base', 
        dest_dir='dataset_run',
        positive_class='0_Amiloidose', 
        negative_class=negative_class,
        
        ):
    
    if path.exists(dest_dir):
        shutil.rmtree(dest_dir)
    
    for root, subdirs, files in os.walk(base_dir):    
        for filename in files:
            copy_from = os.path.join(root, filename)
            assert copy_from.startswith(base_dir)
            # print(f"Copy from: {copy_from}")
            suffix = copy_from[len(base_dir):]        
            suffix = suffix.lstrip(os.sep)
            
            suffix_parts = suffix.split(os.sep)
            label = suffix_parts[0]
            img_file = suffix_parts[-1]
            # print(f"Label: {label}")
            # print(f"File name: {img_file}")
    
            if label == positive_class:
                copy_to_dir = dest_dir + os.sep + label
                copy_to = copy_to_dir + os.sep + img_file
            else:
                copy_to_dir = dest_dir + os.sep + negative_class 
                copy_to = copy_to_dir + os.sep + img_file
    
            if not path.exists(copy_to_dir):
                os.makedirs(copy_to_dir)
    
            shutil.copy(copy_from, copy_to)

In [10]:
dataset_prep()

In [46]:
# from torchvision import datasets

# img_dataset = datasets.ImageFolder(root='data/base')
# img_dataset

Dataset ImageFolder
    Number of datapoints: 4390
    Root location: data/base

In [48]:
# img_dataset.classes

['0_Amiloidose', '1_Non_Amiloidose']

In [49]:
# from torch.utils.data import random_split

In [50]:
# train_set, test_set, val_test = random_split(img_dataset, [0.6, 0.2, 0.2])

In [58]:
# train_set

In [59]:
# test_set

In [60]:
# val_test

In [61]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

In [68]:
classes

['0_Amiloidose',
 '1_Normal',
 '2_Esclerose_Pura_Sem_Crescente',
 '3_Hipercelularidade',
 '4_Hipercelularidade_Pura_Sem_Crescente']

In [17]:
def create_fold(sub_X, sub_y, fold_index, base_folder, dest_folder):
    index = 0
    for img in sub_X:
        label = int(sub_y[index])
        filename = os.path.basename(img)
        save_folder = base_folder + os.sep + 'fold' + str(fold_index + 1) +os.sep + dest_folder + os.sep

        if label == 1:
            save_folder = save_folder + negative_class + os.sep
        elif label == 0:
            save_folder = save_folder + positive_class + os.sep

        if not os.path.exists(str(save_folder)):
            os.makedirs(os.path.dirname(str(save_folder)))

        save_file = save_folder + filename
        save_file = str(save_file)
        shutil.copy(img,save_file)
        index += 1

In [18]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

def generate_Kfolds(base_dir='dataset_run', K=5):
    X, y = load_dataset(base_dir)
    print(f"X: {str(X.shape[0])} Amostras")
    print(f"y: {str(y.shape[0])} Amostras")

    base_folder = base_dir

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=0)

    for train_index, test_index in sss.split(X, y):
        xtrain_aux, xtest = X[train_index], X[test_index]
        ytrain_aux, ytest = y[train_index], y[test_index]

    print(f"Test set: {str(xtest.shape[0])} images. \n")

    index = 0
    for img in xtest:
        label = int(ytest[index])
        filename = os.path.basename(img)
        save_folder = base_folder + os.sep + 'test' + os.sep

        if label == 1:
            save_folder = save_folder + negative_class + os.sep
        elif label == 0:
            save_folder = save_folder + positive_class + os.sep

        if not os.path.exists(str(save_folder)):
            os.makedirs(os.path.dirname(str(save_folder)))

        save_file = save_folder + filename
        save_file = str(save_file)
        shutil.copy(img,save_file)
        index += 1

    skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=0)

    print('[INFO] Generating Folds')

    for idx, (train_indices, val_indices) in enumerate(skf.split(xtrain_aux, ytrain_aux)):
        print(f"[INFO] Generating data for fold {str(idx+1)} \\ {str(K)}...")

        xtrain, xval = xtrain_aux[train_indices], xtrain_aux[val_indices]
        ytrain, yval = ytrain_aux[train_indices], ytrain_aux[val_indices]
        
        create_fold(xtrain, ytrain, idx, base_folder, 'train')

        create_fold(xval, yval, idx, base_folder, 'val')

        print("\n [INFO] END ")
            


In [21]:
generate_Kfolds()

[INFO] Processando classe  ['0_Amiloidose', '1_Non_Amiloidose']
X: 4389 Amostras
y: 4389 Amostras
Test set: 1098 images. 

[INFO] Generating Folds
[INFO] Generating data for fold 1 \ 5...

 [INFO] END 
[INFO] Generating data for fold 2 \ 5...

 [INFO] END 
[INFO] Generating data for fold 3 \ 5...

 [INFO] END 
[INFO] Generating data for fold 4 \ 5...

 [INFO] END 
[INFO] Generating data for fold 5 \ 5...

 [INFO] END 
