In [2]:
import os
import shutil
import pandas as pd
from tqdm import tqdm

In [30]:
def split_directory_by_labels(src_directory, dest_directory, label_map, label_map_names, reset):
    if reset and os.path.exists(dest_directory):
        shutil.rmtree(dest_directory)
        
    if not os.path.isdir(dest_directory):
        os.mkdir(dest_directory)
        
    for i in label_map_names:
        if not os.path.isdir(label_map_names[i]):
            os.mkdir(os.path.join(dest_directory, label_map_names[i]))
            
    
    src_files = os.listdir(src_directory)
    
    print("Number of files in from directory: ", len(src_files))
    
    for filename in tqdm(src_files):
        label = label_map[filename[:-4]]
        
        src_file_path = os.path.join(src_directory, filename)
        
        cur_dest_directory = os.path.join(dest_directory, label_map_names[label])
        dest_filepath = os.path.join(cur_dest_directory, filename)
        
        if not os.path.exists(dest_filepath):
            shutil.copyfile(src_file_path, dest_file_path)

    print("Split directory by labels complete")      
            
            
def train_validation_test_split(master_dir, train_dir, validation_dir, test_dir, train_percent, validation_percent, test_percent, copy):        
    
    dir_list = os.listdir(master_dir)
    
    train_count=0
    validation_count=0
    test_count=0
    
    total_fail_count = 0
    
    for directory in dir_list:
        src_subdir = os.path.join(master_dir, directory)
        
        if train_dir!="":
            dest_train_subdir = os.path.join(train_dir, directory)
        dest_validation_subdir = os.path.join(validation_dir, directory)
        dest_test_subdir = os.path.join(test_dir, directory)
        
        if train_dir and not os.path.isdir(dest_train_subdir):
            os.mkdir(dest_train_subdir)
        if not os.path.isdir(dest_validation_subdir):
            os.mkdir(dest_validation_subdir)
        if not os.path.isdir(dest_test_subdir):
            os.mkdir(dest_test_subdir)
            
        
        
        src_subdir_files = os.listdir(src_subdir)
        count_files = len(src_subdir_files)

        if train_dir:
            train_count = count_files*(train_percent/100)
            validation_count = train_count+count_files*(validation_percent/100)
            test_count = validation_count+count_files*(test_percent/100)
        else:
            validation_count = count_files*(validation_percent/100)
            test_count = validation_count+count_files*(test_percent/100)
            
        
        print("Moving file from: %s, total files: %d, train count: %d, validation count: %d, test count: %d" % (src_subdir, count_files, train_count, validation_count, test_count))
        
        i = 0
        fail_count = 0
        
#         for filename in tqdm(src_subdir_files):
        for filename in src_subdir_files:
            try:
                src_filepath = os.path.join(src_subdir, filename)
    
                dest_filepath=""
    
                if train_dir!="":
                    if i<train_count:
                        dest_filepath = os.path.join(dest_train_subdir, filename)
                        
                    elif i>=train_count and i <validation_count:
                        dest_filepath = os.path.join(dest_validation_subdir, filename)
                    else:
                        dest_filepath = os.path.join(dest_test_subdir, filename)
                else:
                    
                    if i<validation_count:
                        dest_filepath = os.path.join(dest_validation_subdir, filename)
                    else:
                        dest_filepath = os.path.join(dest_test_subdir, filename)
                        
                if copy:
                    shutil.copyfile(src_filepath, dest_filepath)
                i+=1
            except:
                fail_count+=1
                print("Exception for %s: filename: %s"% (i, filename))
    
        total_fail_count+=fail_count                      
        print("Move Complete total prev: %d, total_now(src): %d, total train: %d, total validation: %d, total test: %d Failed: %d" % (count_files, len(os.listdir(src_subdir)), len(os.listdir(dest_train_subdir)), len(os.listdir(dest_validation_subdir)), len(os.listdir(dest_test_subdir)), fail_count))
              
    print("Move Complete, Failed: %d" % (total_fail_count))
    

    
def train_validation_test_split_main(master_dir, master_dir_type, train_dir, validation_dir, test_dir, train_percent, validation_percent, test_percent, copy):    
    if train_dir and not os.path.isdir(train_dir):
        os.mkdir(train_dir)
    if not os.path.isdir(validation_dir):
        os.mkdir(validation_dir)
    if not os.path.isdir(test_dir):
        os.mkdir(test_dir)
        
    if master_dir_type=="train":
        print("aaada")
        train_validation_test_split(master_dir, train_dir, validation_dir, test_dir, train_percent, validation_percent, test_percent, copy)
    
    elif master_dir_type=="test":
        train_validation_test_split(master_dir, None, validation_dir, test_dir, 0, validation_percent, test_percent, copy)


        
def move_file_from_dir_to_dir(from_dir, to_dir, move_percent, copy):    
    if not os.path.isdir(to_dir):
        os.mkdir(to_dir)
    
    dir_list = os.listdir(from_dir)
    
    tot_fail=0
    
    for directory in dir_list:
        
        from_subdir = os.path.join(from_dir, directory)
        to_subdir = os.path.join(to_dir, directory)
        
        if not os.path.isdir(to_subdir):
            os.mkdir(to_subdir)
        
        
        from_subdir_files = os.listdir(from_subdir)
        

        count_files = len(from_subdir_files)
        print(from_subdir, count_files)

        move_count = count_files*(move_percent/100)
        
        print("Moving file from: %s,  to: %s, total files: %d, moving: %d" % (from_subdir, to_subdir, count_files, move_count))
        
        i = 0
        fail = 0
        
        for filename in tqdm(from_subdir_files):
            try:
#                 print(filename)
                from_filepath = os.path.join(from_subdir, filename)
                
                if i<move_count:
                    shutil.copyfile(from_filepath, os.path.join(to_subdir, filename))
                    i+=1
                else:
                    break
            except:
                fail+=1
                print("Exception for %s: filename: %s"% (i, filename))
        
        tot_fail+=fail                      
        print("Move Complete total prev: %d, total_now(from): %d, total_to: %d, Failed: %d" % (count_files, len(os.listdir(from_subdir)), len(os.listdir(to_subdir)), fail))
              
    print("Move Complete, Failed: %d" % (tot_fail))
    
def move_file_from_subdir_to_subdir(from_dir, to_dir, move_count):         
    from_subdir_files = os.listdir(from_subdir)
    count_files = len(from_subdir_files)        
    i = 0
    fail = 0
        
    for filename in tqdm(from_subdir_files):
        try:
            if i<move_count:
                shutil.move(os.path.join(from_dir, filename), os.path.join(to_subdir, filename))
                i+=1
            else:
                break
        except:
            fail+=1
            print("Exception for %s: filename: %s"% (i, filename))
                     
    print("Move Complete, Failed: %d" % (fail))


def balance_train_validation_test_set(master_dir, train_dir_type, train_dir, validation_dir, test_dir, train_percent, validation_percent, test_percent):            
    dir_list = os.listdir(master_dir)
    
    train_count=0
    validation_count=0
    test_count=0
    
    total_fail_count = 0
    
    for directory in dir_list:
        if train_dir:
            train_subdir = os.path.join(train_dir, directory)
        validation_subdir = os.path.join(validation_dir, directory)
        test_subdir = os.path.join(test_dir, directory)
        
        
        current_train_count = os.listdir(train_subdir)
        current_validation_count = os.listdir(validation_subdir)
        current_test_count = os.listdir(test_subdir)
        

        if train_dir:
            total_count = train_count + validation_count + test_count
            
            expected_train_count = total_count*(train_percent/100)
            expected_validation_count = total_count*(validation_percent/100)
            expected_test_count = total_count*(test_percent/100)
            
        else:
            total_count = validation_count + test_count
            
            expected_validation_count = total_count*(validation_percent/100)
            expected_test_count = total_count*(test_percent/100)
            
        
        if current_train_count<expected_train_count and current_validation_count>expected_validation_count:
            extra = current_validation_count-expected_validation_count
            need = expected_train_count-current_train_count
            if need>extra:
                move=extra
            else:
                move=need
            move_file_from_subdir_to_subdir(validation_subdir, train_subdir, move)
            current_train_count = current_train_count+move
            current_validation_count = current_validation_count-move
            
        if current_train_count<expected_train_count and current_test_count>expected_test_count:
            extra = current_test_count-expected_test_count
            need = expected_train_count-current_train_count
            if need>extra:
                move = extra
            else:
                move = need
            move_file_from_subdir_to_subdir(test_subdir, train_subdir, move)
            current_train_count = current_train_count+move
            current_test_count = current_test_count-move
            
        if current_validation_count<expected_validation_count and current_test_count>expected_test_count:
            extra = current_test_count-expected_test_count
            need = expected_validation_count-current_validation_count
            if need>extra:
                move = extra
            else:
                move = need
            move_file_from_subdir_to_subdir(test_subdir, validation_subdir, move)
            current_validation_count = current_validation_count+move
            current_test_count = current_test_count-move
            
        if current_validation_count<expected_validation_count and current_train_count>expected_train_count:
            extra = current_train_count-expected_train_count
            need = expected_validation_count-current_validation_count
            if need>extra:
                move = extra
            else:
                move = need
            move_file_from_subdir_to_subdir(train_subdir, validation_subdir, move)
            current_validation_count = current_validation_count+move
            current_train_count = current_train_count-move
            
        
        
        if current_test_count<expected_test_count and current_validation_count>expected_validation_count:
            extra = current_validation_count-expected_validation_count
            need = expected_test_count-current_test_count
            if need>extra:
                move = extra
            else:
                move = need
            move_file_from_subdir_to_subdir(validation_subdir, test_subdir, move)
            current_test_count = current_test_count+move
            current_validation_count = current_validation_count-move
            
        if current_test_count<expected_test_count and current_train_count>expected_train_count:
            extra = current_train_count-expected_train_count
            need = expected_test_count-current_test_count
            if need>extra:
                move = extra
            else:
                move = need
            move_file_from_subdir_to_subdir(train_subdir, test_subdir, move)
            current_test_count = current_test_count+move
            current_train_count = current_train_count-move
            
            
        
        current_train_count = os.listdir(train_subdir)
        current_validation_count = os.listdir(validation_subdir)
        current_test_count = os.listdir(test_subdir)
        print("current sub-directory: %s: current_train_count:%d, current_train_count: %d, current_train_count:%d"%(directory, current_train_count, current_validation_count, current_test_count))
    print("completed")

In [28]:
src_train_directory_all = "data\\input\\train"
src_train_directory = "data\\input\\train_new"
src_test_directory_all = "data\\input\\test"
src_validation_directory = "data\\input\\test_new"

dest_train_directory = "data\\input\\train_final"
dest_validation_directory = "data\\input\\validation_final"
dest_test_directory = "data\\input\\test_final"

In [5]:
train_labels = pd.read_csv("data/input/train_labels.csv")
test_labels = pd.read_csv("data/input/sample_submission.csv")

# train labels
labels_only = train_labels['label']

labels_set = set(labels_only)

num_labels = len(labels_set)

label_map_names = {0: "Normal", 1 : "Cancer"}
    

label_map={}
for i in range(len(train_labels['id'])):
    label_map[train_labels['id'][i]] = train_labels['label'][i]
    

# test labels
sample_label_map={}
for i in range(len(test_labels['id'])):
    sample_label_map[test_labels['id'][i]] = test_labels['label'][i]

From excel sheet create map for all the images mentioned in sheet

In [6]:
# split_directory_by_labels(src_train_directory_all, src_train_directory, label_map, label_map_names, reset=True)
# label_dir(test_from_directory, test_to_directory, sample_label_map, label_map_names, reset=True)

From labels saved in excel file create diferent directory for different class for a directory with multiple classes

In [31]:
master_dir=src_train_directory
master_dir_type = "train"
train_dir=dest_train_directory
# train_dir=None
validation_dir = dest_validation_directory
test_dir = dest_test_directory
train_percent = 60
validation_percent = 20
test_percent = 20    
copy=True


In [32]:
train_validation_test_split_main(master_dir, master_dir_type, train_dir, validation_dir, test_dir, train_percent, validation_percent, test_percent, copy)

aaada
Moving file from: data\input\train_new\Cancer, total files: 89117, train count: 53470, validation count: 71293, test count: 89117
Move Complete total prev: 89117, total_now(src): 89117, total train: 53471, total validation: 17823, total test: 17823 Failed: 0
Moving file from: data\input\train_new\Normal, total files: 130908, train count: 78544, validation count: 104726, test count: 130908
Move Complete total prev: 130908, total_now(src): 130908, total train: 78545, total validation: 26182, total test: 26181 Failed: 0
Move Complete, Failed: 0


In [None]:
# for directory in os.listdir(master_directory):
#     sub_directory = os.path.join(master_directory, directory)
#     sub_directory_files = os.listdir(sub_directory)
#     count_files = len(sub_directory_files)
#     print(directory, count_files)
#     train_count = count_files*.7
#     validate_count = train_count+count_files*.2
#     test_count = validate_count+count_files*.1
    
#     cat_train_dir = os.path.join(train_to_directory, directory)
#     cat_test_dir = os.path.join(test_to_directory, directory)
#     cat_validate_dir = os.path.join(validate_to_directory, directory)
    
#     if not os.path.isdir(cat_train_dir):
#         os.mkdir(cat_train_dir)
        
#     if not os.path.isdir(cat_validate_dir):
#         os.mkdir(cat_validate_dir)
    
#     if not os.path.isdir(cat_test_dir):
#         os.mkdir(cat_test_dir)
                 

    
#     from_dir_path = os.path.join(master_directory, directory)
#     from_dir_files= os.listdir(from_dir_path)
#     i = 0
#     for filename in from_dir_files:
#         try:
#             file_path_from = os.path.join(from_dir_path, filename)
#             if i<=train_count:
#                 shutil.copyfile(file_path_from, os.path.join(cat_train_dir, filename))
#             elif i<=validate_count:
#                 shutil.copyfile(file_path_from, os.path.join(cat_validate_dir, filename))
#             else:
#                 shutil.copyfile(file_path_from, os.path.join(cat_test_dir, filename))

#         except:
#             print("Exception for index: ", i, "Breed: ", label, ", File: ", file_path_from)
#         i+=1
