In [5]:
import os as os
from os import *
import random
import shutil
import itertools
from fastai.dataset import *
import numpy as np
from glob import glob

In [3]:
PATH = 'data'
train_root = f'{PATH}/train'
valid_root = f'{PATH}/valid'

In [12]:
def count_original_files(path):
    classes = os.listdir(path)
    full_dirs = [os.path.join(path, c) for c in classes]
    sz = len(os.listdir(full_dirs[0]))
    lens = []
    for full_dir in full_dirs:
        copies = glob(f'{full_dir}/copy_*')
        remainder = sz - len(copies)
        lens.append(remainder)
    return lens

orig_file_counts = count_original_files(train_root)

In [13]:
def get_scaled_factors(sizes):
    combined = sum(sizes)
    divide_by = [combined/sz for sz in sizes]
    print(divide_by)

get_scaled_factors(orig_file_counts)

[34.74545454545454, 29.542028985507248, 12.324062877871826, 11.278495020287716, 18.092307692307692, 23.61081081081081, 47.18518518518518, 14.422641509433962, 12.787954830614806, 16.689956331877728, 22.75, 19.984313725490196, 13.105872267466781, 26.40414507772021, 19.841661258922777, 11.985887887103097, 21.0, 19.351898734177215]


In [31]:
def increase_class_size(path, folder, target_size, file_ext='jpg'):
    relative_paths = read_dir(path, folder)
    files_required = target_size - len(relative_paths)
    
    np.random.shuffle(relative_paths)
    files_to_copy = itertools.islice(itertools.cycle(relative_paths), files_required)
    
    base_copy_name = 'copy'
    for idx, f in enumerate(files_to_copy):
        copy_name = f'{base_copy_name}_{idx}.{file_ext}'
        dir_head = os.path.split(f)[0]
        dst = os.path.join(path, dir_head, copy_name)
        src = os.path.join(path, f)
        shutil.copy(src, dst)
    
def get_num_samples(path):
    sub_folders = os.listdir(path)
    full_dirs = [os.path.join(path, folder) for folder in sub_folders]
    contents_in_sub_folders = [os.listdir(d) for d in full_dirs]
    sizes = [len(c) for c in contents_in_sub_folders]
    return sizes

def gen_extra_samples(train_root):
    largest_class_size = max(get_num_samples(train_root))
    classes = os.listdir(train_root)
    for c in classes:
        increase_class_size(train_root, c, largest_class_size)
    
    

In [32]:
gen_extra_samples(train_root)

In [32]:
def mv_to_valid(chosen_files, classname):
    for i in chosen_files:
        shutil.move(f'{train_root}/{classname}/{i}', f'{valid_root}/{classname}/{i}')

In [37]:
def train_val_split(val_percentage=0.2):
    random.seed(1)
    classes = os.listdir(train_root)
    for classname in classes:
        os.makedirs(f'{valid_root}/{classname}', exist_ok=True)
        list_of_imgs = !ls "{train_root}/{classname}"
        random.shuffle(list_of_imgs)
        n_files_moved=int(len(list_of_imgs)*val_percentage)
        selected_files = [list_of_imgs[m] for m in range(n_files_moved)]
        mv_to_valid(selected_files, classname)

In [38]:
def mv_to_train(chosen_files, classname):
    for i in chosen_files:
        src = f'{valid_root}/{classname}/{i}'
        dst = f'{train_root}/{classname}/{i}'
        shutil.move(src, dst)
    
def put_back_to_train():
    for classname in classes:
        list_of_file_dirs = !ls "{PATH}/valid/{classname}"
        mv_to_train(list_of_file_dirs, classname)
    

In [26]:
put_back_to_train()

In [39]:
train_val_split()

In [36]:
sample_train = f'{PATH}/sample/train'
sample_valid = f'{PATH}/sample/valid'
os.makedirs(sample_train, exist_ok=True)
os.makedirs(sample_valid, exist_ok=True)
random.seed(1)

def cp_to_dir(filenames, src_root, dest_root):
    for fn in filenames:
        src = f'{src_root}/{fn}'
        dst = f'{dest_root}/{fn}'
        shutil.copy2(src, dst)

def build_sample_train(perc=0.1):
    classes = os.listdir(train_root)
    for classname in classes:
        dest_root = f'{sample_train}/{classname}'
        src_root = f'{train_root}/{classname}'
        os.makedirs(dest_root, exist_ok=True)
        imgs = os.listdir(src_root)
        random.shuffle(imgs)
        n_to_copy = int(len(imgs)*perc)
        selected = [imgs[i] for i in range(n_to_copy)]
        cp_to_dir(selected, src_root, dest_root)

In [13]:
def build_sample_valid(perc=0.1):
    classes = os.listdir(valid_root)
    for classname in classes:
        dest_root = f'{sample_valid}/{classname}'
        src_root = f'{valid_root}/{classname}'
        os.makedirs(dest_root, exist_ok=True)
        imgs = os.listdir(src_root)
        random.shuffle(imgs)
        n_to_copy = int(len(imgs)*perc)
        selected = [imgs[i] for i in range(n_to_copy)]
        cp_to_dir(selected, src_root, dest_root)

In [38]:
build_sample_train()

In [14]:
build_sample_valid()