# Deleting empty patches before passing data to U-net
    Jakub Dvorak /jakub.dvorak@natur.cuni.cz/         2020/2021

    Student, Dept. of Applied Geoinformatics and Cartography, Faculty of Science, Charles University

In [None]:
# COMMON IMPORTS 
import os
import numpy as np
import imageio

## Delete tiles based on containing no data
This portion of the script deletes all rasters from a folder which only contain one value (their minimum value equals the maximum value).

Variable ***empty_dir*** needs to be set as the folder to delete from.

In [None]:
def check_empty(filepath):
    array = imageio.imread(filepath)
    if np.amax(array) == np.amin(array):
        return True
    else:
        return False

In [None]:
empty_dir = 'E:/datasets/test_unet/Krkonose2012/overlap/testing_preprocessing/mhs'

i = 0
for filename in os.listdir(empty_dir):
    filepath = os.path.join(empty_dir, filename)
    if check_empty(filepath):
        os.remove(filepath)
        i += 1

print(f'Finished and deleted {str(i)} files.')

## Delete tiles based on percentile of minimums
This portion of the script deletes all rasters from a folder where a certain percentile of the raster is the minimum value (useful when the minimum value is nodata). Variables *percentile_dir* and *percentile* need to be set:

***percentile_dir*** - folder to delete from

***percentile*** /integer 0-100/ - which percentile should be used to decide if the patch should be deleted

In [None]:
def check_percentile(filepath, perc):
    array = imageio.imread(filepath)
    if np.percentile(array, perc) == np.amin(array):
        return True
    else:
        return False

In [None]:
percentile_dir = 'E:/datasets/test_unet/Krkonose2012/overlap/testing_preprocessing/gt'
percentile = 50

i = 0
for filename in os.listdir(percentile_dir):
    filepath = os.path.join(percentile_dir, filename)
    if check_percentile(filepath, percentile):
        os.remove(filepath)
        i += 1

print(f'Finished and deleted {str(i)} files.')

## Delete tiles based on file endings in a reference folder
Change variables *dataset_dir*, *reference_dir*, and *end_len*:

***dataset_dir*** - directory containing files to be deleted

***reference_dir*** - reference directory which is already missing the corresponding files

***end_len*** - length of the unique file ending identifier including file extension, e.g.: File *gt_fullsize_05_06.tif* with a unique identifier *05_06* requires setting the value to the length of *05_06.tif*, which is 9 characters.

In [None]:
dataset_dir = 'E:/datasets/test_unet/Krkonose2012/overlap/testing_preprocessing/MHS'
reference_dir = 'E:/datasets/test_unet/Krkonose2012/overlap/testing_preprocessing/GT'

end_len = 9


data_filename_prefix = os.listdir(dataset_dir)[0][:-end_len]
data_ending_set = set()
for filename in os.listdir(dataset_dir):
    data_ending_set.add(filename[-end_len:])

ref_ending_set = set()
for filename in os.listdir(reference_dir):
    ref_ending_set.add(filename[-end_len:])

    
difference = gt_ending_set.difference(ref_ending_set)
if not difference: ix_gt = 0

for ix_gt, file_ending in enumerate(difference):
    delete_file = os.path.join(dataset_dir, data_filename_prefix + file_ending)
    os.remove(delete_file)

print(f'Finished and deleted {str(ix_gt)} files.')