In [1]:
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [16]:
def load_and_preprocess_data(image_path, mask_path=None):
    # Load and preprocess the image
    image = tf.io.read_file(image_path)
    image = tf.io.decode_png(image, channels=3)
    image = image/255
    #image = tf.pad(image, [[12,12], [12,12], [0,0]], mode='REFLECT') #To be removed and model adjusted


    # Load and preprocess the mask image
    if mask_path is not None:
        mask = tf.io.read_file(mask_path)
        mask = tf.io.decode_png(mask, channels=1)
        mask = mask/255
        #mask = tf.pad(mask, [[12,12], [12,12], [0,0]], mode='REFLECT') #To be removed and model adjusted

        return image, mask
    else:
        return image

In [2]:
image_dir = "/Users/bastianberger/code/bergerbastian/inria1358/raw_data/patches/train/images"
mask_dir = "/Users/bastianberger/code/bergerbastian/inria1358/raw_data/patches/train/gt"

In [3]:
image_path = [os.path.join(image_dir, filename) for filename in os.listdir(image_dir)]
mask_path = [os.path.join(mask_dir, filename) for filename in os.listdir(mask_dir)] if mask_dir else None

In [23]:
len(image_path) * .3

33750.0

In [32]:
image_path_subset, _, mask_path_subset, _ = train_test_split(image_path, mask_path, train_size=1)

In [33]:
len(image_path_subset), len(image_path)

(1, 112500)

In [13]:
len(mask_path_subset), len(mask_path)

(56250, 112500)

In [20]:
dataset = tf.data.Dataset.from_tensor_slices((image_path_subset, mask_path_subset))
dataset = dataset.map(load_and_preprocess_data, num_parallel_calls=tf.data.AUTOTUNE)

In [21]:
len(dataset)

56250

In [59]:
def create_datasets(save_path, set="train", test_size=.1, batch_size=64, data_size=1, val_size=.3):
    if set == "predict":
        image_dir = save_path
    else:
        image_dir = f'{save_path}/{set}/images'
    mask_dir = f'{save_path}/{set}/gt' if set == "train" else None

    image_path = [os.path.join(image_dir, filename) for filename in os.listdir(image_dir)]
    mask_path = [os.path.join(mask_dir, filename) for filename in os.listdir(mask_dir)] if mask_dir else None
    
    if set in ["predict", "test"] :
        dataset = tf.data.Dataset.from_tensor_slices(image_path)
        dataset = dataset.map(load_and_preprocess_data, num_parallel_calls=tf.data.AUTOTUNE)
        
        predict_batches = dataset.batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)

        pics_test = len(dataset)
        print(f"✅ Predict_batches with {pics_test} images ({len(predict_batches)} batches) created")

        return predict_batches
        
    else:
        #test split
        image_path_rest, image_path_test, mask_path_rest, mask_path_test = train_test_split(image_path, mask_path, test_size=test_size, random_state=42) if test_size != 0 else (None, None, None, None)
        
        #data size split
        if test_size == 0:
            image_path_rest = image_path
            mask_path_rest = mask_path
        image_path_subset, _, mask_path_subset, _ = train_test_split(image_path_rest, mask_path_rest, train_size=data_size) if data_size != 1 else (None, None, None, None)
        
        #train val split
        if data_size == 1:
            image_path_subset = image_path_rest
            mask_path_subset = mask_path_rest
        image_path_train, image_path_val, mask_path_train, mask_path_val = train_test_split(image_path_subset, mask_path_subset, train_size=val_size)
        
        train_dataset = tf.data.Dataset.from_tensor_slices((image_path_train, mask_path_train))
        train_dataset = train_dataset.map(load_and_preprocess_data, num_parallel_calls=tf.data.AUTOTUNE)
        train_batches = train_dataset.batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
        
        val_dataset = tf.data.Dataset.from_tensor_slices((image_path_val, mask_path_val))
        val_dataset = val_dataset.map(load_and_preprocess_data, num_parallel_calls=tf.data.AUTOTUNE)
        val_batches = val_dataset.batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
        
        test_dataset = tf.data.Dataset.from_tensor_slices((image_path_test, mask_path_test)) if test_size != 0 else None 
        test_dataset = test_dataset.map(load_and_preprocess_data, num_parallel_calls=tf.data.AUTOTUNE) if test_size != 0 else None 
        test_batches = test_dataset.batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE) if test_size != 0 else None 
        
        pics_train = len(train_dataset)
        pics_val = len(val_dataset)
        pics_test = len(test_dataset) if test_size != 0 else None 
        
        print(f"✅ Test_batches with {pics_test} images ({len(test_batches)} batches) created") if test_size != 0 else None 
        print(f"✅ Train_batches with {pics_train} images ({len(train_batches)} batches) created")
        print(f"✅ Val_batches with {pics_val} images ({len(val_batches)} batches) created")
        
        if test_size != 0:
            return test_batches, train_batches, val_batches 
        else:
            return train_batches, val_batches
    

In [61]:
save_path= "/Users/bastianberger/code/bergerbastian/inria1358/raw_data/patches"
test_size = 0
set ="train"
data_size=1
val_size= .3
batch_size=64

create_datasets(save_path=save_path, set=set, data_size=1.0, val_size=val_size, batch_size=batch_size, test_size=test_size)

✅ Train_batches with 33750 images (528 batches) created
✅ Val_batches with 78750 images (1231 batches) created


(<_PrefetchDataset element_spec=(TensorSpec(shape=(None, None, None, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, None, None, 1), dtype=tf.float32, name=None))>,
 <_PrefetchDataset element_spec=(TensorSpec(shape=(None, None, None, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, None, None, 1), dtype=tf.float32, name=None))>)