# Data preprocessing

Central project file to process all manuell reviewed and selected source bathymetrie images tiles.

Input data has been collected by donwload images with seamounts in center by using database https://data.unep-wcmc.org/datasets/41 and bathemetry data via API of GMRT service https://www.gmrt.org/services/ImageServer.

Every tile contains a scale border with geodata. This scale will removed by cropping.

Result is a cropped and shrinked imageset as input for train, validation and test of all different models. 

In [1]:
import os
import cv2
import config

print(f"Preprocessing start")

# Ensure output folder exists
print(f"Create output folder {config.DATASETS_FOLDER} if not already exists!")
os.makedirs(config.DATASETS_FOLDER, exist_ok=True)

# Function to crop borders from an image
def crop_fixed_border(image, crop_pixels=5):
    height, width, _ = image.shape
    if height > crop_pixels * 2 and width > crop_pixels * 2:
        cropped_image = image[crop_pixels:height-crop_pixels, crop_pixels:width-crop_pixels]
        return cropped_image
    else:
        # Return original image if cropping not possible
        return image  

# Preprocess images (cropping and resizing)
print("Preprocessing images...")

print(f"Read images from input folder {config.DATASETS_SOURCE_FOLDER}")

for category in config.CATEGORIES:
    
    # check and create cat dir if not exists
    input_category_folder = os.path.join(config.DATASETS_SOURCE_FOLDER, category)
    output_category_folder = os.path.join(config.DATASETS_FOLDER, category)
    os.makedirs(output_category_folder, exist_ok=True)
    
    filenames_in_catfolder = os.listdir(input_category_folder)

    print(f"Start with categorie folder {input_category_folder} found {len(filenames_in_catfolder)} images")

    # process all images in cat subfolder
    for file_name in filenames_in_catfolder:
        image_path = os.path.join(input_category_folder, file_name)
        output_path = os.path.join(output_category_folder, file_name)
        try:
            image = cv2.imread(image_path)
            if image is not None:
                cropped_image = crop_fixed_border(image, config.CROP_PIXELS)
                resized_image = cv2.resize(cropped_image, config.IMAGE_SIZE)
                cv2.imwrite(output_path, resized_image)
        except Exception as e:
            print(f"Error processing {image_path}: {e}")

print("Preprocessing of all images complete.")

Preprocessing start
Create output folder /workspaces/Seamounts/0_Dataset/processed_data if not already exists!
Preprocessing images...
Read images from input folder /workspaces/Seamounts/0_Dataset/raw_dataset
Start with categorie folder /workspaces/Seamounts/0_Dataset/raw_dataset/with_seamount found 301 images
Start with categorie folder /workspaces/Seamounts/0_Dataset/raw_dataset/without_seamount found 500 images
Preprocessing of all images complete.
