In [None]:
import os
import numpy as np
from PIL import Image
Image.MAX_IMAGE_PIXELS = None
from IPython.display import clear_output
import pandas as pd
import os
from tqdm import tqdm
from tqdm.auto import tqdm as tqdm_auto
import multiprocessing as mproc

In [None]:
DATA_DIR = "/kaggle/input/UBC-OCEAN"
TILE_DATA_DIR = "/kaggle/working/ubc-ocean-train-tiles"
TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
TRAIN_IMGS = os.path.join(DATA_DIR, "train_images")

In [None]:
%%time
train_df = pd.read_csv(TRAIN_CSV) 
train_df["path"] = [os.path.join(TRAIN_IMGS, f"{str(i)}.png")
                   for i in train_df["image_id"]]


In [None]:
def resize_imgs(img: Image.Image, new_size = (256, 256)):
    return img.resize(new_size, Image.LANCZOS)


def img_to_tiles(img_path, output_dir = TILE_DATA_DIR, size: list = (2048, 2048),
                 post_transforms = resize_imgs, black_max_perc: float = 0.3):
    print(f"Reading {img_path}...")
    
    image = Image.open(img_path)
    img_filename = os.path.basename(img_path).split(".")[0]

    # Get image dimensions
    width, height = image.size

    # Calculate the number of rows and columns
    rows = height // size[1]
    cols = width // size[0]
    quadrants = [(r, c) for r in range(rows)
                 for c in range(cols)]
    
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    print(f"Finished reading {img_path}, starting the cropping...")
    # Crop the image into tiles
    tqdm_bar = tqdm_auto(total=len(quadrants), desc=f"cropping {img_filename=}..")
    for (col, row) in quadrants:
        tqdm_bar.update()
        left = col * size[0]
        top = row * size[1]
        right = left + size[0]
        bottom = top + size[1]

        # Crop the tile
        tile = image.crop((left, top, right, bottom))
        # Save the tile
        tile_filename = f"{img_filename}__{row}_{col}.png"
        tile_path = os.path.join(output_dir, tile_filename)

        if post_transforms is not None:
            tile = post_transforms(tile)
        # checking if the tile passes the threshold
        if (np.array(tile) == 0).mean() < black_max_perc:
            tile.save(tile_path)

    return tile

In [None]:
# for i, row in tqdm(train_df.iterrows(), total=train_df.shape[0]):
#     tile = img_to_tiles(row["path"], TILE_DATA_DIR, (2048, 2048), resize_imgs)
#     clear_output(wait=True)

# *** with paralellism ***
paths = train_df["path"].to_list()
tqdm_bar = tqdm_auto(total=len(paths))
# create and configure the process pool
with mproc.Pool(3) as pool:
    # execute tasks in order, process results out of order
    for _ in pool.imap_unordered(img_to_tiles, iter(paths)):
        tqdm_bar.update()

    # process pool is closed automatically; no need to call close() and join()