In [1]:
import os
import csv
import shutil

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

import cv2

In [2]:
DATA_PATH = "./data"
metadata_df = pd.read_csv(os.path.join(DATA_PATH, "metadata.csv"))

metadata_df

Unnamed: 0,image_id,split,sat_image_path,mask_path
0,100694,train,train/100694_sat.jpg,train/100694_mask.png
1,102122,train,train/102122_sat.jpg,train/102122_mask.png
2,10233,train,train/10233_sat.jpg,train/10233_mask.png
3,103665,train,train/103665_sat.jpg,train/103665_mask.png
4,103730,train,train/103730_sat.jpg,train/103730_mask.png
...,...,...,...,...
1141,979233,test,test/979233_sat.jpg,
1142,983689,test,test/983689_sat.jpg,
1143,988205,test,test/988205_sat.jpg,
1144,989953,test,test/989953_sat.jpg,


In [20]:
# select only images with masks
metadata_df = metadata_df[metadata_df["split"]=="train"]
# drop "split" column
metadata_df = metadata_df[["image_id", "sat_image_path", "mask_path"]]
# modify the path of images and masks
metadata_df["sat_image_path"] = metadata_df["sat_image_path"].apply(lambda img_pth: os.path.join(DATA_PATH, img_pth))
metadata_df["mask_path"] = metadata_df["mask_path"].apply(lambda mask_pth: os.path.join(DATA_PATH, mask_pth))

metadata_df.head()

Unnamed: 0,image_id,sat_image_path,mask_path
0,100694,./data/train/100694_sat.jpg,./data/train/100694_mask.png
1,102122,./data/train/102122_sat.jpg,./data/train/102122_mask.png
2,10233,./data/train/10233_sat.jpg,./data/train/10233_mask.png
3,103665,./data/train/103665_sat.jpg,./data/train/103665_mask.png
4,103730,./data/train/103730_sat.jpg,./data/train/103730_mask.png


In [29]:
metadata_df_shuffled = metadata_df.sample(frac = 1, random_state = 0).reset_index(drop = True)

train_df, valid_df, test_df = np.split(metadata_df_shuffled, [int(0.8*len(metadata_df_shuffled)), int(0.9*len(metadata_df_shuffled))])

print("train:", len(train_df))
print("valid:", len(valid_df))
print("test:", len(test_df))

train: 642
valid: 80
test: 81


In [30]:
train_df.head()

Unnamed: 0,image_id,sat_image_path,mask_path
0,111335,./data/train/111335_sat.jpg,./data/train/111335_mask.png
1,727832,./data/train/727832_sat.jpg,./data/train/727832_mask.png
2,77669,./data/train/77669_sat.jpg,./data/train/77669_mask.png
3,263576,./data/train/263576_sat.jpg,./data/train/263576_mask.png
4,351228,./data/train/351228_sat.jpg,./data/train/351228_mask.png


In [26]:
for dir_name in ["train_patches", "valid_patches", "test_patches"]:
    
    DIR_PATH = os.path.join(DATA_PATH, dir_name)

    if os.path.isdir(DIR_PATH):
        # remove directory
        shutil.rmtree(DIR_PATH)
        print("Directory '{}' removed".format(DIR_PATH))

    if not os.path.isdir(DIR_PATH):
        # create directory
        os.mkdir(DIR_PATH)
        print("Directory '{}' created".format(DIR_PATH))

Directory './data/train_patches' removed
Directory './data/train_patches' created
Directory './data/valid_patches' removed
Directory './data/valid_patches' created
Directory './data/test_patches' removed
Directory './data/test_patches' created


In [27]:
for file_name in ["train", "valid", "test"]:
    
    METADATA_PATCHES_PATH = os.path.join(DATA_PATH, file_name + "_metadata_patches.csv")

    if os.path.exists(METADATA_PATCHES_PATH):
        # remove file
        os.remove(METADATA_PATCHES_PATH)
        print("File {} removed".format(METADATA_PATCHES_PATH))

    # create file with header
    with open(METADATA_PATCHES_PATH, "w", newline = "") as f:
        writer = csv.writer(f)
        writer.writerow(["image_id", "sat_image_path", "mask_path"])
        print("File {} created".format(METADATA_PATCHES_PATH))

File ./data/train_metadata_patches.csv created
File ./data/valid_metadata_patches.csv created
File ./data/test_metadata_patches.csv created


In [31]:
# patches settings
original_size = 2448
scale = [1.25, 1, 0.75, 0.5]
patch_size = 224
stride = 224

patch_idx = {}
for s in scale:
    patch_idx[s] =  [stride*i for i in range(int(int(original_size*s)/stride))]

In [32]:
print("-------------------------------------")
print("Patches information along 1 dimension")
print("-------------------------------------\n")

format_spec = "{:<8} {:<84} {:<18} {:<10}"
print(format_spec.format("scale:", "patch indexes:", "discarded pixels:", "number of patches:"), "\n")

for key, value in patch_idx.items():
    discarded_pixels = int(original_size*key) - (value[-1] + patch_size)
    print(format_spec.format(str(key), str(value), str(discarded_pixels), str(len(value))))

-------------------------------------
Patches information along 1 dimension
-------------------------------------

scale:   patch indexes:                                                                       discarded pixels:  number of patches: 

1.25     [0, 224, 448, 672, 896, 1120, 1344, 1568, 1792, 2016, 2240, 2464, 2688]              148                13        
1        [0, 224, 448, 672, 896, 1120, 1344, 1568, 1792, 2016]                                208                10        
0.75     [0, 224, 448, 672, 896, 1120, 1344, 1568]                                            44                 8         
0.5      [0, 224, 448, 672, 896]                                                              104                5         


In [33]:
def create_patches(metadata, scale, dir_name, METADATA_PATCHES_PATH):

    id = metadata[0]
    image_path = metadata[1]
    mask_path = metadata[2]

    image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
    mask = cv2.cvtColor(cv2.imread(mask_path), cv2.COLOR_BGR2RGB)

    if image.shape[0] != original_size:
        raise ValueError("The image size should be " + str(original_size))

    for s in scale:

        # resize image and mask
        scaled_size = int(image.shape[0]*s)
        scaled_image = cv2.resize(image, (scaled_size, scaled_size))
        scaled_mask = cv2.resize(mask, (scaled_size, scaled_size))

        if scaled_image.shape[0] < patch_idx[s][-1] + patch_size:
            raise ValueError("The scaled image should contain all the patches")

        for i in patch_idx[s]:
            for j in patch_idx[s]:

                # get patches
                scaled_image_patch = scaled_image[i:i+patch_size, j:j+patch_size, :]
                scaled_mask_patch = scaled_mask[i:i+patch_size, j:j+patch_size, :]

                # create paths
                scaled_image_patch_path = dir_name + "/{}_sat_{}_{}_{}.jpg".format(str(id), str(int(s*100)), str(i), str(j))
                scaled_mask_patch_path = dir_name + "/{}_mask_{}_{}_{}.png".format(str(id), str(int(s*100)), str(i), str(j))

                # save patches
                cv2.imwrite(os.path.join(DATA_PATH, scaled_image_patch_path), scaled_image_patch)
                cv2.imwrite(os.path.join(DATA_PATH, scaled_mask_patch_path), scaled_mask_patch)

                # update the metadata of patches
                with open(METADATA_PATCHES_PATH, "a", newline = "") as f:
                    writer = csv.writer(f)
                    writer.writerow([id, scaled_image_patch_path, scaled_mask_patch_path])

In [34]:
# create train patches
for sample in tqdm(train_df[["image_id", "sat_image_path","mask_path"]].values.tolist()):
    create_patches(sample, scale, 'train_patches', './data/train_metadata_patches.csv')

100%|██████████| 642/642 [12:24<00:00,  1.16s/it]


In [35]:
# create valid patches
for sample in tqdm(valid_df[["image_id", "sat_image_path","mask_path"]].values.tolist()):
    create_patches(sample, [1], 'valid_patches', './data/valid_metadata_patches.csv')

100%|██████████| 80/80 [00:35<00:00,  2.28it/s]


In [36]:
# create test patches
for sample in tqdm(test_df[["image_id", "sat_image_path","mask_path"]].values.tolist()):
    create_patches(sample, [1], 'test_patches', './data/test_metadata_patches.csv')

100%|██████████| 81/81 [00:36<00:00,  2.24it/s]
