# Data preparation

- This notebook presents data preprocessing for [HiSup](https://github.com/SarahwXU/HiSup) and [SAM-Adapter](https://github.com/tianrun-chen/SAM-Adapter-PyTorch) approaches.
- The codes mainly source from [HiSup](https://github.com/SarahwXU/HiSup).
- This workflow is only suitable for binary segmentation. Feel free to adapt it for multiclass segmentation.
- You can upscale images (4 times) by a super resolution model ([EDSR](https://github.com/aswintechguy/Deep-Learning-Projects/tree/main/Super%20Resolution%20-%20OpenCV)).
- The default structure of your data is:<br>
    Dataset1<br>
    - raw
        - train
            - images  (geotif, uint8, 3 bands (RGB), you can enhance image data in GIS software in advance, e.g., standard deviation)
            - gt      (geotif, uint8, value:0(background), 255(targets))
        - test
            - images
            - gt
        - val
            - images
            - gt<br>
    
  Dataset2<br>
  Dataset3<br>
  ... ...

In [1]:
# set up working directory
import os
path = os.getcwd() # your current working directory where your codes are stored.
print(path)

from DataProcessing import data_process_hisup, data_process_sam, upscale_img_sr, upscale_img_nearest, upscale_img_bilinear, upscale_lab, set_sr_model, upscale_testing_data_nearest_bilinear, upscale_testing_data_SR
from pathlib import Path
import pathlib
import glob

# print all datasets
path_database = "./data" # path of dataset
data_list = os.listdir(path_database)
data_list.sort()

type_list = ['train_large', 'train_small', 'val'] # 'test',
data_list, type_list

/home/yunya/anaconda3/envs/sam/SAM_Adapter


(['Dagaha2017',
  'Djibo2019',
  'Kutupalong2018',
  'Minawao2017',
  'Nduta2017',
  'Ngu2017',
  'Ngu2018'],
 ['train_large', 'train_small', 'val'])

## Data preparation for SAM Adapter

In [11]:
patchsize_list = [1024]

for dataset in data_list:
    for dtype in type_list:
        for patch_size in patchsize_list:

            path_dataset = os.path.join(path_database, dataset)
            print("Start processing: " + dataset + " " + dtype + " " + str(patch_size))

            data_process_sam(path_dataset, dtype, patch_size)
            print("Done")

Start processing: Dagaha2017 train_large 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [01:44<00:00, 14.92s/it]


Done
Start processing: Dagaha2017 train_small 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:14<00:00,  2.08s/it]


Done
Start processing: Dagaha2017 val 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00,  8.12it/s]


Done
Start processing: Djibo2019 train_large 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [01:35<00:00, 13.66s/it]


Done
Start processing: Djibo2019 train_small 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:17<00:00,  2.55s/it]


Done
Start processing: Djibo2019 val 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00,  7.28it/s]


Done
Start processing: Kutupalong2018 train_large 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [10:44<00:00, 92.04s/it]


Done
Start processing: Kutupalong2018 train_small 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [02:19<00:00, 19.92s/it]


Done
Start processing: Kutupalong2018 val 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:39<00:00,  5.69s/it]


Done
Start processing: Minawao2017 train_large 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [01:11<00:00, 10.23s/it]


Done
Start processing: Minawao2017 train_small 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:16<00:00,  2.41s/it]


Done
Start processing: Minawao2017 val 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00,  8.36it/s]


Done
Start processing: Nduta2017 train_large 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [07:14<00:00, 62.11s/it]


Done
Start processing: Nduta2017 train_small 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [01:36<00:00, 13.83s/it]


Done
Start processing: Nduta2017 val 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:25<00:00,  3.60s/it]


Done
Start processing: Ngu2017 train_large 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:51<00:00,  7.35s/it]


Done
Start processing: Ngu2017 train_small 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:13<00:00,  1.92s/it]


Done
Start processing: Ngu2017 val 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00,  7.65it/s]


Done
Start processing: Ngu2018 train_large 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [01:05<00:00,  9.36s/it]


Done
Start processing: Ngu2018 train_small 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:16<00:00,  2.32s/it]


Done
Start processing: Ngu2018 val 1024


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:01<00:00,  6.27it/s]

Done





In [12]:
patchsize_list = [256]
data_sr_list = ['Dagaha2017',
                'Djibo2019',
                'Minawao2017',
                'Ngu2018']

type_sr_list = ['train_small', 'val']

for dataset in data_sr_list:
    for dtype in type_sr_list:
        for patch_size in patchsize_list:

            path_dataset = os.path.join(path_database, dataset)
            print("Start processing: " + dataset + " " + dtype + " " + str(patch_size))

            data_process_sam(path_dataset, dtype, patch_size)
            print("Done")

Start processing: Dagaha2017 train_small 256


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:13<00:00,  1.87s/it]


Done
Start processing: Dagaha2017 val 256


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:03<00:00,  2.07it/s]


Done
Start processing: Djibo2019 train_small 256


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:17<00:00,  2.47s/it]


Done
Start processing: Djibo2019 val 256


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:03<00:00,  1.93it/s]


Done
Start processing: Minawao2017 train_small 256


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:11<00:00,  1.58s/it]


Done
Start processing: Minawao2017 val 256


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:02<00:00,  2.44it/s]


Done
Start processing: Ngu2018 train_small 256


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:12<00:00,  1.78s/it]


Done
Start processing: Ngu2018 val 256


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:02<00:00,  2.67it/s]

Done





## Upscale image (256x256) to 1024x1024

In [4]:
model_name = "SAM"

data_list = ['Dagaha2017',
              'Djibo2019',
              'Minawao2017',
              'Ngu2018']

type_list = ['train_small']#['train_small', "val"]
patchsize_list = [256]
upscaled_list = ['nearest', 'bilinear', 'EDSR']

for dataset in data_list:
    for dtype in type_list:
        for patch_size in patchsize_list:

            path_dataset = os.path.join(path_database, dataset, model_name, str(patch_size), dtype, "images")
            
            # read all image names
            img_name_list = os.listdir(path_dataset) 
            
            print("Start: {:<15}   {:<6}  {}".format(len(img_name_list), dataset, dtype))

            for img_name in img_name_list:
                
                path_ups_lab = os.path.join(path_database, dataset, model_name, "SR", dtype, "gt")
                Path(path_ups_lab).mkdir(parents=True, exist_ok=True)
                
                path_in_lab = os.path.join(path_database, dataset, model_name, str(patch_size), dtype, "gt", img_name)
                path_out_lab = os.path.join(path_ups_lab, img_name)
                
                # upscale labels
                upscale_lab(path_in_lab, path_out_lab)
                
                for upscaled_folder in upscaled_list:
                    
                    path_ups_img = os.path.join(path_database, dataset, model_name, "SR", dtype, upscaled_folder, "images")
                    Path(path_ups_img).mkdir(parents=True, exist_ok=True)

                    path_in_img = os.path.join(path_database, dataset, model_name, str(patch_size), dtype, "images", img_name)
                    path_out_img = os.path.join(path_ups_img, img_name)
                    
                    if upscaled_folder=='nearest': 
                        upscale_img_nearest(path_in_img, path_out_img)
                    
                    elif upscaled_folder=='bilinear': 
                        upscale_img_bilinear(path_in_img, path_out_img)
                    
                    else:
                        # set up sr model
                        sr_model = set_sr_model(upscaled_folder)

                        # upscale images
                        upscale_img_sr(path_in_img, path_out_img, sr_model)                

            print("Done")

Start: 686               Dagaha2017  train_small
Done
Start: 784               Djibo2019  train_small
Done
Start: 504               Minawao2017  train_small
Done
Start: 588               Ngu2018  train_small
Done


In [7]:
upscaled_folder

'FSRCNN'

## Upscale testing data

In [3]:
data_list = ['Dagaha2017',
              'Djibo2019',
              'Minawao2017',
              'Ngu2018']

In [6]:
# upscale data by nearest neighboring and bilinear interpolation
upscale_list = ['nearest', 'bilinear']
data_type = "images"

for dataset in data_list:
    for upscaled_folder in upscale_list:
        upscale_testing_data_nearest_bilinear(path_database, dataset, upscaled_folder, data_type)

./data/Dagaha2017/SAM/SR/test/nearest/test_up.tif
(3, 12404, 19132)
./data/Dagaha2017/SAM/SR/test/bilinear/test_up.tif
(3, 12404, 19132)
./data/Djibo2019/SAM/SR/test/nearest/test_up.tif
(3, 11696, 15404)
./data/Djibo2019/SAM/SR/test/bilinear/test_up.tif
(3, 11696, 15404)
./data/Minawao2017/SAM/SR/test/nearest/test_up.tif
(3, 12660, 7268)
./data/Minawao2017/SAM/SR/test/bilinear/test_up.tif
(3, 12660, 7268)
./data/Ngu2018/SAM/SR/test/nearest/test_up.tif
(3, 9616, 19392)
./data/Ngu2018/SAM/SR/test/bilinear/test_up.tif
(3, 9616, 19392)


In [7]:
# upscale Ground Truth data

upscale_list = ['bilinear']
data_type = "gt"

for dataset in data_list:
    for upscaled_folder in upscale_list:
        upscale_testing_data_nearest_bilinear(path_database, dataset, upscaled_folder, data_type)

./data/Dagaha2017/SAM/SR/test/gt/test_up.tif
(1, 12408, 19128)
./data/Djibo2019/SAM/SR/test/gt/test_up.tif
(1, 11696, 15404)
./data/Minawao2017/SAM/SR/test/gt/test_up.tif
(1, 12660, 7268)
./data/Ngu2018/SAM/SR/test/gt/test_up.tif
(1, 9612, 19392)


#### Upscale image by SR

In [None]:
upscaled_folder = "EDSR"
data_type = "images"

for dataset in data_list:
    upscale_testing_data_SR(path_database, dataset, upscaled_folder, data_type)

./data/Ngu2018/SAM/SR/test/EDSR/test_up.tif
num_patches_height: 5, num_patches_width: 10


# Print number of patches for each type of created data

### SAM

In [14]:
model_name = "SAM"

patchsize_list = [1024]

for dataset in data_list:
    for dtype in type_list:
        for patch_size in patchsize_list:

            path_dataset = os.path.join(path_database, dataset, model_name, str(patch_size), dtype, "images")
            alldata_list = os.listdir(path_dataset)

            print("{:<7}   {:<15}  {:<6}  {}".format(len(alldata_list), dataset, patch_size, dtype))

350       Dagaha2017       1024    train_large
56        Dagaha2017       1024    train_small
7         Dagaha2017       1024    val
280       Djibo2019        1024    train_large
56        Djibo2019        1024    train_small
7         Djibo2019        1024    val
1848      Kutupalong2018   1024    train_large
420       Kutupalong2018   1024    train_small
112       Kutupalong2018   1024    val
224       Minawao2017      1024    train_large
56        Minawao2017      1024    train_small
7         Minawao2017      1024    val
1176      Nduta2017        1024    train_large
224       Nduta2017        1024    train_small
63        Nduta2017        1024    val
224       Ngu2017          1024    train_large
56        Ngu2017          1024    train_small
7         Ngu2017          1024    val
224       Ngu2018          1024    train_large
56        Ngu2018          1024    train_small
7         Ngu2018          1024    val


In [18]:
model_name = "SAM"

patchsize_list = [256]
data_list = ['Dagaha2017',
              'Djibo2019',
              'Minawao2017',
              'Ngu2018']
type_list = ["train_small"]

for dataset in data_list:
    for dtype in type_list:
        for patch_size in patchsize_list:

            path_dataset = os.path.join(path_database, dataset, model_name, str(patch_size), dtype, "images")
            alldata_list = os.listdir(path_dataset)

            print("{:<7}   {:<15}  {:<6}  {}".format(len(alldata_list), dataset, patch_size, dtype))

686       Dagaha2017       256     train_small
784       Djibo2019        256     train_small
504       Minawao2017      256     train_small
588       Ngu2018          256     train_small
