# Road Segmentation Model Dataset Preprocessing

In [2]:
# import libraries and files
import os
from PIL import Image as I
import numpy as np
from pathlib import Path as P

Create Folders

In [4]:
train_sat_loc = os.path.join(P(os.path.realpath("__file__")).parent.parent.parent, "assets/preprocessed_ML_dataset/train/sat")
train_mask_loc = os.path.join(P(os.path.realpath("__file__")).parent.parent.parent, "assets/preprocessed_ML_dataset/train/mask")
test_sat_loc = os.path.join(P(os.path.realpath("__file__")).parent.parent.parent, "assets/preprocessed_ML_dataset/test/sat")
test_mask_loc = os.path.join(P(os.path.realpath("__file__")).parent.parent.parent, "assets/preprocessed_ML_dataset/test/mask")
valid_sat_loc = os.path.join(P(os.path.realpath("__file__")).parent.parent.parent, "assets/preprocessed_ML_dataset/valid/sat")
valid_mask_loc = os.path.join(P(os.path.realpath("__file__")).parent.parent.parent, "assets/preprocessed_ML_dataset/valid/mask")

loc = [train_sat_loc, train_mask_loc, test_sat_loc, test_mask_loc, valid_sat_loc, valid_mask_loc]

for i in loc: 
    if not os.path.exists(i): os.makedirs(i)

Original dataset location

In [5]:
test_data_loc = os.path.join(P(os.path.realpath("__file__")).parent.parent.parent.parent, "CV/dataset massachusetts/archive/tiff/test")
test_labels_data_loc = os.path.join(P(os.path.realpath("__file__")).parent.parent.parent.parent, "CV/dataset massachusetts/archive/tiff/test_labels")
train_data_loc = os.path.join(P(os.path.realpath("__file__")).parent.parent.parent.parent, "CV/dataset massachusetts/archive/tiff/train")
train_labels_data_loc = os.path.join(P(os.path.realpath("__file__")).parent.parent.parent.parent, "CV/dataset massachusetts/archive/tiff/train_labels")
valid_data_loc = os.path.join(P(os.path.realpath("__file__")).parent.parent.parent.parent, "CV/dataset massachusetts/archive/tiff/val")
valid_labels_data_loc = os.path.join(P(os.path.realpath("__file__")).parent.parent.parent.parent, "CV/dataset massachusetts/archive/tiff/val_labels")

Original dataset files

In [6]:
test_files = os.listdir(test_data_loc)
testLen = len(test_files)
print("Found {} images as test sat images.".format(testLen))

test_labels_files = os.listdir(test_labels_data_loc)
print("Found {} images as test masks.".format(len(test_labels_files)))

train_files = os.listdir(train_data_loc)
trainLen = len(train_files)
print("Found {} images as train sat images.".format(trainLen))

train_labels_files = os.listdir(train_labels_data_loc)
print("Found {} images as train masks.".format(len(train_labels_files)))

valid_files = os.listdir(valid_data_loc)
validLen = len(valid_files)
print("Found {} images as validation sat images.".format(validLen))

valid_labels_files = os.listdir(valid_labels_data_loc)
print("Found {} images as validation masks.".format(len(valid_labels_files)))

Found 49 images as test sat images.
Found 49 images as test masks.
Found 1108 images as train sat images.
Found 1108 images as train masks.
Found 14 images as validation sat images.
Found 14 images as validation masks.


Function to convert image array to jpeg and save it at the desired location

In [7]:
def save(imgArr, destination, savedImgs, format):
    img = I.fromarray(imgArr)
    img.save(destination + "/" + str(savedImgs + 1) + format)

Function to split the dataset images and masks
* We wish to split the sat-image and its mask of size 1500x1500 into 36 parts of size 256x256.
* There will be a padding of 36 pixels on the left and bottom boundary parts
* A lot of images in the dataset were partly empty/white, but their respective masks had roads markings in the empty region too. Those road markings will be removed too

In [8]:
def splitterAndSaver(satimgpath, 
                    maskpath, 
                    satDestination, 
                    maskDestination, 
                    savedImgsAlready):
                    
    satimg = I.open(satimgpath)
    satimg = satimg.convert("RGB")
    satimg = np.asarray(satimg)
    mask = I.open(maskpath)
    mask = mask.convert("RGB")
    mask = np.asarray(mask)
    whitePixel = np.full((1, 3), 255, dtype = np.uint8)

    i = 0
    while i < 6:
        j = 0
        while j < 6:
            newSatImg = np.full((256, 256, 3), 255, dtype = np.uint8) # complete white
            newSatImgIsBlank = True  # flag variable
            newMask = np.full((256, 256, 3), 0, dtype = np.uint8) # complete black
            row = 0
            while row < 256:
                x = i * 256 + row               
                if x == 1500: break
                col = 0
                while col < 256:
                    y = j * 256 + col
                    if y == 1500: break
                    satpixel = satimg[x, y]  
                    satpixelIsWhite = np.all(satpixel == whitePixel)
                    maskpixel = mask[x, y]

                    if not satpixelIsWhite: 
                        newSatImgIsBlank = False
                        newSatImg[row, col] = satpixel
                        newMask[row, col] = maskpixel
                    col += 1
                row += 1
            j += 1
            if not newSatImgIsBlank: 
                save(newSatImg, satDestination, savedImgsAlready, ".tiff")
                save(newMask, maskDestination, savedImgsAlready, ".tif")
                savedImgsAlready += 1
        i += 1
    return savedImgsAlready

### Testing Dataset Images Splitting
```savedImgsAlready``` is an integer denoting how many new images formed and saved. We won't save completely white/black images.

In [7]:
savedImgsAlready = 0

for i in range(testLen):
    savedImgsAlready = splitterAndSaver(
        satimgpath = os.path.join(test_data_loc, test_files[i]),
        maskpath = os.path.join(test_labels_data_loc, test_labels_files[i]),
        satDestination = test_sat_loc,
        maskDestination = test_mask_loc,
        savedImgsAlready = savedImgsAlready                    
    )
    if (i + 1) % 5 == 0: print("{}/{} sat-mask images split. Generated {} sat-mask image pairs.".format(2 * (i + 1), 2 * testLen, savedImgsAlready))

print("{}/{} sat-mask images split. Generated {} sat-mask image pairs.".format(2 * testLen, 2 * testLen, savedImgsAlready))

10/98 sat-mask images split. Generated 180 sat-mask image pairs.
20/98 sat-mask images split. Generated 360 sat-mask image pairs.
30/98 sat-mask images split. Generated 540 sat-mask image pairs.
40/98 sat-mask images split. Generated 720 sat-mask image pairs.
50/98 sat-mask images split. Generated 900 sat-mask image pairs.
60/98 sat-mask images split. Generated 1080 sat-mask image pairs.
70/98 sat-mask images split. Generated 1260 sat-mask image pairs.
80/98 sat-mask images split. Generated 1440 sat-mask image pairs.
90/98 sat-mask images split. Generated 1620 sat-mask image pairs.
98/98 sat-mask images split. Generated 1764 sat-mask image pairs.


### Validation Dataset Images Splitting

In [8]:
savedImgsAlready = 0

for i in range(validLen):
    savedImgsAlready = splitterAndSaver(
        satimgpath = os.path.join(valid_data_loc, valid_files[i]),
        maskpath = os.path.join(valid_labels_data_loc, valid_labels_files[i]),
        satDestination = valid_sat_loc,
        maskDestination = valid_mask_loc,
        savedImgsAlready = savedImgsAlready                    
    )
    if (i + 1) % 5 == 0: print("{}/{} sat-mask images split. Generated {} sat-mask image pairs.".format(2 * (i + 1), 2 * validLen, savedImgsAlready))

print("{}/{} sat-mask images split. Generated {} sat-mask image pairs.".format(2 * validLen, 2 * validLen, savedImgsAlready))

10/28 sat-mask images split. Generated 180 sat-mask image pairs.
20/28 sat-mask images split. Generated 360 sat-mask image pairs.
28/28 sat-mask images split. Generated 504 sat-mask image pairs.


### Training Dataset Images Splitting

In [9]:
savedImgsAlready = 0

for i in range(trainLen):
    savedImgsAlready = splitterAndSaver(
        satimgpath = os.path.join(train_data_loc, train_files[i]),
        maskpath = os.path.join(train_labels_data_loc, train_labels_files[i]),
        satDestination = train_sat_loc,
        maskDestination = train_mask_loc,
        savedImgsAlready = savedImgsAlready                    
    )
    if (i + 1) % 100 == 0: print("{}/{} sat-mask images split. Generated {} sat-mask image pairs.".format(2 * (i + 1), 2 * trainLen, savedImgsAlready))

print("{}/{} sat-mask images split. Generated {} sat-mask image pairs.".format(2 * trainLen, 2 * trainLen, savedImgsAlready))

200/2216 sat-mask images split. Generated 3288 sat-mask image pairs.
400/2216 sat-mask images split. Generated 6689 sat-mask image pairs.
600/2216 sat-mask images split. Generated 9495 sat-mask image pairs.
800/2216 sat-mask images split. Generated 12512 sat-mask image pairs.
1000/2216 sat-mask images split. Generated 15670 sat-mask image pairs.
1200/2216 sat-mask images split. Generated 18923 sat-mask image pairs.
1400/2216 sat-mask images split. Generated 22384 sat-mask image pairs.
1600/2216 sat-mask images split. Generated 25731 sat-mask image pairs.
1800/2216 sat-mask images split. Generated 29126 sat-mask image pairs.
2000/2216 sat-mask images split. Generated 32488 sat-mask image pairs.
2200/2216 sat-mask images split. Generated 35675 sat-mask image pairs.
2216/2216 sat-mask images split. Generated 35868 sat-mask image pairs.
