In [3]:
import os
import sys

import pandas as pd

In [4]:
RAW_DATA_PATH = "data/raw/deepglobe-2018-dataset/"
METADATA_FILE = "metadata.csv"
CLASS_FILE = "class_dict.csv"

TARGET_DATA_PATH = "data/splitted/deepglobe-2018-dataset/"

In [5]:
metadata_df = pd.read_csv(os.path.join(RAW_DATA_PATH, METADATA_FILE))
metadata_df

Unnamed: 0,image_id,split,sat_image_path,mask_path
0,100694,train,train/100694_sat.jpg,train/100694_mask.png
1,102122,train,train/102122_sat.jpg,train/102122_mask.png
2,10233,train,train/10233_sat.jpg,train/10233_mask.png
3,103665,train,train/103665_sat.jpg,train/103665_mask.png
4,103730,train,train/103730_sat.jpg,train/103730_mask.png
...,...,...,...,...
1141,979233,test,test/979233_sat.jpg,
1142,983689,test,test/983689_sat.jpg,
1143,988205,test,test/988205_sat.jpg,
1144,989953,test,test/989953_sat.jpg,


In [6]:
os.makedirs(os.path.join(TARGET_DATA_PATH, "train/images"), exist_ok=True)
os.makedirs(os.path.join(TARGET_DATA_PATH, "train/masks"), exist_ok=True)
os.makedirs(os.path.join(TARGET_DATA_PATH, "valid/images"), exist_ok=True)
os.makedirs(os.path.join(TARGET_DATA_PATH, "valid/masks"), exist_ok=True)
os.makedirs(os.path.join(TARGET_DATA_PATH, "test/images"), exist_ok=True)
os.makedirs(os.path.join(TARGET_DATA_PATH, "test/masks"), exist_ok=True)

In [7]:
meta_list = metadata_df.to_dict(orient="records")
meta_list

[{'image_id': 100694,
  'split': 'train',
  'sat_image_path': 'train/100694_sat.jpg',
  'mask_path': 'train/100694_mask.png'},
 {'image_id': 102122,
  'split': 'train',
  'sat_image_path': 'train/102122_sat.jpg',
  'mask_path': 'train/102122_mask.png'},
 {'image_id': 10233,
  'split': 'train',
  'sat_image_path': 'train/10233_sat.jpg',
  'mask_path': 'train/10233_mask.png'},
 {'image_id': 103665,
  'split': 'train',
  'sat_image_path': 'train/103665_sat.jpg',
  'mask_path': 'train/103665_mask.png'},
 {'image_id': 103730,
  'split': 'train',
  'sat_image_path': 'train/103730_sat.jpg',
  'mask_path': 'train/103730_mask.png'},
 {'image_id': 104113,
  'split': 'train',
  'sat_image_path': 'train/104113_sat.jpg',
  'mask_path': 'train/104113_mask.png'},
 {'image_id': 10452,
  'split': 'train',
  'sat_image_path': 'train/10452_sat.jpg',
  'mask_path': 'train/10452_mask.png'},
 {'image_id': 10901,
  'split': 'train',
  'sat_image_path': 'train/10901_sat.jpg',
  'mask_path': 'train/10901_mask.

In [8]:


from numpy import NaN


for image_dict in meta_list:
    split_name = image_dict['split']
    sat_image_name = image_dict['sat_image_path'].split("/")[-1]
    
    if image_dict['mask_path'] not in [None, NaN, ""]:
        mask_image_name = image_dict['mask_path'].split("/")[-1]
    
    if os.path.exists(os.path.join(RAW_DATA_PATH, image_dict['sat_image_path'])):
    
        os.rename(os.path.join(RAW_DATA_PATH, image_dict['sat_image_path']), os.path.join(
            TARGET_DATA_PATH, split_name, f"images/{sat_image_name}"))
        
    
    if image_dict['mask_path'] not in [None, NaN, ""] and os.path.exists(os.path.join(RAW_DATA_PATH, image_dict['mask_path'])):
        os.rename(os.path.join(RAW_DATA_PATH, image_dict['mask_path']), os.path.join(
            TARGET_DATA_PATH, split_name, f"masks/{mask_image_name}"))

In [9]:
# remove postfix
splits = ["train", "test", "valid"]

for split in splits:
    image_dir_path = os.path.join(TARGET_DATA_PATH, split, "images/")
    mask_dir_path = os.path.join(TARGET_DATA_PATH, split, "masks/")
    
    image_dir = os.listdir(image_dir_path)
    mask_dir = os.listdir(mask_dir_path)
    
    for image_name in image_dir:
        if image_name.endswith((".jpg", ".png", ".jpeg")):
            new_name = image_name.replace("_sat", "")
            os.rename(os.path.join(image_dir_path, image_name), os.path.join(image_dir_path, new_name))
            
    for image_name in mask_dir:
        if image_name.endswith((".jpg", ".png", ".jpeg")):
            new_name = image_name.replace("_mask", "")
            os.rename(os.path.join(mask_dir_path, image_name), os.path.join(mask_dir_path, new_name))

In [10]:
from sklearn.model_selection import train_test_split

image_dir_path = os.path.join(TARGET_DATA_PATH, "train", "images/")
mask_dir_path = os.path.join(TARGET_DATA_PATH, "train", "masks/")

image_files = sorted(os.listdir(image_dir_path))
mask_files = sorted(os.listdir(mask_dir_path))

# Train-validation split (e.g., 80% train, 20% validation)
train_images, val_images, train_masks, val_masks = train_test_split(
    image_files, mask_files, test_size=0.2, random_state=42
)

In [11]:

val_image_dir_path = os.path.join(TARGET_DATA_PATH, "val", "images/")
val_mask_dir_path = os.path.join(TARGET_DATA_PATH, "val", "masks/")

for img, mask in zip(val_images, val_masks):
    os.rename(os.path.join(image_dir_path, img), os.path.join(val_image_dir_path, img))
    os.rename(os.path.join(mask_dir_path, mask), os.path.join(val_mask_dir_path, mask))