This fils is to create training dataset using 5K cleaned annotation
- Drop 'delete' rows
- Inclue 'uncertflag_qc' rows

This code does not distinguish between different PV types.
All PV types are encoded as same category
  0: PV_all_types

In [None]:
# Data Sanity Check
import geopandas as gpd

# Load data once
df = gpd.read_file("/home/il72/cape_town_year_of_installation/YOLO_CapeTown_5K_3categories(July_10)/Final_annotations_July10.gpkg")
print("Total rows:", len(df))

# Columns to check
cols = [
    "PV_normal_qc",
    "PV_heater_qc",
    "PV_pool_qc",
    "uncertflag_qc",
    "delete_qc",
    "PV_heater_mat",
    "PV_heater_mat_qc"
]

# Count exact matches for value == 1
flag_counts = df[cols].apply(lambda col: (col == 1).sum())

# Display result
print("\nCounts where value == 1 for each column:")
print(flag_counts)


df['num_flags_eq_1'] = df[cols].apply(lambda row: (row == 1).sum(), axis=1)

multi_flag_rows = df[df['num_flags_eq_1'] >= 2]

print(f"\n Rows with two or more columns having value : {len(multi_flag_rows)}")


Total rows: 5263

Counts where value == 1 for each column:
PV_normal_qc        2033
PV_heater_qc        1422
PV_pool_qc          1146
uncertflag_qc         13
delete_qc            101
PV_heater_mat        143
PV_heater_mat_qc     421
dtype: int64

 Rows with two or more columns having value : 16


**Combining PV_heater + PV_mat**

In [5]:
import geopandas as gpd

gdf = gpd.read_file('/home/il72/cape_town_year_of_installation/YOLO_CapeTown_5K_3categories(July_10)/Final_annotations_July10.gpkg')
gdf["PV_normal"] = gdf["PV_normal_qc"]
gdf["PV_pool"] = gdf["PV_pool_qc"]
gdf["PV_heater"] = gdf["PV_heater_qc"]
gdf["uncertflag"] = gdf["uncertflag_qc"]

gdf.columns

# Combine PV_heater_mat_qc and PV_heater_mat
gdf["PV_heater"] = gdf["PV_heater"].fillna(0).astype(int)
gdf["PV_heater_mat_combined"] = (
    gdf.get("PV_heater_mat_qc", 0).fillna(0).astype(int) |
    gdf.get("PV_heater_mat", 0).fillna(0).astype(int)
)

# Final PV_heater union includes both PV_heater_qc and the combined mat flags
gdf["PV_heater"] = (
    gdf["PV_heater"] | gdf["PV_heater_mat_combined"]
)

# Keep rows where at least one PV-related flag is 1
pv_mask = (
    (gdf['PV_normal'] == 1) |
    (gdf['PV_heater'] == 1) |
    (gdf['PV_pool'] == 1) )

# Exclude rows where delete_qc == 1
delete_mask = gdf['delete_qc'] != 1

# Apply filters
filtered_gdf = gdf[pv_mask & delete_mask].copy()

# Drop irrelevant columns if needed (optional, keep only analysis columns)
# filtered_gdf = filtered_gdf[['PV_normal', 'PV_heater', 'PV_pool', 'PV_heater_mat']]  # optional

# Save the filtered data
output_path = "final_annotations_PV_all_types_5K.gpkg"
filtered_gdf.to_file(output_path, driver="GPKG")

# Output stats
print(f"Filtered dataset contains {len(filtered_gdf)} PV-related arrays")
print("Remaining columns:", filtered_gdf.columns.tolist())


Filtered dataset contains 5162 PV-related arrays
Remaining columns: ['id', 'PV_normal', 'PV_heater', 'PV_pool', 'uncertflag', 'area', 'annotator', 'centroid_latitude', 'centroid_longitude', 'image_name', 'nw_corner_of_image_latitude', 'nw_corner_of_image_longitude', 'se_corner_of_image_latitude', 'se_corner_of_image_longitude', 'PV_normal_qc', 'PV_heater_qc', 'PV_pool_qc', 'uncertflag_qc', 'delete_qc', 'resizing_qc', 'PV_heater_mat', 'original_label', 'qc_label', 'source_file', 'PV_heater_mat_qc', 'geometry', 'PV_heater_mat_combined']


In [6]:
# Geometry error checker
from shapely.geometry import Polygon

gdf = gpd.read_file("final_annotations_PV_all_types_5K.gpkg")

# Invalid geometry
invalid_gdf = gdf[~gdf.geometry.is_valid].copy()
invalid_gdf["image_name"] = invalid_gdf["image_name"]
print(f"Invalid geometries found: {len(invalid_gdf)}")

# Error correction : buffer(0) method 
gdf["geometry"] = gdf["geometry"].apply(lambda geom: geom.buffer(0) if not geom.is_valid else geom)


still_invalid = gdf[~gdf.geometry.is_valid].copy()
print(f"Still invalid after fixing: {len(still_invalid)}")

gdf_valid = gdf[gdf.geometry.is_valid].copy()
gdf_valid.to_file("final_annotations_PV_all_types_5K_cleaned.gpkg", driver="GPKG")

Invalid geometries found: 8
Still invalid after fixing: 0


**Converging PV_heater_mat with PV_heater = PV_heater_mat -> PV_heater**

In [7]:
# Load the GeoPackage
gdf_path = "final_annotations_PV_all_types_5K_cleaned.gpkg"
gdf = gpd.read_file(gdf_path)

# Count occurrences of each condition
counts = {
    "PV_normal": (gdf['PV_normal'] == 1).sum(),
    "PV_heater": (gdf['PV_heater'] == 1).sum(),
    "PV_pool": (gdf['PV_pool'] == 1).sum(),
}

import pandas as pd
df_counts = pd.DataFrame(list(counts.items()), columns=["Category", "Count"])
print(df_counts)

    Category  Count
0  PV_normal   2033
1  PV_heater   1983
2    PV_pool   1146


**Save GPKG file**

**Code to crop tiles and annotations**
Dataset Split : Training, Validation, Test
- As of July. 6, we only have 
    PV_normal: 663.0, PV_heater: 711.0, PV_pool: 623, PV_heater_mat: 140
- [1] Assign class IDs (0/1/2/3) in YOLO labels during tiling
- As of July.15, Padding is applied

In [None]:
import os
import numpy as np
import rasterio
from rasterio.windows import Window
from shapely.geometry import box
import cv2
import geopandas as gpd
from tqdm import tqdm

def sort_points_clockwise(pts):
    center = np.mean(pts, axis=0)
    angles = np.arctan2(pts[:, 1] - center[1], pts[:, 0] - center[0])
    return pts[np.argsort(angles)]

def split_tiff_to_tiles(tif_path, label_gdf, out_img_dir, out_lbl_dir, tile_size=1024):
    os.makedirs(out_img_dir, exist_ok=True)
    os.makedirs(out_lbl_dir, exist_ok=True)

    img_name = os.path.splitext(os.path.basename(tif_path))[0]

    with rasterio.open(tif_path) as src:
        for y in range(0, src.height, tile_size):
            for x in range(0, src.width, tile_size):
                
                read_width = min(tile_size, src.width - x)
                read_height = min(tile_size, src.height - y)
                if read_width <= 0 or read_height <= 0:
                    continue

                window = Window(x, y, read_width, read_height)
                transform = src.window_transform(window)
                tile_img = src.read(window=window)

                # padding
                padded_img = np.zeros((src.count, tile_size, tile_size), dtype=tile_img.dtype)
                padded_img[:, :read_height, :read_width] = tile_img

                tile_name = f"{img_name}_tile_{x}_{y}"
                out_img_path = os.path.join(out_img_dir, f"{tile_name}.tif")
                label_file_path = os.path.join(out_lbl_dir, f"{tile_name}.txt")

                with rasterio.open(out_img_path, 'w',
                    driver='GTiff',
                    height=tile_size, width=tile_size,
                    count=src.count, dtype=padded_img.dtype,
                    crs=src.crs, transform=transform
                ) as dst:
                    dst.write(padded_img)

                tile_bounds = box(*rasterio.windows.bounds(window, src.transform))
                anns = label_gdf[label_gdf.geometry.intersects(tile_bounds)].copy()
                if anns.empty:
                    continue

                anns['geometry'] = anns.geometry.intersection(tile_bounds)
                anns = anns[~anns.geometry.is_empty]

                label_lines = []

                for _, row in anns.iterrows():
                    try:
                        if row['PV_normal'] == 1:
                            class_id = 0
                        elif row['PV_heater'] == 1:
                            class_id = 1
                        elif row['PV_pool'] == 1:
                            class_id = 2
                        else:
                            continue

                        geom = row.geometry
                        if geom.geom_type == "MultiPolygon":
                            geom = max(geom.geoms, key=lambda g: g.area)
                        elif geom.geom_type != "Polygon":
                            continue

                        coords = np.array(list(geom.exterior.coords[:-1]), dtype=np.float32)
                        pixel_coords = np.array([~transform * (lon, lat) for lon, lat in coords], dtype=np.float32)

                        if len(pixel_coords) < 3 or np.any(np.isnan(pixel_coords)) or np.any(np.isinf(pixel_coords)):
                            continue

                        original_area = cv2.contourArea(pixel_coords)
                        if original_area < 1.0:
                            continue

                        rect = cv2.minAreaRect(pixel_coords)
                        box_pts = cv2.boxPoints(rect)
                        box_pts = sort_points_clockwise(box_pts)
                        if np.any(np.isnan(box_pts)) or np.any(np.isinf(box_pts)):
                            continue

                        box_area = cv2.contourArea(box_pts)
                        if box_area < 1.0:
                            continue

                        box_pts[:, 0] /= tile_size
                        box_pts[:, 1] /= tile_size
                        box_pts = np.clip(box_pts, 0, 1)

                        if box_pts.shape != (4, 2):
                            continue

                        coords_str = " ".join([f"{pt[0]:.6f} {pt[1]:.6f}" for pt in box_pts])
                        label_lines.append(f"{class_id} {coords_str}")

                    except Exception as e:
                        print(f"[!] Error in {tile_name}: {e}")
                        continue

                if label_lines:
                    with open(label_file_path, "w") as f:
                        f.write("\n".join(label_lines))
                    print(f"Saved tile: {tile_name}.tif with {len(label_lines)} labels")

def main():
    tif_dir = "/home/il72/cape_town_annotation_checker/1.db_pipeline/download/images"

    try:
        label_gdf = gpd.read_file("final_annotations_PV_all_types_5K_cleaned.gpkg")
        print(f"Loaded {len(label_gdf)} annotations")
    except Exception as e:
        print(f"[!] Error loading annotations: {e}")
        return

    image_names = label_gdf["image_name"].unique()
    print(f"Found {len(image_names)} unique images")

    out_img_dir = "/home/il72/cape_town_year_of_installation/datasets/pv_capetown_after_qc_5K/original_dataset_tiles/images"
    out_lbl_dir = "/home/il72/cape_town_year_of_installation/datasets/pv_capetown_after_qc_5K/original_dataset_tiles/labels"

    os.makedirs(out_img_dir, exist_ok=True)
    os.makedirs(out_lbl_dir, exist_ok=True)

    for img_name in tqdm(image_names, desc="Tiling"):
        tif_path = os.path.join(tif_dir, f"{img_name}.tif")
        if not os.path.exists(tif_path):
            print(f"[!] Missing TIFF: {tif_path}")
            continue

        split_tiff_to_tiles(
            tif_path=tif_path,
            label_gdf=label_gdf[label_gdf["image_name"] == img_name],
            out_img_dir=out_img_dir,
            out_lbl_dir=out_lbl_dir
        )

    print("Finished tiling and label conversion")

if __name__ == "__main__":
    main()


Loaded 5162 annotations
Found 15 unique images


Tiling:   0%|          | 0/15 [00:00<?, ?it/s]

Saved tile: 2023_RGB_8cm_W57B_10_tile_3072_0.tif with 12 labels
Saved tile: 2023_RGB_8cm_W57B_10_tile_4096_0.tif with 5 labels
Saved tile: 2023_RGB_8cm_W57B_10_tile_7168_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_10_tile_11264_0.tif with 5 labels
Saved tile: 2023_RGB_8cm_W57B_10_tile_12288_0.tif with 3 labels
Saved tile: 2023_RGB_8cm_W57B_10_tile_1024_1024.tif with 2 labels
Saved tile: 2023_RGB_8cm_W57B_10_tile_2048_1024.tif with 6 labels
Saved tile: 2023_RGB_8cm_W57B_10_tile_3072_1024.tif with 7 labels
Saved tile: 2023_RGB_8cm_W57B_10_tile_5120_1024.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_10_tile_6144_1024.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_10_tile_7168_1024.tif with 8 labels
Saved tile: 2023_RGB_8cm_W57B_10_tile_9216_1024.tif with 4 labels
Saved tile: 2023_RGB_8cm_W57B_10_tile_10240_1024.tif with 5 labels
Saved tile: 2023_RGB_8cm_W57B_10_tile_11264_1024.tif with 9 labels
Saved tile: 2023_RGB_8cm_W57B_10_tile_1024_2048.tif with 4 labels
Saved tile: 2023_RGB

Tiling:   7%|▋         | 1/15 [00:04<00:59,  4.27s/it]

Saved tile: 2023_RGB_8cm_W57B_4_tile_2048_0.tif with 4 labels
Saved tile: 2023_RGB_8cm_W57B_4_tile_4096_0.tif with 2 labels
Saved tile: 2023_RGB_8cm_W57B_4_tile_6144_0.tif with 3 labels
Saved tile: 2023_RGB_8cm_W57B_4_tile_7168_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_4_tile_8192_0.tif with 4 labels
Saved tile: 2023_RGB_8cm_W57B_4_tile_9216_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_4_tile_10240_0.tif with 3 labels
Saved tile: 2023_RGB_8cm_W57B_4_tile_2048_1024.tif with 3 labels
Saved tile: 2023_RGB_8cm_W57B_4_tile_3072_1024.tif with 2 labels
Saved tile: 2023_RGB_8cm_W57B_4_tile_4096_1024.tif with 4 labels
Saved tile: 2023_RGB_8cm_W57B_4_tile_5120_1024.tif with 6 labels
Saved tile: 2023_RGB_8cm_W57B_4_tile_6144_1024.tif with 4 labels
Saved tile: 2023_RGB_8cm_W57B_4_tile_7168_1024.tif with 6 labels
Saved tile: 2023_RGB_8cm_W57B_4_tile_8192_1024.tif with 5 labels
Saved tile: 2023_RGB_8cm_W57B_4_tile_9216_1024.tif with 7 labels
Saved tile: 2023_RGB_8cm_W57B_4_tile_10240_10

Tiling:  13%|█▎        | 2/15 [00:08<00:56,  4.31s/it]

Saved tile: 2023_RGB_8cm_W57B_15_tile_1024_0.tif with 2 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_2048_0.tif with 6 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_3072_0.tif with 6 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_4096_0.tif with 3 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_5120_0.tif with 6 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_6144_0.tif with 8 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_7168_0.tif with 5 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_8192_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_9216_0.tif with 2 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_10240_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_0_1024.tif with 10 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_1024_1024.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_2048_1024.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_3072_1024.tif with 2 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_4096_1024.tif with 6 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_512

Tiling:  20%|██        | 3/15 [00:26<02:06, 10.54s/it]

Saved tile: 2023_RGB_8cm_W57B_15_tile_5120_11264.tif with 3 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_6144_11264.tif with 3 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_7168_11264.tif with 5 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_8192_11264.tif with 7 labels
Saved tile: 2023_RGB_8cm_W57B_15_tile_9216_11264.tif with 1 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_2048_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_10240_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_11264_0.tif with 3 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_0_1024.tif with 6 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_1024_1024.tif with 9 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_4096_1024.tif with 3 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_5120_1024.tif with 2 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_6144_1024.tif with 1 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_7168_1024.tif with 2 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_9216_1024.tif with 3 labels
Saved tile: 202

Tiling:  27%|██▋       | 4/15 [00:31<01:30,  8.23s/it]

Saved tile: 2023_RGB_8cm_W53B_24_tile_5120_11264.tif with 1 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_6144_11264.tif with 2 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_7168_11264.tif with 5 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_8192_11264.tif with 16 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_9216_11264.tif with 9 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_11264_11264.tif with 6 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_12288_11264.tif with 1 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_6144_12288.tif with 3 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_7168_12288.tif with 2 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_8192_12288.tif with 2 labels
Saved tile: 2023_RGB_8cm_W53B_24_tile_9216_12288.tif with 2 labels
Saved tile: 2023_RGB_8cm_W36B_15_tile_4096_1024.tif with 1 labels
Saved tile: 2023_RGB_8cm_W36B_15_tile_5120_1024.tif with 1 labels
Saved tile: 2023_RGB_8cm_W36B_15_tile_4096_2048.tif with 9 labels
Saved tile: 2023_RGB_8cm_W36B_15_tile_5120_2048.tif with 6 lab

Tiling:  33%|███▎      | 5/15 [00:35<01:09,  6.93s/it]

Saved tile: 2023_RGB_8cm_W36B_15_tile_2048_11264.tif with 5 labels
Saved tile: 2023_RGB_8cm_W36B_15_tile_3072_11264.tif with 2 labels
Saved tile: 2023_RGB_8cm_W36B_15_tile_4096_11264.tif with 9 labels
Saved tile: 2023_RGB_8cm_W36B_15_tile_5120_11264.tif with 12 labels
Saved tile: 2023_RGB_8cm_W36B_15_tile_6144_11264.tif with 26 labels
Saved tile: 2023_RGB_8cm_W36B_15_tile_7168_11264.tif with 4 labels
Saved tile: 2023_RGB_8cm_W36B_15_tile_3072_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W36B_15_tile_4096_12288.tif with 4 labels
Saved tile: 2023_RGB_8cm_W36B_15_tile_5120_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W36B_15_tile_6144_12288.tif with 2 labels
Saved tile: 2023_RGB_8cm_W36B_15_tile_7168_12288.tif with 2 labels
Saved tile: 2023_RGB_8cm_W16C_20_tile_1024_1024.tif with 2 labels
Saved tile: 2023_RGB_8cm_W16C_20_tile_2048_4096.tif with 3 labels
Saved tile: 2023_RGB_8cm_W16C_20_tile_3072_4096.tif with 8 labels
Saved tile: 2023_RGB_8cm_W16C_20_tile_4096_4096.tif with 8 labe

Tiling:  40%|████      | 6/15 [01:26<03:16, 21.88s/it]

Saved tile: 2023_RGB_8cm_W16C_20_tile_4096_12288.tif with 2 labels
Saved tile: 2023_RGB_8cm_W16C_20_tile_7168_12288.tif with 2 labels
Saved tile: 2023_RGB_8cm_W16C_20_tile_8192_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W16C_20_tile_9216_12288.tif with 4 labels
Saved tile: 2023_RGB_8cm_W16C_20_tile_12288_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_9216_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_0_1024.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_8192_1024.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_10240_1024.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_1024_2048.tif with 2 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_10240_2048.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_1024_3072.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_3072_3072.tif with 18 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_8192_3072.tif with 20 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_11264_3072.tif with 2 labels
Saved

Tiling:  47%|████▋     | 7/15 [01:45<02:46, 20.76s/it]

Saved tile: 2023_RGB_8cm_W57B_13_tile_1024_11264.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_5120_11264.tif with 6 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_6144_11264.tif with 7 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_7168_11264.tif with 7 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_8192_11264.tif with 4 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_9216_11264.tif with 3 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_10240_11264.tif with 3 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_6144_12288.tif with 2 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_7168_12288.tif with 3 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_8192_12288.tif with 2 labels
Saved tile: 2023_RGB_8cm_W57B_13_tile_9216_12288.tif with 2 labels
Saved tile: 2023_RGB_8cm_W53B_9_tile_4096_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W53B_9_tile_4096_1024.tif with 4 labels
Saved tile: 2023_RGB_8cm_W53B_9_tile_5120_1024.tif with 1 labels
Saved tile: 2023_RGB_8cm_W53B_9_tile_4096_2048.tif with 9 labels
Saved

Tiling:  53%|█████▎    | 8/15 [01:50<01:49, 15.68s/it]

Saved tile: 2023_RGB_8cm_W53B_9_tile_10240_11264.tif with 2 labels
Saved tile: 2023_RGB_8cm_W53B_9_tile_10240_12288.tif with 4 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_1024_0.tif with 11 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_2048_0.tif with 5 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_4096_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_7168_0.tif with 2 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_8192_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_9216_0.tif with 4 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_10240_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_11264_0.tif with 3 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_0_1024.tif with 3 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_1024_1024.tif with 9 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_2048_1024.tif with 3 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_10240_1024.tif with 3 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_0_2048.tif with 9 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_1024_2048.

Tiling:  60%|██████    | 9/15 [01:53<01:11, 11.91s/it]

Saved tile: 2023_RGB_8cm_W25A_9_tile_0_11264.tif with 1 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_2048_11264.tif with 1 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_4096_11264.tif with 1 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_1024_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W25A_9_tile_2048_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_0_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_0_1024.tif with 7 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_2048_1024.tif with 3 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_3072_1024.tif with 4 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_0_2048.tif with 4 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_1024_2048.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_2048_2048.tif with 8 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_3072_2048.tif with 8 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_6144_2048.tif with 4 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_7168_2048.tif with 3 labels
Saved tile: 2023_RGB_8cm_W17B_3_til

Tiling:  67%|██████▋   | 10/15 [01:59<00:50, 10.08s/it]

Saved tile: 2023_RGB_8cm_W17B_3_tile_1024_11264.tif with 4 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_2048_11264.tif with 8 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_3072_11264.tif with 11 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_4096_11264.tif with 6 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_5120_11264.tif with 2 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_6144_11264.tif with 3 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_7168_11264.tif with 2 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_8192_11264.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_2048_12288.tif with 3 labels
Saved tile: 2023_RGB_8cm_W17B_3_tile_3072_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W36B_20_tile_3072_0.tif with 3 labels
Saved tile: 2023_RGB_8cm_W36B_20_tile_4096_0.tif with 6 labels
Saved tile: 2023_RGB_8cm_W36B_20_tile_5120_0.tif with 10 labels
Saved tile: 2023_RGB_8cm_W36B_20_tile_6144_0.tif with 13 labels
Saved tile: 2023_RGB_8cm_W36B_20_tile_7168_0.tif with 8 labels
Saved tile: 2023_RGB_8

Tiling:  73%|███████▎  | 11/15 [02:06<00:36,  9.22s/it]

Saved tile: 2023_RGB_8cm_W36B_20_tile_8192_11264.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_2048_0.tif with 5 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_3072_0.tif with 14 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_4096_0.tif with 4 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_1024_1024.tif with 4 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_2048_1024.tif with 10 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_3072_1024.tif with 11 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_4096_1024.tif with 7 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_6144_1024.tif with 6 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_11264_1024.tif with 3 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_0_2048.tif with 18 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_1024_2048.tif with 14 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_5120_2048.tif with 2 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_6144_2048.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_10240_2048.tif with 9 labels
Saved tile: 20

Tiling:  80%|████████  | 12/15 [02:13<00:25,  8.53s/it]

Saved tile: 2023_RGB_8cm_W17B_11_tile_3072_11264.tif with 5 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_4096_11264.tif with 3 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_5120_11264.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_6144_11264.tif with 19 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_10240_11264.tif with 18 labels
Saved tile: 2023_RGB_8cm_W17B_11_tile_11264_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W57C_5_tile_0_0.tif with 2 labels
Saved tile: 2023_RGB_8cm_W57C_5_tile_0_1024.tif with 3 labels
Saved tile: 2023_RGB_8cm_W57C_5_tile_0_2048.tif with 3 labels
Saved tile: 2023_RGB_8cm_W57C_5_tile_1024_2048.tif with 2 labels
Saved tile: 2023_RGB_8cm_W57C_5_tile_2048_2048.tif with 9 labels
Saved tile: 2023_RGB_8cm_W57C_5_tile_0_3072.tif with 2 labels
Saved tile: 2023_RGB_8cm_W57C_5_tile_1024_3072.tif with 10 labels
Saved tile: 2023_RGB_8cm_W57C_5_tile_2048_3072.tif with 2 labels
Saved tile: 2023_RGB_8cm_W57C_5_tile_3072_3072.tif with 6 labels
Saved tile: 2023_RGB_8c

Tiling:  87%|████████▋ | 13/15 [02:17<00:14,  7.03s/it]

Saved tile: 2023_RGB_8cm_W17B_6_tile_1024_0.tif with 6 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_2048_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_4096_0.tif with 2 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_5120_0.tif with 8 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_6144_0.tif with 6 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_7168_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_8192_0.tif with 6 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_9216_0.tif with 2 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_10240_0.tif with 9 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_11264_0.tif with 2 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_0_1024.tif with 7 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_3072_1024.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_4096_1024.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_5120_1024.tif with 9 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_6144_1024.tif with 10 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_7168_1024.tif with

Tiling:  93%|█████████▎| 14/15 [02:58<00:17, 17.33s/it]

Saved tile: 2023_RGB_8cm_W17B_6_tile_1024_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_2048_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_6_tile_5120_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_0_0.tif with 8 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_1024_0.tif with 5 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_2048_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_3072_0.tif with 2 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_5120_0.tif with 2 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_6144_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_7168_0.tif with 3 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_10240_0.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_1024_1024.tif with 10 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_2048_1024.tif with 3 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_6144_1024.tif with 3 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_7168_1024.tif with 2 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_2048_2048.

Tiling: 100%|██████████| 15/15 [03:38<00:00, 14.59s/it]

Saved tile: 2023_RGB_8cm_W17B_7_tile_0_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_2048_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_3072_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_4096_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_5120_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_8192_12288.tif with 1 labels
Saved tile: 2023_RGB_8cm_W17B_7_tile_9216_12288.tif with 2 labels
Finished tiling and label conversion





For stratified sampling\
Option1. Dominant Class method\
[2] Extract dominant class per tile from label files
* This method is quite simple. Therefore, I choose the second option

In [None]:
# import os
# from collections import Counter
# import pandas as pd
# from collections import Counter

# # Paths
# base_dir = "/home/il72/cape_town_year_of_installation/datasets/pv_capetown_after_qc_5K/original_dataset_tiles"
# label_dir = os.path.join(base_dir, "labels")
# image_dir = os.path.join(base_dir, "images")

# tile_dominants = []

# for fname in os.listdir(label_dir):
#     if not fname.endswith(".txt"):
#         continue

#     tile_id = fname[:-4]  # remove .txt
#     label_path = os.path.join(label_dir, fname)
#     image_path = os.path.join(image_dir, f"{tile_id}.tif")

#     if not os.path.exists(image_path):
#         continue  # skip if image does not exist

#     with open(label_path, "r") as f:
#         lines = f.readlines()

#     class_ids = [int(line.strip().split()[0]) for line in lines if line.strip()]
#     if not class_ids:
#         continue  # skip empty label file

#     dominant = Counter(class_ids).most_common(1)[0][0]
#     tile_dominants.append({
#         "tile_id": tile_id,
#         "dominant_class": dominant
#     })

# # Save to CSV
# df = pd.DataFrame(tile_dominants)
# df.to_csv("tile_dominant_classes.csv", index=False)
# print(f"Saved tile dominant class info for {len(df)} labeled tiles (with matching images).")

# # Print class distribution
# class_counts = df["dominant_class"].value_counts().sort_index()
# print("\nTile count per class:")
# for cls, count in class_counts.items():
#     print(f"  Class {cls}: {count} tiles")

# ## Just Checker

# label_root = "/home/il72/cape_town_year_of_installation/datasets/pv_capetown_after_qc_5K/original_dataset_tiles/labels"
# class_counts = Counter()

# for fname in os.listdir(label_root):
#     if not fname.endswith(".txt"):
#         continue
#     with open(os.path.join(label_root, fname), "r") as f:
#         for line in f:
#             parts = line.strip().split()
#             if parts:
#                 class_id = int(parts[0])
#                 class_counts[class_id] += 1

# for cls_id in sorted(class_counts):
#     print(f"Class {cls_id}: {class_counts[cls_id]} objects")


Saved tile dominant class info for 1216 labeled tiles (with matching images).

Tile count per class:
  Class 0: 484 tiles
  Class 1: 401 tiles
  Class 2: 331 tiles
Class 0: 2276 objects
Class 1: 2074 objects
Class 2: 1283 objects


For stratified sampling\
Option2. Multi-label per tile - Stratify by treating the full set of classes per tile as a single unit (i.e., a class combination). This ensures the split preserves the distribution of tiles with specific class combos, instead of simplifying to a dominant class.

- store each tile’s ID and its corresponding class combination to define class distribution across tiles(=pathces)

[2] Extract dominant class per tile from label files

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

label_dir = "/home/il72/cape_town_year_of_installation/datasets/pv_capetown_after_qc_5K/original_dataset_tiles/labels"

tile_class_combos = []

for fname in os.listdir(label_dir):
    if not fname.endswith(".txt"):
        continue

    tile_id = fname[:-4]
    label_path = os.path.join(label_dir, fname)

    with open(label_path, "r") as f:
        lines = f.readlines()

    class_ids = sorted(set(int(line.strip().split()[0]) for line in lines if line.strip()))
    if class_ids:
        combo = "_".join(map(str, class_ids))
        tile_class_combos.append({"tile_id": tile_id, "class_combo": combo})

df = pd.DataFrame(tile_class_combos)

# Stratified split by class combination
train_ids, val_ids = train_test_split(
    df["tile_id"],
    test_size=0.2,
    stratify=df["class_combo"],
    random_state=42
)

print(f"Train: {len(train_ids)}, Val: {len(val_ids)}")

# Optional: Save split lists
df[df["tile_id"].isin(train_ids)].to_csv("train_tiles.csv", index=False)
df[df["tile_id"].isin(val_ids)].to_csv("val_tiles.csv", index=False)

combo_counts = df["class_combo"].value_counts()
print("\nUnique class_combo types and their frequencies:")
print(combo_counts)


Train: 972, Val: 244

Unique class_combo types and their frequencies:
class_combo
0_1_2    236
1        230
0        194
0_1      152
1_2      144
2        144
0_2      116
Name: count, dtype: int64


In [None]:
# Class Distribution Checker
import os
from collections import Counter

def get_class_distribution(tile_ids, label_dir):
    counter = Counter()
    for tile_id in tile_ids:
        path = os.path.join(label_dir, tile_id + ".txt")
        if not os.path.exists(path):
            continue
        with open(path, "r") as f:
            for line in f:
                cls = int(line.strip().split()[0])
                counter[cls] += 1
    return counter

train_dist = get_class_distribution(train_ids, label_dir)
val_dist = get_class_distribution(val_ids, label_dir)

print(" Train Class Distribution:", dict(train_dist))
print(" Val Class Distribution:", dict(val_dist))


 Train Class Distribution: {0: 1833, 2: 1037, 1: 1701}
 Val Class Distribution: {1: 373, 0: 443, 2: 246}


[3] Stratified train/val/test split
- Corresponding to option1. dominant class, but we don't use this approach.

In [None]:
# import pandas as pd
# from sklearn.utils import shuffle

# df = pd.read_csv("tile_dominant_classes.csv")

# train_ratio = 0.6
# val_ratio = 0.2
# test_ratio = 0.2

# train_tiles, val_tiles, test_tiles = [], [], []

# for class_id, group in df.groupby("dominant_class"):
#     group = shuffle(group, random_state=42)
#     n = len(group)

#     if n < 3:
#         print(f"Class {class_id}: only {n} tile(s), assigning all to train")
#         train_tiles.extend(group["tile_id"])
#         continue

#     n_train = int(train_ratio * n)
#     n_val = int(val_ratio * n)
#     n_test = n - n_train - n_val

#     train_tiles.extend(group.iloc[:n_train]["tile_id"])
#     val_tiles.extend(group.iloc[n_train:n_train + n_val]["tile_id"])
#     test_tiles.extend(group.iloc[n_train + n_val:]["tile_id"])


[3] Stratified train/val/test split
- Corresponding to option2. tile_class_combos
- the number of patches = Train: 727, Val: 240, Test: 249
- class_combo<br>
    * 0_1_2 :    236<br>
    * 1     :   230<br>
    * 0     :   194<br>
    * 0_1   :   152<br>
    * 1_2   :   144<br>
    * 2     :   144<br>
    * 0_2   :   116<br>

In [7]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

label_dir = "/home/il72/cape_town_year_of_installation/datasets/pv_capetown_after_qc_5K/original_dataset_tiles/labels"
train_ratio = 0.6
val_ratio = 0.2
test_ratio = 0.2

tile_class_combos = []

for fname in os.listdir(label_dir):
    if not fname.endswith(".txt"):
        continue

    tile_id = fname[:-4]
    label_path = os.path.join(label_dir, fname)

    with open(label_path, "r") as f:
        lines = f.readlines()

    class_ids = sorted(set(int(line.strip().split()[0]) for line in lines if line.strip()))
    if class_ids:
        combo = "_".join(map(str, class_ids)) 
        tile_class_combos.append({"tile_id": tile_id, "class_combo": combo})

df = pd.DataFrame(tile_class_combos)

train_tiles, val_tiles, test_tiles = [], [], []

for combo, group in df.groupby("class_combo"):
    group = shuffle(group, random_state=42)
    n = len(group)

    if n < 3:
        print(f" Class combo '{combo}' has only {n} tile(s). Assigning all to train.")
        train_tiles.extend(group["tile_id"])
        continue

    n_train = int(train_ratio * n)
    n_val = int(val_ratio * n)
    n_test = n - n_train - n_val

    train_tiles.extend(group.iloc[:n_train]["tile_id"])
    val_tiles.extend(group.iloc[n_train:n_train + n_val]["tile_id"])
    test_tiles.extend(group.iloc[n_train + n_val:]["tile_id"])

print(f"\n Train: {len(train_tiles)}, Val: {len(val_tiles)}, Test: {len(test_tiles)}")


 Train: 727, Val: 240, Test: 249


[4] Imagery and label files into split folders

In [8]:
import os
import shutil

# Absolute base directory
base_dir = "/home/il72/cape_town_year_of_installation/datasets/pv_capetown_after_qc_5K"

# Input tile source directories
image_src_dir = os.path.join(base_dir, "original_dataset_tiles", "images")
label_src_dir = os.path.join(base_dir, "original_dataset_tiles", "labels")

# Train/val/test split dicts — must be predefined
splits = {
    "train": train_tiles,
    "val": val_tiles,
    "test": test_tiles
}

for split_name, tile_list in splits.items():
    img_out_dir = os.path.join(base_dir, "images", split_name)
    lbl_out_dir = os.path.join(base_dir, "labels", split_name)
    os.makedirs(img_out_dir, exist_ok=True)
    os.makedirs(lbl_out_dir, exist_ok=True)

    copied = 0
    for tile_id in tile_list:
        img_src = os.path.join(image_src_dir, f"{tile_id}.tif")
        lbl_src = os.path.join(label_src_dir, f"{tile_id}.txt")
        
        if os.path.exists(img_src) and os.path.exists(lbl_src):
            shutil.copy(img_src, os.path.join(img_out_dir, f"{tile_id}.tif"))
            shutil.copy(lbl_src, os.path.join(lbl_out_dir, f"{tile_id}.txt"))
            copied += 1
        else:
            print(f"Missing: {tile_id}")

    print(f"{split_name.upper()}: Copied {copied} labeled tiles")

TRAIN: Copied 727 labeled tiles
VAL: Copied 240 labeled tiles
TEST: Copied 249 labeled tiles


Check the number of data in Train/Val/Test

In [9]:
import os
from collections import Counter
import pandas as pd

# Define base path
base_dir = "/home/il72/cape_town_year_of_installation/datasets/pv_capetown_after_qc_5K"
splits = ["train", "val", "test"]
class_distribution = {}

for split in splits:
    label_dir = os.path.join(base_dir, "labels", split)
    class_counter = Counter()

    for fname in os.listdir(label_dir):
        if not fname.endswith(".txt"):
            continue

        with open(os.path.join(label_dir, fname), "r") as f:
            lines = f.readlines()
            for line in lines:
                parts = line.strip().split()
                if parts:
                    class_id = int(parts[0])
                    class_counter[class_id] += 1

    class_distribution[split] = dict(class_counter)

df_dist = pd.DataFrame(class_distribution).fillna(0).astype(int)
print(df_dist)

   train  val  test
0   1329  460   487
1   1298  393   383
2    750  267   266
