Make sure to import files after annotation check !

In [6]:
# Data Sanity Check

import geopandas as gpd

# Load data once
df = gpd.read_file("/shared/data/climateplus2025/Prediction_for_poster_July21/0.Image_files_selection/balanced_3_predictions_July21_after_qc.gpkg")
print("Total rows:", len(df))

# Columns to check
cols = [
    "PV_normal_qc",
    "PV_heater_qc",
    "PV_pool_qc",
    "uncertflag_qc",
    "delete_qc",
    # "PV_heater_mat",
    "PV_heater_mat_qc"
]

# Count exact matches for value == 1
flag_counts = df[cols].apply(lambda col: (col == 1).sum())

# Display result
print("\nCounts where value == 1 for each column:")
print(flag_counts)


df['num_flags_eq_1'] = df[cols].apply(lambda row: (row == 1).sum(), axis=1)

multi_flag_rows = df[df['num_flags_eq_1'] >= 2]

print(f"\n Rows with two or more columns having value : {len(multi_flag_rows)}")


Total rows: 1022

Counts where value == 1 for each column:
PV_normal_qc        337
PV_heater_qc        310
PV_pool_qc          236
uncertflag_qc        16
delete_qc           133
PV_heater_mat_qc      6
dtype: int64

 Rows with two or more columns having value : 16


In [11]:
import geopandas as gpd

gdf = gpd.read_file('/shared/data/climateplus2025/Prediction_for_poster_July21/0.Image_files_selection/balanced_3_predictions_July21_after_qc.gpkg')
gdf["PV_normal"] = gdf["PV_normal_qc"]
gdf["PV_pool"] = gdf["PV_pool_qc"]
gdf["PV_heater"] = gdf["PV_heater_qc"]
gdf["uncertflag"] = gdf["uncertflag_qc"]

gdf.columns

#### In this GPKG, there is no pv_heater_mat_qc column !
# Only PV_heater_mat_qc exists, use it
gdf["PV_heater_mat_combined"] = gdf["PV_heater_mat_qc"].fillna(0).astype(int)
# Combine it with PV_heater
gdf["PV_heater"] = gdf["PV_heater"] | gdf["PV_heater_mat_combined"]

# # Combine PV_heater_mat_qc and PV_heater_mat
# gdf["PV_heater"] = gdf["PV_heater"].fillna(0).astype(int)
# gdf["PV_heater_mat_combined"] = (
#     gdf.get("PV_heater_mat_qc", 0).fillna(0).astype(int) |
#     gdf.get("PV_heater_mat", 0).fillna(0).astype(int)
# )

# Final PV_heater union includes both PV_heater_qc and the combined mat flags
gdf["PV_heater"] = (
    gdf["PV_heater"] | gdf["PV_heater_mat_combined"]
)

# Keep rows where at least one PV-related flag is 1
pv_mask = (
    (gdf['PV_normal'] == 1) |
    (gdf['PV_heater'] == 1) |
    (gdf['PV_pool'] == 1) )

# Exclude rows where delete_qc == 1
delete_mask = gdf['delete_qc'] != 1

# Apply filters
filtered_gdf = gdf[pv_mask & delete_mask].copy()

# Drop irrelevant columns if needed (optional, keep only analysis columns)
# filtered_gdf = filtered_gdf[['PV_normal', 'PV_heater', 'PV_pool', 'PV_heater_mat']]  # optional

# Save the filtered data
output_path = "balanced_3_predictions_July21_after_qc_merging_columns.gpkg"
filtered_gdf.to_file(output_path, driver="GPKG")

# Output stats
print(f"Filtered dataset contains {len(filtered_gdf)} PV-related arrays")
print("Remaining columns:", filtered_gdf.columns.tolist())


Filtered dataset contains 889 PV-related arrays
Remaining columns: ['id', 'PV_normal', 'PV_heater', 'PV_pool', 'uncertflag', 'area', 'annotator', 'centroid_latitude', 'centroid_longitude', 'image_name', 'nw_corner_of_image_latitude', 'nw_corner_of_image_longitude', 'se_corner_of_image_latitude', 'se_corner_of_image_longitude', 'PV_normal_qc', 'PV_heater_qc', 'PV_pool_qc', 'PV_heater_mat_qc', 'uncertflag_qc', 'delete_qc', 'resizing_qc', 'geometry', 'PV_heater_mat_combined']


In [12]:
# Geometry error checker
from shapely.geometry import Polygon

gdf = gpd.read_file("/shared/data/climateplus2025/Prediction_for_poster_July21/0.Image_files_selection/balanced_3_predictions_July21_after_qc_merging_columns.gpkg")

# Invalid geometry
invalid_gdf = gdf[~gdf.geometry.is_valid].copy()
invalid_gdf["image_name"] = invalid_gdf["image_name"]
print(f"Invalid geometries found: {len(invalid_gdf)}")

# Error correction : buffer(0) method 
gdf["geometry"] = gdf["geometry"].apply(lambda geom: geom.buffer(0) if not geom.is_valid else geom)


still_invalid = gdf[~gdf.geometry.is_valid].copy()
print(f"Still invalid after fixing: {len(still_invalid)}")

gdf_valid = gdf[gdf.geometry.is_valid].copy()
gdf_valid.to_file("final_annotations_PV_all_types_balanced_3_cleaned.gpkg", driver="GPKG")

Invalid geometries found: 0
Still invalid after fixing: 0
