In [1]:
import geopandas as gpd
import pandas as pd

Load the file(s)

In [2]:
# Set your file path
file_path = '/Users/remaalbarakati/Downloads/all annotations_files/files that needs merging /all group annotations final.gpkg'

# Read the GPKG file
try:
    gdf = gpd.read_file(file_path)
except Exception as e:
    print(f"Error reading file: {e}")


Get the original labels

In [3]:
def get_original_label(row):
    """Assigns the original annotation label based on column values."""
    if row.get('PV_normal') == 1:
        return 'PV_normal'
    elif row.get('PV_heater') == 1:
        return 'PV_heater'
    elif row.get('PV_pool') == 1:
        return 'PV_pool'
    elif row.get('uncertflag') == 1:
        return 'Uncertain'
    else:
        return 'Unlabeled'

Get the QC lables

In [4]:
def get_qc_label(row):
    """Assigns the QC annotation label based on column values."""
    if row.get('PV_normal_qc') == 1:
        return 'PV_normal'
    elif row.get('PV_heater_qc') == 1:
        return 'PV_heater'
    elif row.get('PV_pool_qc') == 1:
        return 'PV_pool'
    elif row.get('uncertflag_qc') == 1:
        return 'Uncertain'
    elif row.get('delete_qc') == 1:
        return 'Deleted'
    elif row.get('PV_heater_mat') == 1:
        return 'PV_heater_mat'
    else:
        return 'Unlabeled'


Preprocess the data 

In [5]:
# Required QC columns
required_cols = [
    'PV_normal', 'PV_heater', 'PV_pool', 'uncertflag',
    'PV_normal_qc', 'PV_heater_qc', 'PV_pool_qc',
    'uncertflag_qc', 'delete_qc', 'PV_heater_mat'
]

# Check for missing columns
missing_cols = [col for col in required_cols if col not in gdf.columns]
if missing_cols:
    print(f"Missing required columns: {missing_cols}")
else:
    print("All required columns are present.")


All required columns are present.


Clean and label the data 

In [6]:
# Convert QC columns to numeric
for col in ['PV_normal_qc', 'PV_heater_qc', 'PV_pool_qc', 'uncertflag_qc', 'delete_qc']:
    gdf[col] = pd.to_numeric(gdf[col], errors='coerce')

# Apply label functions
gdf['original_label'] = gdf.apply(get_original_label, axis=1)
gdf['qc_label'] = gdf.apply(get_qc_label, axis=1)


compare labels 

In [7]:
# Filter for valid comparisons
compare_df = gdf[gdf['original_label'].notna() & gdf['qc_label'].notna()]

# Match stats
total = len(compare_df)
matches = (compare_df['original_label'] == compare_df['qc_label']).sum()
changed = total - matches

print(f"Total compared: {total}")
print(f"Matches: {matches}")
print(f"Changed: {changed}")


Total compared: 2168
Matches: 1843
Changed: 325


Generate confution matrix 

In [8]:
# Confusion matrix
conf_matrix = pd.crosstab(
    compare_df['original_label'],
    compare_df['qc_label'],
    rownames=['Original Label'],
    colnames=['QC Label']
)

conf_matrix


QC Label,Deleted,PV_heater,PV_heater_mat,PV_normal,PV_pool,Uncertain
Original Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PV_heater,16,690,119,40,5,0
PV_normal,7,14,9,596,52,2
PV_pool,3,2,5,24,557,0
Uncertain,4,5,6,3,9,0
