# QA Training Data 
### Visualise the clusters of the training data. Maybe exclude outliers
Goal is to reduce confusion. Make training point classes clean/accurate/spectrally separable.

In [1]:
# Reload functions during development
%load_ext autoreload
%autoreload 2

In [2]:
import geopandas as gpd
from ldn.typology import colors

In [3]:
# Get the training data
training_data = gpd.read_file("training_data.geojson")
class_attr = "lulc"

training_data.explore(
    column=class_attr,
    categorical=True,
    categories=(present_classes := sorted(training_data[class_attr].unique())),
    cmap=[colors[c] for c in present_classes],
    legend=True,
    style_kwds={"radius": 6, "fillOpacity": 0.8, "weight": 0.5}
)

In [4]:
training_data.columns

Index(['lulc', 'green', 'smad', 'swir16', 'bcmad', 'red', 'blue', 'swir22',
       'nir08', 'emad', 'ndvi', 'ndwi', 'mndwi', 'ndti', 'bsi', 'mbi', 'baei',
       'bui', 'bg', 'ln_bg', 'elevation', 'geometry'],
      dtype='object')

In [10]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist

# ── 1. Prep feature matrix ────────────────────────────────────────────────────
exclude_cols = ['lulc', 'geometry']
feature_cols = [c for c in training_data.columns if c not in exclude_cols]

# Initialise outlier column — NaN rows are automatically flagged True
training_data['outlier'] = False
training_data.loc[training_data[feature_cols].isna().any(axis=1), 'outlier'] = True

nan_rows = training_data['outlier'].sum()
print(f"Rows with NaNs (auto-flagged): {nan_rows}")

# Work only on rows with complete features
valid_mask = ~training_data[feature_cols].isna().any(axis=1)
valid_idx  = training_data.index[valid_mask]

X = training_data.loc[valid_idx, feature_cols].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ── 2. Per-class clustering + outlier flagging ────────────────────────────────
threshold = 2.0   # ← tune: lower = more aggressive flagging

for cls in training_data.loc[valid_idx, 'lulc'].unique():
    mask = (training_data.loc[valid_idx, 'lulc'] == cls).values
    idx  = valid_idx[mask]
    Xc   = X_scaled[mask]
    n    = len(Xc)

    if n < 6:
        continue  # too few — leave as False

    k_max = min(5, n // 5)
    if k_max < 2:
        continue

    best_k, best_score = 2, -1
    for k in range(2, k_max + 1):
        km     = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = km.fit_predict(Xc)
        score  = silhouette_score(Xc, labels)
        if score > best_score:
            best_k, best_score = k, score

    km_final = KMeans(n_clusters=best_k, random_state=42, n_init=10).fit(Xc)
    labels   = km_final.labels_
    centers  = km_final.cluster_centers_

    outlier_mask = np.zeros(n, dtype=bool)
    for cluster_id in range(best_k):
        in_cluster = labels == cluster_id
        pts        = Xc[in_cluster]
        dists      = cdist(pts, centers[cluster_id].reshape(1, -1)).flatten()
        median_d   = np.median(dists)
        if median_d == 0:
            continue
        flagged = dists > threshold * median_d
        outlier_mask[np.where(in_cluster)[0][flagged]] = True

    training_data.loc[idx[outlier_mask], 'outlier'] = True

    print(f"  {str(cls):30s} | n={n:4d} | k={best_k} | sil={best_score:.3f} "
          f"| outliers={outlier_mask.sum():4d} ({100*outlier_mask.mean():.1f}%)")

# ── 3. Summary ────────────────────────────────────────────────────────────────
print(f"\nTotal points : {len(training_data):,}")
print(f"Clean        : {(~training_data['outlier']).sum():,}")
print(f"Outliers     : {training_data['outlier'].sum():,}")
print(f"\nOutliers per class:")
print(training_data.groupby('lulc')['outlier'].mean().mul(100).round(1).astype(str) + '%')

Rows with NaNs (auto-flagged): 13
  6                              | n=1385 | k=2 | sil=0.368 | outliers=  90 (6.5%)
  5                              | n= 502 | k=2 | sil=0.299 | outliers=  30 (6.0%)
  1                              | n= 300 | k=2 | sil=0.344 | outliers=  17 (5.7%)
  2                              | n= 300 | k=2 | sil=0.346 | outliers=  18 (6.0%)
  7                              | n= 297 | k=2 | sil=0.313 | outliers=  19 (6.4%)
  4                              | n= 300 | k=2 | sil=0.367 | outliers=  17 (5.7%)
  3                              | n= 300 | k=2 | sil=0.304 | outliers=  16 (5.3%)

Total points : 3,397
Clean        : 3,177
Outliers     : 220

Outliers per class:
lulc
1    5.7%
2    6.0%
3    5.3%
4    5.7%
5    6.2%
6    7.1%
7    7.3%
Name: outlier, dtype: object
  5                              | n= 502 | k=2 | sil=0.299 | outliers=  30 (6.0%)
  1                              | n= 300 | k=2 | sil=0.344 | outliers=  17 (5.7%)
  2                             

In [11]:
training_clean = training_data[~training_data['outlier']]
training_clean.explore(
    column=class_attr,
    categorical=True,
    categories=(present_classes := sorted(training_clean[class_attr].unique())),
    cmap=[colors[c] for c in present_classes],
    legend=True,
    style_kwds={"radius": 6, "fillOpacity": 0.8, "weight": 0.5}
)

In [12]:
training_data.to_file("training_data.geojson", driver="GeoJSON")