## Anomaly Clustering Evaluation
* Evaluate clustering through:
    * Silhouette Score
    * Davies-Bouldin Index and Calinski-Harabasz Index
    * Internal Cluster Density and Separation Metrics
    * Connectivity or Nearest-Neighbor Analysis

#### Imports

In [None]:
import pickle
import os
from ocean_tools.processing.clustering import extract_experiment_aggregated_features, compute_clustering_metrics
from ocean_tools.io.writers import store_pickle_variable

ExperimentSet = {
    'experiment_structure': ['variable', 'start_date', 'end_date', 'anomaly_threshold', 'eps_t', 'eps_lat', 'eps_lon', 'min_neighbors', 'min_cluster_size', 'experiment_name'],
    'experiments': [
        ['sst', '2018-01-01', '2018-12-01', 2, 1, 10, 10, 100, 1000, 'Base 2018'],
        ['sst', '2018-01-01', '2018-12-01', 2, 1, 5, 5, 100, 1000, 'Smaller Geo Eps'],
        ['sst', '2018-01-01', '2018-12-01', 2, 1, 20, 20, 100, 1000, 'Larger Geo Eps'],
        ['sst', '2018-01-01', '2018-12-01', 2, 2, 10, 10, 100, 1000, 'Larger Time Eps'],
        ['sst', '2018-01-01', '2018-12-01', 2, 10, 10, 10, 100, 1000, 'Much Larger Time Eps'],
        ['sst', '2018-01-01', '2018-12-01', 2, 1, 10, 10, 50, 1000, 'Smaller MinNeighbors'],
        ['sst', '2018-01-01', '2018-12-01', 2, 1, 10, 10, 200, 1000, 'Larger MinNeighbors'],
        ['sst', '2002-08-01', '2024-07-01', 2, 1, 10, 10, 100, 1000, 'Base Full Period'],
        ['sst', '2002-08-01', '2024-07-01', 2, 1, 20, 20, 500, 1000, 'Larger Min Neighbors'],
        ['sst', '2002-08-01', '2024-07-01', 2, 1, 20, 20, 1000, 1000, 'Much Larger Min Neighbors'],
        ['sst', '2002-08-01', '2024-07-01', 2, 1, 10, 10, 500, 1000, 'Smaller Geo Eps'],
        ['sst', '2002-08-01', '2024-07-01', 2, 2, 20, 20, 500, 1000, 'Larger Time Eps'],
        ['sst', '2002-08-01', '2024-07-01', 2, 12, 20, 20, 500, 1000, 'Much Larger Time Eps'],
    ]
}

#### Evaluation Metrics From Cluster Features

In [23]:
# Read stored features.
exports_path = './data/exports/clusters/features/'
experiment_features_file_name = 'experiment_features_run_1'

experiment_features = pickle.load(open(os.path.join(exports_path, f"{experiment_features_file_name}.pkl"), 'rb'))

In [None]:
# Compute aggregated metrics for each experiment.
experiment_level_features = [extract_experiment_aggregated_features(exp) for exp in experiment_features]

# Convert to a DataFrame.
df_experiment_features = pd.DataFrame(experiment_level_features)
print(df_experiment_features.to_string(index=False))

          experiment_name                                                               file_name  num_clusters  time_length_mean  time_length_std  time_length_min  time_length_max  lat_length_mean  lat_length_std  lat_length_min  lat_length_max  lon_length_mean  lon_length_std  lon_length_min  lon_length_max  cluster_size_mean  cluster_size_std  cluster_size_min  cluster_size_max  cluster_volume_mean  cluster_volume_std  cluster_volume_min  cluster_volume_max  cluster_compactness_mean  cluster_compactness_std  cluster_compactness_min  cluster_compactness_max  cluster_eccentricity_mean  cluster_eccentricity_std  cluster_eccentricity_min  cluster_eccentricity_max  cluster_density_mean  cluster_density_std  cluster_density_min  cluster_density_max
                Base 2018  clustering_experiment_2018-01-01_2018-12-01_sst_2_1_10_10_100_1000.pkl             8          3.250000         3.344772              1.0             12.0       125.625000      104.278398            53.0           395.

#### Evaluation Metrics From Clustered Data

In [25]:
# Read Stored Clusters.
exports_path = './data/exports/clusters/'
experiment_results = []
for experiment in ExperimentSet['experiments']:
    variable, start_date, end_date, anomaly_threshold, eps_t, eps_lat, eps_lon, min_neighbors, min_cluster_size, experiment_name = experiment
    file_name = f"clustering_experiment_{start_date}_{end_date}_{variable}_{anomaly_threshold}_{eps_t}_{eps_lat}_{eps_lon}_{min_neighbors}_{min_cluster_size}.pkl"
    clusters, n_clusters, n_discarded, run_seconds = pickle.load(open(os.path.join(exports_path, file_name), 'rb'))
    experiment_results.append((clusters, n_clusters, n_discarded, run_seconds, file_name, experiment_name))

In [None]:
# Compute metrics for each experiment.
experiment_metrics = []
for experiment_result in experiment_results:
    clusters, n_clusters, n_discarded, run_seconds, file_name, experiment_name = experiment_result
    metrics = compute_clustering_metrics(clusters)
    experiment_metrics.append((experiment_name, metrics))
    print(f"Experiment: {experiment_name} | Silhouette: {metrics['silhouette']:.3f} | Davies-Bouldin: {metrics['davies_bouldin']:.3f} | Calinski-Harabasz: {metrics['calinski_harabasz']:.3f} | Noise Ratio: {metrics['noise_ratio']:.3f}")

Experiment: Base 2018 | Silhouette: 0.185 | Davies-Bouldin: 0.603 | Calinski-Harabasz: 18661.954 | Noise Ratio: 0.001
Experiment: Smaller Geo Eps | Silhouette: 0.036 | Davies-Bouldin: 1.458 | Calinski-Harabasz: 20366.325 | Noise Ratio: 0.004
Experiment: Larger Geo Eps | Silhouette: 0.130 | Davies-Bouldin: 0.831 | Calinski-Harabasz: 24562.452 | Noise Ratio: 0.000
Experiment: Larger Time Eps | Silhouette: 0.304 | Davies-Bouldin: 0.516 | Calinski-Harabasz: 19714.683 | Noise Ratio: 0.001
Experiment: Much Larger Time Eps | Silhouette: 0.302 | Davies-Bouldin: 0.518 | Calinski-Harabasz: 19758.283 | Noise Ratio: 0.001
Experiment: Smaller MinNeighbors | Silhouette: 0.234 | Davies-Bouldin: 0.641 | Calinski-Harabasz: 19113.702 | Noise Ratio: 0.001
Experiment: Larger MinNeighbors | Silhouette: 0.056 | Davies-Bouldin: 1.287 | Calinski-Harabasz: 32230.437 | Noise Ratio: 0.002
Experiment: Base Full Period | Silhouette: -0.162 | Davies-Bouldin: 1.427 | Calinski-Harabasz: 40511.643 | Noise Ratio: 0.001

In [None]:
# Store all results.
exports_path = './data/exports/clusters/evaluation_metrics/'
file_name = f"clustering_metrics_run_1"
store_pickle_variable(experiment_metrics, exports_path, file_name)

#### To do:
* Documentation refinement & site generation.
* Memoria:
    * Update outdated content.
    * Include clustering.
    * Include Github repo.
    * Include ocean_tools docs.
    * Expand references to other papers.
    * Tutor feedback.