# Imports

In [1]:
import os
import pandas as pd

from orbit_generation.experiment import read_json_to_dataframe, plot_corr_matrix, create_experiment_image_grid
from orbit_generation.dataset import get_first_period_dataset
from orbit_generation.evaluation import evaluate_metrics_and_clustering

# Data

In [2]:
data_path = r"/orbit-generation/data/orbits_fix_1500/EM_N_fix_1500.h5"
experiments_folder = "../experiments"
seq_len=100

In [3]:
data, orbit_df, labels, system_dict = get_first_period_dataset(file_path=data_path, segment_length=seq_len)
data.shape

(45211, 7, 100)

In [4]:
labels.shape

(45211,)

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import pairwise_distances, adjusted_rand_score, normalized_mutual_info_score, silhouette_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score, fowlkes_mallows_score
from sklearn.metrics import jaccard_score, confusion_matrix
from scipy.spatial.distance import pdist, squareform
from scipy.optimize import linear_sum_assignment
from fastdtw import fastdtw
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture

def evaluate_metrics_and_clustering(orbit_data: np.ndarray,
                                    true_labels: np.ndarray,
                                    distance_metrics: list = None,
                                    clustering_algorithms: list = None,
                                    evaluation_metrics: list = None,
                                    n_clusters: int = None,
                                    plot_results: bool = True):
    """
    Evaluates specified distance metrics and clustering algorithms on the given orbit data.
    
    :param orbit_data: The orbit data as a multivariate time series (shape: [n_samples, n_features, n_time_steps]).
    :param true_labels: Array of true labels for the orbit data.
    :param distance_metrics: List of strings specifying distance metrics to use. If None, all available metrics are used.
    :param clustering_algorithms: List of strings specifying clustering algorithms to use. If None, all available algorithms are used.
    :param evaluation_metrics: List of strings specifying evaluation metrics to use. If None, all available metrics are used.
    :param n_clusters: Number of clusters for algorithms that require it. If None, it will be inferred from labels.
    :param plot_results: If True, plot heatmaps of the results.
    :return: A dictionary containing results for all combinations of metrics and clustering algorithms.
    """

    def calculate_clustering_accuracy(true_labels, pred_labels):
        cm = confusion_matrix(true_labels, pred_labels)
        row_ind, col_ind = linear_sum_assignment(-cm)
        return cm[row_ind, col_ind].sum() / np.sum(cm)
    
    def euclidean_distance(x):
        n = x.shape[0]
        dist_matrix = np.zeros((n, n))
        for i in range(n):
            for j in range(i+1, n):
                dist = np.sqrt(np.sum((x[i] - x[j])**2))
                dist_matrix[i, j] = dist_matrix[j, i] = dist
        return dist_matrix

    def manhattan_distance(x):
        n = x.shape[0]
        dist_matrix = np.zeros((n, n))
        for i in range(n):
            for j in range(i+1, n):
                dist = np.sum(np.abs(x[i] - x[j]))
                dist_matrix[i, j] = dist_matrix[j, i] = dist
        return dist_matrix

    def cosine_distance(x):
        n = x.shape[0]
        dist_matrix = np.zeros((n, n))
        for i in range(n):
            for j in range(i+1, n):
                dot_product = np.sum(x[i] * x[j])
                norm_i = np.sqrt(np.sum(x[i]**2))
                norm_j = np.sqrt(np.sum(x[j]**2))
                dist = 1 - (dot_product / (norm_i * norm_j))
                dist_matrix[i, j] = dist_matrix[j, i] = dist
        return dist_matrix

    def dtw_distance(x):
        n = x.shape[0]
        dist_matrix = np.zeros((n, n))
        for i in range(n):
            for j in range(i+1, n):
                distance, _ = fastdtw(x[i].T, x[j].T)
                dist_matrix[i, j] = dist_matrix[j, i] = distance
        return dist_matrix

    # Define available distance metrics
    available_distance_metrics = {
        'euclidean': euclidean_distance,
        'manhattan': manhattan_distance,
        'cosine': cosine_distance,
        'dtw': dtw_distance
    }
    
    # Define available clustering algorithms
    available_clustering_algorithms = {
        'kmeans': KMeans,
        'gmm': GaussianMixture,
        'dbscan': DBSCAN
    }
    
    # Define available evaluation metrics
    available_evaluation_metrics = {
        'ARI': adjusted_rand_score,
        'NMI': normalized_mutual_info_score,
        'Homogeneity': homogeneity_score,
        'Completeness': completeness_score,
        'V-Measure': v_measure_score,
        'FMI': fowlkes_mallows_score,
        'Purity': lambda true, pred: np.sum(np.amax(confusion_matrix(true, pred), axis=0)) / np.sum(confusion_matrix(true, pred)),
        'Silhouette': silhouette_score,
        'Jaccard': lambda true, pred: jaccard_score(true, pred, average='macro'),
        'Accuracy': calculate_clustering_accuracy
    }
    
    # If no metrics specified, use all available
    if distance_metrics is None:
        distance_metrics = list(available_distance_metrics.keys())
    
    # If no algorithms specified, use all available
    if clustering_algorithms is None:
        clustering_algorithms = list(available_clustering_algorithms.keys())
    
    # If no evaluation metrics specified, use all available
    if evaluation_metrics is None:
        evaluation_metrics = list(available_evaluation_metrics.keys())
    
    results = {}
    
    # If n_clusters is not provided, infer it from the labels
    if n_clusters is None:
        n_clusters = len(np.unique(true_labels))
    
    # Reshape orbit_data for clustering algorithms
    reshaped_orbit_data = orbit_data
    
    for metric_name in distance_metrics:
        if metric_name not in available_distance_metrics:
            print(f"Warning: {metric_name} is not an available distance metric. Skipping.")
            continue
        
        print(f"Computing {metric_name} distances...")
        metric_func = available_distance_metrics[metric_name]
        distance_matrix = metric_func(orbit_data)
        
        for algo_name in clustering_algorithms:
            if algo_name not in available_clustering_algorithms:
                print(f"Warning: {algo_name} is not an available clustering algorithm. Skipping.")
                continue
            
            print(f"Clustering with {algo_name}...")
            algo_class = available_clustering_algorithms[algo_name]
            
            if algo_name == 'dbscan':
                # For DBSCAN, we need to estimate eps
                distances = squareform(distance_matrix)
                eps = np.percentile(distances, 10)  # Use the 10th percentile of distances as eps
                clusterer = algo_class(eps=eps, min_samples=5, metric='precomputed')
                labels = clusterer.fit_predict(distance_matrix)
            elif algo_name == 'kmeans':
                clusterer = algo_class(n_clusters=n_clusters, random_state=42)
                labels = clusterer.fit_predict(reshaped_orbit_data)
            elif algo_name == 'gmm':
                clusterer = algo_class(n_components=n_clusters, random_state=42)
                labels = clusterer.fit_predict(reshaped_orbit_data)
            else:
                clusterer = algo_class()  # Use default parameters for other algorithms
                labels = clusterer.fit_predict(reshaped_orbit_data)
            
            # Evaluate clustering
            eval_results = {}
            for eval_metric in evaluation_metrics:
                if eval_metric not in available_evaluation_metrics:
                    print(f"Warning: {eval_metric} is not an available evaluation metric. Skipping.")
                    continue
                
                if eval_metric == 'Silhouette':
                    unique_labels = np.unique(labels)
                    if len(unique_labels) > 1 and len(unique_labels) < len(labels):
                        eval_results[eval_metric] = available_evaluation_metrics[eval_metric](reshaped_orbit_data, labels)
                    else:
                        eval_results[eval_metric] = np.nan
                else:
                    eval_results[eval_metric] = available_evaluation_metrics[eval_metric](true_labels, labels)
            
            # Store results
            results[f"{metric_name}_{algo_name}"] = eval_results
    
    if plot_results:
        plot_metric_heatmaps(results, distance_metrics, clustering_algorithms, evaluation_metrics)
    
    return results

def plot_metric_heatmaps(results, distance_metrics, clustering_algorithms, evaluation_metrics):
    """
    Plot heatmaps for each evaluation metric.
    """
    n_metrics = len(evaluation_metrics)
    n_cols = 3
    n_rows = (n_metrics + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
    axes = axes.flatten()
    
    for i, metric in enumerate(evaluation_metrics):
        data = np.zeros((len(distance_metrics), len(clustering_algorithms)))
        for d, distance in enumerate(distance_metrics):
            for c, clustering in enumerate(clustering_algorithms):
                data[d, c] = results[f"{distance}_{clustering}"][metric]
        
        sns.heatmap(data, annot=True, fmt=".2f", cmap="YlGnBu", 
                    xticklabels=clustering_algorithms, yticklabels=distance_metrics, ax=axes[i])
        axes[i].set_title(f"{metric} Scores")
        axes[i].set_xlabel("Clustering Algorithms")
        axes[i].set_ylabel("Distance Metrics")
    
    # Remove any unused subplots
    for i in range(n_metrics, len(axes)):
        fig.delaxes(axes[i])
    
    plt.tight_layout()
    plt.show()

In [8]:
num_samples = 100
indices = np.random.choice(len(data), num_samples, replace=False)

sampled_data = data[indices]
sampled_labels = labels[indices]

results = evaluate_metrics_and_clustering(sampled_data, sampled_labels)
print(results)

Computing euclidean distances...
Clustering with kmeans...


ValueError: Found array with dim 3. KMeans expected <= 2.

In [31]:
num_samples = 1000
indices = np.random.choice(len(data), num_samples, replace=False)

sampled_data = data[indices]
sampled_labels = labels[indices]

results = evaluate_metrics_and_clustering(sampled_data, sampled_labels)
print(results)

Computing euclidean distances...
Clustering with kmeans...
Clustering with gmm...
Clustering with dbscan...


ValueError: Distance matrix 'X' must be symmetric.

: 