# Evaluation

> Scripts to perform evaluation on the data

In [None]:
#| default_exp evaluation

In [None]:
#| export
#| hide
import numpy as np
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, homogeneity_score, completeness_score, v_measure_score, fowlkes_mallows_score, silhouette_score, jaccard_score, confusion_matrix, accuracy_score, classification_report
from sklearn.metrics.cluster import contingency_matrix
from scipy.optimize import linear_sum_assignment
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_predict
from scipy.spatial.distance import squareform
from fastdtw import fastdtw
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.spatial import cKDTree
from typing import Tuple

Auxiliar Functions

In [None]:
#| export
def plot_metric_heatmaps(results, distance_metrics, clustering_algorithms, evaluation_metrics):
    """
    Plot heatmaps for each evaluation metric.
    """
    n_metrics = len(evaluation_metrics)
    n_cols = 3
    n_rows = (n_metrics + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
    axes = axes.flatten()
    
    for i, metric in enumerate(evaluation_metrics):
        data = np.zeros((len(distance_metrics), len(clustering_algorithms)))
        for d, distance in enumerate(distance_metrics):
            for c, clustering in enumerate(clustering_algorithms):
                data[d, c] = results[f"{distance}_{clustering}"][metric]
        
        sns.heatmap(data, annot=True, fmt=".2f", cmap="YlGnBu", 
                    xticklabels=clustering_algorithms, yticklabels=distance_metrics, ax=axes[i])
        axes[i].set_title(f"{metric} Scores")
        axes[i].set_xlabel("Clustering Algorithms")
        axes[i].set_ylabel("Distance Metrics")
    
    # Remove any unused subplots
    for i in range(n_metrics, len(axes)):
        fig.delaxes(axes[i])
    
    plt.tight_layout()
    plt.show()

In [None]:
### the following function is not being used anymore

In [None]:
#| export
def plot_comparison(orbit_df, synthetic_orbit_df):
    """
    Function to create a scatter plot comparing 'period' and 'calculated_jacobi'
    between two DataFrames and plot the index of points in synthetic_orbit_df.
    
    Parameters:
    - orbit_df: DataFrame containing 'period' and 'calculated_jacobi' columns
    - synthetic_orbit_df: DataFrame containing 'period' and 'calculated_jacobi' columns
    """
    plt.figure(figsize=(10, 6))

    # Plot for orbit_df
    plt.scatter(orbit_df['period'], orbit_df['calculated_jacobi'], color='blue', label='orbit_df', marker='o')

    # Plot for synthetic_orbit_df
    plt.scatter(synthetic_orbit_df['period'], synthetic_orbit_df['calculated_jacobi'], color='red', label='synthetic_orbit_df', marker='x')

    # Annotate the points in synthetic_orbit_df with their indices
    for i in synthetic_orbit_df.index:
        plt.text(synthetic_orbit_df['period'].loc[i], 
                 synthetic_orbit_df['calculated_jacobi'].loc[i], 
                 str(int(i)), 
                 fontsize=9, color='black', ha='right')

    # Add labels and title
    plt.xlabel('Period')
    plt.ylabel('Calculated Jacobi')
    plt.title('Comparison of Features Between Two DataFrames')
    plt.legend()  # Add legend to distinguish between the groups

    # Display the plot
    plt.grid(True)
    plt.show()

def calculate_closest_feature_distances(
    orbit_df: pd.DataFrame,
    synthetic_orbit_df: pd.DataFrame,
    features: list,
    display_comparison: bool = True
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Calculate the distance from each point in synthetic_orbit_df to the closest point in orbit_df
    based on specified features, and return both the distances and the indices of the closest orbits.
    
    Parameters:
    - orbit_df (pd.DataFrame): DataFrame containing the training data.
    - synthetic_orbit_df (pd.DataFrame): DataFrame containing the synthetic data.
    - features (list): List of feature column names to use for calculating the distances.
    - display_comparison (bool): Whether to display a comparison plot. Defaults to True.
    
    Returns:
    - distances (np.ndarray): Array of the minimum distances from each synthetic point to the nearest orbit point.
    - closest_indices (np.ndarray): Array of indices corresponding to the nearest orbit points in orbit_df.
    """
    # Extract the relevant features from both DataFrames
    orbit_points = orbit_df[features].values
    synthetic_points = synthetic_orbit_df[features].values

    # Create a KDTree for efficient nearest-neighbor search in orbit_df
    tree = cKDTree(orbit_points)

    # Query the KDTree with the synthetic points to find the distance and index of the nearest orbit point
    distances, closest_indices = tree.query(synthetic_points, k=1)

    if display_comparison:
        plot_comparison(orbit_df, synthetic_orbit_df)

    return distances, closest_indices

In [None]:
#| export
def find_non_matching_elements(main_array, check_array):
    """
    Finds elements in check_array that are not present in main_array.

    Parameters:
    main_array (numpy.ndarray): The main array with larger set of elements.
    check_array (numpy.ndarray): The array with elements to check against the main array.

    Returns:
    numpy.ndarray: Elements in check_array that are not in main_array.
    """
    # Convert arrays to tuples to enable comparison
    main_set = set(map(tuple, main_array))
    check_set = set(map(tuple, check_array))

    # Find elements in check_set that are not in main_set
    non_matching_elements = check_set - main_set

    # Convert the result back to a numpy array
    non_matching_elements_array = np.array(list(non_matching_elements))

    return non_matching_elements_array

## Evaluate Clustering with Multiple Labels

In [None]:
#| export
def evaluate_clustering_multiple_labels(latent_representations: np.ndarray,  # The latent space data.
                                        list_of_labels: list,                # List of true labels or a single true labels array.
                                        clustering_method: str = 'kmeans',   # The clustering algorithm to use ('kmeans', 'gmm', 'dbscan').
                                        label_names: list = None,            # Optional names for the label sets.
                                        **kwargs                             # Additional arguments for the clustering algorithm.
                                       ) -> dict:                            # Returns a dictionary with clustering metrics.
    """
    Evaluates the clustering quality of the latent representations for one or multiple sets of labels.
    """
    
    def calculate_clustering_accuracy(true_labels, pred_labels):
        contingency = contingency_matrix(true_labels, pred_labels)
        row_ind, col_ind = linear_sum_assignment(-contingency)
        return contingency[row_ind, col_ind].sum() / np.sum(contingency)

    # Ensure list_of_labels is a list of arrays
    if isinstance(list_of_labels, np.ndarray):
        list_of_labels = [list_of_labels]
    
    # Use default names if label_names are not provided
    if label_names is None:
        label_names = [f'Set_{i+1}' for i in range(len(list_of_labels))]
    
    combined_metrics = {}
    average_metrics = {
        'ARI': 0,
        'NMI': 0,
        'Homogeneity': 0,
        'Completeness': 0,
        'V-Measure': 0,
        'FMI': 0,
        'Purity': 0,
        'Silhouette Score': 0,
        'Jaccard': 0,
        'Accuracy': 0
    }
    num_label_sets = len(list_of_labels)
    
    label_encoders = [LabelEncoder() for _ in range(num_label_sets)]
    encoded_labels_list = [label_encoders[i].fit_transform(list_of_labels[i]) for i in range(num_label_sets)]
    
    for i, true_labels in enumerate(encoded_labels_list):
        # Determine the number of clusters
        n_clusters = len(np.unique(true_labels))
        
        # Apply the selected clustering algorithm
        if clustering_method == 'kmeans':
            clusterer = KMeans(n_clusters=n_clusters, random_state=42, n_init=10, **kwargs)
            pred_labels = clusterer.fit_predict(latent_representations)
        elif clustering_method == 'gmm':
            clusterer = GaussianMixture(n_components=n_clusters, random_state=42, **kwargs)
            pred_labels = clusterer.fit_predict(latent_representations)
        elif clustering_method == 'dbscan':
            clusterer = DBSCAN(**kwargs)
            pred_labels = clusterer.fit_predict(latent_representations)
        else:
            raise ValueError("Unsupported clustering method. Choose from 'kmeans', 'gmm', 'dbscan'.")
        
        # Check if only one label is predicted
        if len(np.unique(pred_labels)) == 1:
            print(f"Warning: Only one cluster predicted for {label_names[i]}. Some metrics will be NaN.")
            ari = nmi = homogeneity = completeness = v_measure = fmi = purity = silhouette = jaccard = accuracy = float('nan')
        else:
            # Calculate clustering metrics
            ari = adjusted_rand_score(true_labels, pred_labels)
            nmi = normalized_mutual_info_score(true_labels, pred_labels)
            homogeneity = homogeneity_score(true_labels, pred_labels)
            completeness = completeness_score(true_labels, pred_labels)
            v_measure = v_measure_score(true_labels, pred_labels)
            fmi = fowlkes_mallows_score(true_labels, pred_labels)
            
            # Purity
            cont_matrix = contingency_matrix(true_labels, pred_labels)
            purity = np.sum(np.amax(cont_matrix, axis=0)) / np.sum(cont_matrix)
            
            # Silhouette Score
            silhouette = silhouette_score(latent_representations, pred_labels)
            
            # Jaccard Coefficient and Accuracy
            jaccard = jaccard_score(true_labels, pred_labels, average='macro')
            accuracy = calculate_clustering_accuracy(true_labels, pred_labels)
        
        # Store the results for this set of labels
        combined_metrics.update({
            f'{label_names[i]}_ari': ari,
            f'{label_names[i]}_nmi': nmi,
            f'{label_names[i]}_homogeneity': homogeneity,
            f'{label_names[i]}_completeness': completeness,
            f'{label_names[i]}_v-measure': v_measure,
            f'{label_names[i]}_fmi': fmi,
            f'{label_names[i]}_purity': purity,
            f'{label_names[i]}_silhouette_score': silhouette,
            f'{label_names[i]}_jaccard': jaccard,
            f'{label_names[i]}_accuracy': accuracy
        })
        
        # Accumulate the results for averaging
        average_metrics['ARI'] += ari
        average_metrics['NMI'] += nmi
        average_metrics['Homogeneity'] += homogeneity
        average_metrics['Completeness'] += completeness
        average_metrics['V-Measure'] += v_measure
        average_metrics['FMI'] += fmi
        average_metrics['Purity'] += purity
        average_metrics['Silhouette Score'] += silhouette
        average_metrics['Jaccard'] += jaccard
        average_metrics['Accuracy'] += accuracy
    
    # Compute the average metrics if there are multiple sets of labels
    if num_label_sets > 1:
        for key in average_metrics:
            combined_metrics[f'average_{key}'] = average_metrics[key] / num_label_sets
    
    return combined_metrics

## Physical Distances

### Euclidean

In [None]:
#| export
def euclidean_distance(point1: np.ndarray, point2: np.ndarray) -> float:
    return np.sqrt(np.sum((point1 - point2) ** 2))

### Manhattan

In [None]:
#| export
def manhattan_distance(point1: np.ndarray, point2: np.ndarray) -> float:
    return np.sum(np.abs(point1 - point2))

### Cosine

In [None]:
#| export
def cosine_distance(point1: np.ndarray, point2: np.ndarray) -> float:
    dot_product = np.sum(point1 * point2)
    norm1 = np.linalg.norm(point1)
    norm2 = np.linalg.norm(point2)
    if norm1 == 0.0 or norm2 == 0.0:
        return 1.0  # Maximum distance if one of the vectors is zero
    cosine_similarity = dot_product / (norm1 * norm2)
    return 1.0 - cosine_similarity

### Dynamic Time Warping

In [None]:
#| export
def dtw_distance(point1: np.ndarray, point2: np.ndarray) -> float:
    distance, _ = fastdtw(point1.T, point2.T)
    return distance

### Generic

In [None]:
#| export
DISTANCE_FUNCTIONS = {
    'euclidean': euclidean_distance,
    'manhattan': manhattan_distance,
    'cosine': cosine_distance,
    'dtw': dtw_distance
}

In [None]:
#| export
def calculate_distance(point1: np.ndarray, point2: np.ndarray, distance_metric: str = 'euclidean') -> float:
    """
    Calculates the distance between two points based on the specified distance metric.
    
    :param point1: First data point array.
    :param point2: Second data point array.
    :param distance_metric: The distance metric to use ('euclidean', 'manhattan', 'cosine', 'dtw').
    :return: Distance as a float.
    """
    if distance_metric == 'euclidean':
        return euclidean_distance(point1, point2)
    elif distance_metric == 'manhattan':
        return manhattan_distance(point1, point2)
    elif distance_metric == 'cosine':
        return cosine_distance(point1, point2)
    elif distance_metric == 'dtw':
        return dtw_distance(point1, point2)
    else:
        raise ValueError('Unknown distance metric: ' + distance_metric)

In [None]:
#| export
def calculate_pairwise_distances(array1: np.ndarray, array2: np.ndarray, distance_metric: str = 'euclidean') -> np.ndarray:
    """
    Calculates the distance between corresponding pairs of points from two arrays using the specified distance metric.
    
    :param array1: A 2D numpy array where each row represents a data point.
    :param array2: A 2D numpy array where each row represents a data point.
    :param distance_metric: The distance metric to use ('euclidean', 'manhattan', 'cosine', 'dtw').
    :return: A 1D numpy array containing the distances between corresponding pairs.
    """
    if array1.shape != array2.shape:
        raise ValueError("Both input arrays must have the same shape.")
    
    distances = np.array([
        calculate_distance(point1, point2, distance_metric)
        for point1, point2 in zip(array1, array2)
    ])
    
    return distances

### Batch

In [None]:
#| export
def calculate_distances_batch(single_points: np.ndarray, points_array: np.ndarray, distance_metric: str = 'euclidean') -> np.ndarray:
    """
    Calculates the distances between single data points and an array of data points based on the specified distance metric.
    
    :param single_points: Single data point array or a batch of data points.
    :param points_array: Array of data points to compare against.
    :param distance_metric: The distance metric to use ('euclidean', 'manhattan', 'cosine', 'dtw').
    :return: Array of distances.
    """
    if single_points.ndim == 1:
        single_points = single_points.reshape(1, -1)
    
    distances = []
    for single_point in single_points:
        for point in points_array:
            distance = calculate_distance(single_point, point, distance_metric)
            distances.append(distance)
    
    return np.array(distances)

### Nearest Points

In [None]:
#| export
def find_nearest_points(single_point: np.ndarray, points_array: np.ndarray, n: int, distance_metric: str = 'euclidean') -> tuple:
    
    distances = calculate_distances_batch(single_point, points_array, distance_metric=distance_metric)

    # Get the indices of the n nearest points
    nearest_indices = np.argsort(distances)[:n]

    # Gather the nearest distances using the indices
    nearest_distances = distances[nearest_indices]
    
    # If only one nearest point is requested, return a single int and float
    if n == 1:
        return nearest_indices[0], nearest_distances[0]

    return nearest_indices, nearest_distances

In [None]:
#| export
def find_nearest_points_batch(single_points: np.ndarray, points_array: np.ndarray, n: int, distance_metric: str = 'euclidean') -> tuple:
    """
    Finds the nearest indices and distances for a batch of single data points to an array of data points based on the specified distance metric.
    
    :param single_points: Array of single data points (2D array).
    :param points_array: Array of data points to compare against (2D array).
    :param n: Number of nearest points to retrieve for each single point.
    :param distance_metric: The distance metric to use ('euclidean', 'manhattan', 'cosine', 'dtw').
    :return: Tuple of nearest indices and nearest distances for each single point.
    """    
    all_nearest_indices = []
    all_nearest_distances = []
    
    for single_point in single_points:
        nearest_indices, nearest_distances = find_nearest_points(single_point, points_array, n, distance_metric)
        all_nearest_indices.append(nearest_indices)
        all_nearest_distances.append(nearest_distances)
    
    return np.array(all_nearest_indices), np.array(all_nearest_distances)

## Orbits Distance

In [None]:
#| export
def orbits_distances(
    orbit_data1: np.ndarray,                # Shape: [n_samples1, n_features, n_time_steps] or [n_features, n_time_steps]
    orbit_data2: np.ndarray,                # Shape: [n_samples2, n_features, n_time_steps] or [n_features, n_time_steps]
    distance_metric: str                     # String representing the distance metric ('euclidean', 'manhattan', 'cosine', 'dtw')
) -> np.ndarray:
    """
    Calculates distances between orbits in two datasets using a specified distance metric.

    This function is robust to input shapes. If an input is a 2D array (representing a single orbit),
    it is automatically converted to a 3D array with one sample. This allows for flexible comparisons
    between single or multiple orbits.

    :param orbit_data1: First set of orbits (shape: [n_samples1, n_features, n_time_steps] or [n_features, n_time_steps]).
    :param orbit_data2: Second set of orbits or a single orbit.
                        Shape: [n_samples2, n_features, n_time_steps] or [n_features, n_time_steps].
    :param distance_metric: A string representing the distance metric to use ('euclidean', 'manhattan', 'cosine', 'dtw').

    :return: NumPy array of distances.
             - If one input is single and the other is multiple:
                 - Shape: [n_samples1] or [n_samples2]
             - If both inputs are multiple:
                 - Shape: [n_samples1, n_samples2]
    """
    
    def ensure_3d(array: np.ndarray) -> np.ndarray:
        """
        Ensures the input array is 3D. If it's 2D, adds an extra dimension.
        
        :param array: Input NumPy array.
        :return: 3D NumPy array.
        """
        if array.ndim == 2:
            return array[np.newaxis, ...]
        elif array.ndim == 3:
            return array
        else:
            raise ValueError(f"Input array must be either 2D or 3D, got {array.ndim}D array instead.")

    # Mapping of distance metrics to their corresponding functions
    distance_functions = DISTANCE_FUNCTIONS

    # Ensure inputs are 3D
    orbit_data1 = ensure_3d(orbit_data1)
    orbit_data2 = ensure_3d(orbit_data2)

    n_samples1 = orbit_data1.shape[0]
    n_samples2 = orbit_data2.shape[0]

    # Verify that feature and time step dimensions match
    if orbit_data1.shape[1:] != orbit_data2.shape[1:]:
        raise ValueError("Feature and time step dimensions must match between orbit_data1 and orbit_data2.")

    # Check if the specified distance metric is supported
    if distance_metric not in distance_functions:
        raise ValueError(f"Unsupported distance metric: {distance_metric}. Choose from {list(distance_functions.keys())}.")

    # Select the appropriate distance function
    selected_distance_func = distance_functions[distance_metric]

    # Initialize the distance matrix
    if n_samples1 == 1 and n_samples2 == 1:
        # Both are single orbits
        distances = np.array([selected_distance_func(orbit_data1[0], orbit_data2[0])])
    elif n_samples1 == 1:
        # orbit_data1 is single, orbit_data2 is multiple
        distances = np.array([selected_distance_func(orbit_data1[0], orbit_data2[j]) for j in range(n_samples2)])
    elif n_samples2 == 1:
        # orbit_data2 is single, orbit_data1 is multiple
        distances = np.array([selected_distance_func(orbit_data1[i], orbit_data2[0]) for i in range(n_samples1)])
    else:
        # Both are multiple
        distances = np.zeros((n_samples1, n_samples2), dtype=float)
        for i in range(n_samples1):
            for j in range(n_samples2):
                distances[i, j] = selected_distance_func(orbit_data1[i], orbit_data2[j])

    # Convert to 1D array if one of the inputs was a single orbit
    if n_samples1 == 1 or n_samples2 == 1:
        distances = distances.flatten()

    return distances

### Get the Closest Orbits

In [None]:
#| export
def find_nearest_orbits(
    single_orbit: np.ndarray,
    orbit_data: np.ndarray,
    n: int,
    distance_metric: str = 'euclidean'
) -> tuple:
    """
    Finds the n closest orbits in orbit_data to the single_orbit based on the specified distance metric.

    :param single_orbit: The reference orbit (shape: [n_features, n_time_steps]).
    :param orbit_data: The dataset of orbits (shape: [n_samples, n_features, n_time_steps]).
    :param n: The number of closest orbits to return.
    :param distance_metric: The distance metric to use ('euclidean', 'manhattan', 'cosine', 'dtw').
                            Defaults to 'euclidean'.
    :return: A tuple containing:
             - Indices of the n closest orbits in orbit_data.
             - Distances of the n closest orbits.
    """
    # Calculate distance matrix
    distance_matrix = orbits_distances(orbit_data, single_orbit, distance_metric)
    
    # Flatten if single_orbit was 2D (now treated as 3D with shape [1, ...])
    if distance_matrix.ndim == 2 and distance_matrix.shape[0] == 1:
        distances = distance_matrix.flatten()
    else:
        distances = distance_matrix
    
    # Get the indices of the n smallest distances
    nearest_indices = np.argsort(distances)[:n]
    
    # Get the corresponding distances
    nearest_distances = distances[nearest_indices]
    
    if n == 1:
        return nearest_indices[0], nearest_distances[0]
    else:
        return nearest_indices, nearest_distances

In [None]:
#| export
def find_nearest_orbits_batch(
    single_orbits: np.ndarray,       # Shape: [num_single_orbits, n_features, n_time_steps]
    orbit_data: np.ndarray,          # Shape: [n_samples, n_features, n_time_steps]
    n: int,                           # Number of nearest orbits to find
    distance_metric: str = 'euclidean'  # Distance metric
) -> tuple:
    """
    Iteratively finds the n closest orbits in orbit_data for each orbit in single_orbits.

    :param single_orbits: The reference orbits (shape: [num_single_orbits, n_features, n_time_steps]).
    :param orbit_data: The dataset of orbits to search within (shape: [n_samples, n_features, n_time_steps]).
    :param n: The number of closest orbits to return for each single_orbit.
    :param distance_metric: The distance metric to use ('euclidean', 'manhattan', 'cosine', 'dtw').
                            Defaults to 'euclidean'.
    :return: A tuple containing:
             - A 2D array of shape [num_single_orbits, n] with indices of the n closest orbits.
             - A 2D array of shape [num_single_orbits, n] with distances of the n closest orbits.
    """
    # Validate input dimensions
    if single_orbits.ndim != 3:
        raise ValueError(f"single_orbits must be a 3D array, got {single_orbits.ndim}D array instead.")
    if orbit_data.ndim != 3:
        raise ValueError(f"orbit_data must be a 3D array, got {orbit_data.ndim}D array instead.")
    if single_orbits.shape[1:] != orbit_data.shape[1:]:
        raise ValueError("Each single_orbit must have the same shape as the orbits in orbit_data.")
    if n <= 0:
        raise ValueError("Parameter n must be a positive integer.")
    if distance_metric not in ['euclidean', 'manhattan', 'cosine', 'dtw']:
        raise ValueError(f"Unsupported distance metric: {distance_metric}. Choose from 'euclidean', 'manhattan', 'cosine', 'dtw'.")

    num_single_orbits = single_orbits.shape[0]
    
    nearest_indices_all = np.empty((num_single_orbits, n), dtype=int)
    nearest_distances_all = np.empty((num_single_orbits, n), dtype=float)

    for i in range(num_single_orbits):
        single_orbit = single_orbits[i]
        nearest_indices, nearest_distances = find_nearest_orbits(
            single_orbit=single_orbit,
            orbit_data=orbit_data,
            n=n,
            distance_metric=distance_metric
        )
        nearest_indices_all[i] = nearest_indices
        nearest_distances_all[i] = nearest_distances

    return nearest_indices_all, nearest_distances_all

### Calculate Pairwise distances

In [None]:
#| export
def calculate_pairwise_orbit_distances(
    orbit_data1: np.ndarray,       # Shape: [n_samples, n_features, n_time_steps]
    orbit_data2: np.ndarray,       # Shape: [n_samples, n_features, n_time_steps]
    distance_metric: str = 'euclidean'  # Distance metric
) -> np.ndarray:
    """
    Calculates the distance between corresponding orbits in two orbit datasets.
    
    :param orbit_data1: The first set of orbits (shape: [n_samples, n_features, n_time_steps]).
    :param orbit_data2: The second set of orbits (shape: [n_samples, n_features, n_time_steps]).
    :param distance_metric: The distance metric to use ('euclidean', 'manhattan', 'cosine', 'dtw').
                            Defaults to 'euclidean'.
    :return: An array of distances with shape [n_samples].
    """
    
    # Inline validation
    if orbit_data1.shape != orbit_data2.shape:
        raise ValueError("Both orbit datasets must have the same shape.")
    if orbit_data1.ndim != 3:
        raise ValueError(f"orbit_data1 must be a 3D array, got {orbit_data1.ndim}D array instead.")
    
    # Mapping of distance metrics to functions
    distance_functions = DISTANCE_FUNCTIONS
    
    if distance_metric not in distance_functions:
        raise ValueError(f"Unsupported distance metric: {distance_metric}. Choose from {list(distance_functions.keys())}.")
    
    n_samples = orbit_data1.shape[0]
    distances = np.empty(n_samples, dtype=float)
    
    distance_func = distance_functions[distance_metric]
    
    for i in range(n_samples):
        distances[i] = distance_func(orbit_data1[i], orbit_data2[i])
    
    return distances

## Evaluate Distance Metrics

In [None]:
#| export
def evaluate_distance_metrics_and_clustering(orbit_data: np.ndarray,
                                    true_labels: np.ndarray,
                                    distance_metrics: list = None,
                                    clustering_algorithms: list = None,
                                    evaluation_metrics: list = None,
                                    n_clusters: int = None,
                                    plot_results: bool = True):
    """
    Evaluates specified distance metrics and clustering algorithms on the given orbit data.
    
    :param orbit_data: The orbit data as either:
                      - multivariate time series (shape: [n_samples, n_features, n_time_steps])
                      - point data (shape: [n_samples, n_features])
    :param true_labels: Array of true labels for the orbit data.
    :param distance_metrics: List of strings specifying distance metrics to use. If None, all available metrics are used.
    :param clustering_algorithms: List of strings specifying clustering algorithms to use. If None, all available algorithms are used.
    :param evaluation_metrics: List of strings specifying evaluation metrics to use. If None, all available metrics are used.
    :param n_clusters: Number of clusters for algorithms that require it. If None, it will be inferred from labels.
    :param plot_results: If True, plot heatmaps of the results.
    :return: A dictionary containing results for all combinations of metrics and clustering algorithms.
    """

    def calculate_clustering_accuracy(true_labels, pred_labels):
        cm = confusion_matrix(true_labels, pred_labels)
        row_ind, col_ind = linear_sum_assignment(-cm)
        return cm[row_ind, col_ind].sum() / np.sum(cm)

    # Define available distance metrics
    available_distance_metrics = list(DISTANCE_FUNCTIONS.keys())
    
    # Define available clustering algorithms
    available_clustering_algorithms = {
        'kmeans': KMeans,
        'gmm': GaussianMixture,
        'dbscan': DBSCAN
    }
    
    # Define available evaluation metrics
    available_evaluation_metrics = {
        'ARI': adjusted_rand_score,
        'NMI': normalized_mutual_info_score,
        'Homogeneity': homogeneity_score,
        'Completeness': completeness_score,
        'V-Measure': v_measure_score,
        'Purity': lambda true, pred: np.sum(np.amax(confusion_matrix(true, pred), axis=0)) / np.sum(confusion_matrix(true, pred)),
        'Accuracy': calculate_clustering_accuracy
    }
    
    # If no metrics specified, use all available
    if distance_metrics is None:
        distance_metrics = available_distance_metrics
    
    # If no algorithms specified, use all available
    if clustering_algorithms is None:
        clustering_algorithms = list(available_clustering_algorithms.keys())
    
    # If no evaluation metrics specified, use all available
    if evaluation_metrics is None:
        evaluation_metrics = list(available_evaluation_metrics.keys())
    
    results = {}
    
    # If n_clusters is not provided, infer it from the labels
    if n_clusters is None:
        n_clusters = len(np.unique(true_labels))
    
    for distance_metric in distance_metrics:
        if distance_metric not in available_distance_metrics:
            print(f"Warning: {distance_metric} is not an available distance metric. Skipping.")
            continue
        
        print(f"Computing {distance_metric} distances...")
        
        # Check input dimensionality and compute distances accordingly
        if orbit_data.ndim == 3:
            distance_matrix = orbits_distances(orbit_data, orbit_data, distance_metric)
        elif orbit_data.ndim == 2:
            n_samples = orbit_data.shape[0]
            distance_matrix = np.zeros((n_samples, n_samples))
            distance_func = DISTANCE_FUNCTIONS[distance_metric]
            for i in range(n_samples):
                for j in range(i+1, n_samples):
                    distance = distance_func(orbit_data[i], orbit_data[j])
                    distance_matrix[i,j] = distance
                    distance_matrix[j,i] = distance
        else:
            raise ValueError(f"Input data must be 2D or 3D, got {orbit_data.ndim}D array instead.")

        # Make the matrix symmetric by averaging with its transpose
        distance_matrix = 0.5 * (distance_matrix + distance_matrix.T)
        
        if np.any(distance_matrix < 0):
            distance_matrix = distance_matrix - np.min(distance_matrix)
        np.fill_diagonal(distance_matrix, 0)  # Ensure diagonal is exactly zero
        
        for algo_name in clustering_algorithms:
            if algo_name not in available_clustering_algorithms:
                print(f"Warning: {algo_name} is not an available clustering algorithm. Skipping.")
                continue
            
            print(f"Clustering with {algo_name}...")
            algo_class = available_clustering_algorithms[algo_name]
            
            if algo_name == 'dbscan':
                # For DBSCAN, we need to estimate eps
                distances = squareform(distance_matrix)
                eps = np.percentile(distances, 10)  # Use the 10th percentile of distances as eps
                clusterer = algo_class(eps=eps, min_samples=5, metric='precomputed')
                labels = clusterer.fit_predict(distance_matrix)
            elif algo_name == 'kmeans':
                clusterer = algo_class(n_clusters=n_clusters, random_state=42)
                labels = clusterer.fit_predict(distance_matrix)
            elif algo_name == 'gmm':
                clusterer = algo_class(n_components=n_clusters, random_state=42)
                labels = clusterer.fit_predict(distance_matrix)
            else:
                clusterer = algo_class()  # Use default parameters for other algorithms
                labels = clusterer.fit_predict(distance_matrix)
            
            # Evaluate clustering
            eval_results = {}
            for eval_metric in evaluation_metrics:
                if eval_metric not in available_evaluation_metrics:
                    print(f"Warning: {eval_metric} is not an available evaluation metric. Skipping.")
                    continue
                else:
                    eval_results[eval_metric] = available_evaluation_metrics[eval_metric](true_labels, labels)
            
            # Store results
            results[f"{distance_metric}_{algo_name}"] = eval_results
    
    if plot_results:
        plot_metric_heatmaps(results, distance_metrics, clustering_algorithms, evaluation_metrics)
    
    return results

## Machine Learning

In [None]:
#| export
def machine_learning_evaluation(X, y, print_results=False, return_best_model=False, scale_data=True):
    """
    Evaluates multiple machine learning algorithms on the provided dataset.

    Parameters:
    - X: Features, expected to be a 2D array. If higher dimensions, the function attempts to reshape.
    - y: Target labels.
    - print_results: If True, visualizes the evaluation results.
    - return_best_model: If True, returns the best model based on accuracy.
    - scale_data: If True, scales the features using StandardScaler.

    Returns:
    - results: Dictionary containing accuracy and classification report for each algorithm.
    - best_model: The model with the highest accuracy if return_best_model is True.
    """
    
    def visualize_results(results):
        # Accuracy comparison
        accuracies = [result['accuracy'] for result in results.values()]
        plt.figure(figsize=(10, 6))
        bars = plt.bar(results.keys(), accuracies, color='skyblue')
        plt.title('Accuracy Comparison')
        plt.ylabel('Accuracy')
        plt.ylim(0, 1)
        for bar in bars:
            yval = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), ha='center', va='bottom')
        plt.show()

        # Detailed metrics heatmap
        metrics = ['precision', 'recall', 'f1-score']
        data = []
        for algo, result in results.items():
            for metric in metrics:
                value = np.mean([
                    v[metric] for k, v in result['report'].items() 
                    if k not in ['accuracy', 'macro avg', 'weighted avg']
                ])
                data.append([algo, metric, value])

        df = pd.DataFrame(data, columns=['Algorithm', 'Metric', 'Value'])
        pivot_df = df.pivot(index='Algorithm', columns='Metric', values='Value')

        plt.figure(figsize=(12, 8))
        sns.heatmap(pivot_df, annot=True, cmap='YlGnBu', fmt='.2f')
        plt.title('Performance Metrics Heatmap')
        plt.show()

    # Validate and reshape X if necessary
    if isinstance(X, np.ndarray):
        if X.ndim > 2:
            try:
                # Flatten all dimensions except the first (samples)
                X = X.reshape(X.shape[0], -1)
                print("Input features reshaped to 2D for processing.")
            except Exception as e:
                raise ValueError(f"Error reshaping input features: {e}")
        elif X.ndim < 2:
            raise ValueError(f"Input features must be at least 2D, but got {X.ndim}D.")
    else:
        raise TypeError("Input features X must be a NumPy array.")

    # List of algorithms to evaluate
    algorithms = {
        'Logistic Regression': LogisticRegression(max_iter=2000),
        'Decision Tree': DecisionTreeClassifier(),
        'Support Vector Machine': SVC(),
        'Random Forest': RandomForestClassifier()
    }

    results = {}
    best_model = None
    best_accuracy = 0

    # Train and evaluate each algorithm
    for name, model in algorithms.items():
        try:
            # Create a pipeline that optionally scales the data and then applies the model
            steps = []
            if scale_data:
                steps.append(('scaler', StandardScaler()))
            steps.append(('model', model))
            pipeline = make_pipeline(*steps)
            
            # Use cross-validation to get predictions
            y_pred = cross_val_predict(pipeline, X, y, cv=5)
            
            accuracy = accuracy_score(y, y_pred)
            report = classification_report(y, y_pred, output_dict=True, zero_division=0)
            results[name] = {'accuracy': accuracy, 'report': report}
            
            # Check for the best model
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model
        except Exception as e:
            print(f"Error evaluating {name}: {e}")
            results[name] = {'accuracy': None, 'report': None}

    if print_results:
        visualize_results(results)

    if return_best_model:
        return results, best_model

    return results

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()