In [1]:
import warnings
from sklearn.model_selection import train_test_split
import os
from sklearn.cluster import KMeans, DBSCAN, AffinityPropagation, Birch, OPTICS, MeanShift, AgglomerativeClustering
import time
from sklearn.model_selection import ParameterSampler
from typing import Union, Tuple
from os import PathLike
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.optimize import linear_sum_assignment
from sklearn import metrics
from enum import Enum, auto

In [2]:
MAJOR_MINOR_VERSION = '2.0'

In [3]:
def evaluate_clustering(X, labels_true, labels_pred, clus_algo_name, dataset_name, results_path, algorithm_details,
                        training_time, prediction_time, drift_type, drift_level):
    """
    Evaluates the clustering performance using various metrics and saves the results to a CSV file.

    :param X: Feature set.
    :param labels_true: Ground truth labels.
    :param labels_pred: Predicted cluster labels.
    :param clus_algo_name: Name of the clustering algorithm.
    :param dataset_name: Name of the dataset.
    :param results_path: Path to save the results CSV file.
    """
    # Ensure there are at least 2 unique labels before calculating certain metrics
    unique_labels = np.unique(labels_pred)

    # Initialize default values for scores that require multiple clusters
    calinski_harabasz_score = np.nan
    davies_bouldin_score = np.nan
    silhouette_score = np.nan

    if len(unique_labels) > 1:
        calinski_harabasz_score = metrics.calinski_harabasz_score(X, labels_pred)
        davies_bouldin_score = metrics.davies_bouldin_score(X, labels_pred)
        silhouette_score = metrics.silhouette_score(X, labels_pred)

    results = {
        'Timestamp': datetime.now(),
        'Dataset': dataset_name,
        'Clustering Algorithm': clus_algo_name,
        'Algorithm Details': algorithm_details,
        'Drift Type': drift_type,
        'Drift Level': drift_level,
        'Training Time': training_time,
        'Prediction Time': prediction_time,
        'AMI': metrics.adjusted_mutual_info_score(labels_true, labels_pred),
        'ARI': metrics.adjusted_rand_score(labels_true, labels_pred),
        'Calinski-Harabasz Score': calinski_harabasz_score,
        'Davies-Bouldin Score': davies_bouldin_score,
        'Completeness Score': metrics.completeness_score(labels_true, labels_pred),
        'Fowlkes-Mallows Score': metrics.fowlkes_mallows_score(labels_true, labels_pred),
        'Homogeneity': metrics.homogeneity_score(labels_true, labels_pred),
        'Completeness': metrics.completeness_score(labels_true, labels_pred),
        'V-Measure': metrics.v_measure_score(labels_true, labels_pred),
        'Mutual Information': metrics.mutual_info_score(labels_true, labels_pred),
        'Normalized Mutual Information': metrics.normalized_mutual_info_score(labels_true, labels_pred),
        'Silhouette Score': silhouette_score,
        'Accuracy': accuracy_score(labels_true, labels_pred)

    }

    # # Print results
    # for key, value in results.items():
    #     if key == 'Confusion Matrix':
    #         print(f"{key}:\n{value}")
    #     else:
    #         print(f"{key}: {value}")

    # Save to CSV
    df = pd.DataFrame([results])
    df.to_csv(results_path, mode='a', header=not os.path.exists(results_path), index=False)


def map_clusters_to_ground_truth(labels_true, labels_pred):
    """
    Maps clustering algorithm output to ground truth labels using the Hungarian algorithm.

    :param labels_true: Ground truth labels.
    :param labels_pred: Predicted cluster labels.
    :return: Remapped predicted labels.
    """
    # Calculate the confusion matrix
    cm = confusion_matrix(labels_true, labels_pred)
    # Apply the Hungarian algorithm to the negative confusion matrix for maximum matching
    row_ind, col_ind = linear_sum_assignment(-cm)

    # Create a new array to hold the remapped predicted labels
    remapped_labels_pred = np.zeros_like(labels_pred)
    # For each original cluster index, find the new label (according to the Hungarian algorithm)
    # and assign it in the remapped labels array
    for original_cluster, new_label in zip(col_ind, row_ind):
        remapped_labels_pred[labels_pred == original_cluster] = new_label

    return remapped_labels_pred


def load_labels_from_file(file_path, labels_pred_len):
    """
    Loads clustering labels from a text file, ignoring the header and metadata.

    :param file_path: Path to the file containing the labels.
    :return: List of labels as integers.
    """
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Skipping the header and metadata, start reading from the line after '-----'
    start_index = lines.index('-------------------------------------\n') + 1
    labels_true = [int(line.strip()) for line in lines[start_index:]]
    # if labels_pred_len != len(labels_true):
    #     raise ValueError(
    #         f"This is a custom error raised by the developer./ Please check the file {file_path} or labels "
    #         f"definition.")

    return labels_true


def preprocess_data(df):
    """
    Function to preprocess data by normalizing it.

    :param df: pandas DataFrame with raw data.
    :return: pandas DataFrame with processed (normalized) data.
    """
    processed_df = df.copy()

    # Normalize the data
    # For each column, subtract the minimum and divide by the range.
    for column in processed_df.columns:
        min_value = processed_df[column].min()
        max_value = processed_df[column].max()
        processed_df[column] = (processed_df[column] - min_value) / (max_value - min_value)

    return processed_df


# Function to create directories and return the path for results
def create_dirs_and_get_results_path(base_path: Union[str, PathLike],
                                     algorithm_name: str,
                                     version: str,
                                     dataset_dir: str,
                                     dataset_name: str,
                                     filename: str) -> str:
    directory_path = os.path.join(base_path, algorithm_name, version, dataset_dir, dataset_name)
    os.makedirs(directory_path, exist_ok=True)
    return os.path.join(directory_path, filename)

In [4]:
def train_and_time(clustering_model, train_data):
    """Train the model and measure training time."""
    start_time = time.time()
    clustering_model.fit(train_data)
    end_time = time.time()
    return clustering_model, end_time - start_time


def predict_and_time(clustering_model, data):
    """Predict using the model and measure prediction time."""
    start_time = time.time()
    if ALGORITHM_NAME == 'DBSCAN' or ALGORITHM_NAME == 'OPTICS' or ALGORITHM_NAME == 'AgglomerativeClustering':
        labels_pred = clustering_model.fit_predict(data)
    else:
        labels_pred = clustering_model.predict(data)
    end_time = time.time()
    return labels_pred, end_time - start_time


def evaluate_and_log(clustering_model, X_train, X_validate, labels_true, results_path, drift_type, drift_level):
    """Evaluate the clustering and log the results."""
    algorithm_details = str(clustering_model.get_params())
    _, training_time = train_and_time(clustering_model, X_train)
    labels_pred, prediction_time = predict_and_time(clustering_model, X_validate)
    labels_pred = map_clusters_to_ground_truth(labels_true, labels_pred)
    evaluate_clustering(X=X_validate, labels_true=labels_true, labels_pred=labels_pred,
                        clus_algo_name=ALGORITHM_NAME, dataset_name=DATASET_FILE_NAME,
                        results_path=results_path, algorithm_details=algorithm_details,
                        training_time=training_time, prediction_time=prediction_time, drift_type=drift_type, drift_level=drift_level)

In [ ]:
def find_valid_target_class(test_labels, initial_target_class):
    """
    Finds a valid target class that exists in the test labels.

    :param test_labels: Array of test labels.
    :param initial_target_class: The initial target class to start searching from.
    :return: A valid target class that exists in the test labels.
    """
    unique_classes = np.unique(test_labels)
    target_class = initial_target_class
    while target_class not in unique_classes:
        target_class += 1  # Increment target class
    return target_class


In [5]:
warnings.filterwarnings("ignore")
current_directory = os.path.join('/opt', 'home', 's3934056')
DATASET_DIRS = [
    'A-sets', 
    'Birch-sets', 
    'DIM-sets-high', 
    # 'G2-sets', 
    # 'S-sets',
    # 'Unbalance'
]
algorithms = ['KMeans']

metrics_file_path = os.path.join(current_directory, 'results', 'metrics')

In [ ]:
class DriftTypes(Enum):
    KNOCKOUT = 'Knock-out'
    GAUSSIAN = 'Gaussian Shift'
    CONCEPT = 'Concept Drift'
    COVARIANT = 'Covariant Drift'

class DriftLevels(Enum):
    MILD = 0.01
    MODERATE = 0.05
    SEVERE = 0.1

drift_type = DriftTypes.GAUSSIAN.value

In [7]:
# Start timing the entire process
total_start_time = time.time()

for algorithm_name in algorithms:
    ALGORITHM_NAME = algorithm_name
    VERSION = MAJOR_MINOR_VERSION + '.5'
    for DATASET_DIR in DATASET_DIRS:
        # Specify the raw and processed data directory paths
        # raw_directory_path = f'data/raw/{DATASET_DIR}'
        raw_directory_path = os.path.join(current_directory, 'data', 'raw', DATASET_DIR)
        # processed_directory_path = f'data/processed/{DATASET_DIR}'
        processed_directory_path = os.path.join(current_directory, 'data', 'processed', DATASET_DIR)

        os.makedirs(processed_directory_path, exist_ok=True)  # This creates the directory if it does not exist

        # Get a list of all files in the directory (excluding directories)
        files = [f for f in os.listdir(raw_directory_path) if os.path.isfile(os.path.join(raw_directory_path, f))]
        total_files = len(files)

        # Preprocessing
        # Run preprocessing step for all files in the specified directory
        for index, filename in enumerate(files, start=1):
            FILE_NAME = filename.split('.')[0]
            raw_file_path = os.path.join(raw_directory_path, f'{FILE_NAME}.txt')
            processed_file_path = os.path.join(processed_directory_path, f'{FILE_NAME}.txt')

            # Check if the processed file already exists
            if not os.path.isfile(processed_file_path):
                # print(f"[{index}/{total_files}] Processing {FILE_NAME}")

                # The regular expression '\s+' can be used to match one or more spaces
                data = pd.read_csv(raw_file_path, sep="\s+", header=None, names=['X', 'Y'])
                # Remove rows with missing values:
                # data_clean = data.dropna()
                processed_data = preprocess_data(data)

                # Save the processed data to a CSV file
                processed_data.to_csv(processed_file_path, index=False)

        # Run clustering algorithm for all files in the specified directory
        for index, filename in enumerate(files, start=1):
            if os.path.isfile(os.path.join(raw_directory_path, filename)):
                # print(f"[{index}/{total_files}] {filename.split('.')[0]}")

                DATASET_FILE_NAME = filename.split('.')[0]
                LABELS_FILE_NAME = f'{DATASET_FILE_NAME}-gt.pa'

                # Read labels
                labels_true = load_labels_from_file(
                    os.path.join(current_directory, 'data', 'label', DATASET_DIR, LABELS_FILE_NAME), 15)

                # Get the number of clusters form the ground truth
                N_CLUSTERS = len(set(labels_true))

                # Read processed data
                # processed_file_path = rf'data\processed\{DATASET_DIR}\{DATASET_FILE_NAME}.txt'
                processed_file_path = os.path.join(current_directory, 'data', 'processed', DATASET_DIR,
                                                   f'{DATASET_FILE_NAME}.txt')
                processed_data = pd.read_csv(processed_file_path)

                # Split the data into train and temp (temp will contain both validate and test)
                train_data, temp_data, train_labels, temp_labels = train_test_split(
                    processed_data, labels_true, train_size=0.5, random_state=42)

                # Split the temp data into validate and test
                validate_data, test_data, validate_labels, test_labels = train_test_split(
                    temp_data, temp_labels, train_size=0.5, random_state=42)

                # For Hyperparameter Tuning Logs
                tuning_results_filename = 'hyperparameter_tuning_logs.csv'
                tuning_results_path = create_dirs_and_get_results_path(metrics_file_path, ALGORITHM_NAME, VERSION,
                                                                       DATASET_DIR, DATASET_FILE_NAME,
                                                                       tuning_results_filename)

                # For Final Results
                final_results_filename = 'drift_results_v4.csv'
                final_results_path = create_dirs_and_get_results_path(metrics_file_path, ALGORITHM_NAME, VERSION,
                                                                      DATASET_DIR, DATASET_FILE_NAME,
                                                                      final_results_filename)

                # Read hyperparameter tuning logs
                csv_content = pd.read_csv(tuning_results_path)

                # Find the record with the highest accuracy
                max_accuracy_record = csv_content.loc[csv_content['Accuracy'].idxmax()]

                # # Display the record with the highest accuracy
                # print(max_accuracy_record)
                # print(max_accuracy_record['Algorithm Details'])
                # print(max_accuracy_record['Accuracy'], "\n")

                combined_train_data = pd.concat([train_data, validate_data])
                combined_train_labels = np.concatenate([train_labels, validate_labels])

                # Initialize KMeans with the best hyperparameters
                best_params = max_accuracy_record['Algorithm Details']  # Assume this contains the best parameters

                model_final = KMeans(**eval(best_params))

                # Apply data shift on the test data

                # Knock out start
                num_splits = 5  # Number of random splits
                portion_to_modify = 0.2  # Portion of the dataset to apply drift to

                
                for level in DriftLevels:                   
                    metric_sums = {
                        'Accuracy': 0,
                        'Silhouette Score': 0
                        # Add other metrics here
                    }
    
                    min_accuracy = float('inf')  # Initialize to the highest possible value`               
                    
                    # In order to qualitatively quantify the robustness of our findings, shift detection performance is averaged over a total of 5 random splits
                    for split in range(num_splits):
                        std_dev = level.value   # Adjust this value based on your needs 0.01, 0.05, 0.1
    
                        
                        # Randomly select a subset of these indices to remove
                        np.random.seed(42 + split)  # For reproducibility
                        
                        # Calculate the number of samples to select based on the specified portion
                        num_samples_to_modify = int(len(test_data) * portion_to_modify)
                
                        # Randomly select indices to modify
                        indices_to_modify = np.random.choice(test_data.index, size=num_samples_to_modify, replace=False)
                
                        # Make a copy of the test_data to apply drift
                        test_data_modified = test_data.copy()
                    
                        # Apply Gaussian noise as concept drift to the selected indices for both 'X' and 'Y' columns
                        test_data_modified.loc[indices_to_modify, 'X'] += np.random.normal(0, std_dev, size=num_samples_to_modify)
                        test_data_modified.loc[indices_to_modify, 'Y'] += np.random.normal(0, std_dev, size=num_samples_to_modify)
                
                        # Evaluate and log with the modified test data
                        evaluate_and_log(model_final, combined_train_data, test_data_modified, test_labels, final_results_path, drift_type=drift_type, drift_level=level.name)
    
                        # Read hyperparameter tuning logs
                        csv_content = pd.read_csv(final_results_path)
    
                        # After evaluation, read the latest record for performance metrics
                        latest_record = csv_content.iloc[-1]
    
                        metric_sums['Accuracy'] += latest_record['Accuracy']
                        min_accuracy = min(min_accuracy, latest_record['Accuracy'])
                        silhouette_score = latest_record['Silhouette Score']
                        metric_sums['Silhouette Score'] += silhouette_score
    
                    average_metrics = {metric: sum_value / num_splits for metric, sum_value in metric_sums.items()}
                    # Display or log the average metrics and the minimum accuracy
                    print("Drift type: ", drift_type, "Drift level: ", level)
                    for metric, avg_value in average_metrics.items():
                        print(f"{metric}: {avg_value}")
                    print(f"Minimum Accuracy: {min_accuracy}")
                # Knock out end




FileNotFoundError: [WinError 3] The system cannot find the path specified: '/opt\\home\\s3934056\\data\\raw\\S-sets'

In [ ]:
# End timing the entire process
total_end_time = time.time()

# Calculate total elapsed time
total_elapsed_time = total_end_time - total_start_time

# Display the total time taken
print(f"The total process took {total_elapsed_time} seconds.")