# Data Preprocessing

## Load Data

In [36]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap.umap_ as umap
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

In [37]:
import os

# Define the HuggingFace repository and local path
ct_filename = "ct.nii.gz"
pancreas_segmentations_filename = "segmentations/pancreas.nii.gz"

data_folder = "./data"
encoded_images_dir = os.path.join(data_folder, "encoded_imgs")

os.path.join(encoded_images_dir, 'enc_healthy_cubes.pkl')
import pickle

with open(os.path.join(encoded_images_dir, 'enc_healthy_cubes.pkl'), 'rb') as f:
    enc_healthy_cubes = pickle.load(f)

with open(os.path.join(encoded_images_dir, 'enc_pancreatic_tumor_cubes.pkl'), 'rb') as f:
    enc_pancreatic_tumor_cubes = pickle.load(f)

In [38]:
len(enc_healthy_cubes.values())

42

## Unfortunately some of the CTs have less than 10 samples bcs. they ended up being out of range

- [ ] TODO Reduce to min. no. samples of all CTs

In [39]:
import torch

def process_and_stack_tensors(enc_pancreatic_tumor_cubes):
    """
    Process the tensors in the dictionary:
    1. Stack the tensors, ignoring empty lists.
    2. Take tensors with at least 5 entries along the first dimension.
    3. Threshold them to 5 entries, dropping those with fewer than 5 entries.
    4. Stack all the processed tensors along a new axis.

    Parameters:
    enc_pancreatic_tumor_cubes (dict): The input dictionary with lists of tensors.

    Returns:
    torch.Tensor: A tensor with all the processed tensors stacked along a new axis.
    """
    # Stack the tensors, ignoring empty lists
    stacked_inner_lists = [torch.stack(inner_list).squeeze(1) for inner_list in enc_pancreatic_tumor_cubes.values() if inner_list]

    # Filter and reduce the tensors to have exactly 5 entries along the first dimension
    filtered_and_reduced_tensors = [tensor[:5] for tensor in stacked_inner_lists if tensor.shape[0] >= 5]

    # Stack all the processed tensors along a new axis
    if filtered_and_reduced_tensors:
        final_tensor = torch.stack(filtered_and_reduced_tensors)
    else:
        final_tensor = torch.tensor([])  # Return an empty tensor if no tensors meet the criteria

    return final_tensor

In [40]:
enc_pancreatic_tensors = process_and_stack_tensors(enc_pancreatic_tumor_cubes)
enc_healthy_tensors = process_and_stack_tensors(enc_healthy_cubes)

print(enc_pancreatic_tensors.shape)
print(enc_healthy_tensors.shape)

torch.Size([13, 5, 8, 24, 24, 24])
torch.Size([42, 5, 8, 24, 24, 24])


In [41]:
final_tensor = torch.cat([enc_healthy_tensors, enc_pancreatic_tensors], dim=0)
final_tensor.shape

torch.Size([55, 5, 8, 24, 24, 24])

In [42]:
healthy_labels = torch.zeros(enc_healthy_tensors.shape[0])
pancreatic_labels = torch.ones(enc_pancreatic_tensors.shape[0])

final_labels = torch.cat([healthy_labels, pancreatic_labels], dim=0)
final_labels.shape

torch.Size([55])

In [43]:
features = final_tensor
labels = final_labels

## Apply scaling and Flattening

In [45]:
def flatten_and_scale_features(features):
    """
    Flatten and standardize the features.

    Parameters:
        features (numpy.ndarray): Original features of shape (n_samples, ...).

    Returns:
        numpy.ndarray: Flattened and standardized features of shape (n_samples, n_features_flat).
    """
    n_samples = features.shape[0]
    flattened_features = features.reshape(n_samples, -1)
    scaler = StandardScaler()
    flattened_features_std = scaler.fit_transform(flattened_features)
    return flattened_features_std

In [46]:
# Assuming 'features' is your data of shape (n_samples, 8, 24, 24, 24)
# and 'labels' contains the labels (0 for healthy, 1 for unhealthy)
# Load or generate your 'features' and 'labels' here

# Flatten and scale features
flattened_features_std = flatten_and_scale_features(features)

# Dimensionality Reduction and Clustering

## Function Definitions

### Dimensionality Reduction

In [55]:
def apply_pca(features, n_components):
    """
    Apply PCA to reduce dimensionality.

    Parameters:
        features (numpy.ndarray): Input features.
        n_components (int): Number of components to retain.

    Returns:
        numpy.ndarray: PCA-transformed features.
        float: Total explained variance ratio.
    """
    pca = PCA(n_components=n_components, random_state=42)
    features_pca = pca.fit_transform(features)
    explained_variance = np.sum(pca.explained_variance_ratio_)
    return features_pca, explained_variance

def apply_tsne(features, n_components, perplexity=30):
    """
    Apply t-SNE to reduce dimensionality.

    Parameters:
        features (numpy.ndarray): Input features.
        n_components (int): Target number of dimensions.
        perplexity (float): Perplexity parameter for t-SNE.

    Returns:
        numpy.ndarray: t-SNE-transformed features.
    """
    tsne = TSNE(n_components=n_components, perplexity=perplexity, random_state=42)
    features_tsne = tsne.fit_transform(features)
    return features_tsne

def apply_umap(features, n_components, n_neighbors=15, min_dist=0.1):
    """
    Apply UMAP to reduce dimensionality.

    Parameters:
        features (numpy.ndarray): Input features.
        n_components (int): Target number of dimensions.
        n_neighbors (int): Number of neighbors for UMAP.
        min_dist (float): Minimum distance parameter for UMAP.

    Returns:
        numpy.ndarray: UMAP-transformed features.
    """
    umap_reducer = umap.UMAP(
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        random_state=42
    )
    features_umap = umap_reducer.fit_transform(features)
    return features_umap

### Clustering

In [57]:
def cluster_and_evaluate(features, method, n_clusters, **kwargs):
    """
    Perform clustering and evaluate performance.

    Parameters:
        features (numpy.ndarray): Input features for clustering.
        method (str): Clustering method ('kmeans', 'agglomerative', 'gmm').
        n_clusters (int): Number of clusters.
        **kwargs: Additional keyword arguments for clustering algorithms.

    Returns:
        dict: A dictionary containing evaluation metrics.
    """
    if method == 'kmeans':
        clustering = KMeans(n_clusters=n_clusters, random_state=42, **kwargs)
        clusters = clustering.fit_predict(features)
    elif method == 'agglomerative':
        clustering = AgglomerativeClustering(n_clusters=n_clusters, **kwargs)
        clusters = clustering.fit_predict(features)
    elif method == 'gmm':
        clustering = GaussianMixture(n_components=n_clusters, random_state=42, **kwargs)
        clusters = clustering.fit_predict(features)
    else:
        raise ValueError(f"Unsupported clustering method: {method}")

    # Evaluate clustering performance
    if len(set(clusters)) > 1 and len(set(clusters)) < len(features):
        silhouette_avg = silhouette_score(features, clusters)
        calinski_harabasz = calinski_harabasz_score(features, clusters)
        davies_bouldin = davies_bouldin_score(features, clusters)
    else:
        silhouette_avg = calinski_harabasz = davies_bouldin = np.nan

    evaluation = {
        'Silhouette Score': silhouette_avg,
        'Calinski-Harabasz Score': calinski_harabasz,
        'Davies-Bouldin Score': davies_bouldin
    }
    return evaluation


def evaluate_clustering(features, labels):
    """
    Evaluate clustering performance across different dimensionality reduction and clustering methods.

    Parameters:
        features (numpy.ndarray): Original high-dimensional features.
        labels (numpy.ndarray): True labels (if available, not used in clustering).

    Returns:
        dict: Nested dictionaries containing DataFrames of evaluation metrics.
    """
    # Define parameter ranges
    n_components_list = list(range(2, 56, 5))  # From 2 to 55 in steps of 5
    n_clusters_list = list(range(2, 8))        # From 2 to 7 in steps of 1

    dim_reduction_methods = ['PCA', 't-SNE', 'UMAP']
    clustering_methods = ['kmeans', 'agglomerative', 'gmm']

    # Initialize results dictionary
    results = {}

    for dim_name in dim_reduction_methods:
        print(f"\nDimensionality Reduction Method: {dim_name}")

        # Initialize a nested dictionary to store results
        results[dim_name] = {}

        for n_components in n_components_list:
            if dim_name == 't-SNE' and n_components > 3:
                continue

            print(f"  n_components = {n_components}")

            # Apply dimensionality reduction
            if dim_name == 'PCA':
                reduced_features, explained_variance = apply_pca(features, n_components)
                print(f"    Explained Variance Ratio: {explained_variance:.4f}")
            elif dim_name == 't-SNE':
                # Adjust perplexity if necessary based on n_components and sample size
                reduced_features = apply_tsne(features, n_components=n_components)
            elif dim_name == 'UMAP':
                reduced_features = apply_umap(features, n_components=n_components)

            # Initialize DataFrame for this n_components
            key = f"n_components_{n_components}"
            results[dim_name][key] = pd.DataFrame(
                index=n_clusters_list,
                columns=clustering_methods
            )

            for n_clusters in n_clusters_list:
                for method in clustering_methods:
                    # Perform clustering and evaluation
                    evaluation = cluster_and_evaluate(
                        reduced_features,
                        method=method,
                        n_clusters=n_clusters
                    )
                    # Store Silhouette Score in the DataFrame
                    results[dim_name][key].loc[n_clusters, method] = evaluation['Silhouette Score']

        print("  Evaluation complete.")

    return results

## Evaluation 🧪

In [58]:
# Run the evaluation
results = evaluate_clustering(flattened_features_std, labels)


Dimensionality Reduction Method: PCA
  n_components = 2
    Explained Variance Ratio: 0.0840
  n_components = 7
    Explained Variance Ratio: 0.2058
  n_components = 12
    Explained Variance Ratio: 0.3135
  n_components = 17
    Explained Variance Ratio: 0.4137
  n_components = 22
    Explained Variance Ratio: 0.5102
  n_components = 27
    Explained Variance Ratio: 0.6004
  n_components = 32
    Explained Variance Ratio: 0.6866
  n_components = 37
    Explained Variance Ratio: 0.7697
  n_components = 42
    Explained Variance Ratio: 0.8466
  n_components = 47
    Explained Variance Ratio: 0.9179
  n_components = 52
    Explained Variance Ratio: 0.9793
  Evaluation complete.

Dimensionality Reduction Method: t-SNE
  n_components = 2
  Evaluation complete.

Dimensionality Reduction Method: UMAP
  n_components = 2
  n_components = 7
  n_components = 12
  n_components = 17
  n_components = 22
  n_components = 27
  n_components = 32
  n_components = 37
  n_components = 42
  n_components 

In [60]:
# Display the results for each dimensionality reduction method and n_components
for dim_name, components_dict in results.items():
    print(f"\nResults for {dim_name}:\n")
    for n_components_key, df in components_dict.items():
        print(f"n_components = {n_components_key.split('_')[-1]}")
        print(df)
        print("\n")


Results for PCA:

n_components = 2
     kmeans agglomerative       gmm
2  0.415781      0.403015  0.412469
3  0.471768      0.435367  0.471768
4   0.42468      0.325491  0.207349
5  0.372024      0.324841  0.187038
6  0.369312      0.364407  0.176826
7  0.378918      0.365611  0.268334


n_components = 7
     kmeans agglomerative       gmm
2  0.197427      0.178406  0.247093
3  0.196978      0.186267  0.243406
4  0.207885      0.210924  0.215972
5  0.235804       0.23825  0.091066
6  0.222924      0.260684  0.091944
7  0.245807      0.264427  0.100665


n_components = 12
     kmeans agglomerative       gmm
2  0.128265      0.125084   0.18264
3   0.13986      0.129365 -0.040489
4  0.152924      0.130982 -0.005983
5  0.138891      0.133962  0.000402
6  0.143016      0.156038  0.006684
7  0.147451      0.154797  0.009358


n_components = 17
     kmeans agglomerative       gmm
2  0.101461      0.135077  0.112626
3    0.1046      0.135731  0.115402
4  0.136785      0.136128  0.083596
5  0.

### Results 📊

PCA Optimal Number of Clusters:
	•	For n_components = 2, n_clusters = 3 yields the highest Silhouette Score (0.4718) with K-Means, suggesting that three clusters may better represent the underlying data structure in this reduced space.


t-SNE Optimal Number of Clusters:

•	K-Means with n_clusters = 3 gives a Silhouette Score of 0.3532, indicating that three clusters might be a good choice in t-SNE reduced space.

UMAP:
Optimal Number of Clusters:
	•	With n_components = 2, K-Means with n_clusters = 3 gives a Silhouette Score of 0.3930.

# Supervised Clustering

In [73]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap.umap_ as umap
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.cross_decomposition import PLSRegression
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Assuming 'features' and 'labels' are your data and labels
# Flatten and scale features if not already done
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

ValueError: Found array with dim 6. StandardScaler expected <= 2.

In [74]:
def apply_nca(features, labels, n_components):
    """
    Apply Neighborhood Components Analysis (NCA).

    Parameters:
        features (numpy.ndarray): Input features.
        labels (numpy.ndarray): Class labels.
        n_components (int): Number of dimensions to reduce to.

    Returns:
        numpy.ndarray: Transformed features.
    """
    nca = NeighborhoodComponentsAnalysis(n_components=n_components, random_state=42)
    nca.fit(features, labels)
    return nca.transform(features)


def apply_pls(features, labels, n_components):
    """
    Apply Partial Least Squares (PLS) regression.

    Parameters:
        features (numpy.ndarray): Input features.
        labels (numpy.ndarray): Class labels.
        n_components (int): Number of components to keep.

    Returns:
        numpy.ndarray: Transformed features.
    """
    lb = LabelBinarizer()
    labels_binarized = lb.fit_transform(labels)
    pls = PLSRegression(n_components=n_components)
    pls.fit(features, labels_binarized)
    return pls.transform(features)


def apply_supervised_umap(features, labels, n_components, n_neighbors=15, min_dist=0.1):
    """
    Apply Supervised UMAP.

    Parameters:
        features (numpy.ndarray): Input features.
        labels (numpy.ndarray): Class labels.
        n_components (int): Number of dimensions to reduce to.

    Returns:
        numpy.ndarray: Transformed features.
    """
    reducer = umap.UMAP(
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        random_state=42
    )
    return reducer.fit_transform(features, y=labels)

In [69]:
def evaluate_supervised_methods(features, labels):
    """
    Evaluate clustering with supervised dimensionality reduction methods.

    Parameters:
        features (numpy.ndarray): Scaled features.
        labels (numpy.ndarray): Class labels.

    Returns:
        dict: Results for each method.
    """
    n_components_list = list(range(2, min(features.shape[1], 56), 5))
    n_clusters_list = list(range(2, 8))
    dim_reduction_methods = ['NCA', 'PLS', 'Supervised UMAP']
    clustering_methods = ['kmeans', 'agglomerative', 'gmm']
    
    results = {}
    
    for method in dim_reduction_methods:
        print(f"\nDimensionality Reduction Method: {method}")
        results[method] = {}
        for n_components in n_components_list:
            print(f"  n_components = {n_components}")
            if method == 'NCA':
                try:
                    transformed_features = apply_nca(features, labels, n_components)
                except Exception as e:
                    print(f"    NCA failed at n_components={n_components}: {e}")
                    continue
            elif method == 'PLS':
                try:
                    transformed_features = apply_pls(features, labels, n_components)
                except Exception as e:
                    print(f"    PLS failed at n_components={n_components}: {e}")
                    continue
            elif method == 'Supervised UMAP':
                transformed_features = apply_supervised_umap(features, labels, n_components)
            
            results_key = f"n_components_{n_components}"
            results[method][results_key] = pd.DataFrame(
                index=n_clusters_list,
                columns=clustering_methods
            )
            
            for n_clusters in n_clusters_list:
                for cluster_method in clustering_methods:
                    evaluation = cluster_and_evaluate(
                        transformed_features,
                        method=cluster_method,
                        n_clusters=n_clusters
                    )
                    # Store Silhouette Score
                    results[method][results_key].loc[n_clusters, cluster_method] = evaluation['Silhouette Score']
        print("  Evaluation complete.")
    return results

In [75]:
results = evaluate_supervised_methods(flattened_features_std, labels)


Dimensionality Reduction Method: NCA
  n_components = 2
  n_components = 7
  n_components = 12
  n_components = 17
  n_components = 22
  n_components = 27
  n_components = 32
  n_components = 37
  n_components = 42
  n_components = 47
  n_components = 52
  Evaluation complete.

Dimensionality Reduction Method: PLS
  n_components = 2
  n_components = 7
  n_components = 12
  n_components = 17
  n_components = 22
  n_components = 27
  n_components = 32
  n_components = 37
  n_components = 42
  n_components = 47
  n_components = 52
  Evaluation complete.

Dimensionality Reduction Method: Supervised UMAP
  n_components = 2
  n_components = 7
  n_components = 12
  n_components = 17
  n_components = 22
  n_components = 27
  n_components = 32
  n_components = 37
  n_components = 42
  n_components = 47
  n_components = 52
  Evaluation complete.


In [76]:
for method, n_components_dict in results.items():
    print(f"\nResults for {method}:\n")
    for n_components, df in n_components_dict.items():
        print(f"{n_components}")
        print(df)
        print("\n")


Results for NCA:

n_components_2
     kmeans agglomerative       gmm
2  0.415781      0.403015  0.412469
3  0.471768      0.435367  0.471768
4   0.42468      0.325491  0.207349
5  0.372024      0.324841  0.187038
6  0.369312      0.364407  0.176826
7  0.378918      0.365611  0.268334


n_components_7
     kmeans agglomerative       gmm
2  0.197427      0.178406  0.247093
3  0.196978      0.186267  0.243406
4  0.207885      0.210924  0.215972
5  0.235804       0.23825  0.091066
6  0.222924      0.260684  0.091944
7  0.245807      0.264427  0.100665


n_components_12
     kmeans agglomerative       gmm
2  0.128265      0.125084   0.18264
3   0.13986      0.129365 -0.040489
4  0.152924      0.130982 -0.005983
5  0.138891      0.133962  0.000402
6  0.143016      0.156038  0.006684
7  0.147451      0.154797  0.009358


n_components_17
     kmeans agglomerative       gmm
2  0.101461      0.135077  0.112626
3    0.1046      0.135731  0.115402
4  0.136785      0.136128  0.083596
5  0.088546  

Based on the Silhouette Scores, the best clustering result is:

	•	Dimensionality Reduction Method: Supervised UMAP
	•	Number of Components (n_components): 17
	•	Clustering Method: K-Means
	•	Number of Clusters (n_clusters): 2
	•	Silhouette Score: 0.959937


Saving that as a TSV for Projector.tensorflow.org

In [78]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import umap.umap_ as umap
from sklearn.cluster import KMeans

# Assume 'features' and 'labels' are your data and labels
# Flatten and scale features if not already done
scaler = StandardScaler()
features_scaled = scaler.fit_transform(flattened_features_std)

# Apply Supervised UMAP with n_components = 17
n_components = 17
reducer = umap.UMAP(
    n_components=n_components,
    n_neighbors=15,
    min_dist=0.1,
    random_state=42
)
embedding = reducer.fit_transform(features_scaled, y=labels)

# Perform K-Means clustering with n_clusters = 2
n_clusters = 2
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_assignments = kmeans.fit_predict(embedding)

# Prepare data for TSV files
# Create a DataFrame for the embedding
embedding_df = pd.DataFrame(embedding)

# Add cluster assignments and true labels
embedding_df['Cluster'] = cluster_assignments
embedding_df['Label'] = labels  # Ensure 'labels' is an array-like object

# Save the embedding vectors (without Cluster and Label columns)
embedding_df.drop(columns=['Cluster', 'Label']).to_csv('features.tsv', sep='\t', index=False, header=False)

# Save the metadata (Cluster assignments and Labels)
metadata_df = embedding_df[['Cluster', 'Label']]
metadata_df.to_csv('metadata.tsv', sep='\t', index=False)

## Plotting better than Projector.Tensorflow

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

for method, n_components_dict in results.items():
    for n_components_key, df in n_components_dict.items():
        n_components = n_components_key.split('_')[-1]
        plt.figure(figsize=(10, 6))
        sns.heatmap(df.astype(float), annot=True, fmt=".3f", cmap='viridis')
        plt.title(f'{method} with n_components = {n_components}')
        plt.xlabel('Clustering Method')
        plt.ylabel('Number of Clusters')
        plt.show()