# Load some modules and the data

In [1]:
# import numpy as np
# import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Load all the metrics

results_metabol_uncorrected_classic = pd.read_csv("cluster_metabolome_uncorrected/results/metrics_classic.csv")
results_metabol_uncorrected_classic["Dataset"] = "Metabolome"
results_metabol_uncorrected_classic["Method"] = "Classic"

results_metabol_uncorrected_classic_featsel = pd.read_csv("cluster_metabolome_uncorrected/results/metrics_classic_featsel.csv")
results_metabol_uncorrected_classic_featsel["Dataset"] = "Metabolome"
results_metabol_uncorrected_classic_featsel["Method"] = "Classic (fet. sel.)"

results_metabol_uncorrected_deepclust_mpl = pd.read_csv("cluster_metabolome_uncorrected/results/metrics_deepclust_mpl.csv")
results_metabol_uncorrected_deepclust_mpl["Dataset"] = "Metabolome"
results_metabol_uncorrected_deepclust_mpl["Method"] = "Deep clustering"

results_metabol_uncorrected_deepclust_mpl_featsel = pd.read_csv("cluster_metabolome_uncorrected/results/metrics_deepclust_mpl_featsel.csv")
results_metabol_uncorrected_deepclust_mpl_featsel["Dataset"] = "Metabolome"
results_metabol_uncorrected_deepclust_mpl_featsel["Method"] = "Deep clustering (fet. sel.)"

results_metabol_uncorrected_deepclust_conv = pd.read_csv("cluster_metabolome_uncorrected/results/metrics_deepclust_conv.csv")
results_metabol_uncorrected_deepclust_conv["Dataset"] = "Metabolome"
results_metabol_uncorrected_deepclust_conv["Method"] = "Deep clustering (convolutional AE)"


results_metabol_corrected_classic = pd.read_csv("cluster_metabolome_corrected/results/metrics_classic.csv")
results_metabol_corrected_classic["Dataset"] = "Metabolome (corrected)"
results_metabol_corrected_classic["Method"] = "Classic"

results_metabol_corrected_deepclust = pd.read_csv("cluster_metabolome_corrected/results/metrics_deepclust_mpl.csv")
results_metabol_corrected_deepclust["Dataset"] = "Metabolome (corrected)"
results_metabol_corrected_deepclust["Method"] = "Deep clustering"


results_exposome_uncorrected_classic = pd.read_csv("cluster_exposome_uncorrected/results/metrics_classic.csv")
results_exposome_uncorrected_classic["Dataset"] = "Exposome"
results_exposome_uncorrected_classic["Method"] = "Classic"

results_exposome_uncorrected_deepclust = pd.read_csv("cluster_exposome_uncorrected/results/metrics_deepclust.csv")
results_exposome_uncorrected_deepclust["Dataset"] = "Exposome"
results_exposome_uncorrected_deepclust["Method"] = "Deep clustering"


results_exposome_corrected_classic = pd.read_csv("cluster_exposome_corrected/results/metrics_classic.csv")
results_exposome_corrected_classic["Dataset"] = "Exposome (corrected)"
results_exposome_corrected_classic["Method"] = "Classic"

results_exposome_corrected_deepclust = pd.read_csv("cluster_exposome_corrected/results/metrics_deepclust.csv")
results_exposome_corrected_deepclust["Dataset"] = "Exposome (corrected)"
results_exposome_corrected_deepclust["Method"] = "Deep clustering"

results = pd.concat((results_metabol_uncorrected_classic,
                     results_metabol_uncorrected_classic_featsel,
                     results_metabol_uncorrected_deepclust_mpl,
                     results_metabol_uncorrected_deepclust_mpl_featsel,
                     results_metabol_uncorrected_deepclust_conv,
                     results_metabol_corrected_classic,
                     results_metabol_corrected_deepclust,
                     results_exposome_uncorrected_classic,
                     results_exposome_uncorrected_deepclust,
                     results_exposome_corrected_classic,
                     results_exposome_corrected_deepclust), axis=0)

results = results[['Dataset', 'Method', 'rl_method', 'clust_method', 'n_clusters', 'variable', 'Acc', 'ARI', 'AMI', 'Sil']]
results.columns = ('Dataset', 'Method', 'FL method', 'CL method', 'num clusters', 'variable', 'Acc.', 'ARI.', 'AMI.', 'Sil.')

# Summary of the results

## Breif discussion

For each data set (expsome and metabolome),
I evaluated some classic clustering methods (K-means, GMM, agglomerative)
over the raw data and the PCA learned features.
Also, I evaluated some deep learning methods (DEC, VaDE).

The same methods where evaluated for both datasets
after correcting for the suspected batch effect.

Only in the metabolomics (uncorrected) data,
I also tried applying a data augmentation technique before the deep clustering methods,
and modifying both deep clustering models with convolutional layers (1D and 2D) instead of MPL layers.

Here I show a summary of the results (mean for each combination of methods).
The full results for each method can be found at the end of its corresponing notebook.
Next, I show the top results for each metric.

It is evident that for the external validation metrics
(Accuracy, Adjusted Rand Index, Adjusted Mutual Information),
the best clustering quality is achieved with number of clusters of 6,
where all the methods almost perfectly matched the cohort classes.
This is the reason I suspected the presence of the batch effect.

For the internal validation metric (Silhouette),
the DEC model consistently achieves the best results,
specially with a small number of clusters.
But the found clusters do not seem to have a biological interpretation.

## Summary of the metrics

In [3]:
results_mean = results.set_index(['Dataset', 'Method', 'FL method', 'CL method']).groupby(level=[0,1,2,3]).mean()
results_mean = results_mean.iloc[:,1:]
results_mean.style.background_gradient(axis=0, cmap="cividis", text_color_threshold=0.3, subset=['Acc.','ARI.','AMI.','Sil.']).format('{:.2f}', subset=['Acc.','ARI.','AMI.','Sil.'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Acc.,ARI.,AMI.,Sil.
Dataset,Method,FL method,CL method,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Exposome,Classic,PCA,Agglo.,0.6,0.17,0.18,0.13
Exposome,Classic,PCA,GMM,0.59,0.16,0.18,0.12
Exposome,Classic,PCA,K-Means,0.59,0.16,0.18,0.13
Exposome,Classic,Raw data,Agglo.,0.6,0.17,0.18,0.1
Exposome,Classic,Raw data,GMM,0.59,0.17,0.19,0.09
Exposome,Classic,Raw data,K-Means,0.59,0.16,0.18,0.1
Exposome,Deep clustering,DEC,K-Means,0.6,0.15,0.16,0.81
Exposome,Deep clustering,VaDE,GMM,0.6,0.14,0.15,0.52
Exposome (corrected),Classic,PCA,Agglo.,0.46,0.0,0.0,0.02
Exposome (corrected),Classic,PCA,GMM,0.46,0.0,0.0,0.02


In [4]:
# Markdown-like table:
# !pip install tabulate
# from tabulate import tabulate
# print(tabulate(results_mean.reset_index().round(2), headers='keys', tablefmt='pipe', showindex=False))

## Top results for Accuracy:

In [5]:
results.sort_values(by="Acc.", ascending=False).head(10).style.format('{:.2f}', subset=['Acc.','ARI.','AMI.','Sil.'])

Unnamed: 0,Dataset,Method,FL method,CL method,num clusters,variable,Acc.,ARI.,AMI.,Sil.
28,Exposome,Classic,Raw data,Agglo.,6,cohort,0.99,0.98,0.97,0.12
8,Exposome,Classic,Raw data,K-Means,6,cohort,0.99,0.98,0.97,0.12
38,Exposome,Classic,PCA,K-Means,6,cohort,0.99,0.98,0.97,0.16
48,Exposome,Classic,PCA,GMM,6,cohort,0.99,0.97,0.96,0.16
58,Exposome,Classic,PCA,Agglo.,6,cohort,0.99,0.97,0.96,0.16
9,Exposome,Deep clustering,DEC,K-Means,6,cohort,0.99,0.97,0.96,0.73
20,Exposome,Deep clustering,VaDE,GMM,6,cohort,0.98,0.95,0.94,0.6
18,Exposome,Classic,Raw data,GMM,6,cohort,0.96,0.93,0.94,0.12
40,Metabolome (corrected),Classic,PCA,GMM,2,asthma,0.89,0.0,0.0,0.03
30,Metabolome,Classic (fet. sel.),PCA,K-Means,2,asthma,0.89,0.0,0.0,0.09


In [6]:
# Markdown-like table:
# print(tabulate(results.sort_values(by="Acc.", ascending=False).head(10).round(2), headers='keys', tablefmt='pipe', showindex=False))

## Top results for Adjusted Rand Index:

In [7]:
results.sort_values(by="ARI.", ascending=False).head(10).style.format('{:.2f}', subset=['Acc.','ARI.','AMI.','Sil.'])

Unnamed: 0,Dataset,Method,FL method,CL method,num clusters,variable,Acc.,ARI.,AMI.,Sil.
28,Exposome,Classic,Raw data,Agglo.,6,cohort,0.99,0.98,0.97,0.12
38,Exposome,Classic,PCA,K-Means,6,cohort,0.99,0.98,0.97,0.16
8,Exposome,Classic,Raw data,K-Means,6,cohort,0.99,0.98,0.97,0.12
48,Exposome,Classic,PCA,GMM,6,cohort,0.99,0.97,0.96,0.16
58,Exposome,Classic,PCA,Agglo.,6,cohort,0.99,0.97,0.96,0.16
9,Exposome,Deep clustering,DEC,K-Means,6,cohort,0.99,0.97,0.96,0.73
20,Exposome,Deep clustering,VaDE,GMM,6,cohort,0.98,0.95,0.94,0.6
18,Exposome,Classic,Raw data,GMM,6,cohort,0.96,0.93,0.94,0.12
19,Exposome,Classic,Raw data,GMM,7,age,0.59,0.48,0.59,0.1
29,Exposome,Classic,Raw data,Agglo.,7,age,0.59,0.48,0.58,0.1


In [8]:
# Markdown-like table:
# print(tabulate(results.sort_values(by="ARI.", ascending=False).head(10).round(2), headers='keys', tablefmt='pipe', showindex=False))

## Top results for Adjusted Mutual Information:

In [9]:
results.sort_values(by="AMI.", ascending=False).head(10).style.format('{:.2f}', subset=['Acc.','ARI.','AMI.','Sil.'])

Unnamed: 0,Dataset,Method,FL method,CL method,num clusters,variable,Acc.,ARI.,AMI.,Sil.
28,Exposome,Classic,Raw data,Agglo.,6,cohort,0.99,0.98,0.97,0.12
38,Exposome,Classic,PCA,K-Means,6,cohort,0.99,0.98,0.97,0.16
8,Exposome,Classic,Raw data,K-Means,6,cohort,0.99,0.98,0.97,0.12
48,Exposome,Classic,PCA,GMM,6,cohort,0.99,0.97,0.96,0.16
9,Exposome,Deep clustering,DEC,K-Means,6,cohort,0.99,0.97,0.96,0.73
58,Exposome,Classic,PCA,Agglo.,6,cohort,0.99,0.97,0.96,0.16
20,Exposome,Deep clustering,VaDE,GMM,6,cohort,0.98,0.95,0.94,0.6
18,Exposome,Classic,Raw data,GMM,6,cohort,0.96,0.93,0.94,0.12
19,Exposome,Classic,Raw data,GMM,7,age,0.59,0.48,0.59,0.1
29,Exposome,Classic,Raw data,Agglo.,7,age,0.59,0.48,0.58,0.1


In [10]:
# Markdown-like table:
# print(tabulate(results.sort_values(by="AMI.", ascending=False).head(10).round(2), headers='keys', tablefmt='pipe', showindex=False))

## Top results for Silhouette:

In [11]:
results.sort_values(by="Sil.", ascending=False).head(10).style.format('{:.2f}', subset=['Acc.','ARI.','AMI.','Sil.'])

Unnamed: 0,Dataset,Method,FL method,CL method,num clusters,variable,Acc.,ARI.,AMI.,Sil.
22,Metabolome,Deep clustering,DEC + DA,K-Means,2,asthma,0.89,0.0,0.0,0.95
23,Metabolome,Deep clustering,DEC + DA,K-Means,2,sex,0.53,0.0,0.0,0.95
23,Metabolome,Deep clustering (fet. sel.),DEC + DA,K-Means,2,sex,0.53,0.0,0.0,0.93
22,Metabolome,Deep clustering (fet. sel.),DEC + DA,K-Means,2,asthma,0.89,0.0,0.0,0.93
2,Exposome,Deep clustering,DEC,K-Means,3,education,0.52,0.05,0.02,0.91
3,Exposome,Deep clustering,DEC,K-Means,3,native,0.84,0.0,0.0,0.91
4,Exposome,Deep clustering,DEC,K-Means,3,parity,0.45,0.0,0.0,0.91
26,Metabolome,Deep clustering,DEC + DA,K-Means,3,parity,0.45,0.0,0.0,0.86
25,Metabolome,Deep clustering,DEC + DA,K-Means,3,native,0.84,0.0,0.0,0.86
24,Metabolome,Deep clustering,DEC + DA,K-Means,3,education,0.51,0.0,0.0,0.86


In [12]:
# Markdown-like table:
# print(tabulate(results.sort_values(by="Sil.", ascending=False).head(10).round(2), headers='keys', tablefmt='pipe', showindex=False))