In [1]:
# FIXME:
# - SANKEY PLOTS: colorare link? definire ordinamento migliore tra i tool per visualizzare meglio? fare plot due a due?
# - SILHOUETTE: calcolarla nel feature space della PCA?

In [2]:
import pandas as pd
import scanpy as sc
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score, fowlkes_mallows_score, silhouette_score
from utils import sankey_plot

In [4]:
DIR = './data/'
DATASET_NAMES = ['PBMC1', 'PBMC2', 'PBMC3', 'PBMC4']
TOOLS = ['COTAN', 'monocle', 'scanpy', 'scvi-tools', 'seurat']
PARAMS_TUNING = ['default', 'celltypist', 'antibody']

In [5]:
def compute_scores(dir, dataset, labels_df, labels_matched, ground_truth_labels):
    scores = {}
    scores['NMI'] = {}
    scores['ARI'] = {}
    scores['homogeneity'] = {}
    scores['completeness'] = {}
    scores['v_measure'] = {}
    scores['fowlkes_mallows'] = {}
    for tool in TOOLS:
        scores['NMI'][tool] = normalized_mutual_info_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'], average_method='arithmetic')
        scores['ARI'][tool] = adjusted_rand_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['homogeneity'][tool] = homogeneity_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['completeness'][tool] = completeness_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['v_measure'][tool] = v_measure_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['fowlkes_mallows'][tool] = fowlkes_mallows_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
    scores_df = pd.DataFrame(scores)
    scores_df.to_csv(f'{dir}{dataset}/scores_{labels_matched}_{ground_truth_labels}.csv')
    scores_df.to_latex(f'{dir}{dataset}/scores_{labels_matched}_{ground_truth_labels}.tex')
    display(scores_df)

In [23]:
for tuning in PARAMS_TUNING:
    for dataset in DATASET_NAMES:
        print('------------------------------')
        print(f'{dataset} - matching {tuning} labels' if tuning != 'default' else f'{dataset} - default labels')
        
        # concat tools labels
        labels_df = pd.read_csv(f'{DIR}{dataset}/{TOOLS[0]}/{tuning}/clustering_labels.csv', index_col=0)
        labels_df.rename(columns={"cluster": "cluster_COTAN"}, inplace=True)
        for tool in TOOLS[1:]:
            tool_labels_df = pd.read_csv(f'{DIR}{dataset}/{tool}/{tuning}/clustering_labels.csv', index_col=0)
            labels_df = labels_df.merge(tool_labels_df, how='inner', on='cell')
            labels_df.rename(columns={"cluster": f"cluster_{tool}"}, inplace=True)
        
        # load and concat celltypist labels
        celltypist_df = pd.read_csv(f'{DIR}{dataset}/celltypist/celltypist_labels.csv', index_col=0)
        celltypist_df.index = celltypist_df.index.str[:-2]
        celltypist_df = labels_df.merge(celltypist_df, how='inner', on='cell')
        celltypist_df.rename(columns={"cluster.ids": f"cluster_celltypist"}, inplace=True)
        celltypist_mapping_df = pd.read_csv(f'{DIR}{dataset}/celltypist/celltypist_mapping.csv', index_col=0)
        
        # load and concat protein surface labels
        antibody_df = pd.read_csv(f'{DIR}{dataset}/antibody_annotation/antibody_labels.csv', index_col=0)
        antibody_df = labels_df.merge(antibody_df, how='inner', on='cell')
        antibody_df.rename(columns={"cluster.ids": f"cluster_antibody"}, inplace=True)
        antibody_mapping_df = pd.read_csv(f'{DIR}{dataset}/antibody_annotation/antibody_mapping.csv', index_col=1)

        # read dataset
        adata = sc.read_10x_mtx(
            f'{DIR}{dataset}/filtered/10X/',
            var_names='gene_symbols',
            cache=False
        )
        # keep only labelled cells
        adata.var_names_make_unique()
        subset_cells = adata.obs_names.isin(labels_df.index)
        adata = adata[subset_cells, :]

        # compute silhouette score
        silhouette = {}
        for tool in TOOLS:
            silhouette[tool] = silhouette_score(adata.X, labels_df[f'cluster_{tool}'])
        silhouette['celltypist'] = silhouette_score(adata.X, celltypist_df[f'cluster_celltypist'])
        silhouette['antibody'] = silhouette_score(adata.X, antibody_df[f'cluster_antibody'])
        silhouette_df = pd.DataFrame(silhouette, index=[0])
        silhouette_df.to_csv(f'{DIR}{dataset}/silhouette.csv')
        silhouette_df.to_latex(f'{DIR}{dataset}/silhouette.tex')

        # compute scores comparing each tool labels with celltypist labels
        if tuning == 'celltypist' or tuning == 'default':
            compute_scores(DIR, dataset, celltypist_df, tuning, 'celltypist')
            labels = []
            labels_titles = []
            for tool in TOOLS:
                labels.append(celltypist_df[f'cluster_{tool}'].to_list())
                labels_titles.append(tool)
            labels.append(celltypist_df[f'cluster_celltypist'].map(celltypist_mapping_df['go'].to_dict()).to_list())
            labels_titles.append('celltypist')
            title = f'{dataset} - matching {tuning} labels' if tuning != 'default' else f'{dataset} - default labels'
            sankey_plot(labels=labels, labels_titles=labels_titles, title=title, path=f'{DIR}{dataset}/{tuning}_celltypist.html')
        
        # compute scores comparing each tool labels with protein labels
        if tuning == 'antibody' or tuning == 'default':
            compute_scores(DIR, dataset, antibody_df, tuning, 'antibody')
            labels = []
            labels_titles = []
            for tool in TOOLS:
                labels.append(antibody_df[f'cluster_{tool}'].to_list())
                labels_titles.append(tool)
            labels.append(antibody_df[f'cluster_antibody'].map(antibody_mapping_df['go'].to_dict()).to_list())
            labels_titles.append('antibody')
            title = f'{dataset} - matching {tuning} labels' if tuning != 'default' else f'{dataset} - default labels'
            sankey_plot(labels=labels, labels_titles=labels_titles, title=title, path=f'{DIR}{dataset}/{tuning}_antibody.html')

        break

------------------------------
PBMC1 - default labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
COTAN,0.707991,0.3637,0.821675,0.621941,0.707991,0.483365
monocle,0.586477,0.434154,0.4191,0.976441,0.586477,0.641586
scanpy,0.707991,0.3637,0.821675,0.621941,0.707991,0.483365
scvi-tools,0.728789,0.540777,0.781633,0.682637,0.728789,0.627901
seurat,0.770234,0.588503,0.770632,0.769836,0.770234,0.661907


Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
COTAN,0.654901,0.379558,0.795575,0.556501,0.654901,0.497358
monocle,0.617039,0.429563,0.45187,0.972516,0.617039,0.637903
scanpy,0.654901,0.379558,0.795575,0.556501,0.654901,0.497358
scvi-tools,0.65685,0.498485,0.735053,0.593687,0.65685,0.588256
seurat,0.736892,0.624812,0.767517,0.708617,0.736892,0.691298


------------------------------
PBMC1 - matching celltypist labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
COTAN,0.711115,0.367247,0.849417,0.611544,0.711115,0.494212
monocle,0.636114,0.292627,0.763123,0.54535,0.636114,0.419519
scanpy,0.711115,0.367247,0.849417,0.611544,0.711115,0.494212
scvi-tools,0.75172,0.512773,0.824807,0.690532,0.75172,0.609958
seurat,0.717847,0.400335,0.861522,0.615243,0.717847,0.522358


------------------------------
PBMC1 - matching antibody labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
COTAN,0.654901,0.379558,0.795575,0.556501,0.654901,0.497358
monocle,0.667449,0.477968,0.747542,0.602858,0.667449,0.572296
scanpy,0.654901,0.379558,0.795575,0.556501,0.654901,0.497358
scvi-tools,0.753679,0.702577,0.750157,0.757234,0.753679,0.75727
seurat,0.744,0.630068,0.783025,0.70868,0.744,0.695895
