In [1]:
import pandas as pd
import scanpy as sc
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score, fowlkes_mallows_score, silhouette_score
from utils import sankey_plot

In [2]:
DIR = './data/'
DATASET_NAMES = ['PBMC1', 'PBMC2', 'PBMC3', 'PBMC4']
TOOLS = ['monocle', 'scanpy', 'scvi-tools', 'seurat', 'COTAN']
PARAMS_TUNING = ['default', 'celltypist']#, 'antibody']

In [3]:
def compute_scores(dir, dataset, labels_df, labels_matched, ground_truth_labels):
    scores = {}
    scores['NMI'] = {}
    scores['ARI'] = {}
    scores['homogeneity'] = {}
    scores['completeness'] = {}
    scores['v_measure'] = {}
    scores['fowlkes_mallows'] = {}
    for tool in TOOLS:
        scores['NMI'][tool] = normalized_mutual_info_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'], average_method='arithmetic')
        scores['ARI'][tool] = adjusted_rand_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['homogeneity'][tool] = homogeneity_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['completeness'][tool] = completeness_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['v_measure'][tool] = v_measure_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['fowlkes_mallows'][tool] = fowlkes_mallows_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
    scores_df = pd.DataFrame(scores)
    scores_df.to_csv(f'{dir}{dataset}/scores_{labels_matched}_{ground_truth_labels}.csv')
    scores_df.to_latex(f'{dir}{dataset}/scores_{labels_matched}_{ground_truth_labels}.tex')
    display(scores_df)

In [4]:
for tuning in PARAMS_TUNING:
    for dataset in DATASET_NAMES:
        print('------------------------------')
        print(f'{dataset} - matching {tuning} labels' if tuning != 'default' else f'{dataset} - default labels')
        
        # concat tools labels
        labels_df = pd.read_csv(f'{DIR}{dataset}/COTAN/{tuning}/clustering_labels.csv', index_col=0)
        labels_df.rename(columns={"cluster": "cluster_COTAN"}, inplace=True)
        for tool in [t for t in TOOLS if t != 'COTAN']:
            tool_labels_df = pd.read_csv(f'{DIR}{dataset}/{tool}/{tuning}/clustering_labels.csv', index_col=0)
            labels_df = labels_df.merge(tool_labels_df, how='inner', on='cell')
            labels_df.rename(columns={"cluster": f"cluster_{tool}"}, inplace=True)
        
        # load and concat celltypist labels
        celltypist_df = pd.read_csv(f'{DIR}{dataset}/celltypist/celltypist_labels.csv', index_col=0)
        celltypist_df.index = celltypist_df.index.str[:-2]
        celltypist_df = labels_df.merge(celltypist_df, how='inner', on='cell')
        celltypist_df.rename(columns={"cluster.ids": f"cluster_celltypist"}, inplace=True)
        celltypist_mapping_df = pd.read_csv(f'{DIR}{dataset}/celltypist/celltypist_mapping.csv', index_col=0)
        
        # load and concat protein surface labels
        antibody_df = pd.read_csv(f'{DIR}{dataset}/antibody_annotation/antibody_labels.csv', index_col=0)
        antibody_df = labels_df.merge(antibody_df, how='inner', on='cell')
        antibody_df.rename(columns={"cluster.ids": f"cluster_antibody"}, inplace=True)
        antibody_mapping_df = pd.read_csv(f'{DIR}{dataset}/antibody_annotation/antibody_mapping.csv', index_col=1)

        # read dataset
        adata = sc.read_10x_mtx(
            f'{DIR}{dataset}/filtered/10X/',
            var_names='gene_symbols',
            cache=False
        )
        # keep only labelled cells
        adata.var_names_make_unique()
        subset_cells = adata.obs_names.isin(labels_df.index)
        adata = adata[subset_cells, :]

        # compute silhouette score
        silhouette = {}
        for tool in TOOLS:
            silhouette[tool] = silhouette_score(adata.X, labels_df[f'cluster_{tool}'])
        if tuning=='celltypist':
            silhouette['celltypist'] = silhouette_score(adata.X, celltypist_df[f'cluster_celltypist'])
        elif tuning=='antibody':
            silhouette['antibody'] = silhouette_score(adata.X, antibody_df[f'cluster_antibody'])
        silhouette_df = pd.DataFrame(silhouette, index=[0])
        silhouette_df.to_csv(f'{DIR}{dataset}/{tuning}_silhouette.csv')
        silhouette_df.to_latex(f'{DIR}{dataset}/{tuning}_silhouette.tex')

        # compute scores comparing each tool labels with celltypist labels
        if tuning == 'celltypist' or tuning == 'default':
            compute_scores(DIR, dataset, celltypist_df, tuning, 'celltypist')
            labels = []
            labels_titles = []
            for tool in TOOLS:
                labels.append(celltypist_df[f'cluster_{tool}'].to_list())
                labels_titles.append(tool)
            labels.append(celltypist_df[f'cluster_celltypist'].map(celltypist_mapping_df['go'].to_dict()).to_list())
            labels_titles.append('celltypist')
            title = f'{dataset} - matching {tuning} labels' if tuning != 'default' else f'{dataset} - default labels'
            sankey_plot(labels=labels, labels_titles=labels_titles, title=title, path=f'{DIR}{dataset}/{tuning}_celltypist.html')
        
        # compute scores comparing each tool labels with protein labels
        if tuning == 'antibody' or tuning == 'default':
            compute_scores(DIR, dataset, antibody_df, tuning, 'antibody')
            labels = []
            labels_titles = []
            for tool in TOOLS:
                labels.append(antibody_df[f'cluster_{tool}'].to_list())
                labels_titles.append(tool)
            labels.append(antibody_df[f'cluster_antibody'].map(antibody_mapping_df['go'].to_dict()).to_list())
            labels_titles.append('antibody')
            title = f'{dataset} - matching {tuning} labels' if tuning != 'default' else f'{dataset} - default labels'
            sankey_plot(labels=labels, labels_titles=labels_titles, title=title, path=f'{DIR}{dataset}/{tuning}_antibody.html')

------------------------------
PBMC1 - default labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
monocle,0.586477,0.434154,0.4191,0.976441,0.586477,0.641586
scanpy,0.707991,0.3637,0.821675,0.621941,0.707991,0.483365
scvi-tools,0.740929,0.538885,0.776916,0.708127,0.740929,0.623053
seurat,0.770234,0.588503,0.770632,0.769836,0.770234,0.661907
COTAN,0.756537,0.564212,0.825744,0.698034,0.756537,0.646916


Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
monocle,0.611988,0.425929,0.446299,0.973344,0.611988,0.635502
scanpy,0.659645,0.391203,0.795577,0.563386,0.659645,0.50773
scvi-tools,0.700834,0.56518,0.761397,0.649195,0.700834,0.643658
seurat,0.738344,0.643097,0.764146,0.714228,0.738344,0.706018
COTAN,0.714625,0.584684,0.809127,0.639888,0.714625,0.662378


------------------------------
PBMC2 - default labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
monocle,0.408008,0.245699,0.257439,0.98285,0.408008,0.563497
scanpy,0.71177,0.411761,0.814608,0.631986,0.71177,0.532994
scvi-tools,0.717161,0.432623,0.795019,0.653193,0.717161,0.544665
seurat,0.766101,0.515115,0.827245,0.713374,0.766101,0.614991
COTAN,0.72878,0.461602,0.862867,0.630761,0.72878,0.576264


Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
monocle,0.277314,0.107135,0.165534,0.853977,0.277314,0.450594
scanpy,0.682604,0.524109,0.759388,0.619922,0.682604,0.602311
scvi-tools,0.667101,0.533758,0.720104,0.621366,0.667101,0.605379
seurat,0.745221,0.680424,0.779523,0.71381,0.745221,0.731101
COTAN,0.707707,0.634022,0.797479,0.636102,0.707707,0.694466


------------------------------
PBMC3 - default labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
monocle,0.512556,0.273175,0.350038,0.956777,0.512556,0.539835
scanpy,0.677829,0.412995,0.770393,0.605122,0.677829,0.510203
scvi-tools,0.734975,0.525048,0.776449,0.697707,0.734975,0.599726
seurat,0.75355,0.525128,0.819169,0.697665,0.75355,0.603686
COTAN,0.680463,0.41126,0.875689,0.556416,0.680463,0.530906


Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
monocle,0.429744,0.168276,0.280823,0.914939,0.429744,0.437511
scanpy,0.664567,0.54263,0.702416,0.630588,0.664567,0.596647
scvi-tools,0.680967,0.584528,0.674752,0.687298,0.680967,0.62962
seurat,0.734744,0.661116,0.743884,0.725826,0.734744,0.698627
COTAN,0.657683,0.477137,0.779987,0.568536,0.657683,0.548422


------------------------------
PBMC4 - default labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
monocle,0.617714,0.497717,0.45662,0.954434,0.617714,0.667591
scanpy,0.690181,0.355238,0.815725,0.598126,0.690181,0.471439
scvi-tools,0.737086,0.471157,0.810623,0.675781,0.737086,0.563973
seurat,0.758058,0.477452,0.853752,0.681654,0.758058,0.576361
COTAN,0.697523,0.379661,0.81908,0.607383,0.697523,0.484246


Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
monocle,0.536861,0.325029,0.372515,0.960701,0.536861,0.53281
scanpy,0.622945,0.371655,0.659143,0.590516,0.622945,0.439575
scvi-tools,0.671843,0.463489,0.666315,0.677464,0.671843,0.520987
seurat,0.669274,0.436706,0.676741,0.661971,0.669274,0.496402
COTAN,0.618976,0.351132,0.649094,0.591529,0.618976,0.418417


------------------------------
PBMC1 - matching celltypist labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
monocle,0.638152,0.292695,0.744839,0.558199,0.638152,0.413511
scanpy,0.707991,0.3637,0.821675,0.621941,0.707991,0.483365
scvi-tools,0.688328,0.32564,0.816697,0.594832,0.688328,0.451652
seurat,0.712972,0.37719,0.840982,0.618784,0.712972,0.497948
COTAN,0.731298,0.55819,0.711411,0.75233,0.731298,0.638123


------------------------------
PBMC2 - matching celltypist labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
monocle,0.558505,0.196684,0.68677,0.470611,0.558505,0.327606
scanpy,0.71177,0.411761,0.814608,0.631986,0.71177,0.532994
scvi-tools,0.716471,0.395908,0.815004,0.639193,0.716471,0.516451
seurat,0.724443,0.386488,0.849993,0.631209,0.724443,0.516762
COTAN,0.688973,0.444196,0.677858,0.700459,0.688973,0.554044


------------------------------
PBMC3 - matching celltypist labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
monocle,0.5607,0.257249,0.66637,0.483957,0.5607,0.364163
scanpy,0.677829,0.412995,0.770393,0.605122,0.677829,0.510203
scvi-tools,0.721456,0.491665,0.782506,0.669242,0.721456,0.57481
seurat,0.72778,0.434786,0.853486,0.63435,0.72778,0.54111
COTAN,0.690438,0.448282,0.721157,0.66223,0.690438,0.532344


------------------------------
PBMC4 - matching celltypist labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
monocle,0.614294,0.244972,0.770923,0.510563,0.614294,0.37298
scanpy,0.690181,0.355238,0.815725,0.598126,0.690181,0.471439
scvi-tools,0.772969,0.583246,0.80411,0.74415,0.772969,0.655423
seurat,0.729819,0.400663,0.862845,0.632332,0.729819,0.518765
COTAN,0.715158,0.437598,0.770769,0.667031,0.715158,0.530832
