In [5]:
import pandas as pd
import scanpy as sc
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sklearn.metrics import homogeneity_score, completeness_score, fowlkes_mallows_score, silhouette_score
from utils import sankey_plot

In [6]:
DIR = './data/'
DATASET_NAMES = ['PBMC1', 'PBMC2', 'PBMC3', 'PBMC4']
TOOLS = ['monocle', 'scanpy', 'scvi-tools', 'seurat', 'COTAN']
PARAMS_TUNING = ['default', 'celltypist', 'antibody']

In [7]:
def compute_scores(dir, dataset, labels_df, labels_matched, ground_truth_labels):
    scores = {}
    scores['NMI'] = {}
    scores['ARI'] = {}
    scores['homogeneity'] = {}
    scores['completeness'] = {}
    scores['fowlkes_mallows'] = {}
    for tool in TOOLS:
        scores['NMI'][tool] = normalized_mutual_info_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'], average_method='arithmetic')
        scores['ARI'][tool] = adjusted_rand_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['homogeneity'][tool] = homogeneity_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['completeness'][tool] = completeness_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['fowlkes_mallows'][tool] = fowlkes_mallows_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
    scores_df = pd.DataFrame(scores)
    scores_df.to_csv(f'{dir}{dataset}/scores_{labels_matched}_{ground_truth_labels}.csv')
    scores_df.to_latex(f'{dir}{dataset}/scores_{labels_matched}_{ground_truth_labels}.tex')
    display(scores_df)

In [8]:
for tuning in PARAMS_TUNING:
    for dataset in DATASET_NAMES:
        print('------------------------------')
        print(f'{dataset} - matching {tuning} labels' if tuning != 'default' else f'{dataset} - default labels')
        
        # concat tools labels
        labels_df = pd.read_csv(f'{DIR}{dataset}/COTAN/{tuning}/clustering_labels.csv', index_col=0)
        labels_df.rename(columns={"cluster": "cluster_COTAN"}, inplace=True)
        for tool in [t for t in TOOLS if t != 'COTAN']:
            tool_labels_df = pd.read_csv(f'{DIR}{dataset}/{tool}/{tuning}/clustering_labels.csv', index_col=0)
            labels_df = labels_df.merge(tool_labels_df, how='inner', on='cell')
            labels_df.rename(columns={"cluster": f"cluster_{tool}"}, inplace=True)
        
        # load and concat celltypist labels
        celltypist_df = pd.read_csv(f'{DIR}{dataset}/celltypist/celltypist_labels.csv', index_col=0)
        celltypist_df.index = celltypist_df.index.str[:-2]
        celltypist_df = labels_df.merge(celltypist_df, how='inner', on='cell')
        celltypist_df.rename(columns={"cluster.ids": f"cluster_celltypist"}, inplace=True)
        celltypist_mapping_df = pd.read_csv(f'{DIR}{dataset}/celltypist/celltypist_mapping.csv', index_col=0)
        
        # load and concat protein surface labels
        antibody_df = pd.read_csv(f'{DIR}{dataset}/antibody_annotation/antibody_labels.csv', index_col=0)
        antibody_df = labels_df.merge(antibody_df, how='inner', on='cell')
        antibody_df.rename(columns={"cluster.ids": f"cluster_antibody"}, inplace=True)
        antibody_mapping_df = pd.read_csv(f'{DIR}{dataset}/antibody_annotation/antibody_mapping.csv', index_col=1)

        # read dataset
        adata = sc.read_10x_mtx(
            f'{DIR}{dataset}/filtered/10X/',
            var_names='gene_symbols',
            cache=False
        )
        # keep only labelled cells
        adata.var_names_make_unique()
        subset_cells = adata.obs_names.isin(labels_df.index)
        adata = adata[subset_cells, :]

        # compute silhouette score
        silhouette = {}
        for tool in TOOLS:
            silhouette[tool] = silhouette_score(adata.X, labels_df[f'cluster_{tool}'])
        if tuning=='celltypist':
            silhouette['celltypist'] = silhouette_score(adata.X, celltypist_df[f'cluster_celltypist'])
        elif tuning=='antibody':
            silhouette['antibody'] = silhouette_score(adata.X, antibody_df[f'cluster_antibody'])
        silhouette_df = pd.DataFrame(silhouette, index=[0])
        silhouette_df.to_csv(f'{DIR}{dataset}/{tuning}_silhouette.csv')
        silhouette_df.to_latex(f'{DIR}{dataset}/{tuning}_silhouette.tex')

        # compute scores comparing each tool labels with celltypist labels
        if tuning == 'celltypist' or tuning == 'default':
            compute_scores(DIR, dataset, celltypist_df, tuning, 'celltypist')
            labels = []
            labels_titles = []
            for tool in TOOLS:
                labels.append(celltypist_df[f'cluster_{tool}'].to_list())
                labels_titles.append(tool)
            labels.append(celltypist_df[f'cluster_celltypist'].map(celltypist_mapping_df['go'].to_dict()).to_list())
            labels_titles.append('celltypist')
            title = f'{dataset} - matching {tuning} labels' if tuning != 'default' else f'{dataset} - default labels'
            sankey_plot(labels=labels, labels_titles=labels_titles, title=title, path=f'{DIR}{dataset}/{tuning}_celltypist.html')
        
        # compute scores comparing each tool labels with protein labels
        if tuning == 'antibody' or tuning == 'default':
            compute_scores(DIR, dataset, antibody_df, tuning, 'antibody')
            labels = []
            labels_titles = []
            for tool in TOOLS:
                labels.append(antibody_df[f'cluster_{tool}'].to_list())
                labels_titles.append(tool)
            labels.append(antibody_df[f'cluster_antibody'].map(antibody_mapping_df['go'].to_dict()).to_list())
            labels_titles.append('antibody')
            title = f'{dataset} - matching {tuning} labels' if tuning != 'default' else f'{dataset} - default labels'
            sankey_plot(labels=labels, labels_titles=labels_titles, title=title, path=f'{DIR}{dataset}/{tuning}_antibody.html')

------------------------------
PBMC1 - default labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.586477,0.434154,0.4191,0.976441,0.641586
scanpy,0.707991,0.3637,0.821675,0.621941,0.483365
scvi-tools,0.759984,0.540722,0.803366,0.721048,0.626231
seurat,0.770234,0.588503,0.770632,0.769836,0.661907
COTAN,0.760531,0.573026,0.824545,0.705741,0.653886


Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.611988,0.425929,0.446299,0.973344,0.635502
scanpy,0.659645,0.391203,0.795577,0.563386,0.50773
scvi-tools,0.708581,0.551051,0.776228,0.65178,0.63275
seurat,0.738344,0.643097,0.764146,0.714228,0.706018
COTAN,0.711422,0.579912,0.799933,0.640547,0.657343


------------------------------
PBMC2 - default labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.408008,0.245699,0.257439,0.98285,0.563497
scanpy,0.71177,0.411761,0.814608,0.631986,0.532994
scvi-tools,0.699062,0.390431,0.803404,0.618707,0.511413
seurat,0.765181,0.514174,0.826024,0.712686,0.614014
COTAN,0.735583,0.465385,0.866541,0.639011,0.579387


Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.277314,0.107135,0.165534,0.853977,0.450594
scanpy,0.682604,0.524109,0.759388,0.619922,0.602311
scvi-tools,0.652891,0.485961,0.734303,0.587729,0.567847
seurat,0.743681,0.679941,0.77765,0.712555,0.730603
COTAN,0.709931,0.632933,0.799781,0.63823,0.693633


------------------------------
PBMC3 - default labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.512556,0.273175,0.350038,0.956777,0.539835
scanpy,0.677829,0.412995,0.770393,0.605122,0.510203
scvi-tools,0.737343,0.547919,0.770748,0.706712,0.617917
seurat,0.753801,0.524471,0.819448,0.697892,0.603038
COTAN,0.677317,0.394179,0.872352,0.553556,0.515965


Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.429744,0.168276,0.280823,0.914939,0.437511
scanpy,0.664567,0.54263,0.702416,0.630588,0.596647
scvi-tools,0.691391,0.620339,0.67793,0.705398,0.66258
seurat,0.735217,0.664188,0.744324,0.72633,0.701375
COTAN,0.657321,0.47634,0.780031,0.567972,0.550013


------------------------------
PBMC4 - default labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.617714,0.497717,0.45662,0.954434,0.667591
scanpy,0.690181,0.355238,0.815725,0.598126,0.471439
scvi-tools,0.728772,0.469629,0.784776,0.680229,0.560344
seurat,0.758058,0.477452,0.853752,0.681654,0.576361
COTAN,0.694128,0.373214,0.843035,0.589928,0.482714


Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.536861,0.325029,0.372515,0.960701,0.53281
scanpy,0.622945,0.371655,0.659143,0.590516,0.439575
scvi-tools,0.65155,0.425369,0.634107,0.66998,0.487767
seurat,0.669274,0.436706,0.676741,0.661971,0.496402
COTAN,0.620969,0.354846,0.671867,0.577239,0.421695


------------------------------
PBMC1 - matching celltypist labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.638152,0.292695,0.744839,0.558199,0.413511
scanpy,0.720739,0.412778,0.816894,0.644836,0.523473
scvi-tools,0.695184,0.334699,0.815078,0.606039,0.454645
seurat,0.712972,0.37719,0.840982,0.618784,0.497948
COTAN,0.743779,0.537549,0.793723,0.699747,0.620191


------------------------------
PBMC2 - matching celltypist labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.589038,0.267611,0.696325,0.510398,0.398231
scanpy,0.688123,0.334803,0.817799,0.593944,0.469648
scvi-tools,0.713225,0.376925,0.814609,0.634284,0.498712
seurat,0.721508,0.365269,0.850946,0.626249,0.496233
COTAN,0.69953,0.457374,0.728557,0.672728,0.560419


------------------------------
PBMC3 - matching celltypist labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.584046,0.317239,0.646407,0.532658,0.414759
scanpy,0.707606,0.499221,0.768091,0.655951,0.581001
scvi-tools,0.734542,0.532472,0.781902,0.692591,0.606452
seurat,0.754619,0.525413,0.820158,0.69878,0.6038
COTAN,0.680167,0.471733,0.691555,0.669149,0.553997


------------------------------
PBMC4 - matching celltypist labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.672294,0.380324,0.739909,0.616002,0.482458
scanpy,0.712517,0.448522,0.798838,0.643032,0.547478
scvi-tools,0.740049,0.474791,0.825454,0.67066,0.571643
seurat,0.758228,0.476344,0.854145,0.681679,0.575419
COTAN,0.711669,0.458362,0.742837,0.683011,0.547341


------------------------------
PBMC1 - matching antibody labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.715777,0.633217,0.707203,0.724563,0.698798
scanpy,0.736466,0.645073,0.769273,0.706342,0.707927
scvi-tools,0.746355,0.650567,0.757864,0.735189,0.712214
seurat,0.739813,0.640616,0.767621,0.713949,0.704048
COTAN,0.704411,0.599209,0.706965,0.701876,0.669953


------------------------------
PBMC2 - matching antibody labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.585175,0.443933,0.599091,0.571891,0.53105
scanpy,0.739559,0.636764,0.742414,0.736726,0.69561
scvi-tools,0.666315,0.562966,0.694832,0.640047,0.632909
seurat,0.763984,0.764614,0.773986,0.754238,0.803575
COTAN,0.721591,0.637992,0.687205,0.7596,0.710348


------------------------------
PBMC3 - matching antibody labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.633333,0.503352,0.609373,0.659256,0.567586
scanpy,0.737439,0.695008,0.736412,0.738468,0.732379
scvi-tools,0.712286,0.63831,0.692394,0.733355,0.684119
seurat,0.760367,0.695131,0.769539,0.751412,0.732928
COTAN,0.710483,0.621643,0.67424,0.750844,0.682058


------------------------------
PBMC4 - matching antibody labels



is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.606252,0.394823,0.583563,0.630776,0.467547
scanpy,0.684939,0.495584,0.645785,0.729147,0.560952
scvi-tools,0.673692,0.484543,0.629449,0.724624,0.551156
seurat,0.687849,0.514398,0.665613,0.711622,0.574629
COTAN,0.629502,0.410868,0.575363,0.694887,0.494384
