In [6]:
import pandas as pd
import scanpy as sc
import numpy as np
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score, fowlkes_mallows_score, silhouette_score
from utils import sankey_plot

In [9]:
DIR = './data/'
DATASET_NAME = 'E175'
TOOLS = ['monocle', 'seurat', 'scanpy', 'scvi-tools']
PARAMS_TUNING = ['default', 'celltypist']

In [10]:
def compute_scores(dir, dataset, labels_df, labels_matched, ground_truth_labels):
    scores = {}
    scores['NMI'] = {}
    scores['ARI'] = {}
    scores['homogeneity'] = {}
    scores['completeness'] = {}
    scores['v_measure'] = {}
    scores['fowlkes_mallows'] = {}
    for tool in TOOLS:
        scores['NMI'][tool] = normalized_mutual_info_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'], average_method='arithmetic')
        scores['ARI'][tool] = adjusted_rand_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['homogeneity'][tool] = homogeneity_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['completeness'][tool] = completeness_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['v_measure'][tool] = v_measure_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['fowlkes_mallows'][tool] = fowlkes_mallows_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
    scores_df = pd.DataFrame(scores)
    scores_df.to_csv(f'{dir}{dataset}/scores_{labels_matched}_{ground_truth_labels}.csv')
    scores_df.to_latex(f'{dir}{dataset}/scores_{labels_matched}_{ground_truth_labels}.tex')
    display(scores_df)

In [16]:
for tuning in PARAMS_TUNING:
    print('------------------------------')
    print(f'{DATASET_NAME} - matching {tuning} labels' if tuning != 'default' else f'{DATASET_NAME} - default labels')
    
    # concat tools labels
    labels_df = pd.read_csv(f'{DIR}{DATASET_NAME}/COTAN/{tuning}/clustering_labels.csv', index_col=0)
    labels_df.rename(columns={"cluster": "cluster_COTAN"}, inplace=True)
    for tool in [t for t in TOOLS if t != 'COTAN']:
        tool_labels_df = pd.read_csv(f'{DIR}{DATASET_NAME}/{tool}/{tuning}/clustering_labels.csv', index_col=0)
        labels_df = labels_df.merge(tool_labels_df, how='inner', on='cell')
        labels_df.rename(columns={"cluster": f"cluster_{tool}"}, inplace=True)
    
    # load and concat celltypist labels
    celltypist_df = pd.read_csv(f'{DIR}{DATASET_NAME}/E17_5_Devel_Mouse_Brain_predicted_labels.csv', index_col=0)
    celltypist_df = labels_df.merge(celltypist_df, how='inner', on='cell')
    celltypist_df.rename(columns={"predicted_labels": f"cluster_celltypist"}, inplace=True)
    
    display(labels_df)

    # read dataset
    adata = sc.read_csv(
        f'./data/{DATASET_NAME}/CorticalCells_GSM2861514_E175_cleaned.csv',
        first_column_names=None
    )
    adata = adata.T
    # keep only labelled cells
    adata.var_names_make_unique()
    subset_cells = adata.obs_names.isin(labels_df.index)
    adata = adata[subset_cells, :]

    # compute silhouette score
    silhouette = {}
    for tool in TOOLS:
        try:
            silhouette[tool] = silhouette_score(adata.X, labels_df[f'cluster_{tool}'])
        except:
            silhouette[tool] = np.nan
    if tuning=='celltypist':
        silhouette['celltypist'] = silhouette_score(adata.X, celltypist_df[f'cluster_celltypist'])
    silhouette_df = pd.DataFrame(silhouette, index=[0])
    silhouette_df.to_csv(f'{DIR}{DATASET_NAME}/{tuning}_silhouette.csv')
    silhouette_df.to_latex(f'{DIR}{DATASET_NAME}/{tuning}_silhouette.tex')

    # compute scores comparing each tool labels with celltypist labels
    if tuning == 'celltypist' or tuning == 'default':
        compute_scores(DIR, DATASET_NAME, celltypist_df, tuning, 'celltypist')
        labels = []
        labels_titles = []
        for tool in TOOLS:
            labels.append(celltypist_df[f'cluster_{tool}'].to_list())
            labels_titles.append(tool)
        labels.append(celltypist_df[f'cluster_celltypist'].to_list())#.map(celltypist_mapping_df['go'].to_dict()).to_list())
        labels_titles.append('celltypist')
        title = f'{DATASET_NAME} - matching {tuning} labels' if tuning != 'default' else f'{DATASET_NAME} - default labels'
        sankey_plot(
            labels=labels,
            labels_titles=labels_titles,
            title=title, path=f'{DIR}{DATASET_NAME}/{tuning}_celltypist.html',
            height=550,
            width=1400
        )

------------------------------
E175 - default labels


Unnamed: 0_level_0,cluster_COTAN,cluster_monocle,cluster_seurat,cluster_scanpy,cluster_scvi-tools
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CGTTTAGTTTAC,1,1,5,7,4
TCTAGAACAACG,3,1,4,4,9
ACCTTTGTTCGT,10,1,4,5,1
TAAAATATCGCC,5,1,4,8,5
GTACCCTATTTC,5,1,4,8,5
...,...,...,...,...,...
ACTAAGCCGCTT,7,1,5,7,4
CTTAGGGGGGCT,2,1,3,9,10
GCTATCTGGTTG,8,1,2,6,1
CACTATTGTCAA,2,1,3,3,5


Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
monocle,0.0,0.0,0.0,1.0,0.0,0.811093
seurat,0.314587,0.102977,0.432556,0.247176,0.314587,0.448201
scanpy,0.287347,0.049438,0.497811,0.201962,0.287347,0.309711
scvi-tools,0.255927,0.03287,0.413885,0.185234,0.255927,0.319597


------------------------------
E175 - matching celltypist labels


Unnamed: 0_level_0,cluster_COTAN,cluster_monocle,cluster_seurat,cluster_scanpy,cluster_scvi-tools
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CGTTTAGTTTAC,1,1,1,7,1
TCTAGAACAACG,3,2,7,4,22
ACCTTTGTTCGT,10,9,18,5,11
TAAAATATCGCC,5,2,10,8,17
GTACCCTATTTC,5,2,10,8,17
...,...,...,...,...,...
ACTAAGCCGCTT,7,1,1,7,1
CTTAGGGGGGCT,2,11,12,9,24
GCTATCTGGTTG,8,12,5,6,25
CACTATTGTCAA,2,2,14,3,17


Unnamed: 0,NMI,ARI,homogeneity,completeness,v_measure,fowlkes_mallows
monocle,0.233652,0.011822,0.469499,0.155526,0.233652,0.21131
seurat,0.247473,0.010727,0.513258,0.163043,0.247473,0.197565
scanpy,0.287347,0.049438,0.497811,0.201962,0.287347,0.309711
scvi-tools,0.231802,0.005674,0.497821,0.151073,0.231802,0.176757
