In [1]:
import pandas as pd
import numpy as np
import random as rn

In [2]:
# set the seed for reproducibility
sd = 1234
np.random.seed(sd)
rn.seed(sd)
%env PYTHONHASHSEED=0


env: PYTHONHASHSEED=0


In [3]:
def get_accuracy(annotation_df, correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    num_correct_ann= annotation_df[annotation_df[correct_id_col]==annotation_df[predicted_id_col]].shape[0]
    acc=num_correct_ann/annotation_df.shape[0]
    #print("accuracy:")
    return acc

In [4]:
def get_balanced_accuracy(annotation_df, correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    ann=[]
    for cell_type in sorted(set(annotation_df[correct_id_col])):
        cell_type_sub = annotation_df[annotation_df[correct_id_col]==cell_type]
        #print(cell_type)
        num_cells =cell_type_sub.shape[0]
        #print("predicted cell ID")
        num_correct_ann = cell_type_sub[cell_type_sub[predicted_id_col]==cell_type].shape[0]
        #print(cell_type_sub.predicted_cellID.value_counts())
        perc=(num_correct_ann/num_cells)
        #print(perc)
        ann.append(perc)
    balanced_accuracy= np.sum(ann)/len(ann)
    #print("balanced accuracy:")
    #print(balanced_accuracy)
    return balanced_accuracy

In [5]:
def get_cluster_accuracy(annotation_df, cluster_id_column ='clust',correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    correc_ann_counter=0
    annotation_df = annotation_df.groupby(cluster_id_column).filter(lambda x: len(x) > 10)
    cluster_ids =sorted(set(annotation_df[cluster_id_column]))
    for cluster_id in cluster_ids:
        cluster_sub = annotation_df[annotation_df[cluster_id_column]==cluster_id]   
        clust_real_id_counts = cluster_sub[correct_id_col].value_counts()
        # sort fist by number then by index, then get the cell type with the highest occurance. This takes care of ties
        clust_real_id = clust_real_id_counts.iloc[np.lexsort((clust_real_id_counts.index, -clust_real_id_counts.values))].index[0]
        clust_predicted_id_counts = cluster_sub[predicted_id_col].value_counts()
        # sort fist by number then by index, then get the cell type with the highest occurance. This takes care of ties
        clust_predicted_id = clust_predicted_id_counts.iloc[np.lexsort((clust_predicted_id_counts.index, -clust_predicted_id_counts.values))].index[0]
        if clust_real_id == clust_predicted_id:
            correc_ann_counter += 1
        else:
            continue
    clust_accuracy= correc_ann_counter/len(cluster_ids)
    return clust_accuracy

In [6]:
data_dir = "../../../results/Fig1_Fig2_Fig3_SFig1-FACS_BM_scATAC/"

leiden_key = 'leiden_0.5'

In [7]:
labelTransfer_annotations = pd.read_csv(data_dir+"Fig3-apply_seurat_label_transfer/buenrostro2018_Seurat4_labeltransfer_metadata.csv", header=0)
labelTransfer_annotations = labelTransfer_annotations[['Unnamed: 0','orig.ident', 'predicted.celltype.l2']]
labelTransfer_annotations.columns = ['cell_IDs','orig.ident', 'predicted.celltype.l2']
labelTransfer_annotations.index = labelTransfer_annotations.cell_IDs
labelTransfer_annotations.index.name = None
labelTransfer_annotations.head()

Unnamed: 0,cell_IDs,orig.ident,predicted.celltype.l2
CLP_0,CLP_0,CLP,gdT
CLP_1,CLP_1,CLP,Memory B
CMP_0,CMP_0,CMP,CD4 Naive
CMP_1,CMP_1,CMP,CD14 Mono
CMP_2,CMP_2,CMP,CD14 Mono


In [8]:
seurat_l2_annotations = ['CD4 Naive','Prog_B 1','CD4 Memory','Prog_RBC','CD8 Memory_2','Memory B','NK','CD14 Mono','CD8 Naive','CD8 Effector_2','Naive B','HSC','MAIT','pDC','GMP','CD16 Mono','gdT','Prog_B 2','LMPP','CD8 Effector_1','Prog_DC','Plasmablast','cDC2','Prog_Mk','CD56 bright NK','CD8 Memory_1']

seurat_l2_annotations_simplified  =['CD4Tcell' , "CLP", 'CD4Tcell', "MEP", "CD8Tcell", "Bcell", "NKcell", "Mono", "CD8Tcell", "CD8Tcell", "Bcell", 'HSC', 'MAIT','pDC','GMP', "Mono", 'gdT', 'CLP', 'LMPP', 'CD8Tcell', "GMP", 'Plasmablast', 'cDC2', "MEP", "NKcell", 'CD8Tcell']

seurat_l2_annotations_to_simplified_dict = dict(zip(seurat_l2_annotations, seurat_l2_annotations_simplified))

print(seurat_l2_annotations_to_simplified_dict)



{'CD4 Naive': 'CD4Tcell', 'Prog_B 1': 'CLP', 'CD4 Memory': 'CD4Tcell', 'Prog_RBC': 'MEP', 'CD8 Memory_2': 'CD8Tcell', 'Memory B': 'Bcell', 'NK': 'NKcell', 'CD14 Mono': 'Mono', 'CD8 Naive': 'CD8Tcell', 'CD8 Effector_2': 'CD8Tcell', 'Naive B': 'Bcell', 'HSC': 'HSC', 'MAIT': 'MAIT', 'pDC': 'pDC', 'GMP': 'GMP', 'CD16 Mono': 'Mono', 'gdT': 'gdT', 'Prog_B 2': 'CLP', 'LMPP': 'LMPP', 'CD8 Effector_1': 'CD8Tcell', 'Prog_DC': 'GMP', 'Plasmablast': 'Plasmablast', 'cDC2': 'cDC2', 'Prog_Mk': 'MEP', 'CD56 bright NK': 'NKcell', 'CD8 Memory_1': 'CD8Tcell'}


In [9]:
labelTransfer_annotations = labelTransfer_annotations.assign(labelTransfer_predicted_celltype_l2_simplified=list(labelTransfer_annotations['predicted.celltype.l2'].map(seurat_l2_annotations_to_simplified_dict)))

In [10]:
labelTransfer_annotations

Unnamed: 0,cell_IDs,orig.ident,predicted.celltype.l2,labelTransfer_predicted_celltype_l2_simplified
CLP_0,CLP_0,CLP,gdT,gdT
CLP_1,CLP_1,CLP,Memory B,Bcell
CMP_0,CMP_0,CMP,CD4 Naive,CD4Tcell
CMP_1,CMP_1,CMP,CD14 Mono,Mono
CMP_2,CMP_2,CMP,CD14 Mono,Mono
...,...,...,...,...
LMPP_91,LMPP_91,LMPP,CD8 Naive,CD8Tcell
LMPP_92,LMPP_92,LMPP,CD14 Mono,Mono
LMPP_93,LMPP_93,LMPP,Naive B,Bcell
LMPP_94,LMPP_94,LMPP,Memory B,Bcell


In [11]:
labelTransfer_annotations.columns=["cell_IDs","ground_truth", "labelTransfer_predicted", "labelTransfer_predicted_simplified"]
labelTransfer_annotations.head()

Unnamed: 0,cell_IDs,ground_truth,labelTransfer_predicted,labelTransfer_predicted_simplified
CLP_0,CLP_0,CLP,gdT,gdT
CLP_1,CLP_1,CLP,Memory B,Bcell
CMP_0,CMP_0,CMP,CD4 Naive,CD4Tcell
CMP_1,CMP_1,CMP,CD14 Mono,Mono
CMP_2,CMP_2,CMP,CD14 Mono,Mono


In [12]:
labelTransfer_annotations.ground_truth.value_counts()

ground_truth
CMP     661
HSC     448
GMP     274
MEP     208
MPP     184
CLP     146
LMPP     96
Name: count, dtype: int64

In [13]:
leiden_resolutions = ['0.1' ,'0.2' ,'0.3' ,'0.4' ,'0.5' ,'0.6' ,'0.7' ,'0.8' ,'0.9' ,'1.0' ,'1.1' ,'1.2' ,'1.3' ,'1.4' ,'1.5' ,'1.6' ,'1.7','1.8' ,'1.9' ,'2.0', '2.1', '2.2', '2.3', '2.4', '2.5']

In [14]:
# prepare results dataframe
acc_comparisons = pd.DataFrame( index=['leiden_'+i for i in leiden_resolutions])

In [15]:
for res in leiden_resolutions:
    result_path = "../../../results/Supplementary_tables/ST3-FACS_BM_scATAC_accuracy_acros_clustering_res/res"+ res + "/outputs/scATAcat_annotations_leiden_" + res +".csv"
    scATAcat_annotations = pd.read_csv(result_path, index_col=0)
    scATAcat_annotations.columns = ['leiden_'+res, 'scATAcat_annotation' ,'ground_truth_annotations']
    scATAcat_annotations['cell_IDs'] = scATAcat_annotations.index
    scATAcat_annotations.index.name = None
    common_cells = list(set(scATAcat_annotations.index)&set(labelTransfer_annotations.index))
    scATAcat_labelTransfer_groundTruth_commonCells= pd.merge(scATAcat_annotations.loc[common_cells,], labelTransfer_annotations.loc[common_cells,], on="cell_IDs")
    common_annotations = list(set(scATAcat_labelTransfer_groundTruth_commonCells.scATAcat_annotation)&set(scATAcat_labelTransfer_groundTruth_commonCells.labelTransfer_predicted_simplified)&set(scATAcat_labelTransfer_groundTruth_commonCells.ground_truth_annotations))
    scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations = scATAcat_labelTransfer_groundTruth_commonCells[scATAcat_labelTransfer_groundTruth_commonCells.ground_truth_annotations.isin(common_annotations)]
    acc_comparisons.loc['leiden_'+ res, 'num_clust'] = scATAcat_annotations['leiden_'+ res].max()+1
    acc_comparisons.loc['leiden_'+ res, 'scATAcat_Acc'] = get_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations', predicted_id_col='scATAcat_annotation')
    acc_comparisons.loc['leiden_'+ res, 'labelTransfer_Acc'] = get_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations', predicted_id_col='labelTransfer_predicted_simplified')
    acc_comparisons.loc['leiden_'+ res, 'scATAcat_BAcc'] = get_balanced_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations', predicted_id_col='scATAcat_annotation')
    acc_comparisons.loc['leiden_'+ res, 'labelTransfer_BAcc'] = get_balanced_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations', predicted_id_col='labelTransfer_predicted_simplified')
    acc_comparisons.loc['leiden_'+ res, 'scATAcat_CAcc'] = get_cluster_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations,cluster_id_column ='leiden_'+ res, correct_id_col='ground_truth_annotations', predicted_id_col='scATAcat_annotation')
    acc_comparisons.loc['leiden_'+ res, 'labelTransfer_CAcc'] = get_cluster_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations,cluster_id_column ='leiden_'+ res, correct_id_col='ground_truth_annotations', predicted_id_col='labelTransfer_predicted_simplified')

In [16]:
acc_comparisons

Unnamed: 0,num_clust,scATAcat_Acc,labelTransfer_Acc,scATAcat_BAcc,labelTransfer_BAcc,scATAcat_CAcc,labelTransfer_CAcc
leiden_0.1,3.0,0.954853,0.327314,0.956543,0.37175,0.666667,0.333333
leiden_0.2,5.0,0.947862,0.173097,0.953281,0.231082,0.8,0.2
leiden_0.3,5.0,0.947862,0.173097,0.953281,0.231082,0.8,0.2
leiden_0.4,5.0,0.952813,0.297641,0.957817,0.306475,1.0,0.333333
leiden_0.5,7.0,0.947862,0.173097,0.953657,0.231082,0.833333,0.166667
leiden_0.6,8.0,0.916429,0.163343,0.914766,0.197631,0.857143,0.142857
leiden_0.7,8.0,0.916429,0.163343,0.914766,0.197631,0.857143,0.142857
leiden_0.8,9.0,0.91453,0.163343,0.913786,0.197631,0.857143,0.142857
leiden_0.9,10.0,0.91453,0.163343,0.913786,0.197631,0.875,0.25
leiden_1.0,10.0,0.868946,0.163343,0.889356,0.197631,0.777778,0.222222


In [17]:
acc_comparisons.to_csv('../../../results/Supplementary_tables/ST3-FACS_BM_scATAC_accuracy_acros_clustering_res/ST3-FACS_BM_scATAC_accuracy_acros_clustering_res.csv')