In [1]:
import pandas as pd
import numpy as np

In [2]:
def get_accuracy(annotation_df, correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    num_correct_ann= annotation_df[annotation_df[correct_id_col]==annotation_df[predicted_id_col]].shape[0]
    acc=num_correct_ann/annotation_df.shape[0]
    print("accuracy:")
    return acc

In [3]:
def get_balanced_accuracy(annotation_df, correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    ann=[]
    for cell_type in sorted(set(annotation_df[correct_id_col])):
        cell_type_sub = annotation_df[annotation_df[correct_id_col]==cell_type]
        print(cell_type)
        num_cells =cell_type_sub.shape[0]
        print("predicted cell ID")
        num_correct_ann = cell_type_sub[cell_type_sub[predicted_id_col]==cell_type].shape[0]
        #print(cell_type_sub.predicted_cellID.value_counts())
        perc=(num_correct_ann/num_cells)
        print(perc)
        ann.append(perc)
    balanced_accuracy= np.sum(ann)/len(ann)
    print("balanced accuracy:")
    print(balanced_accuracy)
    return balanced_accuracy

In [4]:
def get_cluster_accuracy(annotation_df, cluster_id_column ='clust',correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    correc_ann_counter=0
    annotation_df = annotation_df.groupby(cluster_id_column).filter(lambda x: len(x) > 10)
    cluster_ids =sorted(set(annotation_df[cluster_id_column]))
    for cluster_id in cluster_ids:
        cluster_sub = annotation_df[annotation_df[cluster_id_column]==cluster_id]   
        clust_real_id_counts = cluster_sub[correct_id_col].value_counts()
        # sort fist by number then by index, then get the cell type with the highest occurance. This takes care of ties
        clust_real_id = clust_real_id_counts.iloc[np.lexsort((clust_real_id_counts.index, -clust_real_id_counts.values))].index[0]
        clust_predicted_id_counts = cluster_sub[predicted_id_col].value_counts()
        # sort fist by number then by index, then get the cell type with the highest occurance. This takes care of ties
        clust_predicted_id = clust_predicted_id_counts.iloc[np.lexsort((clust_predicted_id_counts.index, -clust_predicted_id_counts.values))].index[0]
        if clust_real_id == clust_predicted_id:
            correc_ann_counter += 1
        else:
            continue
    clust_accuracy= correc_ann_counter/len(cluster_ids)
    print("cluster accuracy:")
    return clust_accuracy

In [5]:
data_dir = "../../../results/Fig1_Fig2_Fig3_SFig1-FACS_BM_scATAC/"

leiden_key = 'leiden_0.6'

In [6]:
labelTransfer_annotations = pd.read_csv(data_dir+"Fig3-apply_seurat_label_transfer/buenrostro2018_Seurat4_labeltransfer_metadata.csv", header=0)
labelTransfer_annotations = labelTransfer_annotations[['Unnamed: 0','orig.ident', 'predicted.celltype.l2']]
labelTransfer_annotations.columns = ['cell_IDs','orig.ident', 'predicted.celltype.l2']
labelTransfer_annotations.index = labelTransfer_annotations.cell_IDs
labelTransfer_annotations.index.name = None
labelTransfer_annotations.head()

Unnamed: 0,cell_IDs,orig.ident,predicted.celltype.l2
CLP_0,CLP_0,CLP,gdT
CLP_1,CLP_1,CLP,Memory B
CMP_0,CMP_0,CMP,CD4 Naive
CMP_1,CMP_1,CMP,CD14 Mono
CMP_2,CMP_2,CMP,CD14 Mono


In [7]:
seurat_l2_annotations = ['CD4 Naive','Prog_B 1','CD4 Memory','Prog_RBC','CD8 Memory_2','Memory B','NK','CD14 Mono','CD8 Naive','CD8 Effector_2','Naive B','HSC','MAIT','pDC','GMP','CD16 Mono','gdT','Prog_B 2','LMPP','CD8 Effector_1','Prog_DC','Plasmablast','cDC2','Prog_Mk','CD56 bright NK','CD8 Memory_1']

seurat_l2_annotations_simplified  =['CD4Tcell' , "CLP", 'CD4Tcell', "MEP", "CD8Tcell", "Bcell", "NKcell", "Mono", "CD8Tcell", "CD8Tcell", "Bcell", 'HSC', 'MAIT','pDC','GMP', "Mono", 'gdT', 'CLP', 'LMPP', 'CD8Tcell', "GMP", 'Plasmablast', 'cDC2', "MEP", "NKcell", 'CD8Tcell']

seurat_l2_annotations_to_simplified_dict = dict(zip(seurat_l2_annotations, seurat_l2_annotations_simplified))

print(seurat_l2_annotations_to_simplified_dict)



{'CD4 Naive': 'CD4Tcell', 'Prog_B 1': 'CLP', 'CD4 Memory': 'CD4Tcell', 'Prog_RBC': 'MEP', 'CD8 Memory_2': 'CD8Tcell', 'Memory B': 'Bcell', 'NK': 'NKcell', 'CD14 Mono': 'Mono', 'CD8 Naive': 'CD8Tcell', 'CD8 Effector_2': 'CD8Tcell', 'Naive B': 'Bcell', 'HSC': 'HSC', 'MAIT': 'MAIT', 'pDC': 'pDC', 'GMP': 'GMP', 'CD16 Mono': 'Mono', 'gdT': 'gdT', 'Prog_B 2': 'CLP', 'LMPP': 'LMPP', 'CD8 Effector_1': 'CD8Tcell', 'Prog_DC': 'GMP', 'Plasmablast': 'Plasmablast', 'cDC2': 'cDC2', 'Prog_Mk': 'MEP', 'CD56 bright NK': 'NKcell', 'CD8 Memory_1': 'CD8Tcell'}


In [8]:
labelTransfer_annotations = labelTransfer_annotations.assign(labelTransfer_predicted_celltype_l2_simplified=list(labelTransfer_annotations['predicted.celltype.l2'].map(seurat_l2_annotations_to_simplified_dict)))

In [9]:
labelTransfer_annotations

Unnamed: 0,cell_IDs,orig.ident,predicted.celltype.l2,labelTransfer_predicted_celltype_l2_simplified
CLP_0,CLP_0,CLP,gdT,gdT
CLP_1,CLP_1,CLP,Memory B,Bcell
CMP_0,CMP_0,CMP,CD4 Naive,CD4Tcell
CMP_1,CMP_1,CMP,CD14 Mono,Mono
CMP_2,CMP_2,CMP,CD14 Mono,Mono
...,...,...,...,...
LMPP_91,LMPP_91,LMPP,CD8 Naive,CD8Tcell
LMPP_92,LMPP_92,LMPP,CD14 Mono,Mono
LMPP_93,LMPP_93,LMPP,Naive B,Bcell
LMPP_94,LMPP_94,LMPP,Memory B,Bcell


In [10]:
labelTransfer_annotations.columns=["cell_IDs","ground_truth", "labelTransfer_predicted", "labelTransfer_predicted_simplified"]
labelTransfer_annotations.head()

Unnamed: 0,cell_IDs,ground_truth,labelTransfer_predicted,labelTransfer_predicted_simplified
CLP_0,CLP_0,CLP,gdT,gdT
CLP_1,CLP_1,CLP,Memory B,Bcell
CMP_0,CMP_0,CMP,CD4 Naive,CD4Tcell
CMP_1,CMP_1,CMP,CD14 Mono,Mono
CMP_2,CMP_2,CMP,CD14 Mono,Mono


In [11]:
labelTransfer_annotations.ground_truth.value_counts()

ground_truth
CMP     661
HSC     448
GMP     274
MEP     208
MPP     184
CLP     146
LMPP     96
Name: count, dtype: int64

In [12]:
scATAcat_annotations = pd.read_csv(data_dir +"/Fig1_SFig1-apply_scATAcat/apply_scATAcat_with_Corces2016_prototypes/outputs/scATAcat_annotations.csv")
scATAcat_annotations.columns = ['cell_IDs',leiden_key, 'scATAcat_annotation' ,'ground_truth_annotations']
scATAcat_annotations.index = scATAcat_annotations.cell_IDs
scATAcat_annotations.index.name = None
scATAcat_annotations.head()

Unnamed: 0,cell_IDs,leiden_0.6,scATAcat_annotation,ground_truth_annotations
CLP_1,CLP_1,6,CLP,CLP
CMP_0,CMP_0,5,CMP,CMP
CMP_1,CMP_1,5,CMP,CMP
CMP_2,CMP_2,0,CMP,CMP
CMP_3,CMP_3,3,MEP,CMP


In [13]:
scATAcat_annotations.shape

(1872, 4)

## combine the annotations for the common cells:


In [14]:
common_cells = list(set(scATAcat_annotations.index)&set(labelTransfer_annotations.index))

In [15]:
scATAcat_labelTransfer_groundTruth_commonCells= pd.merge(scATAcat_annotations.loc[common_cells,], labelTransfer_annotations.loc[common_cells,], on="cell_IDs")


In [16]:
scATAcat_labelTransfer_groundTruth_commonCells

Unnamed: 0,cell_IDs,leiden_0.6,scATAcat_annotation,ground_truth_annotations,ground_truth,labelTransfer_predicted,labelTransfer_predicted_simplified
0,MPP_94,1,HSC,MPP,MPP,CD8 Naive,CD8Tcell
1,MPP_68,0,CMP,MPP,MPP,CD4 Memory,CD4Tcell
2,MPP_124,0,CMP,MPP,MPP,CD4 Naive,CD4Tcell
3,MPP_105,0,CMP,MPP,MPP,CD14 Mono,Mono
4,CMP_131,0,CMP,CMP,CMP,Memory B,Bcell
...,...,...,...,...,...,...,...
1867,CLP_12,6,CLP,CLP,CLP,Prog_B 2,CLP
1868,HSC_470,4,HSC,HSC,HSC,CD14 Mono,Mono
1869,MEP_183,3,MEP,MEP,MEP,Prog_RBC,MEP
1870,MEP_27,3,MEP,MEP,MEP,CD8 Naive,CD8Tcell


## subset the ground truth to common annotations


In [17]:
common_annotations = list(set(scATAcat_labelTransfer_groundTruth_commonCells.scATAcat_annotation)&set(scATAcat_labelTransfer_groundTruth_commonCells.labelTransfer_predicted_simplified)&set(scATAcat_labelTransfer_groundTruth_commonCells.ground_truth_annotations))
common_annotations

['HSC', 'GMP', 'CLP', 'LMPP', 'MEP']

In [18]:
scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations = scATAcat_labelTransfer_groundTruth_commonCells[scATAcat_labelTransfer_groundTruth_commonCells.ground_truth_annotations.isin(common_annotations)]

In [19]:
scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations

Unnamed: 0,cell_IDs,leiden_0.6,scATAcat_annotation,ground_truth_annotations,ground_truth,labelTransfer_predicted,labelTransfer_predicted_simplified
6,LMPP_33,7,LMPP,LMPP,LMPP,CD4 Memory,CD4Tcell
7,GMP_202,2,GMP,GMP,GMP,NK,NKcell
8,CLP_190,6,CLP,CLP,CLP,Prog_B 2,CLP
10,CLP_140,6,CLP,CLP,CLP,Naive B,Bcell
11,GMP_270,2,GMP,GMP,GMP,NK,NKcell
...,...,...,...,...,...,...,...
1867,CLP_12,6,CLP,CLP,CLP,Prog_B 2,CLP
1868,HSC_470,4,HSC,HSC,HSC,CD14 Mono,Mono
1869,MEP_183,3,MEP,MEP,MEP,Prog_RBC,MEP
1870,MEP_27,3,MEP,MEP,MEP,CD8 Naive,CD8Tcell


## accuracy metrics for scATAcat:

In [20]:
get_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations', predicted_id_col='scATAcat_annotation')

accuracy:


0.9164292497625831

In [21]:
get_balanced_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations', predicted_id_col='scATAcat_annotation')

CLP
predicted cell ID
0.9722222222222222
GMP
predicted cell ID
0.8458498023715415
HSC
predicted cell ID
0.9362745098039216
LMPP
predicted cell ID
0.851063829787234
MEP
predicted cell ID
0.968421052631579
balanced accuracy:
0.9147662833632996


0.9147662833632996

In [22]:
get_cluster_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations,cluster_id_column =leiden_key, correct_id_col='ground_truth_annotations', predicted_id_col='scATAcat_annotation')

cluster accuracy:


0.8571428571428571

## accuracy metrics for label transfer:

In [23]:
get_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations', predicted_id_col='labelTransfer_predicted_simplified')

accuracy:


0.16334283000949668

In [24]:
get_balanced_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations', predicted_id_col='labelTransfer_predicted_simplified')

CLP
predicted cell ID
0.17592592592592593
GMP
predicted cell ID
0.05928853754940711
HSC
predicted cell ID
0.004901960784313725
LMPP
predicted cell ID
0.06382978723404255
MEP
predicted cell ID
0.6842105263157895
balanced accuracy:
0.19763134756189576


0.19763134756189576

In [25]:
get_cluster_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations,cluster_id_column =leiden_key, correct_id_col='ground_truth_annotations', predicted_id_col='labelTransfer_predicted_simplified')

cluster accuracy:


0.14285714285714285