In [1]:
import pandas as pd
import numpy as np
import random as rn

In [2]:
# set the seed for reproducibility
sd = 1234
np.random.seed(sd)
rn.seed(sd)
%env PYTHONHASHSEED=0


env: PYTHONHASHSEED=0


In [3]:
def get_accuracy(annotation_df, correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    num_correct_ann= annotation_df[annotation_df[correct_id_col]==annotation_df[predicted_id_col]].shape[0]
    acc=num_correct_ann/annotation_df.shape[0]
    print("accuracy:")
    return acc

In [4]:
def get_balanced_accuracy(annotation_df, correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    ann=[]
    for cell_type in sorted(set(annotation_df[correct_id_col])):
        cell_type_sub = annotation_df[annotation_df[correct_id_col]==cell_type]
        print(cell_type)
        num_cells =cell_type_sub.shape[0]
        print("predicted cell ID")
        num_correct_ann = cell_type_sub[cell_type_sub[predicted_id_col]==cell_type].shape[0]
        #print(cell_type_sub.predicted_cellID.value_counts())
        perc=(num_correct_ann/num_cells)
        print(perc)
        ann.append(perc)
    balanced_accuracy= np.sum(ann)/len(ann)
    print("balanced accuracy:")
    print(balanced_accuracy)
    return balanced_accuracy

In [5]:
def get_cluster_accuracy(annotation_df, cluster_id_column ='clust',correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    correc_ann_counter=0
    annotation_df = annotation_df.groupby(cluster_id_column).filter(lambda x: len(x) > 10)
    cluster_ids =sorted(set(annotation_df[cluster_id_column]))
    for cluster_id in cluster_ids:
        cluster_sub = annotation_df[annotation_df[cluster_id_column]==cluster_id]   
        clust_real_id_counts = cluster_sub[correct_id_col].value_counts()
        # sort fist by number then by index, then get the cell type with the highest occurance. This takes care of ties
        clust_real_id = clust_real_id_counts.iloc[np.lexsort((clust_real_id_counts.index, -clust_real_id_counts.values))].index[0]
        clust_predicted_id_counts = cluster_sub[predicted_id_col].value_counts()
        # sort fist by number then by index, then get the cell type with the highest occurance. This takes care of ties
        clust_predicted_id = clust_predicted_id_counts.iloc[np.lexsort((clust_predicted_id_counts.index, -clust_predicted_id_counts.values))].index[0]
        if clust_real_id == clust_predicted_id:
            correc_ann_counter += 1
        else:
            continue
    clust_accuracy= correc_ann_counter/len(cluster_ids)
    print(correc_ann_counter)
    print("cluster accuracy:")
    return clust_accuracy

### ground truth annotations

In [6]:
NeurIPS_BM_s1d1_metadata = pd.read_csv("../../../data/NeurIPS_BM_scmultiome/03_cell_IDs/NeurIPS_BM_s1d1_metadata.csv", index_col=0)

In [7]:
NeurIPS_BM_s1d1_metadata["cellIDs"] = NeurIPS_BM_s1d1_metadata.index.str.split("-").str[:-1].str.join('-')

In [8]:
NeurIPS_BM_s1d1_metadata[["cellIDs", "cell_type"]]

Unnamed: 0,cellIDs,cell_type
TAGTTGTCACCCTCAC-1-s1d1,TAGTTGTCACCCTCAC-1,Naive CD20+ B
CTATGGCCATAACGGG-1-s1d1,CTATGGCCATAACGGG-1,CD14+ Mono
CCGCACACAGGTTAAA-1-s1d1,CCGCACACAGGTTAAA-1,CD8+ T
TCATTTGGTAATGGAA-1-s1d1,TCATTTGGTAATGGAA-1,CD8+ T
ACCACATAGGTGTCCA-1-s1d1,ACCACATAGGTGTCCA-1,CD16+ Mono
...,...,...
AGACCCGGTTATCCTA-1-s1d1,AGACCCGGTTATCCTA-1,CD4+ T activated
GACCTAAGTGCCTCAC-1-s1d1,GACCTAAGTGCCTCAC-1,pDC
AGCTCATAGCTATATG-1-s1d1,AGCTCATAGCTATATG-1,CD4+ T activated
TACGTACAGGAAACTG-1-s1d1,TACGTACAGGAAACTG-1,CD4+ T naive


In [9]:
NeurIPS_BM_s1d1_metadata.index = NeurIPS_BM_s1d1_metadata.cellIDs

In [10]:
NeurIPS_BM_s1d1_metadata = NeurIPS_BM_s1d1_metadata[["cellIDs", "cell_type"]]

In [11]:
NeurIPS_ann_rename_dict = {'B1 B': 'Bcell',
 'CD14+ Mono': 'Mono',
 'CD16+ Mono': 'Mono',
 'CD4+ T activated':"CD4Tcell",
 'CD4+ T naive':"CD4Tcell",
 'CD8+ T':"CD8Tcell",
 'Erythroblast':"Ery",
 'G/M prog':"GMP",
 'HSC':"HSC",
 'ID2-hi myeloid prog':"CMP",
 'ILC':"ILC",
 'Lymph prog':"CLP",
 'MK/E prog':"MEP",
 'NK':"NKcell",
 'Naive CD20+ B':"Bcell",
 'Normoblast':"Ery", #
 'Plasma cell':"Plasma cell",
 'Proerythroblast':"Ery",
 'Transitional B':"Bcell",
 'cDC2':"cDC2",
 'pDC':'pDC'}

In [12]:
NeurIPS_BM_s1d1_metadata['NeurIPS_ann_simp'] =list(NeurIPS_BM_s1d1_metadata["cell_type"].map(NeurIPS_ann_rename_dict))
NeurIPS_BM_s1d1_metadata.index.name = None
NeurIPS_BM_s1d1_metadata.columns=["cell_IDs", "ground_truth_annotations","ground_truth_annotations_simplified" ]
NeurIPS_BM_s1d1_metadata

Unnamed: 0,cell_IDs,ground_truth_annotations,ground_truth_annotations_simplified
TAGTTGTCACCCTCAC-1,TAGTTGTCACCCTCAC-1,Naive CD20+ B,Bcell
CTATGGCCATAACGGG-1,CTATGGCCATAACGGG-1,CD14+ Mono,Mono
CCGCACACAGGTTAAA-1,CCGCACACAGGTTAAA-1,CD8+ T,CD8Tcell
TCATTTGGTAATGGAA-1,TCATTTGGTAATGGAA-1,CD8+ T,CD8Tcell
ACCACATAGGTGTCCA-1,ACCACATAGGTGTCCA-1,CD16+ Mono,Mono
...,...,...,...
AGACCCGGTTATCCTA-1,AGACCCGGTTATCCTA-1,CD4+ T activated,CD4Tcell
GACCTAAGTGCCTCAC-1,GACCTAAGTGCCTCAC-1,pDC,pDC
AGCTCATAGCTATATG-1,AGCTCATAGCTATATG-1,CD4+ T activated,CD4Tcell
TACGTACAGGAAACTG-1,TACGTACAGGAAACTG-1,CD4+ T naive,CD4Tcell


### seurat annotations

In [13]:
seurat_labelTransfer_annotations = pd.read_csv("/project/scATAC_analysis/NeurIPS2021_BM_scmultiome/analysis/04_annotate_via_Seurat3_label_transfer/outs/bm_neurips_labelTransfer_predicted_annotations.csv", index_col = 0)
seurat_labelTransfer_annotations.head()

Unnamed: 0,predicted.celltype.l2,cellID
AAACAGCCAATTAAGG-1,CD4 Naive,AAACAGCCAATTAAGG-1
AAACAGCCAGGCTGTT-1,Prog_B 1,AAACAGCCAGGCTGTT-1
AAACATGCAAAGCGCA-1,CD4 Memory,AAACATGCAAAGCGCA-1
AAACATGCAATAACCT-1,Prog_RBC,AAACATGCAATAACCT-1
AAACATGCAATAATGG-1,CD8 Memory_2,AAACATGCAATAATGG-1


In [14]:
seurat_annotations = ['CD4 Naive', 'Prog_B 1', 'CD4 Memory', 'Prog_RBC', 'CD8 Memory_2', 'Memory B', 'NK', 'CD14 Mono', 'CD8 Naive', 'CD8 Effector_2', 'Naive B', 'HSC', 'MAIT', 'pDC', 'GMP', 'CD16 Mono', 'gdT', 'Prog_B 2', 'LMPP', 'CD8 Effector_1', 'Prog_DC', 'Plasmablast', 'cDC2', 'Prog_Mk', 'CD56 bright NK', 'CD8 Memory_1']
seurat_annotations_simplified = ['CD4Tcell', "CLP", 'CD4Tcell', "MEP", "CD8Tcell", "Bcell", "NKcell", "Mono", "CD8Tcell", "CD8Tcell", "Bcell", 'HSC', 'MAIT', 'pDC', 'GMP', "Mono", 'gdT', 'CLP', 'LMPP', 'CD8Tcell', "CMP", 'Plasmablast', 'cDC2', "MEP", "NKcell", 'CD8Tcell']

seurat_annotations_and_simplified_dict = dict(zip(seurat_annotations, seurat_annotations_simplified))

print(seurat_annotations_and_simplified_dict)


{'CD4 Naive': 'CD4Tcell', 'Prog_B 1': 'CLP', 'CD4 Memory': 'CD4Tcell', 'Prog_RBC': 'MEP', 'CD8 Memory_2': 'CD8Tcell', 'Memory B': 'Bcell', 'NK': 'NKcell', 'CD14 Mono': 'Mono', 'CD8 Naive': 'CD8Tcell', 'CD8 Effector_2': 'CD8Tcell', 'Naive B': 'Bcell', 'HSC': 'HSC', 'MAIT': 'MAIT', 'pDC': 'pDC', 'GMP': 'GMP', 'CD16 Mono': 'Mono', 'gdT': 'gdT', 'Prog_B 2': 'CLP', 'LMPP': 'LMPP', 'CD8 Effector_1': 'CD8Tcell', 'Prog_DC': 'CMP', 'Plasmablast': 'Plasmablast', 'cDC2': 'cDC2', 'Prog_Mk': 'MEP', 'CD56 bright NK': 'NKcell', 'CD8 Memory_1': 'CD8Tcell'}


In [15]:
seurat_labelTransfer_annotations['seurat_annotations_simp'] =list(seurat_labelTransfer_annotations["predicted.celltype.l2"].map(seurat_annotations_and_simplified_dict))
seurat_labelTransfer_annotations.columns=["label_transfer_annotations", "cell_IDs", "label_transfer_annotations_simplified"]

In [16]:
seurat_labelTransfer_annotations

Unnamed: 0,label_transfer_annotations,cell_IDs,label_transfer_annotations_simplified
AAACAGCCAATTAAGG-1,CD4 Naive,AAACAGCCAATTAAGG-1,CD4Tcell
AAACAGCCAGGCTGTT-1,Prog_B 1,AAACAGCCAGGCTGTT-1,CLP
AAACATGCAAAGCGCA-1,CD4 Memory,AAACATGCAAAGCGCA-1,CD4Tcell
AAACATGCAATAACCT-1,Prog_RBC,AAACATGCAATAACCT-1,MEP
AAACATGCAATAATGG-1,CD8 Memory_2,AAACATGCAATAATGG-1,CD8Tcell
...,...,...,...
TTTGTGTTCATTTGCT-1,CD4 Naive,TTTGTGTTCATTTGCT-1,CD4Tcell
TTTGTGTTCGACCTGA-1,Prog_RBC,TTTGTGTTCGACCTGA-1,MEP
TTTGTGTTCGCCTAAG-1,CD4 Naive,TTTGTGTTCGCCTAAG-1,CD4Tcell
TTTGTTGGTACGGTAC-1,CD4 Naive,TTTGTTGGTACGGTAC-1,CD4Tcell


### scATAcat annotations

In [17]:
scATAcat_annotations = pd.read_csv("../../../results/Fig5_SFig3-NeurIPS_BM_scmultiome/Fig5_SFig3-apply_scATAcat/outputs/scATAcat_annotations.csv", index_col=0 )
scATAcat_annotations.index.name = None
scATAcat_annotations.columns = ['leiden_1.0', 'scATAcat_annotation','cell_IDs']
scATAcat_annotations

Unnamed: 0,leiden_1.0,scATAcat_annotation,cell_IDs
TAGTTGTCACCCTCAC-1,4,Bcell,TAGTTGTCACCCTCAC-1
CTATGGCCATAACGGG-1,0,Mono,CTATGGCCATAACGGG-1
CCGCACACAGGTTAAA-1,2,CD8Tcell,CCGCACACAGGTTAAA-1
TCATTTGGTAATGGAA-1,2,CD8Tcell,TCATTTGGTAATGGAA-1
ACCACATAGGTGTCCA-1,0,Mono,ACCACATAGGTGTCCA-1
...,...,...,...
AGACCCGGTTATCCTA-1,1,CD4Tcell,AGACCCGGTTATCCTA-1
GACCTAAGTGCCTCAC-1,9,Bcell,GACCTAAGTGCCTCAC-1
AGCTCATAGCTATATG-1,1,CD4Tcell,AGCTCATAGCTATATG-1
TACGTACAGGAAACTG-1,5,CD4Tcell,TACGTACAGGAAACTG-1


## combine the annotations for the common cells:


In [18]:
common_cells = list(set(scATAcat_annotations.index)&set(seurat_labelTransfer_annotations.index)&set(NeurIPS_BM_s1d1_metadata.index))

In [19]:
seurat_labelTransfer_annotations.loc[common_cells,]

Unnamed: 0,label_transfer_annotations,cell_IDs,label_transfer_annotations_simplified
TCATAACCATAATCCG-1,CD8 Effector_1,TCATAACCATAATCCG-1,CD8Tcell
AGGTACGCACCATATG-1,CD4 Memory,AGGTACGCACCATATG-1,CD4Tcell
AGGCGGATCCTGAATA-1,CD14 Mono,AGGCGGATCCTGAATA-1,Mono
GTTGCTGAGACTAAGG-1,CD14 Mono,GTTGCTGAGACTAAGG-1,Mono
ATTGAAGCAAATTCGT-1,CD4 Naive,ATTGAAGCAAATTCGT-1,CD4Tcell
...,...,...,...
TGCTTGCTCAATCTAG-1,CD14 Mono,TGCTTGCTCAATCTAG-1,Mono
GTTTCTAGTGTTGCTT-1,CD4 Memory,GTTTCTAGTGTTGCTT-1,CD4Tcell
ATCCTTAGTTTAGTCC-1,Memory B,ATCCTTAGTTTAGTCC-1,Bcell
TGCTCTCAGCTGTACG-1,Naive B,TGCTCTCAGCTGTACG-1,Bcell


In [20]:
scATAcat_annotations.loc[common_cells,]

Unnamed: 0,leiden_1.0,scATAcat_annotation,cell_IDs
TCATAACCATAATCCG-1,2,CD8Tcell,TCATAACCATAATCCG-1
AGGTACGCACCATATG-1,5,CD4Tcell,AGGTACGCACCATATG-1
AGGCGGATCCTGAATA-1,0,Mono,AGGCGGATCCTGAATA-1
GTTGCTGAGACTAAGG-1,0,Mono,GTTGCTGAGACTAAGG-1
ATTGAAGCAAATTCGT-1,5,CD4Tcell,ATTGAAGCAAATTCGT-1
...,...,...,...
TGCTTGCTCAATCTAG-1,0,Mono,TGCTTGCTCAATCTAG-1
GTTTCTAGTGTTGCTT-1,1,CD4Tcell,GTTTCTAGTGTTGCTT-1
ATCCTTAGTTTAGTCC-1,4,Bcell,ATCCTTAGTTTAGTCC-1
TGCTCTCAGCTGTACG-1,4,Bcell,TGCTCTCAGCTGTACG-1


In [21]:
 NeurIPS_BM_s1d1_metadata.loc[common_cells,]

Unnamed: 0,cell_IDs,ground_truth_annotations,ground_truth_annotations_simplified
TCATAACCATAATCCG-1,TCATAACCATAATCCG-1,CD8+ T,CD8Tcell
AGGTACGCACCATATG-1,AGGTACGCACCATATG-1,CD4+ T activated,CD4Tcell
AGGCGGATCCTGAATA-1,AGGCGGATCCTGAATA-1,CD14+ Mono,Mono
GTTGCTGAGACTAAGG-1,GTTGCTGAGACTAAGG-1,CD14+ Mono,Mono
ATTGAAGCAAATTCGT-1,ATTGAAGCAAATTCGT-1,CD4+ T naive,CD4Tcell
...,...,...,...
TGCTTGCTCAATCTAG-1,TGCTTGCTCAATCTAG-1,CD14+ Mono,Mono
GTTTCTAGTGTTGCTT-1,GTTTCTAGTGTTGCTT-1,CD4+ T naive,CD4Tcell
ATCCTTAGTTTAGTCC-1,ATCCTTAGTTTAGTCC-1,B1 B,Bcell
TGCTCTCAGCTGTACG-1,TGCTCTCAGCTGTACG-1,Naive CD20+ B,Bcell


In [22]:
scATAcat_labelTransfer_groundTruth_commonCells= pd.merge(scATAcat_annotations.loc[common_cells,], seurat_labelTransfer_annotations.loc[common_cells,], on="cell_IDs")
scATAcat_labelTransfer_groundTruth_commonCells= pd.merge(scATAcat_labelTransfer_groundTruth_commonCells, NeurIPS_BM_s1d1_metadata.loc[common_cells,],on="cell_IDs")

In [23]:
scATAcat_labelTransfer_groundTruth_commonCells.head()

Unnamed: 0,leiden_1.0,scATAcat_annotation,cell_IDs,label_transfer_annotations,label_transfer_annotations_simplified,ground_truth_annotations,ground_truth_annotations_simplified
0,2,CD8Tcell,TCATAACCATAATCCG-1,CD8 Effector_1,CD8Tcell,CD8+ T,CD8Tcell
1,5,CD4Tcell,AGGTACGCACCATATG-1,CD4 Memory,CD4Tcell,CD4+ T activated,CD4Tcell
2,0,Mono,AGGCGGATCCTGAATA-1,CD14 Mono,Mono,CD14+ Mono,Mono
3,0,Mono,GTTGCTGAGACTAAGG-1,CD14 Mono,Mono,CD14+ Mono,Mono
4,5,CD4Tcell,ATTGAAGCAAATTCGT-1,CD4 Naive,CD4Tcell,CD4+ T naive,CD4Tcell


In [24]:
scATAcat_labelTransfer_groundTruth_commonCells.columns

Index(['leiden_1.0', 'scATAcat_annotation', 'cell_IDs',
       'label_transfer_annotations', 'label_transfer_annotations_simplified',
       'ground_truth_annotations', 'ground_truth_annotations_simplified'],
      dtype='object')

## subset the ground truth to common annotations


In [25]:
common_annotations = list(set(scATAcat_labelTransfer_groundTruth_commonCells.scATAcat_annotation)&set(scATAcat_labelTransfer_groundTruth_commonCells.label_transfer_annotations_simplified)&set(scATAcat_labelTransfer_groundTruth_commonCells.ground_truth_annotations_simplified))
common_annotations

['CD4Tcell', 'Mono', 'Bcell', 'NKcell', 'CD8Tcell', 'GMP']

In [26]:
scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations = scATAcat_labelTransfer_groundTruth_commonCells[scATAcat_labelTransfer_groundTruth_commonCells.ground_truth_annotations_simplified.isin(common_annotations)]

In [27]:
scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations

Unnamed: 0,leiden_1.0,scATAcat_annotation,cell_IDs,label_transfer_annotations,label_transfer_annotations_simplified,ground_truth_annotations,ground_truth_annotations_simplified
0,2,CD8Tcell,TCATAACCATAATCCG-1,CD8 Effector_1,CD8Tcell,CD8+ T,CD8Tcell
1,5,CD4Tcell,AGGTACGCACCATATG-1,CD4 Memory,CD4Tcell,CD4+ T activated,CD4Tcell
2,0,Mono,AGGCGGATCCTGAATA-1,CD14 Mono,Mono,CD14+ Mono,Mono
3,0,Mono,GTTGCTGAGACTAAGG-1,CD14 Mono,Mono,CD14+ Mono,Mono
4,5,CD4Tcell,ATTGAAGCAAATTCGT-1,CD4 Naive,CD4Tcell,CD4+ T naive,CD4Tcell
...,...,...,...,...,...,...,...
5761,0,Mono,TGCTTGCTCAATCTAG-1,CD14 Mono,Mono,CD14+ Mono,Mono
5762,1,CD4Tcell,GTTTCTAGTGTTGCTT-1,CD4 Memory,CD4Tcell,CD4+ T naive,CD4Tcell
5763,4,Bcell,ATCCTTAGTTTAGTCC-1,Memory B,Bcell,B1 B,Bcell
5764,4,Bcell,TGCTCTCAGCTGTACG-1,Naive B,Bcell,Naive CD20+ B,Bcell


## accuracy metrics for scATAcat:

In [28]:
get_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations_simplified', predicted_id_col='scATAcat_annotation')

accuracy:


0.961646398503274

In [29]:
get_balanced_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations_simplified', predicted_id_col='scATAcat_annotation')

Bcell
predicted cell ID
0.9954407294832827
CD4Tcell
predicted cell ID
0.9818731117824774
CD8Tcell
predicted cell ID
0.9044117647058824
GMP
predicted cell ID
0.9245283018867925
Mono
predicted cell ID
0.9925531914893617
NKcell
predicted cell ID
0.8996478873239436
balanced accuracy:
0.9497424977786234


0.9497424977786234

In [30]:
get_cluster_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations,cluster_id_column ='leiden_1.0', correct_id_col='ground_truth_annotations_simplified', predicted_id_col='scATAcat_annotation')

8
cluster accuracy:


0.8888888888888888

## accuracy metrics for label transfer:

In [31]:
get_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations_simplified', predicted_id_col='label_transfer_annotations_simplified')

accuracy:


0.7853133769878391

In [32]:
get_balanced_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations_simplified', predicted_id_col='label_transfer_annotations_simplified')

Bcell
predicted cell ID
0.851063829787234
CD4Tcell
predicted cell ID
0.7658610271903323
CD8Tcell
predicted cell ID
0.5338235294117647
GMP
predicted cell ID
0.24528301886792453
Mono
predicted cell ID
0.9063829787234042
NKcell
predicted cell ID
0.9559859154929577
balanced accuracy:
0.7097333832456029


0.7097333832456029

In [33]:
get_cluster_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations,cluster_id_column ='leiden_1.0', correct_id_col='ground_truth_annotations_simplified', predicted_id_col='label_transfer_annotations_simplified')

7
cluster accuracy:


0.7777777777777778