In [1]:
import pandas as pd
import numpy as np
import random as rn

In [2]:
# set the seed for reproducibility
sd = 1234
np.random.seed(sd)
rn.seed(sd)
%env PYTHONHASHSEED=0


env: PYTHONHASHSEED=0


In [3]:
def get_accuracy(annotation_df, correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    num_correct_ann= annotation_df[annotation_df[correct_id_col]==annotation_df[predicted_id_col]].shape[0]
    acc=num_correct_ann/annotation_df.shape[0]
    #print("accuracy:")
    return acc

In [4]:
def get_balanced_accuracy(annotation_df, correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    ann=[]
    for cell_type in sorted(set(annotation_df[correct_id_col])):
        cell_type_sub = annotation_df[annotation_df[correct_id_col]==cell_type]
        #print(cell_type)
        num_cells =cell_type_sub.shape[0]
        #print("predicted cell ID")
        num_correct_ann = cell_type_sub[cell_type_sub[predicted_id_col]==cell_type].shape[0]
        #print(cell_type_sub.predicted_cellID.value_counts())
        perc=(num_correct_ann/num_cells)
        #print(perc)
        ann.append(perc)
    balanced_accuracy= np.sum(ann)/len(ann)
    #print("balanced accuracy:")
    #print(balanced_accuracy)
    return balanced_accuracy

In [5]:
def get_cluster_accuracy(annotation_df, cluster_id_column ='clust',correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    correc_ann_counter=0
    annotation_df = annotation_df.groupby(cluster_id_column).filter(lambda x: len(x) > 10)
    cluster_ids =sorted(set(annotation_df[cluster_id_column]))
    for cluster_id in cluster_ids:
        cluster_sub = annotation_df[annotation_df[cluster_id_column]==cluster_id]   
        clust_real_id_counts = cluster_sub[correct_id_col].value_counts()
        # sort fist by number then by index, then get the cell type with the highest occurance. This takes care of ties
        clust_real_id = clust_real_id_counts.iloc[np.lexsort((clust_real_id_counts.index, -clust_real_id_counts.values))].index[0]
        clust_predicted_id_counts = cluster_sub[predicted_id_col].value_counts()
        # sort fist by number then by index, then get the cell type with the highest occurance. This takes care of ties
        clust_predicted_id = clust_predicted_id_counts.iloc[np.lexsort((clust_predicted_id_counts.index, -clust_predicted_id_counts.values))].index[0]
        if clust_real_id == clust_predicted_id:
            correc_ann_counter += 1
        else:
            continue
    clust_accuracy= correc_ann_counter/len(cluster_ids)
    return clust_accuracy

In [6]:
leiden_resolutions = ['0.1' ,'0.2' ,'0.3' ,'0.4' ,'0.5' ,'0.6' ,'0.7' ,'0.8' ,'0.9' ,'1.0' ,'1.1' ,'1.2' ,'1.3' ,'1.4' ,'1.5' ,'1.6' ,'1.7' ,'1.8' ,'1.9' ,'2.0', '2.1', '2.2', '2.3', '2.4', '2.5']

In [7]:
# prepare results dataframe
acc_comparisons = pd.DataFrame( index=['leiden_'+i for i in leiden_resolutions])

In [8]:
NeurIPS_BM_s1d1_metadata = pd.read_csv("../../../data/NeurIPS_BM_scmultiome/03_cell_IDs/NeurIPS_BM_s1d1_metadata.csv", index_col=0)

In [9]:
NeurIPS_BM_s1d1_metadata["cellIDs"] = NeurIPS_BM_s1d1_metadata.index.str.split("-").str[:-1].str.join('-')

In [10]:
NeurIPS_BM_s1d1_metadata[["cellIDs", "cell_type"]]

Unnamed: 0,cellIDs,cell_type
TAGTTGTCACCCTCAC-1-s1d1,TAGTTGTCACCCTCAC-1,Naive CD20+ B
CTATGGCCATAACGGG-1-s1d1,CTATGGCCATAACGGG-1,CD14+ Mono
CCGCACACAGGTTAAA-1-s1d1,CCGCACACAGGTTAAA-1,CD8+ T
TCATTTGGTAATGGAA-1-s1d1,TCATTTGGTAATGGAA-1,CD8+ T
ACCACATAGGTGTCCA-1-s1d1,ACCACATAGGTGTCCA-1,CD16+ Mono
...,...,...
AGACCCGGTTATCCTA-1-s1d1,AGACCCGGTTATCCTA-1,CD4+ T activated
GACCTAAGTGCCTCAC-1-s1d1,GACCTAAGTGCCTCAC-1,pDC
AGCTCATAGCTATATG-1-s1d1,AGCTCATAGCTATATG-1,CD4+ T activated
TACGTACAGGAAACTG-1-s1d1,TACGTACAGGAAACTG-1,CD4+ T naive


In [11]:
NeurIPS_BM_s1d1_metadata.index = NeurIPS_BM_s1d1_metadata.cellIDs

In [12]:
NeurIPS_BM_s1d1_metadata = NeurIPS_BM_s1d1_metadata[["cellIDs", "cell_type"]]

In [13]:
NeurIPS_ann_rename_dict = {'B1 B': 'Bcell',
 'CD14+ Mono': 'Mono',
 'CD16+ Mono': 'Mono',
 'CD4+ T activated':"CD4Tcell",
 'CD4+ T naive':"CD4Tcell",
 'CD8+ T':"CD8Tcell",
 'Erythroblast':"Ery",
 'G/M prog':"GMP",
 'HSC':"HSC",
 'ID2-hi myeloid prog':"CMP",
 'ILC':"ILC",
 'Lymph prog':"CLP",
 'MK/E prog':"MEP",
 'NK':"NKcell",
 'Naive CD20+ B':"Bcell",
 'Normoblast':"Ery", 
 'Plasma cell':"Plasma cell",
 'Proerythroblast':"Ery",
 'Transitional B':"Bcell",
 'cDC2':"cDC2",
 'pDC':'pDC'}

In [14]:
NeurIPS_BM_s1d1_metadata['NeurIPS_ann_simp'] =list(NeurIPS_BM_s1d1_metadata["cell_type"].map(NeurIPS_ann_rename_dict))
NeurIPS_BM_s1d1_metadata.index.name = None
NeurIPS_BM_s1d1_metadata.columns=["cell_IDs", "ground_truth_annotations","ground_truth_annotations_simplified" ]
NeurIPS_BM_s1d1_metadata

Unnamed: 0,cell_IDs,ground_truth_annotations,ground_truth_annotations_simplified
TAGTTGTCACCCTCAC-1,TAGTTGTCACCCTCAC-1,Naive CD20+ B,Bcell
CTATGGCCATAACGGG-1,CTATGGCCATAACGGG-1,CD14+ Mono,Mono
CCGCACACAGGTTAAA-1,CCGCACACAGGTTAAA-1,CD8+ T,CD8Tcell
TCATTTGGTAATGGAA-1,TCATTTGGTAATGGAA-1,CD8+ T,CD8Tcell
ACCACATAGGTGTCCA-1,ACCACATAGGTGTCCA-1,CD16+ Mono,Mono
...,...,...,...
AGACCCGGTTATCCTA-1,AGACCCGGTTATCCTA-1,CD4+ T activated,CD4Tcell
GACCTAAGTGCCTCAC-1,GACCTAAGTGCCTCAC-1,pDC,pDC
AGCTCATAGCTATATG-1,AGCTCATAGCTATATG-1,CD4+ T activated,CD4Tcell
TACGTACAGGAAACTG-1,TACGTACAGGAAACTG-1,CD4+ T naive,CD4Tcell


### seurat annotations

In [15]:
seurat_labelTransfer_annotations = pd.read_csv("../../../results/Fig5_SFig3-NeurIPS_BM_scmultiome/Fig5-apply_seurat_label_transfer/bm_neurips_labelTransfer_predicted_annotations.csv", index_col = 0)
seurat_labelTransfer_annotations.head()

Unnamed: 0,predicted.celltype.l2,cellID
AAACAGCCAATTAAGG-1,CD4 Naive,AAACAGCCAATTAAGG-1
AAACAGCCAGGCTGTT-1,Prog_B 1,AAACAGCCAGGCTGTT-1
AAACATGCAAAGCGCA-1,CD4 Memory,AAACATGCAAAGCGCA-1
AAACATGCAATAACCT-1,Prog_RBC,AAACATGCAATAACCT-1
AAACATGCAATAATGG-1,CD8 Memory_2,AAACATGCAATAATGG-1


In [16]:
seurat_annotations = ['CD4 Naive', 'Prog_B 1', 'CD4 Memory', 'Prog_RBC', 'CD8 Memory_2', 'Memory B', 'NK', 'CD14 Mono', 'CD8 Naive', 'CD8 Effector_2', 'Naive B', 'HSC', 'MAIT', 'pDC', 'GMP', 'CD16 Mono', 'gdT', 'Prog_B 2', 'LMPP', 'CD8 Effector_1', 'Prog_DC', 'Plasmablast', 'cDC2', 'Prog_Mk', 'CD56 bright NK', 'CD8 Memory_1']
seurat_annotations_simplified = ['CD4Tcell', "CLP", 'CD4Tcell', "MEP", "CD8Tcell", "Bcell", "NKcell", "Mono", "CD8Tcell", "CD8Tcell", "Bcell", 'HSC', 'MAIT', 'pDC', 'GMP', "Mono", 'gdT', 'CLP', 'LMPP', 'CD8Tcell', "CMP", 'Plasmablast', 'cDC2', "MEP", "NKcell", 'CD8Tcell']

seurat_annotations_and_simplified_dict = dict(zip(seurat_annotations, seurat_annotations_simplified))

print(seurat_annotations_and_simplified_dict)


{'CD4 Naive': 'CD4Tcell', 'Prog_B 1': 'CLP', 'CD4 Memory': 'CD4Tcell', 'Prog_RBC': 'MEP', 'CD8 Memory_2': 'CD8Tcell', 'Memory B': 'Bcell', 'NK': 'NKcell', 'CD14 Mono': 'Mono', 'CD8 Naive': 'CD8Tcell', 'CD8 Effector_2': 'CD8Tcell', 'Naive B': 'Bcell', 'HSC': 'HSC', 'MAIT': 'MAIT', 'pDC': 'pDC', 'GMP': 'GMP', 'CD16 Mono': 'Mono', 'gdT': 'gdT', 'Prog_B 2': 'CLP', 'LMPP': 'LMPP', 'CD8 Effector_1': 'CD8Tcell', 'Prog_DC': 'CMP', 'Plasmablast': 'Plasmablast', 'cDC2': 'cDC2', 'Prog_Mk': 'MEP', 'CD56 bright NK': 'NKcell', 'CD8 Memory_1': 'CD8Tcell'}


In [17]:
seurat_labelTransfer_annotations['seurat_annotations_simp'] =list(seurat_labelTransfer_annotations["predicted.celltype.l2"].map(seurat_annotations_and_simplified_dict))
seurat_labelTransfer_annotations.columns=["label_transfer_annotations", "cell_IDs", "label_transfer_annotations_simplified"]

In [18]:
seurat_labelTransfer_annotations

Unnamed: 0,label_transfer_annotations,cell_IDs,label_transfer_annotations_simplified
AAACAGCCAATTAAGG-1,CD4 Naive,AAACAGCCAATTAAGG-1,CD4Tcell
AAACAGCCAGGCTGTT-1,Prog_B 1,AAACAGCCAGGCTGTT-1,CLP
AAACATGCAAAGCGCA-1,CD4 Memory,AAACATGCAAAGCGCA-1,CD4Tcell
AAACATGCAATAACCT-1,Prog_RBC,AAACATGCAATAACCT-1,MEP
AAACATGCAATAATGG-1,CD8 Memory_2,AAACATGCAATAATGG-1,CD8Tcell
...,...,...,...
TTTGTGTTCATTTGCT-1,CD4 Naive,TTTGTGTTCATTTGCT-1,CD4Tcell
TTTGTGTTCGACCTGA-1,Prog_RBC,TTTGTGTTCGACCTGA-1,MEP
TTTGTGTTCGCCTAAG-1,CD4 Naive,TTTGTGTTCGCCTAAG-1,CD4Tcell
TTTGTTGGTACGGTAC-1,CD4 Naive,TTTGTTGGTACGGTAC-1,CD4Tcell


In [19]:
for res in leiden_resolutions:
    result_path = "../../../results/Supplementary_tables/ST5-NeurIPS_BM_scmultiome_accuracy_acros_clustering_res/res"+ res + "/outputs/scATAcat_annotations_leiden_" + res +".csv"
    scATAcat_annotations = pd.read_csv(result_path, index_col=0)    
    scATAcat_annotations.index.name = None
    scATAcat_annotations.columns = ['leiden_'+ res , 'scATAcat_annotation' ,'cell_IDs']
    common_cells = list(set(scATAcat_annotations.index)&set(seurat_labelTransfer_annotations.index)&set(NeurIPS_BM_s1d1_metadata.index))
    scATAcat_labelTransfer_groundTruth_commonCells= pd.merge(scATAcat_annotations.loc[common_cells,], seurat_labelTransfer_annotations.loc[common_cells,], on="cell_IDs")
    scATAcat_labelTransfer_groundTruth_commonCells= pd.merge(scATAcat_labelTransfer_groundTruth_commonCells, NeurIPS_BM_s1d1_metadata.loc[common_cells,],on="cell_IDs")
    common_annotations = list(set(scATAcat_labelTransfer_groundTruth_commonCells.scATAcat_annotation)&set(scATAcat_labelTransfer_groundTruth_commonCells.label_transfer_annotations_simplified)&set(scATAcat_labelTransfer_groundTruth_commonCells.ground_truth_annotations_simplified))
    scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations = scATAcat_labelTransfer_groundTruth_commonCells[scATAcat_labelTransfer_groundTruth_commonCells.ground_truth_annotations_simplified.isin(common_annotations)]
    acc_comparisons.loc['leiden_'+ res, 'num_clust'] = scATAcat_annotations['leiden_'+ res].max()+1
    acc_comparisons.loc['leiden_'+ res, 'scATAcat_Acc'] = get_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations_simplified', predicted_id_col='scATAcat_annotation')
    acc_comparisons.loc['leiden_'+ res, 'labelTransfer_Acc'] = get_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations_simplified', predicted_id_col='label_transfer_annotations_simplified')
    acc_comparisons.loc['leiden_'+ res, 'scATAcat_BAcc'] = get_balanced_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations_simplified', predicted_id_col='scATAcat_annotation')
    acc_comparisons.loc['leiden_'+ res, 'labelTransfer_BAcc'] = get_balanced_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations_simplified', predicted_id_col='label_transfer_annotations_simplified')
    acc_comparisons.loc['leiden_'+ res, 'scATAcat_CAcc'] = get_cluster_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations,cluster_id_column ='leiden_'+ res, correct_id_col='ground_truth_annotations_simplified', predicted_id_col='scATAcat_annotation')
    acc_comparisons.loc['leiden_'+ res, 'labelTransfer_CAcc'] = get_cluster_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations,cluster_id_column ='leiden_'+ res, correct_id_col='ground_truth_annotations_simplified', predicted_id_col='label_transfer_annotations_simplified')

In [22]:
scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations['leiden_'+ res].value_counts()

leiden_2.5
2     521
1     518
0     492
3     468
4     444
6     399
5     398
10    282
11    253
12    158
13    136
16    102
17     61
18     23
14      2
8       1
15      1
Name: count, dtype: int64

In [20]:
acc_comparisons

Unnamed: 0,num_clust,scATAcat_Acc,labelTransfer_Acc,scATAcat_BAcc,labelTransfer_BAcc,scATAcat_CAcc,labelTransfer_CAcc
leiden_0.1,5.0,0.982808,0.850716,0.978007,0.869823,1.0,1.0
leiden_0.2,6.0,0.95356,0.83287,0.905612,0.744915,0.8,0.8
leiden_0.3,7.0,0.943639,0.785313,0.931212,0.709733,1.0,0.833333
leiden_0.4,7.0,0.946679,0.785313,0.934855,0.709733,1.0,0.833333
leiden_0.5,8.0,0.947848,0.785313,0.937448,0.709733,1.0,0.833333
leiden_0.6,8.0,0.922123,0.785313,0.924761,0.709733,1.0,0.833333
leiden_0.7,9.0,0.940599,0.785313,0.927301,0.709733,1.0,0.857143
leiden_0.8,12.0,0.961646,0.785313,0.949742,0.709733,0.888889,0.777778
leiden_0.9,11.0,0.95884,0.785313,0.948016,0.709733,0.888889,0.777778
leiden_1.0,12.0,0.961646,0.785313,0.949742,0.709733,0.888889,0.777778


In [21]:
acc_comparisons.to_csv('../../../results/Supplementary_tables/ST5-NeurIPS_BM_scmultiome_accuracy_acros_clustering_res/ST5-NeurIPS_BM_scmultiome_accuracy_acros_clustering_res.csv')