In [1]:
import pandas as pd
import numpy as np

In [2]:
def get_accuracy(annotation_df, correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    num_correct_ann= annotation_df[annotation_df[correct_id_col]==annotation_df[predicted_id_col]].shape[0]
    acc=num_correct_ann/annotation_df.shape[0]
    print("accuracy:")
    return acc

In [3]:
def get_balanced_accuracy(annotation_df, correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    ann=[]
    for cell_type in sorted(set(annotation_df[correct_id_col])):
        cell_type_sub = annotation_df[annotation_df[correct_id_col]==cell_type]
        print(cell_type)
        num_cells =cell_type_sub.shape[0]
        print("predicted cell ID")
        num_correct_ann = cell_type_sub[cell_type_sub[predicted_id_col]==cell_type].shape[0]
        #print(cell_type_sub.predicted_cellID.value_counts())
        perc=(num_correct_ann/num_cells)
        print(perc)
        ann.append(perc)
    balanced_accuracy= np.sum(ann)/len(ann)
    print("balanced accuracy:")
    print(balanced_accuracy)
    return balanced_accuracy

In [4]:
def get_cluster_accuracy(annotation_df, cluster_id_column ='clust',correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    correc_ann_counter=0
    annotation_df = annotation_df.groupby(cluster_id_column).filter(lambda x: len(x) > 10)
    cluster_ids =sorted(set(annotation_df[cluster_id_column]))
    for cluster_id in cluster_ids:
        cluster_sub = annotation_df[annotation_df[cluster_id_column]==cluster_id]   
        clust_real_id_counts = cluster_sub[correct_id_col].value_counts()
        # sort fist by number then by index, then get the cell type with the highest occurance. This takes care of ties
        clust_real_id = clust_real_id_counts.iloc[np.lexsort((clust_real_id_counts.index, -clust_real_id_counts.values))].index[0]
        clust_predicted_id_counts = cluster_sub[predicted_id_col].value_counts()
        # sort fist by number then by index, then get the cell type with the highest occurance. This takes care of ties
        clust_predicted_id = clust_predicted_id_counts.iloc[np.lexsort((clust_predicted_id_counts.index, -clust_predicted_id_counts.values))].index[0]
        if clust_real_id == clust_predicted_id:
            correc_ann_counter += 1
        else:
            continue
    clust_accuracy= correc_ann_counter/len(cluster_ids)
    print("cluster accuracy:")
    return clust_accuracy

In [5]:
seurat_integration_annotations = pd.read_csv("../../../results/Fig4_SFig2-10X_PBMC_scmultiome/Fig4_SFig2-apply_seurat_label_transfer/10XPBMC_seurat_label_transfer_annotations.csv")
seurat_integration_annotations.columns = ["cell_IDs", "label_transfer_annotations","ground_truth_annotations"]
seurat_integration_annotations.index = seurat_integration_annotations.cell_IDs
seurat_integration_annotations.index.name = None
seurat_integration_annotations

Unnamed: 0,cell_IDs,label_transfer_annotations,ground_truth_annotations
AAACAGCCAAGGAATC-1,AAACAGCCAAGGAATC-1,CD4 T,CD4 T
AAACAGCCAATCCCTT-1,AAACAGCCAATCCCTT-1,CD8 T,CD4 T
AAACAGCCAATGCGCT-1,AAACAGCCAATGCGCT-1,CD8 T,CD4 T
AAACAGCCACCAACCG-1,AAACAGCCACCAACCG-1,Mono,CD8 T
AAACAGCCAGGATAAC-1,AAACAGCCAGGATAAC-1,CD4 T,CD4 T
...,...,...,...
TTTGTTGGTGACATGC-1,TTTGTTGGTGACATGC-1,CD8 T,CD8 T
TTTGTTGGTGTTAAAC-1,TTTGTTGGTGTTAAAC-1,CD8 T,CD8 T
TTTGTTGGTTAGGATT-1,TTTGTTGGTTAGGATT-1,NK,NK
TTTGTTGGTTGGTTAG-1,TTTGTTGGTTGGTTAG-1,CD4 T,CD4 T


In [6]:
scATAcat_annotations = pd.read_csv("../../../results/Fig4_SFig2-10X_PBMC_scmultiome/Fig4_SFig2-apply_scATAcat/outputs/scATAcat_annotations.csv", index_col=0)
scATAcat_annotations.index.name = None
scATAcat_annotations.columns = ['leiden_1', 'scATAcat_annotation','cell_IDs']
scATAcat_annotations

Unnamed: 0,leiden_1,scATAcat_annotation,cell_IDs
AAACAGCCAATCCCTT-1,4,CD4Tcell,AAACAGCCAATCCCTT-1
AAACAGCCAATGCGCT-1,2,CD4Tcell,AAACAGCCAATGCGCT-1
AAACAGCCACCAACCG-1,1,CD4Tcell,AAACAGCCACCAACCG-1
AAACAGCCAGGATAAC-1,2,CD4Tcell,AAACAGCCAGGATAAC-1
AAACAGCCAGTTTACG-1,4,CD4Tcell,AAACAGCCAGTTTACG-1
...,...,...,...
TTTGTTGGTGACATGC-1,1,CD4Tcell,TTTGTTGGTGACATGC-1
TTTGTTGGTGTTAAAC-1,1,CD4Tcell,TTTGTTGGTGTTAAAC-1
TTTGTTGGTTAGGATT-1,9,NKcell,TTTGTTGGTTAGGATT-1
TTTGTTGGTTGGTTAG-1,6,CD4Tcell,TTTGTTGGTTGGTTAG-1


In [7]:
renaming_dict={'CD4Tcell':'CD4 T', 
            'CD8Tcell':'CD8 T',
            'pDC': 'DC',
            'NKcell' : 'NK',
            'Bcell':'B'}

scATAcat_annotations = scATAcat_annotations.replace({"scATAcat_annotation": renaming_dict})
scATAcat_annotations.head()

Unnamed: 0,leiden_1,scATAcat_annotation,cell_IDs
AAACAGCCAATCCCTT-1,4,CD4 T,AAACAGCCAATCCCTT-1
AAACAGCCAATGCGCT-1,2,CD4 T,AAACAGCCAATGCGCT-1
AAACAGCCACCAACCG-1,1,CD4 T,AAACAGCCACCAACCG-1
AAACAGCCAGGATAAC-1,2,CD4 T,AAACAGCCAGGATAAC-1
AAACAGCCAGTTTACG-1,4,CD4 T,AAACAGCCAGTTTACG-1


## combine the annotations for the common cells:


In [8]:
common_cells = list(set(scATAcat_annotations.index)&set(seurat_integration_annotations.index))

In [9]:
seurat_integration_annotations.loc[common_cells,]

Unnamed: 0,cell_IDs,label_transfer_annotations,ground_truth_annotations
GTTACGTAGCTAGAAG-1,GTTACGTAGCTAGAAG-1,CD8 T,CD8 T
GTGTCCAAGGTCTTGG-1,GTGTCCAAGGTCTTGG-1,NK,NK
CGATGTCCAACAGCCT-1,CGATGTCCAACAGCCT-1,Mono,Mono
AATCCATCACAGGAAT-1,AATCCATCACAGGAAT-1,CD8 T,CD8 T
ATGCGATTCCTCAGCT-1,ATGCGATTCCTCAGCT-1,Mono,Mono
...,...,...,...
GTCAATATCGCACACA-1,GTCAATATCGCACACA-1,CD8 T,CD8 T
CGACCTGCAAGACTCC-1,CGACCTGCAAGACTCC-1,Mono,Mono
GCGCTTAAGCCTGATG-1,GCGCTTAAGCCTGATG-1,CD8 T,CD8 T
GCTCACAAGTGTGATC-1,GCTCACAAGTGTGATC-1,B,B


In [10]:
scATAcat_annotations.loc[common_cells,]

Unnamed: 0,leiden_1,scATAcat_annotation,cell_IDs
GTTACGTAGCTAGAAG-1,5,CD8 T,GTTACGTAGCTAGAAG-1
GTGTCCAAGGTCTTGG-1,5,CD8 T,GTGTCCAAGGTCTTGG-1
CGATGTCCAACAGCCT-1,3,Mono,CGATGTCCAACAGCCT-1
AATCCATCACAGGAAT-1,1,CD4 T,AATCCATCACAGGAAT-1
ATGCGATTCCTCAGCT-1,0,Mono,ATGCGATTCCTCAGCT-1
...,...,...,...
GTCAATATCGCACACA-1,5,CD8 T,GTCAATATCGCACACA-1
CGACCTGCAAGACTCC-1,0,Mono,CGACCTGCAAGACTCC-1
GCGCTTAAGCCTGATG-1,5,CD8 T,GCGCTTAAGCCTGATG-1
GCTCACAAGTGTGATC-1,7,B,GCTCACAAGTGTGATC-1


In [11]:
seurat_integration_annotations.loc[common_cells,]

Unnamed: 0,cell_IDs,label_transfer_annotations,ground_truth_annotations
GTTACGTAGCTAGAAG-1,GTTACGTAGCTAGAAG-1,CD8 T,CD8 T
GTGTCCAAGGTCTTGG-1,GTGTCCAAGGTCTTGG-1,NK,NK
CGATGTCCAACAGCCT-1,CGATGTCCAACAGCCT-1,Mono,Mono
AATCCATCACAGGAAT-1,AATCCATCACAGGAAT-1,CD8 T,CD8 T
ATGCGATTCCTCAGCT-1,ATGCGATTCCTCAGCT-1,Mono,Mono
...,...,...,...
GTCAATATCGCACACA-1,GTCAATATCGCACACA-1,CD8 T,CD8 T
CGACCTGCAAGACTCC-1,CGACCTGCAAGACTCC-1,Mono,Mono
GCGCTTAAGCCTGATG-1,GCGCTTAAGCCTGATG-1,CD8 T,CD8 T
GCTCACAAGTGTGATC-1,GCTCACAAGTGTGATC-1,B,B


In [12]:
pd.merge(scATAcat_annotations.loc[common_cells,], seurat_integration_annotations.loc[common_cells,], on="cell_IDs")


Unnamed: 0,leiden_1,scATAcat_annotation,cell_IDs,label_transfer_annotations,ground_truth_annotations
0,5,CD8 T,GTTACGTAGCTAGAAG-1,CD8 T,CD8 T
1,5,CD8 T,GTGTCCAAGGTCTTGG-1,NK,NK
2,3,Mono,CGATGTCCAACAGCCT-1,Mono,Mono
3,1,CD4 T,AATCCATCACAGGAAT-1,CD8 T,CD8 T
4,0,Mono,ATGCGATTCCTCAGCT-1,Mono,Mono
...,...,...,...,...,...
9663,5,CD8 T,GTCAATATCGCACACA-1,CD8 T,CD8 T
9664,0,Mono,CGACCTGCAAGACTCC-1,Mono,Mono
9665,5,CD8 T,GCGCTTAAGCCTGATG-1,CD8 T,CD8 T
9666,7,B,GCTCACAAGTGTGATC-1,B,B


In [13]:
scATAcat_labelTransfer_groundTruth_commonCells= pd.merge(scATAcat_annotations.loc[common_cells,], seurat_integration_annotations.loc[common_cells,], on="cell_IDs")


## subset the ground truth to common annotations


In [14]:
common_annotations = list(set(scATAcat_labelTransfer_groundTruth_commonCells.scATAcat_annotation)&set(scATAcat_labelTransfer_groundTruth_commonCells.ground_truth_annotations)&set(scATAcat_labelTransfer_groundTruth_commonCells.label_transfer_annotations))
common_annotations

['Mono', 'NK', 'CD4 T', 'B', 'DC', 'CD8 T']

In [15]:
scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations = scATAcat_labelTransfer_groundTruth_commonCells[scATAcat_labelTransfer_groundTruth_commonCells.ground_truth_annotations.isin(common_annotations)]

In [16]:
scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations

Unnamed: 0,leiden_1,scATAcat_annotation,cell_IDs,label_transfer_annotations,ground_truth_annotations
0,5,CD8 T,GTTACGTAGCTAGAAG-1,CD8 T,CD8 T
1,5,CD8 T,GTGTCCAAGGTCTTGG-1,NK,NK
2,3,Mono,CGATGTCCAACAGCCT-1,Mono,Mono
3,1,CD4 T,AATCCATCACAGGAAT-1,CD8 T,CD8 T
4,0,Mono,ATGCGATTCCTCAGCT-1,Mono,Mono
...,...,...,...,...,...
9663,5,CD8 T,GTCAATATCGCACACA-1,CD8 T,CD8 T
9664,0,Mono,CGACCTGCAAGACTCC-1,Mono,Mono
9665,5,CD8 T,GCGCTTAAGCCTGATG-1,CD8 T,CD8 T
9666,7,B,GCTCACAAGTGTGATC-1,B,B


## accuracy metrics for scATAcat:

In [17]:
get_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations', predicted_id_col='scATAcat_annotation')

accuracy:


0.837800614471872

In [18]:
get_balanced_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations', predicted_id_col='scATAcat_annotation')

B
predicted cell ID
0.9959839357429718
CD4 T
predicted cell ID
0.9777038269550749
CD8 T
predicted cell ID
0.3305351521511018
DC
predicted cell ID
0.42592592592592593
Mono
predicted cell ID
0.998084902649218
NK
predicted cell ID
0.8726851851851852
balanced accuracy:
0.766819821434913


0.766819821434913

In [19]:
get_cluster_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations,cluster_id_column ='leiden_1', correct_id_col='ground_truth_annotations', predicted_id_col='scATAcat_annotation')

cluster accuracy:


0.8571428571428571

## accuracy metrics for label transfer:

In [20]:
get_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations', predicted_id_col='label_transfer_annotations')

accuracy:


0.911537239114313

In [21]:
get_balanced_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations', predicted_id_col='label_transfer_annotations')

B
predicted cell ID
0.9866131191432396
CD4 T
predicted cell ID
0.8489184692179701
CD8 T
predicted cell ID
0.8693599160545645
DC
predicted cell ID
0.7546296296296297
Mono
predicted cell ID
0.982444940951165
NK
predicted cell ID
0.9675925925925926
balanced accuracy:
0.9015931112648602


0.9015931112648602

In [22]:
get_cluster_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations,cluster_id_column ='leiden_1', correct_id_col='ground_truth_annotations', predicted_id_col='label_transfer_annotations')

cluster accuracy:


1.0