In [1]:
import pandas as pd
import numpy as np
import random as rn

In [2]:
# set the seed for reproducibility
sd = 1234
np.random.seed(sd)
rn.seed(sd)
%env PYTHONHASHSEED=0


env: PYTHONHASHSEED=0


In [3]:
def get_accuracy(annotation_df, correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    num_correct_ann= annotation_df[annotation_df[correct_id_col]==annotation_df[predicted_id_col]].shape[0]
    acc=num_correct_ann/annotation_df.shape[0]
    #print("accuracy:")
    return acc

In [4]:
def get_balanced_accuracy(annotation_df, correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    ann=[]
    for cell_type in sorted(set(annotation_df[correct_id_col])):
        cell_type_sub = annotation_df[annotation_df[correct_id_col]==cell_type]
        #print(cell_type)
        num_cells =cell_type_sub.shape[0]
        #print("predicted cell ID")
        num_correct_ann = cell_type_sub[cell_type_sub[predicted_id_col]==cell_type].shape[0]
        #print(cell_type_sub.predicted_cellID.value_counts())
        perc=(num_correct_ann/num_cells)
        #print(perc)
        ann.append(perc)
    balanced_accuracy= np.sum(ann)/len(ann)
    #print("balanced accuracy:")
    #print(balanced_accuracy)
    return balanced_accuracy

In [5]:
def get_cluster_accuracy(annotation_df, cluster_id_column ='clust',correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    correc_ann_counter=0
    annotation_df = annotation_df.groupby(cluster_id_column).filter(lambda x: len(x) > 10)
    cluster_ids =sorted(set(annotation_df[cluster_id_column]))
    for cluster_id in cluster_ids:
        cluster_sub = annotation_df[annotation_df[cluster_id_column]==cluster_id]   
        clust_real_id_counts = cluster_sub[correct_id_col].value_counts()
        # sort fist by number then by index, then get the cell type with the highest occurance. This takes care of ties
        clust_real_id = clust_real_id_counts.iloc[np.lexsort((clust_real_id_counts.index, -clust_real_id_counts.values))].index[0]
        clust_predicted_id_counts = cluster_sub[predicted_id_col].value_counts()
        # sort fist by number then by index, then get the cell type with the highest occurance. This takes care of ties
        clust_predicted_id = clust_predicted_id_counts.iloc[np.lexsort((clust_predicted_id_counts.index, -clust_predicted_id_counts.values))].index[0]
        if clust_real_id == clust_predicted_id:
            correc_ann_counter += 1
        else:
            continue
    clust_accuracy= correc_ann_counter/len(cluster_ids)
    return clust_accuracy

In [6]:
leiden_resolutions = ['0.1' ,'0.2' ,'0.3' ,'0.4' ,'0.5' ,'0.6' ,'0.7' ,'0.8' ,'0.9' ,'1.0' ,'1.1' ,'1.2' ,'1.3' ,'1.4' ,'1.5' ,'1.6' ,'1.7' ,'1.8' ,'1.9' ,'2.0', '2.1', '2.2', '2.3', '2.4', '2.5']

In [7]:
# prepare results dataframe
acc_comparisons = pd.DataFrame( index=['leiden_'+i for i in leiden_resolutions])

### seurat annotations

In [8]:
seurat_integration_annotations = pd.read_csv("../../../results/Fig4_SFig2-10X_PBMC_scmultiome/Fig4_SFig2-apply_seurat_label_transfer/10XPBMC_seurat_label_transfer_annotations.csv")
seurat_integration_annotations.columns = ["cell_IDs", "label_transfer_annotations","ground_truth_annotations"]
seurat_integration_annotations.index = seurat_integration_annotations.cell_IDs
seurat_integration_annotations.index.name = None
seurat_integration_annotations

Unnamed: 0,cell_IDs,label_transfer_annotations,ground_truth_annotations
AAACAGCCAAGGAATC-1,AAACAGCCAAGGAATC-1,CD4 T,CD4 T
AAACAGCCAATCCCTT-1,AAACAGCCAATCCCTT-1,CD8 T,CD4 T
AAACAGCCAATGCGCT-1,AAACAGCCAATGCGCT-1,CD8 T,CD4 T
AAACAGCCACCAACCG-1,AAACAGCCACCAACCG-1,Mono,CD8 T
AAACAGCCAGGATAAC-1,AAACAGCCAGGATAAC-1,CD4 T,CD4 T
...,...,...,...
TTTGTTGGTGACATGC-1,TTTGTTGGTGACATGC-1,CD8 T,CD8 T
TTTGTTGGTGTTAAAC-1,TTTGTTGGTGTTAAAC-1,CD8 T,CD8 T
TTTGTTGGTTAGGATT-1,TTTGTTGGTTAGGATT-1,NK,NK
TTTGTTGGTTGGTTAG-1,TTTGTTGGTTGGTTAG-1,CD4 T,CD4 T


In [9]:
renaming_dict={'CD4Tcell':'CD4 T', 
            'CD8Tcell':'CD8 T',
            'pDC': 'DC',
            'NKcell' : 'NK',
            'Bcell':'B'}

In [10]:
for res in leiden_resolutions:
    result_path = "../../../results/Supplementary_tables/ST4-10X_PBMC_scmultiome_accuracy_acros_clustering_res/res"+ res + "/outputs/scATAcat_annotations_leiden_" + res +".csv"
    scATAcat_annotations = pd.read_csv(result_path, index_col=0)
    scATAcat_annotations.index.name = None
    scATAcat_annotations.columns = ['leiden_'+ res , 'scATAcat_annotation' ,'cell_IDs']
    scATAcat_annotations = scATAcat_annotations.replace({"scATAcat_annotation": renaming_dict})
    common_cells = list(set(scATAcat_annotations.index)&set(seurat_integration_annotations.index))
    scATAcat_labelTransfer_groundTruth_commonCells= pd.merge(scATAcat_annotations.loc[common_cells,], seurat_integration_annotations.loc[common_cells,], on="cell_IDs")
    common_annotations = list(set(scATAcat_labelTransfer_groundTruth_commonCells.scATAcat_annotation)&set(scATAcat_labelTransfer_groundTruth_commonCells.ground_truth_annotations)&set(scATAcat_labelTransfer_groundTruth_commonCells.label_transfer_annotations))
    scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations = scATAcat_labelTransfer_groundTruth_commonCells[scATAcat_labelTransfer_groundTruth_commonCells.ground_truth_annotations.isin(common_annotations)]
    acc_comparisons.loc['leiden_'+ res, 'num_clust'] = scATAcat_annotations['leiden_'+ res].max()+1
    acc_comparisons.loc['leiden_'+ res, 'scATAcat_Acc'] = get_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations', predicted_id_col='scATAcat_annotation')
    acc_comparisons.loc['leiden_'+ res, 'labelTransfer_Acc'] = get_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations', predicted_id_col='label_transfer_annotations')
    acc_comparisons.loc['leiden_'+ res, 'scATAcat_BAcc'] = get_balanced_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations', predicted_id_col='scATAcat_annotation')
    acc_comparisons.loc['leiden_'+ res, 'labelTransfer_BAcc'] = get_balanced_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations, correct_id_col='ground_truth_annotations', predicted_id_col='label_transfer_annotations')
    acc_comparisons.loc['leiden_'+ res, 'scATAcat_CAcc'] = get_cluster_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations,cluster_id_column ='leiden_'+ res, correct_id_col='ground_truth_annotations', predicted_id_col='scATAcat_annotation')
    acc_comparisons.loc['leiden_'+ res, 'labelTransfer_CAcc'] = get_cluster_accuracy(scATAcat_labelTransfer_groundTruth_commonCells_commonAnnotations,cluster_id_column ='leiden_'+ res, correct_id_col='ground_truth_annotations', predicted_id_col='label_transfer_annotations')

In [11]:
acc_comparisons

Unnamed: 0,num_clust,scATAcat_Acc,labelTransfer_Acc,scATAcat_BAcc,labelTransfer_BAcc,scATAcat_CAcc,labelTransfer_CAcc
leiden_0.1,7.0,0.837238,0.908849,0.746176,0.888393,0.857143,1.0
leiden_0.2,7.0,0.847785,0.908849,0.853023,0.888393,0.857143,1.0
leiden_0.3,10.0,0.838118,0.911537,0.766952,0.901593,0.8,1.0
leiden_0.4,11.0,0.838118,0.911537,0.766984,0.901593,0.818182,1.0
leiden_0.5,12.0,0.838013,0.911537,0.766897,0.901593,0.833333,1.0
leiden_0.6,12.0,0.837801,0.911537,0.766786,0.901593,0.833333,0.916667
leiden_0.7,13.0,0.837695,0.911537,0.766732,0.901593,0.846154,1.0
leiden_0.8,14.0,0.837801,0.911537,0.76682,0.901593,0.857143,1.0
leiden_0.9,15.0,0.837801,0.911537,0.76682,0.901593,0.857143,1.0
leiden_1.0,15.0,0.837801,0.911537,0.76682,0.901593,0.857143,1.0


In [12]:
acc_comparisons

Unnamed: 0,num_clust,scATAcat_Acc,labelTransfer_Acc,scATAcat_BAcc,labelTransfer_BAcc,scATAcat_CAcc,labelTransfer_CAcc
leiden_0.1,7.0,0.837238,0.908849,0.746176,0.888393,0.857143,1.0
leiden_0.2,7.0,0.847785,0.908849,0.853023,0.888393,0.857143,1.0
leiden_0.3,10.0,0.838118,0.911537,0.766952,0.901593,0.8,1.0
leiden_0.4,11.0,0.838118,0.911537,0.766984,0.901593,0.818182,1.0
leiden_0.5,12.0,0.838013,0.911537,0.766897,0.901593,0.833333,1.0
leiden_0.6,12.0,0.837801,0.911537,0.766786,0.901593,0.833333,0.916667
leiden_0.7,13.0,0.837695,0.911537,0.766732,0.901593,0.846154,1.0
leiden_0.8,14.0,0.837801,0.911537,0.76682,0.901593,0.857143,1.0
leiden_0.9,15.0,0.837801,0.911537,0.76682,0.901593,0.857143,1.0
leiden_1.0,15.0,0.837801,0.911537,0.76682,0.901593,0.857143,1.0


In [13]:
acc_comparisons.to_csv('../../../results/Supplementary_tables/ST4-10X_PBMC_scmultiome_accuracy_acros_clustering_res/ST4-10X_PBMC_scmultiome_accuracy_acros_clustering_res.csv')