In [27]:
import pandas as pd
import numpy as np

In [28]:
def get_accuracy(annotation_df, correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    num_correct_ann= annotation_df[annotation_df[correct_id_col]==annotation_df[predicted_id_col]].shape[0]
    acc=num_correct_ann/annotation_df.shape[0]
    print("accuracy:")
    return acc

In [29]:
def get_balanced_accuracy(annotation_df, correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    ann=[]
    for cell_type in sorted(set(annotation_df[correct_id_col])):
        cell_type_sub = annotation_df[annotation_df[correct_id_col]==cell_type]
        print(cell_type)
        num_cells =cell_type_sub.shape[0]
        print("predicted cell ID")
        num_correct_ann = cell_type_sub[cell_type_sub[predicted_id_col]==cell_type].shape[0]
        #print(cell_type_sub.predicted_cellID.value_counts())
        perc=(num_correct_ann/num_cells)
        print(perc)
        ann.append(perc)
    balanced_accuracy= np.sum(ann)/len(ann)
    print("balanced accuracy:")
    print(balanced_accuracy)
    return balanced_accuracy

In [30]:
def get_cluster_accuracy(annotation_df, cluster_id_column ='clust',correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    correc_ann_counter=0
    annotation_df = annotation_df.groupby(cluster_id_column).filter(lambda x: len(x) > 10)
    cluster_ids =sorted(set(annotation_df[cluster_id_column]))
    for cluster_id in cluster_ids:
        cluster_sub = annotation_df[annotation_df[cluster_id_column]==cluster_id]   
        clust_real_id_counts = cluster_sub[correct_id_col].value_counts()
        # sort fist by number then by index, then get the cell type with the highest occurance. This takes care of ties
        clust_real_id = clust_real_id_counts.iloc[np.lexsort((clust_real_id_counts.index, -clust_real_id_counts.values))].index[0]
        clust_predicted_id_counts = cluster_sub[predicted_id_col].value_counts()
        # sort fist by number then by index, then get the cell type with the highest occurance. This takes care of ties
        clust_predicted_id = clust_predicted_id_counts.iloc[np.lexsort((clust_predicted_id_counts.index, -clust_predicted_id_counts.values))].index[0]
        if clust_real_id == clust_predicted_id:
            correc_ann_counter += 1
        else:
            continue
    clust_accuracy= correc_ann_counter/len(cluster_ids)
    print("cluster accuracy:")
    return clust_accuracy

In [31]:
data_dir = "../../../results/Fig1_Fig2_Fig3_SFig1-FACS_BM_scATAC/"

leiden_key = 'leiden_0.65'

In [32]:
scATAcat_annotations = pd.read_csv(data_dir +"/Fig1_SFig1-apply_scATAcat/apply_scATAcat_feasibility_study/outputs/scATAcat_annotations.csv")
scATAcat_annotations.columns = ['cell_IDs',leiden_key, 'scATAcat_annotation' ,'ground_truth_annotations']
scATAcat_annotations.index = scATAcat_annotations.cell_IDs
scATAcat_annotations.index.name = None
scATAcat_annotations.head()

Unnamed: 0,cell_IDs,leiden_0.65,scATAcat_annotation,ground_truth_annotations
CLP_1,CLP_1,6,CLP,CLP
CMP_0,CMP_0,1,CMP,CMP
CMP_1,CMP_1,1,CMP,CMP
CMP_2,CMP_2,2,CMP,CMP
CMP_3,CMP_3,4,MEP,CMP


In [33]:
scATAcat_annotations.shape

(1872, 4)

## accuracy metrics for scATAcat:

In [34]:
get_accuracy(scATAcat_annotations, correct_id_col='ground_truth_annotations', predicted_id_col='scATAcat_annotation')

accuracy:


0.811965811965812

In [35]:
get_balanced_accuracy(scATAcat_annotations, correct_id_col='ground_truth_annotations', predicted_id_col='scATAcat_annotation')

CLP
predicted cell ID
0.9629629629629629
CMP
predicted cell ID
0.8697674418604651
GMP
predicted cell ID
0.8181818181818182
HSC
predicted cell ID
0.9215686274509803
LMPP
predicted cell ID
0.925531914893617
MEP
predicted cell ID
0.9736842105263158
MPP
predicted cell ID
0.0
balanced accuracy:
0.7816709965537371


0.7816709965537371

In [36]:
get_cluster_accuracy(scATAcat_annotations,cluster_id_column =leiden_key, correct_id_col='ground_truth_annotations', predicted_id_col='scATAcat_annotation')

cluster accuracy:


1.0