In [1]:
import pandas as pd
import numpy as np

In [2]:
def get_accuracy(annotation_df, correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    num_correct_ann= annotation_df[annotation_df[correct_id_col]==annotation_df[predicted_id_col]].shape[0]
    acc=num_correct_ann/annotation_df.shape[0]
    print("accuracy:")
    return acc

In [3]:
def get_balanced_accuracy(annotation_df, correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    ann=[]
    for cell_type in sorted(set(annotation_df[correct_id_col])):
        cell_type_sub = annotation_df[annotation_df[correct_id_col]==cell_type]
        print(cell_type)
        num_cells =cell_type_sub.shape[0]
        print("predicted cell ID")
        num_correct_ann = cell_type_sub[cell_type_sub[predicted_id_col]==cell_type].shape[0]
        #print(cell_type_sub.predicted_cellID.value_counts())
        perc=(num_correct_ann/num_cells)
        print(perc)
        ann.append(perc)
    balanced_accuracy= np.sum(ann)/len(ann)
    print("balanced accuracy:")
    print(balanced_accuracy)
    return balanced_accuracy

In [4]:
def get_cluster_accuracy(annotation_df, cluster_id_column ='clust',correct_id_col='real_cellID', predicted_id_col='predicted_cellID'):
    correc_ann_counter=0
    annotation_df = annotation_df.groupby(cluster_id_column).filter(lambda x: len(x) > 10)
    cluster_ids =sorted(set(annotation_df[cluster_id_column]))
    for cluster_id in cluster_ids:
        cluster_sub = annotation_df[annotation_df[cluster_id_column]==cluster_id]   
        clust_real_id_counts = cluster_sub[correct_id_col].value_counts()
        # sort fist by number then by index, then get the cell type with the highest occurance. This takes care of ties
        clust_real_id = clust_real_id_counts.iloc[np.lexsort((clust_real_id_counts.index, -clust_real_id_counts.values))].index[0]
        clust_predicted_id_counts = cluster_sub[predicted_id_col].value_counts()
        # sort fist by number then by index, then get the cell type with the highest occurance. This takes care of ties
        clust_predicted_id = clust_predicted_id_counts.iloc[np.lexsort((clust_predicted_id_counts.index, -clust_predicted_id_counts.values))].index[0]
        if clust_real_id == clust_predicted_id:
            correc_ann_counter += 1
        else:
            continue
    clust_accuracy= correc_ann_counter/len(cluster_ids)
    print("cluster accuracy:")
    return clust_accuracy

In [18]:
# reformat the data

In [59]:
Cellcano_annotations = pd.read_csv("/project/scATAC_analysis/scATAcat_review/benchmarking/Cellcano/apply_BM/output_Buenrostro2018_FACS_BM/predict_Buenrostro2018_FACS_BMcelltypes.csv")

In [60]:
Cellcano_annotations[['prefix', 'cell_IDs']] = Cellcano_annotations.iloc[:,0].str.split("#", n=1, expand=True)
Cellcano_annotations = Cellcano_annotations[["cell_IDs", "pred_celltype"]]
Cellcano_annotations.columns = ["cell_IDs", "Cellcano_annotations"]
Cellcano_annotations["ground_truth_annotations"] = Cellcano_annotations["cell_IDs"].str.split("_", n=1, expand=True)[0]
Cellcano_annotations.index =Cellcano_annotations["cell_IDs"]
Cellcano_annotations.index.name = None
Cellcano_annotations.head()

Unnamed: 0,cell_IDs,Cellcano_annotations,ground_truth_annotations
MEP_150,MEP_150,MEP,MEP
CMP_239,CMP_239,Mono1,CMP
HSC_284,HSC_284,MDP,HSC
GMP_285,GMP_285,MDP,GMP
CMP_164,CMP_164,MDP,CMP


In [61]:
Cellcano_annotations_ = ['Mono1', 'MDP', 'N-CD8T2', 'Mat-NK2', 'MEP', 'EM-CDT8', 'Naive-B','Pre-B', 'CM-CD8T', 'Mono2', 'Plasma-B', 'cDC', 'GDelta-T', 'pDC','LMPP', 'GMP', 'Memory-B', 'HSC/MPP', 'Pro-B', 'Mat-NK1']
Cellcano_annotations_simplified  =[ "Mono","GMP","CD8Tcell","NKcell","MEP","CD8Tcell","Bcell","CLP","CD8Tcell","Mono","Bcell","cDC","gdT","pDC","LMPP","GMP","Bcell","HSC/MPP","CLP","NKcell"]

Cellcano_annotations_simplified_dict = dict(zip(Cellcano_annotations_, Cellcano_annotations_simplified))

print(Cellcano_annotations_simplified_dict)


{'Mono1': 'Mono', 'MDP': 'GMP', 'N-CD8T2': 'CD8Tcell', 'Mat-NK2': 'NKcell', 'MEP': 'MEP', 'EM-CDT8': 'CD8Tcell', 'Naive-B': 'Bcell', 'Pre-B': 'CLP', 'CM-CD8T': 'CD8Tcell', 'Mono2': 'Mono', 'Plasma-B': 'Bcell', 'cDC': 'cDC', 'GDelta-T': 'gdT', 'pDC': 'pDC', 'LMPP': 'LMPP', 'GMP': 'GMP', 'Memory-B': 'Bcell', 'HSC/MPP': 'HSC/MPP', 'Pro-B': 'CLP', 'Mat-NK1': 'NKcell'}


In [63]:

Cellcano_annotations= Cellcano_annotations.assign(Cellcano_annotations_simplified=list(Cellcano_annotations['Cellcano_annotations'].map(Cellcano_annotations_simplified_dict)))
Cellcano_annotations.head()

Unnamed: 0,cell_IDs,Cellcano_annotations,ground_truth_annotations,Cellcano_annotations_simplified
MEP_150,MEP_150,MEP,MEP,MEP
CMP_239,CMP_239,Mono1,CMP,Mono
HSC_284,HSC_284,MDP,HSC,GMP
GMP_285,GMP_285,MDP,GMP,GMP
CMP_164,CMP_164,MDP,CMP,GMP


## accuracy metrics for scATAcat:

In [64]:
get_accuracy(Cellcano_annotations, correct_id_col='ground_truth_annotations', predicted_id_col='Cellcano_annotations_simplified')

accuracy:


0.09653333333333333

In [65]:
get_balanced_accuracy(Cellcano_annotations, correct_id_col='ground_truth_annotations', predicted_id_col='Cellcano_annotations_simplified')

CLP
predicted cell ID
0.1388888888888889
CMP
predicted cell ID
0.0
GMP
predicted cell ID
0.14566929133858267
HSC
predicted cell ID
0.0
LMPP
predicted cell ID
0.0
MEP
predicted cell ID
0.675392670157068
MPP
predicted cell ID
0.0
balanced accuracy:
0.13713583576921995


0.13713583576921995

In [58]:
get_cluster_accuracy(Cellcano_annotations,cluster_id_column =leiden_key, correct_id_col='ground_truth_annotations', predicted_id_col='Cellcano_annotations_simplified')

NameError: name 'leiden_key' is not defined