In [181]:
import pandas as pd
from sklearn import metrics

In [136]:
dataset = "COVID"
datatype = "log2" # "raw_counts"
clustertype = "gmm" # "kmeans"

datafolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\data\\"
resfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\results\\"+dataset+"\\"+datatype+"\\"
scorenames = ["z", "gsva", "auc", "cerno",
              "aucell", "vision", "ratios",
              "svd", "sparse_pca"]

## Get chosen pathways for cell types

In [91]:
paths = pd.read_csv(datafolder+"chosen_paths.txt", sep='\t', index_col=0)
paths.head()

## Get dataset specific pathways

In [93]:
geneset_info = pd.read_csv(datafolder + dataset +"//genesets_modules.csv", index_col=0)
geneset_info.head()

Unnamed: 0,ID,Title,source,DataBase
hsa04062,hsa04062,Chemokine signaling pathway,Organismal Systems; Immune system,KEGG
hsa04610,hsa04610,Complement and coagulation cascades,Organismal Systems; Immune system,KEGG
hsa04611,hsa04611,Platelet activation,Organismal Systems; Immune system,KEGG
hsa04612,hsa04612,Antigen processing and presentation,Organismal Systems; Immune system,KEGG
hsa04613,hsa04613,Neutrophil extracellular trap formation,Organismal Systems; Immune system,KEGG


## Choose the ones in both

In [None]:
dataset_specific = paths[paths["ID"].isin(geneset_info["ID"])]
dataset_specific.Celltype = dataset_specific['Celltype'].str.replace(';',' +')

In [95]:
dataset_specific.groupby(['Celltype']).size()

Celltype
B cell              10
B cell + T cell      1
NK cell              1
NK cell + T cell     2
T cell              14
dtype: int64

In [186]:
to_save = dataset_specific[["ID", "Title", "Celltype"]]

## Get true labels

In [164]:
true_labels = pd.read_csv(datafolder + dataset +"//true_labels.csv", index_col=0)
true_labels.head()

Unnamed: 0,cell.ID,Cell.type.Ontology
SAMEA6979313-AAAGAACCACCTGCTT,SAMEA6979313-AAAGAACCACCTGCTT,T cell
SAMEA6979313-AAAGGATGTCCCTCAT,SAMEA6979313-AAAGGATGTCCCTCAT,monocyte
SAMEA6979313-AACAGGGAGATCACTC,SAMEA6979313-AACAGGGAGATCACTC,T cell
SAMEA6979313-AACCAACGTGTGAGCA,SAMEA6979313-AACCAACGTGTGAGCA,T cell
SAMEA6979313-AACCATGAGCAGCGAT,SAMEA6979313-AACCATGAGCAGCGAT,monocyte


Check for pro B cell beforehand - ignore index

In [165]:
true_labels = true_labels.rename(columns={'Cell.type.Ontology': 'CellType'})
true_labels.CellType.unique()

array(['T cell', 'monocyte', 'erythroid lineage cell', 'neutrophil',
       'platelet', 'B cell'], dtype=object)

In [99]:
true_labels.loc[true_labels["CellType"].isin(['CD4+ T cell',
                                              'Cytotoxic T cell']), "CellType"] = 'T cell'
true_labels.loc[true_labels["CellType"].isin(['Natural killer cell']), "CellType"] = 'NK cell'
true_labels.loc[~true_labels["CellType"].isin(['NK cell',
                                               'T cell',
                                               'B cell']), "CellType"] = 'other'
true_labels.CellType.unique()

array(['T cell', 'other', 'B cell'], dtype=object)

## Calculate classification scores

In [191]:
class Scores:
    def __init__(self):
        self.scoring_methods = [metrics.balanced_accuracy_score,
                                metrics.adjusted_rand_score,
                                metrics.f1_score,
                                metrics.precision_score,
                                metrics.recall_score,
                                metrics.matthews_corrcoef,
                                metrics.jaccard_score,
                                metrics.hamming_loss
                                ]
        self.scores = [[] for _ in self.scoring_methods]
        self.names = ["BAcc_",
                      "ARI_",
                      "F1_",
                      "Precision_",
                      "Recall_",
                      "Matthews_",
                      "Jaccard_",
                      "Hamming_"]

    def get_classification_scores(self, y_true, y_pred):
        for i, scoring in enumerate(self.scoring_methods):
            res = scoring(y_true, y_pred)
            self.scores[i].append(res)
    
    

In [None]:
for scorename in scorenames:
    scores = pd.read_csv(resfolder+ scorename + ".csv", index_col=0)
    thresholds = pd.read_csv(
        resfolder+ scorename +"_" + clustertype + "_thr" ".csv",
        index_col=0)
    eval = Scores()
    for index, row in dataset_specific.iterrows():
        gs_score = scores.loc[row["ID"]]
        thr = thresholds.loc[row["ID"]].max()
        preds = gs_score > thr
        preds = preds.astype(int)
        true_labels["label"] = true_labels.CellType.isin(
            row["Celltype"].split(" + ")
            ).astype(int)
        eval.get_classification_scores(true_labels["label"], preds)
    for i, cls_score in enumerate(eval.scores):
        to_save.loc[:, eval.names[i]+scorename] = cls_score

to_save.to_csv(resfolder+"classification_scores.csv")