In [1]:
import numpy as np
import pandas as pd

In [2]:
from enrichment_auc.plot.plot_boxplots import visualize_methods, visualize_difference
from enrichment_auc.evaluate_classification import Scores

In [3]:
dataset = "Liver"
datatype = "log2" # "raw_counts"
clustertype = "gmm" # "kmeans"
plottype = clustertype
if clustertype != 'kmeans':
    plottype = "top1"

datafolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\data\\"
resfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\results\\"+dataset+"\\"+datatype+"\\"
plotfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\plots\\"+dataset+"\\"+datatype+"\\"+plottype+"\\"+"classification\\"

In [None]:
scorenames = [
    "z",
    "gsva",
    "auc",
    "cerno",
    "aucell",
    "vision",
    "ratios",
    "svd",
    "sparse_pca",
    # "vae",
]


In [4]:
names = ["Balanced_accuracy_",
        "ARI_",
        "F1_",
        "Recall_",
        "Matthews_",
        "Jaccard_",
        "Hamming_",
        "Precision_",
        "FDR_"]

## Get chosen pathways for cell types

In [4]:
paths = pd.read_csv(datafolder+"chosen_paths.txt", sep='\t', index_col=0)
paths.head()

Unnamed: 0,ID,Title,Category,Database,GSsize,GenesInGS,Perc,Celltype,Celltype_unclear
hsa04062,hsa04062,KEGG: Chemokine signaling pathway,KEGG: Organismal Systems; Immune system,KEGG,192,139,72.395833,T cell,no
hsa04610,hsa04610,KEGG: Complement and coagulation cascades,KEGG: Organismal Systems; Immune system,KEGG,85,46,54.117647,B cell; T cell,no
hsa04612,hsa04612,KEGG: Antigen processing and presentation,KEGG: Organismal Systems; Immune system,KEGG,78,66,84.615385,NK cell; T cell,no
hsa04650,hsa04650,KEGG: Natural killer cell mediated cytotoxicity,KEGG: Organismal Systems; Immune system,KEGG,131,94,71.755725,NK cell,no
hsa04657,hsa04657,KEGG: IL-17 signaling pathway,KEGG: Organismal Systems; Immune system,KEGG,94,63,67.021277,NK cell; T cell,no


## Get dataset specific pathways

In [4]:
geneset_info = pd.read_csv(datafolder + dataset +"//genesets_modules.csv", index_col=0)
geneset_info.head()

Unnamed: 0,ID,Title,source,DataBase
hsa04062,hsa04062,Chemokine signaling pathway,Organismal Systems; Immune system,KEGG
hsa04610,hsa04610,Complement and coagulation cascades,Organismal Systems; Immune system,KEGG
hsa04611,hsa04611,Platelet activation,Organismal Systems; Immune system,KEGG
hsa04612,hsa04612,Antigen processing and presentation,Organismal Systems; Immune system,KEGG
hsa04613,hsa04613,Neutrophil extracellular trap formation,Organismal Systems; Immune system,KEGG


## Choose the ones in both

In [6]:
dataset_specific = paths[paths["ID"].isin(geneset_info["ID"])]
dataset_specific.Celltype = dataset_specific['Celltype'].str.replace(';',' +')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_specific.Celltype = dataset_specific['Celltype'].str.replace(';',' +')


In [7]:
to_save = dataset_specific[["ID", "Title", "Celltype"]]

## Get true labels

In [10]:
true_labels = pd.read_csv(datafolder + dataset +"//true_labels.csv", index_col=0)
true_labels.head()

Unnamed: 0,ID,Cell.type.authors
2,SAMEA11294524-AAACCTGAGTCTCCTC,hepatocyte
3,SAMEA11294524-AAACCTGCAATGTAAG,monocyte-derived macrophage
4,SAMEA11294524-AAACCTGCACCGATAT,T cell
6,SAMEA11294524-AAACCTGCAGTCAGAG,liver sinusoidal endothelial cell
7,SAMEA11294524-AAACCTGGTATCAGTC,T cell


In [None]:
true_labels.set_index('NAME', inplace=True)

In [11]:
true_labels.set_index('ID', inplace=True)

In [12]:
true_labels = true_labels.rename(columns={'Cell.type.Ontology': 'CellType'})
true_labels = true_labels.rename(columns={'Cell.type.authors': 'CellType'})
true_labels.CellType.unique()

array(['hepatocyte', 'monocyte-derived macrophage', 'T cell',
       'liver sinusoidal endothelial cell', 'natural killer cell',
       'cycling cell', 'plasma cell', 'B cell', 'cholangiocyte',
       'Kupffer cell', 'vascular endothelial cell 1',
       'vascular endothelial cell 2', 'activated HSC', 'quiescent HSC'],
      dtype=object)

In [13]:
not_pre_B = ~true_labels["CellType"].isin(['precursor B cell',
                                                         'pro-B cell'])
true_labels = true_labels[not_pre_B]

In [14]:
true_labels.loc[true_labels["CellType"].isin(['CD4+ T cell',
                                              'Cytotoxic T cell']), "CellType"] = 'T cell'
true_labels.loc[true_labels["CellType"].isin(['mature B cell']), "CellType"] = 'B cell'
true_labels.loc[true_labels["CellType"].isin(['Natural killer cell', 'natural killer cell']), "CellType"] = 'NK cell'
true_labels.loc[~true_labels["CellType"].isin(['NK cell',
                                               'T cell',
                                               'B cell']), "CellType"] = 'other'
true_labels.CellType.unique()

array(['other', 'T cell', 'NK cell', 'B cell'], dtype=object)

## Calculate classification scores

In [16]:
to_save = to_save[to_save.Celltype.str.contains('|'.join(true_labels.CellType.unique()))]

In [17]:
for scorename in scorenames:
    scores = pd.read_csv(resfolder+ scorename + ".csv", index_col=0)
    scores = scores.loc[:, not_pre_B]
    thresholds = pd.read_csv(
        resfolder+ scorename +"_" + clustertype + "_thr" ".csv",
        index_col=0)
    eval = Scores()
    for index, row in to_save.iterrows():
        gs_score = scores.loc[row["ID"]]
        thr = thresholds.loc[row["ID"]].max()
        preds = gs_score > thr
        preds = preds.astype(int)
        true_labels["label"] = true_labels.CellType.isin(
            row["Celltype"].split(" + ")
            ).astype(int)
        eval.get_classification_scores(true_labels["label"], preds)
        eval.save_confusion_matrix(true_labels["label"], preds,
                                   resfolder, plottype, scorename, row["ID"])
    for i, cls_score in enumerate(eval.scores):
        to_save.loc[:, eval.names[i]+scorename] = cls_score

to_save.to_csv(resfolder+"classification_scores_"+plottype+".csv")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

## Plots

### COVID

In [5]:
plotfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\plots\\COVID\\"+datatype+"\\"+plottype+"\\"+"classification\\"
resfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\results\\COVID\\"+datatype+"\\"
covid_df = pd.read_csv(resfolder+"classification_scores_"+plottype+".csv", index_col=0)
groups = covid_df.groupby(["Celltype"])["Celltype"].count()
groups

Celltype
B cell              10
B cell + T cell      1
NK cell + T cell     2
T cell              14
Name: Celltype, dtype: int64

In [6]:
celltypes = groups[groups > 2].keys().tolist()
visualize_methods(covid_df, celltypes, names, plotfolder)

In [31]:
resfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\results\\COVID\\"+datatype+"\\"
covid_kmeans_df = pd.read_csv(resfolder+"classification_scores_kmeans.csv", index_col=0)
covid_top1_df = pd.read_csv(resfolder+"classification_scores_top1.csv", index_col=0)

cols = ["ID", "Title", "Celltype"]
covid_top1_df.drop(columns=cols, inplace=True)
covid_kmeans_df.drop(columns=cols, inplace=True)

### BM

In [10]:
plotfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\plots\\BM\\"+datatype+"\\"+plottype+"\\"+"classification\\"
resfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\results\\BM\\"+datatype+"\\"
bm_df = pd.read_csv(resfolder+"classification_scores_"+plottype+".csv", index_col=0)
groups = bm_df.groupby(["Celltype"])["Celltype"].count()
groups

Celltype
B cell              9
B cell + T cell     1
NK cell             6
NK cell + T cell    3
Name: Celltype, dtype: int64

In [11]:
celltypes = groups[groups > 2].keys().tolist()
visualize_methods(bm_df, celltypes, names, plotfolder)

In [32]:
resfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\results\\BM\\"+datatype+"\\"
bm_kmeans_df = pd.read_csv(resfolder+"classification_scores_kmeans.csv", index_col=0)
bm_top1_df = pd.read_csv(resfolder+"classification_scores_top1.csv", index_col=0)

cols = ["ID", "Title", "Celltype"]
bm_top1_df.drop(columns=cols, inplace=True)
bm_kmeans_df.drop(columns=cols, inplace=True)

### PBMC

In [12]:
plotfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\plots\\PBMC\\"+datatype+"\\"+plottype+"\\"+"classification\\"
resfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\results\\PBMC\\"+datatype+"\\"
pbmc_df = pd.read_csv(resfolder+"classification_scores_"+plottype+".csv", index_col=0)
groups = pbmc_df.groupby(["Celltype"])["Celltype"].count()
groups

Celltype
B cell              10
B cell + T cell      1
NK cell              7
NK cell + T cell     4
T cell              14
Name: Celltype, dtype: int64

In [13]:
celltypes = groups[groups > 2].keys().tolist()
visualize_methods(pbmc_df, celltypes, names, plotfolder)

In [33]:
resfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\results\\PBMC\\"+datatype+"\\"
pbmc_kmeans_df = pd.read_csv(resfolder+"classification_scores_kmeans.csv", index_col=0)
pbmc_top1_df = pd.read_csv(resfolder+"classification_scores_top1.csv", index_col=0)

cols = ["ID", "Title", "Celltype"]
pbmc_top1_df.drop(columns=cols, inplace=True)
pbmc_kmeans_df.drop(columns=cols, inplace=True)

### Liver

In [14]:
plotfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\plots\\Liver\\"+datatype+"\\"+plottype+"\\"+"classification\\"
resfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\results\\Liver\\"+datatype+"\\"
liver_df = pd.read_csv(resfolder+"classification_scores_"+plottype+".csv", index_col=0)
groups = liver_df.groupby(["Celltype"])["Celltype"].count()
groups

Celltype
B cell              10
B cell + T cell      1
NK cell              6
NK cell + T cell     4
T cell              12
Name: Celltype, dtype: int64

In [15]:
celltypes = groups[groups > 2].keys().tolist()
visualize_methods(liver_df, celltypes, names, plotfolder)

In [34]:
resfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\results\\Liver\\"+datatype+"\\"
liver_kmeans_df = pd.read_csv(resfolder+"classification_scores_kmeans.csv", index_col=0)
liver_top1_df = pd.read_csv(resfolder+"classification_scores_top1.csv", index_col=0)

cols = ["ID", "Title", "Celltype"]
liver_top1_df.drop(columns=cols, inplace=True)
liver_kmeans_df.drop(columns=cols, inplace=True)

### Merged

In [16]:
plotfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\plots\\merged\\"+datatype+"\\"+plottype+"\\"

merged = pd.concat([covid_df, pbmc_df, liver_df, bm_df])
groups = merged.groupby(["Celltype"])["Celltype"].count()
groups

Celltype
B cell              39
B cell + T cell      4
NK cell             19
NK cell + T cell    13
T cell              40
Name: Celltype, dtype: int64

In [17]:
celltypes = groups[groups > 2].keys().tolist()
visualize_methods(merged, celltypes, names, plotfolder)

In [35]:
plotfolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\plots\\merged\\"+datatype+"\\"
merged_top1 = pd.concat([covid_top1_df, pbmc_top1_df, liver_top1_df, bm_top1_df])
merged_kmeans = pd.concat([covid_kmeans_df, pbmc_kmeans_df, liver_kmeans_df, bm_kmeans_df])
visualize_difference(merged_top1, merged_kmeans, names, plotfolder)