In [1]:
import pandas as pd
import numpy as np
from numpy import genfromtxt
from source import metrics
import source.model as m
import source.plot_distributed_data as dens
import source.plot_scatter as sc
from tqdm import tqdm

# Get data

## Load genesets

In [2]:
with open('data/genesets_genes.txt') as file:
    lines = file.readlines()

In [3]:
genesets_modules = pd.read_csv('data/genesets_modules.csv')

In [4]:
genesets_modules.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Category,Database
0,hsa04062,hsa04062,KEGG: Chemokine signaling pathway,KEGG: Organismal Systems; Immune system,KEGG
1,hsa04610,hsa04610,KEGG: Complement and coagulation cascades,KEGG: Organismal Systems; Immune system,KEGG
2,hsa04611,hsa04611,KEGG: Platelet activation,KEGG: Organismal Systems; Immune system,KEGG
3,hsa04612,hsa04612,KEGG: Antigen processing and presentation,KEGG: Organismal Systems; Immune system,KEGG
4,hsa04613,hsa04613,KEGG: Neutrophil extracellular trap formation,KEGG: Organismal Systems; Immune system,KEGG


In [5]:
genesets = {}

In [6]:
for i, line in enumerate(lines):
    genesets[genesets_modules.iloc[i]["Title"]] = line.split(",")[:-1]

In [7]:
gs_names = list(genesets.keys())

## Load gene expressions

In [8]:
gene_expressions = pd.read_csv('data/filtered_unique_data.csv', index_col=0)
gene_expressions.head()

Unnamed: 0,pbmc1_10x_v2_A_AAAGATGCAAAGTCAA,pbmc1_10x_v2_A_AAAGCAAGTAGGAGTC,pbmc1_10x_v2_A_AAAGCAATCGGTTCGG,pbmc1_10x_v2_A_AAAGTAGTCATTTGGG,pbmc1_10x_v2_A_AAAGTAGTCCGAGCCA,pbmc1_10x_v2_A_AAATGCCGTGGCAAAC,pbmc1_10x_v2_A_AACACGTCAGGTCCAC,pbmc1_10x_v2_A_AACACGTCATCACGTA,pbmc1_10x_v2_A_AACCATGTCATATCGG,pbmc1_10x_v2_A_AACTCAGAGTACGCCC,...,pbmc1_10x_v2_A_CGCTGGAAGTGAATTG,pbmc1_10x_v2_A_CTACCCAAGTGTACCT,pbmc1_10x_v2_A_GACCAATCAGTTCATG,pbmc1_10x_v2_A_GGGCACTAGCTGCGAA,pbmc1_10x_v2_A_GGTATTGTCACTCTTA,pbmc1_10x_v2_A_GTGTGCGAGCGCTCCA,pbmc1_10x_v2_A_TACTCATTCACATACG,pbmc1_10x_v2_A_TCCACACAGTACACCT,pbmc1_10x_v2_A_TGATTTCCAGACGCAA,pbmc1_10x_v2_A_TTAGTTCAGAGCTTCT
TSPAN6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
DPM1,0,0,0,1,0,0,0,0,0,0,...,1,0,1,0,0,0,1,1,0,1
SCYL3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
C1orf112,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
FGR,0,0,0,1,0,0,1,2,0,0,...,0,0,0,0,0,1,0,0,0,0


In [9]:
patients_names = gene_expressions.columns.to_list()

In [10]:
gene_expressions = gene_expressions.to_numpy().astype(float)

In [11]:
gene_expressions.dtype

dtype('float64')

## And gene names

In [12]:
gene_names = pd.read_csv('data/gene_names.csv', index_col=0)
gene_names.head()

Unnamed: 0,x
1,TSPAN6
2,DPM1
3,SCYL3
4,C1orf112
5,FGR


In [13]:
genes = gene_names['x'].tolist()

## Load t-SNE

In [14]:
tsne = pd.read_csv('data/tsne.csv', index_col=0)
tsne.head()

Unnamed: 0,V1,V2
1,-21.497219,18.638739
2,-27.86457,19.0615
3,-8.49045,27.334528
4,-30.854176,11.695693
5,-19.002418,26.384748


In [15]:
tsne = tsne.to_numpy().T

In [16]:
tsne.shape

(2, 3222)

## Load true labels

In [17]:
true_labels = pd.read_csv('data/true_labels.csv', index_col=0)
true_labels.head()

Unnamed: 0,NAME,CellType
507,pbmc1_10x_v2_A_AAAGATGCAAAGTCAA,CD14+ monocyte
508,pbmc1_10x_v2_A_AAAGCAAGTAGGAGTC,CD14+ monocyte
509,pbmc1_10x_v2_A_AAAGCAATCGGTTCGG,CD14+ monocyte
510,pbmc1_10x_v2_A_AAAGTAGTCATTTGGG,CD14+ monocyte
511,pbmc1_10x_v2_A_AAAGTAGTCCGAGCCA,CD14+ monocyte


In [18]:
true_labels = true_labels["CellType"].tolist()

# Get scores

## Ratio

In [19]:
ratio = metrics.calculate_ratios(genesets, gene_expressions, genes)

In [20]:
ratio.shape

(103, 3222)

In [21]:
df_ratio = pd.DataFrame(data=ratio, index=list(genesets.keys()), columns=patients_names)
df_ratio.to_csv("results/ratios.csv")

## CERNO

### Get ranks

In [22]:
ranks = metrics.rank_genes(gene_expressions)

### Get CERNO

In [23]:
cerno, auc = metrics.CERNO(genesets, ranks, genes)

In [24]:
df_cerno = pd.DataFrame(data=cerno, index=list(genesets.keys()), columns=patients_names)
df_cerno.to_csv("results/cerno.csv")

df_auc = pd.DataFrame(data=auc, index=list(genesets.keys()), columns=patients_names)
df_auc.to_csv("results/auc.csv")

## SVD

In [25]:
svd = metrics.SVD(genesets, ranks, genes)

In [26]:
df_svd = pd.DataFrame(data=svd, index=list(genesets.keys()), columns=patients_names)
df_svd.to_csv("results/svd.csv")

## AUC

In [27]:
AUC = pd.read_csv('data/R/AUC.csv', index_col=0)
thr = pd.read_csv('data/R/thr.csv', index_col=0)

In [28]:
thr.head()

Unnamed: 0,x
KEGG: Chemokine signaling pathway,0.098921
KEGG: Complement and coagulation cascades,0.049145
KEGG: Platelet activation,0.146114
KEGG: Antigen processing and presentation,0.32721
KEGG: Neutrophil extracellular trap formation,0.138411


In [29]:
list(set(gs_names)-set(AUC.index.tolist()))

['LI et al. TBA (source: memory B cells)',
 'LI et al. TBA (source: B cells)',
 'LI et al. TBA (source: naive B cells)']

In [30]:
'LI et al. TBA (source: B cells)' in AUC.index

False

In [31]:
AUC = AUC.to_numpy()

# Plots

In [32]:
def pipeline_for_score(gs_name, score1, score2, tsne, true_labels, threshold, binary_score2, name1, name2, save_dir):
    model = m.choose_distribution(score1)
    comp_group = m.cluster_gmms(model)
    labels, binary_labels = m.get_predictions(model, comp_group, score1)
    if threshold is not None:
        dens.compare_with_categorical(score1, model, comp_group, name1, gs_name, score2, 
                                      threshold, name2, save_dir=save_dir);
        sc.show_significance(binary_labels, binary_score2, tsne, gs_name, true_labels=true_labels,
                             score_name1=name1, score_name2=name2, save_dir=save_dir);
        sc.show_difference(binary_labels, binary_score2, tsne, gs_name, true_labels=true_labels, 
                           score_name1=name1, score_name2=name2, save_dir=save_dir);
    else:
        dens.plot_densities(score1, model, comp_group, name1, gs_name, save_dir=save_dir)
    sc.plot_results(binary_labels, tsne, geneset_name=gs_name, predicted=labels, true_labels=true_labels, 
                    save_dir=save_dir);

In [None]:
for i, gs_name in tqdm(enumerate(gs_names), total=len(gs_names)):
    score_svd = svd[i, :]
    score_auc = auc[i, :]
    score_cerno = cerno[i, :]
    score_ratio = ratio[i, :]
    if gs_name in thr.index:
        AUC_index = thr.index.get_loc(gs_name)
        score2 = AUC[AUC_index, :]
        threshold = thr.loc[gs_name, "x"]
        binary_score2 = score2 > threshold
        
    else:
        score2 = None
        threshold = None
        print(gs_name)
    pipeline_for_score(gs_name, score_svd, score2, tsne, true_labels, threshold, binary_score2,
                       "SVD", "AUCell", "plots/SVD/")
    pipeline_for_score(gs_name, score_auc, score2, tsne, true_labels, threshold, binary_score2,
                       "AUC", "AUCell", "plots/AUC/")    
    pipeline_for_score(gs_name, score_cerno, score2, tsne, true_labels, threshold, binary_score2,
                       "CERNO", "AUCell", "plots/CERNO/")
    pipeline_for_score(gs_name, score_ratio, score2, tsne, true_labels, threshold, binary_score2,
                       "Ratio", "AUCell", "plots/ratio/")
        

 22%|██▏       | 23/103 [09:13<36:33, 27.41s/it]