In [None]:
library(data.table)
library(dplyr)
library(readxl)
library(ggplot2)
library(purrr)
library(stringr)
library(tidyr)
library(limma)
library(granulator)

### following analysis was eventually carried out on the LM22 sig matrix

In [None]:
genes.raw.counts = read.table("/grehawi/splice-reg-prj/new-data/RNA-seq/gene_counts_combined_noDex.matrix")
head(genes.raw.counts)

In [None]:
dim(genes.raw.counts)

In [None]:
gencode.annotation = rtracklayer::import("/grehawi/splice-reg-prj/data/Homo_sapiens.GRCh38.97.gtf")

# get names instead of Ids
gencode.annotation.df = as.data.frame(gencode.annotation)
gene.names.ids = gencode.annotation.df[gencode.annotation.df$type=="gene", colnames(gencode.annotation.df) %in% c("gene_id", "gene_name")]
gene.names.ids = gene.names.ids[gene.names.ids$gene_id %in% row.names(genes.raw.counts),]

gene.lenghts = read.table("/grehawi/splice-reg-prj/data/gene-lenghts.txt")
head(gene.lenghts)
subsetted.gene.lengths = gene.lenghts[gene.lenghts$gene_id %in% gene.names.ids$gene_id,]
subsetted.gene.lengths = subsetted.gene.lengths %>% left_join(gene.names.ids, by ="gene_id")

#match with order of expression matrix
# first remove genes with effective lengths = 0 
subsetted.gene.lengths = subsetted.gene.lengths[subsetted.gene.lengths$effective_length != 0, ]
genes.raw.counts = genes.raw.counts[rownames(genes.raw.counts) %in% subsetted.gene.lengths$gene_id, ]

idorder <- as.character(rownames(genes.raw.counts))
final.gene.lengths <- subsetted.gene.lengths[match(idorder, subsetted.gene.lengths$gene_id),]
head(final.gene.lengths)
dim(final.gene.lengths)

In [None]:
length(unique(final.gene.lengths$gene_name))

In [None]:
length(unique(final.gene.lengths$gene_id))

In [None]:
final.gene.lengths = final.gene.lengths[! duplicated(final.gene.lengths$gene_name), ]

In [None]:
dim(final.gene.lengths)

In [None]:
length(unique(final.gene.lengths$gene_id))

In [None]:
genes.count.matrix.sub = genes.raw.counts[rownames(genes.raw.counts) %in% final.gene.lengths$gene_id, ] 
genes.count.matrix.withNames = as.matrix(genes.count.matrix.sub)
row.names(genes.count.matrix.withNames) = final.gene.lengths$gene_name
head(genes.count.matrix.withNames)

In [None]:
dim(genes.count.matrix.withNames)

In [None]:
final.gene.lengths.vec = final.gene.lengths$effective_length

### Load reference profile matrix

In [None]:
LM22 = read.table("/grehawi/splice-reg-prj/data/LM22.txt", sep = '\t', header=TRUE)
rownames(LM22) = LM22$Gene.symbol
LM22$Gene.symbol = NULL
head(LM22)
dim(LM22)

In [None]:
length(intersect(rownames(genes.count.matrix.withNames), rownames(LM22)))

The performance of cell type deconvolution strongly depends on the choice and quality of the reference profile, and in particular on the degree of similarity between cell-type specific expression profiles. It is therefore recommended to test multiple reference profile matrices generated at different cell type resolutions

A useful metric to evaluate the quality of reference profile matrices is to compute the Condition Number k, which measures how sensitive the deconvolution is to variability in the input data. Generally, a matrix with low condition number (k close to 1) is well-conditioned, as it leads to a stable solution.

In [None]:
## plot signature matrix similarity matrices
options(repr.plot.width=12, repr.plot.height=8)
plot_similarity(sigMatrix=as.matrix(LM22))

In [None]:
# deconvolute input data using all available methods by default
decon <- deconvolute(m = get_TPM(as.matrix(genes.count.matrix.withNames), final.gene.lengths.vec), as.matrix(LM22))

In [None]:
decon$proportions$svr_sig1[1:50,]

In [None]:
sum(decon$proportions$svr_sig1 < 0)

In [None]:
sum(decon$proportions$rls_sig1 < 0)

In [None]:
decon$proportions$rls_sig1[1:50,]

In [None]:
sum(decon$proportions$qprogwc_sig1 < 0)

In [None]:
decon$proportions$qprogwc_sig1[1:50,]

In [None]:
sum(decon$proportions$nnls_sig1 < 0)

In [None]:
decon$proportions$nnls_sig1[1:50,]

In [None]:
sum(decon$proportions$ols_sig1 < 0)

In [None]:
sum(decon$proportions$qprog_sig1 < 0)

In [None]:
sum(decon$proportions$dtangle_sig1 < 0)

In [None]:
decon$proportions$dtangle_sig1[1:50,]

In [None]:
options(repr.plot.width=12, repr.plot.height=9)
# plot cell type proportions
plot_deconvolute(deconvoluted = decon, scale = TRUE, labels = FALSE)

When no ground truth data is available, we can assess the performance of the different deconvolution methods by computing the correlation between estimated cell type proportions generated by all methods using the correlate() function. By default estimated cell type proportions are scaled to standard scores to correct for differences in absolute estimated cell-type specific proportions across algorithms.

In [None]:
# deconvolute input data using selected methods and reference profile matrix
methods <- c('dtangle','nnls','qprogwc')
decon <- deconvolute(m = get_TPM(as.matrix(genes.count.matrix.withNames), final.gene.lengths.vec), as.matrix(LM22), methods)

In [None]:
# correlation analysis
correl <- correlate(deconvoluted = decon)
options(repr.plot.width=9, repr.plot.height=6)
# correlation heatmap
plot_correlate(correlated = correl, method="heatmap", legend=TRUE)

We observe that estimated cell type proportions are highly correlated between methods for all cell types, indicating that the deconvolution methods agree on the assignment of cell type specific signals

In [None]:
correl

In [None]:
# The average correlations across methods by cell type
correl$summary

In [None]:
# deconvolution method ranking
correl$rank

In [None]:
# deconvolute input data 
methods <- c('dtangle')
decon.final <- deconvolute(m = get_TPM(as.matrix(genes.count.matrix.withNames), final.gene.lengths.vec), as.matrix(LM22), methods)

In [None]:
head(decon.final$proportions$dtangle_sig1)

In [None]:
write.table(decon.final$proportions$dtangle_sig1, "/grehawi/splice-reg-prj/new-data/batch_corr/dtangle-cellTypes-on-raw-LM22.txt")