In [90]:
library(Biobase) ; library(reshape2) ; library(Seurat) 
library(SingleCellExperiment) ;  library(scran) ; library(scater)

### Functions 

In [91]:
# METRICS FUNCTIONS
num <- function(x){ return(as.numeric(as.character(x)))}
flatten <- function(x){return(as.vector(as.matrix(x)))}

### Load data from a pair of scRNA-seq datasets, filter genes and cells, integrate datasets and transfer cell-type labels

Here we show how the gene were filtered in the in-vitro datasets. Steps 

__Explanations of the steps:__

We load the data stored as `ExpressionSet` object, containing all the cells we consider and all the genes in a raw counts matrix, including the cell types labels fromt the original publications. Cell type labels are stored in `cellType`. We filter genes using a simple threshold on the raw counts : we keep only those with at least 3 counts in 3 cells. 

In [92]:
gene_intersection <- function(dataset1, dataset2){
    message('Finding gene intersect between two datasets')
    gene_intersect = intersect(rownames(dataset1), rownames(dataset2))
    message(paste0(length(gene_intersect), ' genes in common between two datasets. Subsetting datasets.'))
    return(gene_intersect)
}
select_variable_genes <- function(sc, cut.varbio=0.0){
    sce <- SingleCellExperiment(assays = list(counts = as.matrix(sc@assays$RNA@counts)))
    clusters <- quickCluster(sce, subset.row=NULL, assay.type="counts")
    sce <- computeSumFactors(sce, clusters=clusters)
    sce <- scater::logNormCounts(sce, size.factors = sizeFactors(sce))
    fit.genevar = modelGeneVar(sce)
    return(rownames(sc)[fit.genevar$bio > cut.varbio])
}

In [113]:
sc.srt = readRDS('data/invitro_climb/invitro_seurat_allGenes.RDS')
bulk.es = readRDS('data/invitro_climb/invitro_bulk_es_allGens.RDS')

In [114]:
bulk.es = bulk.es[,1:6]

In [115]:
# Filters genes with low expression - threshold of 3 counts in at least 3 cells
sc_mat = as.matrix(sc.srt@assays$RNA@counts)
sel.genes = apply(sc_mat,1,function(x) sum(x > 3)) > 3
sc_mat = sc_mat[sel.genes,]
dim(sc_mat)

In [116]:
gene_select = intersect(rownames(sc.srt), rownames(bulk.es))
sc.srt = sc.srt[gene_select,]
bulk.es = bulk.es[gene_select,]

In [117]:
# transform Seurat object into ExpressionSet
sc.es = ExpressionSet(as.matrix(sc.srt@assays$RNA@counts))
sc.es$cellType = Idents(sc.srt)

In [118]:
# all cells form same sequencing, simulate two random sample for MuSiC method to work
sc.es$SubjectName = sample(c(1,2,3,4), dim(sc.es)[2], replace = T)

In [119]:
true_prop = read.csv('data/invitro_climb/true_prop.csv', row.names='X')

In [120]:
bulk.es = ExpressionSet(exprs(bulk.es), phenoData = AnnotatedDataFrame(data.frame(true_prop)))

In [112]:
# Save filtered objects
saveRDS(sc.es, 'data/invitro_climb/invitro_sc_es.RDS')
saveRDS(bulk.es, 'data/invitro_climb/invitro_bulk_es.RDS')