Filter the raw data from Ernst to get matrix of counts for different cell types.

Data available https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-6946/ (Ernst et al. 2019)


In [3]:
library(Matrix)
library("data.table")
library(edgeR)
library(SingleCellExperiment)
library(scran)
library(scuttle)

In [4]:
# Load cell agregate
cell_matrix <- read.delim("data/law/GSE124904_aggregate_gene_cell_matrix.txt", header = TRUE, sep = " ", dec = ".")

In [24]:
head(cell_matrix["Nfix",])
metadata <- sapply(strsplit(colnames(cell_matrix), "[.]"), `[`, 1)
names(metadata) <- colnames(cell_matrix)
metadata = as.data.frame( metadata)
colnames(metadata) <- c("Sample")
head(metadata)

Unnamed: 0_level_0,E16.1_AAACCTGAGAAACCAT,E16.1_AAACCTGGTAAGAGAG,E16.1_AAACCTGTCGCCAAAT,E16.1_AAAGATGAGAGACGAA,E16.1_AAAGATGCAAACCCAT,E16.1_AAAGATGCACAACTGT,E16.1_AAAGCAAGTCGCGAAA,E16.1_AAAGTAGAGTTCGCGC,E16.1_AAAGTAGCAGTGACAG,E16.1_AAAGTAGCATTGTGCA,⋯,P0.2_CATGCCTTCCGGGTGT,P0.2_CGTGTAAGTATTCTCT,P0.2_TGAGCCGAGATCTGCT,P0.3_TACTTGTCAAGACACG,P0.3_TCAGCAATCTATCCCG,P0.3_TGTCCCATCATGGTCA,P0.3_TTAGGCAGTTGTTTGG,P3.1_GGAATAATCGCTGATA,P3.1_GTGCAGCAGTCAAGGC,P3.1_TTCTCAAGTCACCCAG
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
Nfix,0,0,0,1,0,0,0,0,0,0,⋯,0,0,0,0,0,1,0,0,0,0


Unnamed: 0_level_0,Sample
Unnamed: 0_level_1,<chr>
E16.1_AAACCTGAGAAACCAT,E16
E16.1_AAACCTGGTAAGAGAG,E16
E16.1_AAACCTGTCGCCAAAT,E16
E16.1_AAAGATGAGAGACGAA,E16
E16.1_AAAGATGCAAACCCAT,E16
E16.1_AAAGATGCACAACTGT,E16


In [27]:
# this cell passed the QC but is actually an artefact and was removed from the analysis
prot_coding_genes <- read.csv(file="data/stevant/prot_coding.csv", row.names=1)
cell_matrix <- cell_matrix[rownames(cell_matrix) %in% as.vector(prot_coding_genes$x),]

In [28]:
genes <- data.frame(ID = row.names(cell_matrix), Symbol = row.names(cell_matrix))
row.names(genes) = genes$Symbol
head(genes)

Unnamed: 0_level_0,ID,Symbol
Unnamed: 0_level_1,<chr>,<chr>
Xkr4,Xkr4,Xkr4
Rp1,Rp1,Rp1
Sox17,Sox17,Sox17
Mrpl15,Mrpl15,Mrpl15
Lypla1,Lypla1,Lypla1
Tcea1,Tcea1,Tcea1


In [30]:
sce <- SingleCellExperiment(list(counts= cell_matrix), colData=metadata, rowData = genes )
sce

class: SingleCellExperiment 
dim: 21557 10140 
metadata(0):
assays(1): counts
rownames(21557): Xkr4 Rp1 ... mt-Nd6 mt-Cytb
rowData names(2): ID Symbol
colnames(10140): E16.1_AAACCTGAGAAACCAT E16.1_AAACCTGGTAAGAGAG ...
  P3.1_GTGCAGCAGTCAAGGC P3.1_TTCTCAAGTCACCCAG
colData names(1): Sample
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

In [31]:
qcstats <- perCellQCMetrics(sce)


In [32]:

sce <- computeSumFactors(sce)
summary(sizeFactors(sce))

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.1351  0.7316  0.9212  1.0000  1.2054  3.7176 

In [33]:
sce <- logNormCounts(sce)

In [34]:
table(sce$Sample)


 E16   P0   P3   P6 
3845 2197 2427 1671 

In [35]:
GermcellsE16 = sce[ , sce$Sample == 'E16' ]
GermcellsE16

class: SingleCellExperiment 
dim: 21557 3845 
metadata(0):
assays(2): counts logcounts
rownames(21557): Xkr4 Rp1 ... mt-Nd6 mt-Cytb
rowData names(2): ID Symbol
colnames(3845): E16.1_AAACCTGAGAAACCAT E16.1_AAACCTGGTAAGAGAG ...
  E16.3_TGGTTCCGTGCAGGTA E16.1_CCCAGTTAGCTAACAA
colData names(2): Sample sizeFactor
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

Save the data in files

In [36]:
save(sce, file="data/stevant/GermcellsLaw.Rdata")
save(GermcellsE16, file="data/stevant/GermcellsE16Law.Rdata")