Filter the raw data from Stévant to get matrix of counts for different cell types.

Data available https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE97519 (Stévant et al. 2019)


In [2]:
library(Matrix)
library("data.table")
library(edgeR)
library(SingleCellExperiment)
library(scran)
library(scuttle)

In [70]:
# Load male RPKM data
load(file="data/stevant/male_rpkm.Robj")
# this cell passed the QC but is actually an artefact and was removed from the analysis
prot_coding_genes <- read.csv(file="data/stevant/prot_coding.csv", row.names=1)
males <- male_rpkm[rownames(male_rpkm) %in% as.vector(prot_coding_genes$x),]
males <- males[,!colnames(males) %in% "E16.5_XY_20150202_C94_150331_8"]

# Load male read count data
load(file="data/stevant/male_count.Robj")
male_count <- male_count[rownames(male_count) %in% rownames(males),]
colnames(male_count) <- colnames(male_rpkm)
# this cell passed the QC but is actually an artefact and was removed from the analysis
male_count <- male_count[,!colnames(male_count) %in% "E16.5_XY_20150202_C94_150331_8"]

# Load male clustering results
male_clustering <- read.csv(file="data/stevant/male_clustering.csv", row.names=1)$x
#male_clustering <- paste("XY", male_clustering, sep="_")
names(male_clustering) <- colnames(males)

# Get cell embryonic stages
male_stages <- sapply(strsplit(colnames(males), "_"), `[`, 1)
names(male_stages) <- colnames(males)
male_stages = as.data.frame( male_stages)
head(male_stages)

genes <- data.frame(ID = row.names(males), Symbol = row.names(males))
row.names(genes) = genes$Symbol
head(genes)

Unnamed: 0_level_0,male_stages
Unnamed: 0_level_1,<chr>
E10.5_XY_20140428_C01_140729_1,E10.5
E10.5_XY_20140428_C02_140729_1,E10.5
E10.5_XY_20140428_C03_140729_1,E10.5
E10.5_XY_20140428_C04_140729_2,E10.5
E10.5_XY_20140428_C05_140729_2,E10.5
E10.5_XY_20140428_C07_140729_3,E10.5


Unnamed: 0_level_0,ID,Symbol
Unnamed: 0_level_1,<chr>,<chr>
eGFP,eGFP,eGFP
Gnai3,Gnai3,Gnai3
Pbsn,Pbsn,Pbsn
Cdc45,Cdc45,Cdc45
Scml2,Scml2,Scml2
Apoh,Apoh,Apoh


In [71]:
#################
male_clusteringFrame <- as.data.frame(male_clustering)

names(male_clusteringFrame) <- c("cluster")


cluster = c("C1",  "C2", "C3",  "C4",  "C5",  "C6" )
cellCluster = c("Endothelial","Early Progenitor","Int. Progenitor", "Pre-Sertoli","Fetal Leydig", "Sertoli")
for(i in 1:length( cellCluster)){
  cluster[i]
  male_clusteringFrame[male_clusteringFrame$cluster==cluster[i],] = cellCluster[i]
  
}


male_clusteringFrame$cluster = as.factor(male_clusteringFrame$cluster)
table(male_clusteringFrame)

head(male_clusteringFrame) 

#sertoliCells = male_count[ , male_clusteringFrame["cluster",]=="Sertoli"]
#fetalLeydig = male_count[ , male_clusteringFrame["cluster",]=="Fetal Leydig"]



male_clusteringFrame
Early Progenitor      Endothelial     Fetal Leydig  Int. Progenitor 
             183                3                7              106 
     Pre-Sertoli          Sertoli 
              31               70 

Unnamed: 0_level_0,cluster
Unnamed: 0_level_1,<fct>
E10.5_XY_20140428_C01_140729_1,Early Progenitor
E10.5_XY_20140428_C02_140729_1,Early Progenitor
E10.5_XY_20140428_C03_140729_1,Early Progenitor
E10.5_XY_20140428_C04_140729_2,Early Progenitor
E10.5_XY_20140428_C05_140729_2,Early Progenitor
E10.5_XY_20140428_C07_140729_3,Early Progenitor


The raw_counts row are genes and colummns are cells

In [72]:
metadata <- merge(male_clusteringFrame, male_stages, by=0, all=TRUE)

sce <- SingleCellExperiment(list(counts= male_count), colData=metadata, rowData = genes )
sce

class: SingleCellExperiment 
dim: 21083 400 
metadata(0):
assays(1): counts
rownames(21083): eGFP Gnai3 ... TTC14 Rhbg
rowData names(2): ID Symbol
colnames(400): E10.5_XY_20140428_C01_140729_1
  E10.5_XY_20140428_C02_140729_1 ... E16.5_XY_20150223_C88_150331_6
  E16.5_XY_20150223_C94_150331_7
colData names(3): Row.names cluster male_stages
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

In [73]:
qcstats <- perCellQCMetrics(sce)


In [74]:

sce <- computeSumFactors(sce)
summary(sizeFactors(sce))

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.09937 0.53197 0.77512 1.00000 1.54305 2.82170 

In [75]:
sce <- logNormCounts(sce)

In [76]:
table(sce$cluster)


Early Progenitor      Endothelial     Fetal Leydig  Int. Progenitor 
             183                3                7              106 
     Pre-Sertoli          Sertoli 
              31               70 

In [77]:
Sertoli = sce[ , sce$cluster == 'Sertoli' ]
Sertoli

class: SingleCellExperiment 
dim: 21083 70 
metadata(0):
assays(2): counts logcounts
rownames(21083): eGFP Gnai3 ... TTC14 Rhbg
rowData names(2): ID Symbol
colnames(70): E12.5_XY_20140526_C70_150331_4
  E12.5_XY_20140526_C80_150331_8 ... E16.5_XY_20150223_C75_150331_7
  E16.5_XY_20150223_C88_150331_6
colData names(4): Row.names cluster male_stages sizeFactor
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

Save the data in files

In [78]:
save(Sertoli, file="data/stevant/SertoliStevant.Rdata")

In [79]:
Leydig = sce[ , sce$cluster == 'Fetal Leydig' ]
Leydig
save(Leydig, file="data/stevant/LeydigStevant.Rdata")

class: SingleCellExperiment 
dim: 21083 7 
metadata(0):
assays(2): counts logcounts
rownames(21083): eGFP Gnai3 ... TTC14 Rhbg
rowData names(2): ID Symbol
colnames(7): E12.5_XY_20140526_C89_150331_5
  E13.5_XY_20130918_C28_140729_2 ... E16.5_XY_20150202_C57_150331_3
  E16.5_XY_20150223_C30_150331_1
colData names(4): Row.names cluster male_stages sizeFactor
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):