### MPRA RNA counts normalization by DESeq2 in R

In [1]:
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(DESeq2))

In [10]:
base_dir = dirname(getwd())
outbase_dir = file.path(dirname(base_dir), "DESeq2")

In [4]:
# load workspace
#load(file=file.path(outbase_dir, "030523_mpraDESeq2.RData"))

#### read CRE activity matrix

In [11]:
count.matrix <- read.table(file.path(base_dir, "compiled_counts", "030523_rna_compile_avg_count.tsv"), sep="\t", header=T, row.names="annots")

#### generate metadata table for DESeq2 input

In [20]:
# reformat the column names make it eaiser to read
rep_num = substr(colnames(count.matrix), 4,4) %>% as.list()
genotypes = substr(colnames(count.matrix), 5,8) %>% as.list()

In [12]:
colnames(count.matrix) <- paste0(genotypes, rep_num)

In [23]:
rm(metaTable)
samples <- colnames(count.matrix)
splitnames <- str_split(samples,"_",simplify=TRUE)
mut.all <- c("ehet","ehom","khet","khom","rhom")
genotype <- factor(genotypes,levels=c("wt",mut.all))
replicate <- factor(rep_num, levels=c("1","2","3","4"))
metaTable <- data.frame(samples,genotype,replicate,row.names="samples")
#take a look at the metaTable
print(metaTable)

      genotype replicate
wt1         wt         1
wt2         wt         2
wt3         wt         3
wt4         wt         4
ehet1     ehet         1
ehet2     ehet         2
ehet3     ehet         3
ehom1     ehom         1
ehom2     ehom         2
ehom3     ehom         3
ehom4     ehom         4
khet1     khet         1
khet2     khet         2
khet3     khet         3
khet4     khet         4
khom1     khom         1
khom2     khom         2
khom3     khom         3
khom4     khom         4
rhom1     rhom         1
rhom2     rhom         2
rhom3     rhom         3


In [24]:
# initialize the DESeqDataSet object using count matrix
deObj <- DESeqDataSetFromMatrix(count.matrix, colData=metaTable, design= ~ genotype)

In [25]:
# submit for DE analysis
deObj<-DESeq(deObj)

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.

final dispersion estimates

fitting model and testing



In [26]:
#retireve normalized counts - only scare by mean expression level (size factor)
normalized_count <- fpm(deObj, robust=TRUE)


In [None]:
# save the scaled count matrix to file
normalized_count %>% write.table(file=file.path(outbase_dir, paste0("030523_rna_scaled_counts.tsv")), sep="\t", quote=F, row.names=T, col.names=T)

In [30]:
# save workplace
save.image(file=file.path(outbase_dir, "030523_mpraDESeq2.RData"))

In [31]:
sessionInfo()

R version 4.2.3 (2023-03-15)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: macOS Mojave 10.14.6

Matrix products: default
BLAS/LAPACK: /Users/yiqiao/opt/anaconda3/envs/mpra-r/lib/libopenblasp-r0.3.21.dylib

locale:
[1] C/UTF-8/C/C/C/C

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] DESeq2_1.38.3               SummarizedExperiment_1.28.0
 [3] Biobase_2.58.0              MatrixGenerics_1.10.0      
 [5] matrixStats_0.63.0          GenomicRanges_1.50.2       
 [7] GenomeInfoDb_1.34.9         IRanges_2.32.0             
 [9] S4Vectors_0.36.2            BiocGenerics_0.44.0        
[11] lubridate_1.9.2             forcats_1.0.0              
[13] stringr_1.5.0               dplyr_1.1.1                
[15] purrr_1.0.1                 readr_2.1.4                
[17] tidyr_1.3.0                 tibble_3.2.1               
[19] ggplot2_3.4.2               tidyverse_2.0.0            

l