## edgeR-TMM normalization with miRNA counts table

https://www.bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf

In [None]:
options(stringsAsFactors = FALSE)

Load required libraries (install packages if required)

In [None]:
#Load packages
library(limma)
library(edgeR)
library(data.table)
library(RColorBrewer)
library(gplots)

### Creating DGE object for edgeR

Read in counts file `data/Serum_ExoR_Nor_miRNA_counts.csv` and view head of file

In [None]:
#Read file
counts <- read.csv("FILE", stringsAsFactors=F, row.names=1)
head(VARIABLE)

Define groups and design and create `dge` using `DGEList()`.

In [None]:
group<-as.factor(c("ExoR","ExoR","ExoR","ExoR","Nor","Nor","Nor"))
group

design <- model.matrix(~0+group)
design

dge<- DGEList(counts=VARIABLE,group=VARIABLE)

Plot library sizes

In [None]:
par(mar=c(10,5,5,5))
options(scipen=5)
barplot(dge$samples$lib.size, horiz=F, names.arg=colnames(dge$counts), las=2, cex.name = 0.5, cex.axis=.5, 
        main="Library Size")

In [None]:
#To check
class(dge)
dim(dge)
dge$samples

### Filtering based on cpm cutoff

Filter miRNAs with at least 12 cpm present in at least 2 samples

In [None]:
table(rowSums(dge$counts==0)==7)
keep <- rowSums(cpm(dge) > NUMBER) >= NUMBER
dge.f <- dge[keep, , keep.lib.sizes=FALSE]
dim(dge.f)

### Estimate dispersion
Estimate dispersion: The square root of the common dispersion gives the coefficient of variation of biological variation.

Generate the estimate dispersion `d` with `estimateDisp()` using the filtered dge (`dge.f`).

In [None]:
#estimate dispersions
d <- estimateDisp(VARIABLE, design=PARAMETER)

Here the common dispersion is found to be 0.92, so the coefficient of biological variation (BCV) is around 0.96. 

In [None]:
d$common.dispersion
sqrt(d$common.disp)

### Normalization using TMM
“TMM (weighted trimmed mean of log expression) determines scaling factor calculated after double trimming values at the two extremes based on log-intensity ratios (M-values) and log-intensity averages (A-values)” (Dillies et al. Briefings in Bioinformatics, Vol. 14 (6): 671–683, 2013)

To normalize using TMM, 
- Calculate the normalization factors (`calcNormFactors()`) using `d`
- Maximize the negative binomial conditional common likelihood to estimate a common dispersion value across all genes (`estimateCommonDisp()`)
- Compute genewise exact tests for differences in the means between two groups of negative-binomially distributed counts (`exactTest()`)

In [None]:
TMM <- calcNormFactors(VARIABLE, method="TMM")
TMM <- estimateCommonDisp(TMM)
TMM <- exactTest(TMM)
table(p.adjust(TMM$table$PValue, method="BH")<0.05)
TMM.table<-data.frame(topTags(TMM,n=61))
write.table(TMM.table,file="output/edgeR_TMM_p0.05.txt",sep="\t")

## Plots

### Raw and unfiltered data

First calculate cpm and log cpm using the unfiltered data (`dge`)

In [None]:
# Raw data
cpm <- cpm(VARIABLE)
lcpm <- cpm(VARIABLE, log=PARAMETER)

Now calculate the log cpm for the filtered data (`dge.f`)

In [None]:
# Filtered data
lcpm.f <- cpm(VARIABLE, log=PARAMETER)

In [None]:
#set colours for graphs
nsamples <- ncol(dge)
nsamples<-(dge.f)
nsamples<-ncol(dge.norm)
col <- brewer.pal(nsamples, "Paired")

#Visualise filtered vs unfiltered data
par(mfrow=c(1,2))

#plot unfiltered data
samplenames<-c("ExoR1","ExoR2","ExoR3","ExoR4","Nor1","Nor2","Nor3")

plot(density(VARIABLE[,1]),col=col(lcpm,as.factor = FALSE),lwd=1,ylim=c(0,2.5),las=2,main="",xlab="")

title(main="A. Raw data", xlab="Log-cpm")
abline(v=0, lty=3)
for (i in 2:nsamples){
  den <- density(lcpm[,i])
  lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", samplenames, text.col=col, bty="n",cex=0.8,pt.cex=0.8)


#plot filtered data
plot(density(VARIABLE[,1]), col=col(lcpm.f,as.factor=FALSE), lwd=2, ylim=c(0,0.5), las=2,
     main="", xlab="")

title(main="B. Filtered data", xlab="Log-cpm")
abline(v=0, lty=3)
for (i in 2:nsamples){
  den <- density(lcpm.f[,i])
  lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", samplenames, text.col=col, bty="n",cex=0.8,pt.cex=0.8)

### Boxplots of TMM-Normalized vs. unnormalized data

In [None]:
# Unnormalized data
lcpm <- cpm(dge, log=TRUE)

In [None]:
# TMM normalized data
dge.norm <- calcNormFactors(dge.f,method="TMM") 
dge.norm$samples$norm.factors
lcpm.norm <- cpm(VARIABLE, log=PARAMETER)

In [None]:
par(mfrow=c(1,2))

# Unnormalised data
boxplot(VARIABLE,las=2, col=col, main="",ylim=c(2,20),names=c("ExoR1","ExoR2","ExoR3","ExoR4","Nor1","Nor2","Nor3"))
title(main="A. Unnormalized data",ylab="Log-cpm")

# TMM normalized data
boxplot(VARIABLE, las=2, col=col, main="", ylim=c(2,20),
        names=c("ExoR1","ExoR2","ExoR3","ExoR4","Nor1","Nor2","Nor3"))

title(main="B. TMM Normalized data",ylab="Log-cpm")

### PCA plot
Make a PCA plot using the log cpm normalized data (`lcpm.norm`)

In [None]:
#MDS plot PCA
par(mfrow=c(1,2))
col.group <- group
levels(col.group) <- brewer.pal(nlevels(col.group), "Set1")
col.group <- as.character(col.group)

plotMDS(VARIABLE, labels=group, col=col.group)

title(main="Sample groups")

### Heatmap of miRNAs significantly different between groups

<div class="alert alert-block alert-success">
    <p><b>Exercise:</b> Generate the heatmap for significantly different miRNAs</p>
</div>

Hint: First subset the miRNAs in the log cpm TMM-normalized data (`lcpm.norm`) using the miRNAs in the `TMM.table`

In [None]:
# subset miRNAs from lcpm normalized data


In [None]:
## Get some nicer colours
mypalette <- brewer.pal(11,"RdYlBu")
morecols <- colorRampPalette(mypalette)
# Set up colour vector for celltype variable
col.cell <- c("purple","orange")[group]

heatmap.2(VARIABLE,col=rev(morecols(50)),trace="PARAMETER", main="TITLE",
          ColSideColors=col.cell,scale="row",margins=c(9,9), cexCol=0.8)
