analysis/removeUnwantedVariation_bulkRNA.Rmd

---
title: "removeUnwantedVariation_bulkRNA"
author: "Anthony Hung"
date: "2021-01-19"
output: workflowr::wflow_html
editor_options:
  chunk_output_type: console
---

---
title: "Analysis of Technical factors"
author: "Anthony Hung"
date: "2019-12-16"
output: workflowr::wflow_html
editor_options:
  chunk_output_type: console
---

# Examine normalized/filtered data and see if any PCs correlate with technical factors

```{r load data libraries}
library("gplots")
library("ggplot2")
library("reshape")
library("edgeR")
library("RColorBrewer")  
library("scales")
library("cowplot")
library("DT")
library("tidyr")
library("RUVSeq")
library("dplyr")

# Load colors 
colors <- colorRampPalette(c(brewer.pal(9, "Blues")[1],brewer.pal(9, "Blues")[9]))(100)
pal <- c(brewer.pal(9, "Set1"), brewer.pal(8, "Set2"), brewer.pal(12, "Set3"))

#load in normalized/filtered data
filt_norm_counts <- readRDS("data/norm_filtered_counts.rds")
#load in filtered counts
filt_counts <- readRDS("data/filtered_counts.rds")

# load in reordered sample information
sampleinfo <- readRDS("data/Sample.info.RNAseq.reordered.rds")
```

# PCA and hierarchical clustering of normalized and filtered data

```{r clustering norm_filt_data}
#Load PCA plotting Function
source("code/PCA_fn.R")


#spearman
cors <- cor(filt_norm_counts, method="spearman", use="pairwise.complete.obs")

labels <- paste(sampleinfo$Individual, sampleinfo$treatment, sep=" ")
heatmap.2( cors, scale="none", col = colors, margins = c(12, 12), trace='none', denscol="white", labCol=labels, ColSideColors=pal[as.integer(as.factor(sampleinfo$Individual))], RowSideColors=pal[as.integer(as.factor(sampleinfo$treatment))+3], cexCol = 0.2 + 1/log10(15), cexRow = 0.2 + 1/log10(15))

#extract correlations between technical replicates
coords <- cbind(rbind(c(1,5),c(1,10), c(2,9), c(2,12), c(3,4), c(3,16), c(4,16), c(5,10), c(6,11), c(6,15), c(7,13), c(8,14), c(8,17), c(9,12), c(11,15), c(14,17)))
colnames(coords) <- c("index1","index2")
coords
cors_technical <- cors[coords]
median(cors_technical)
min(cors_technical)

#pearson
cors <- cor(filt_norm_counts, method="pearson", use="pairwise.complete.obs")

labels <- paste(sampleinfo$Individual, sampleinfo$treatment, sep=" ")
heatmap.2( cors, scale="none", col = colors, margins = c(12, 12), trace='none', denscol="white", labCol=labels, ColSideColors=pal[as.integer(as.factor(sampleinfo$Individual))], RowSideColors=pal[as.integer(as.factor(sampleinfo$treatment))+3], cexCol = 0.2 + 1/log10(15), cexRow = 0.2 + 1/log10(15))
```

#Perform PCA and color by factors to look for any potential correlations

```{r PCA}
# Perform PCA
pca_genes <- prcomp(t(filt_norm_counts), scale = T)
scores <- pca_genes$x

variances <- pca_genes$sdev^2
explained <- variances / sum(variances)
plot(pca_genes, main = "Variance per PC")


#Make PCA plots with the factors colored by Individual

### PCA norm+filt Data
for (n in 1:5){
  col.v <- pal[as.integer(sampleinfo$Individual)]
  plot_scores(pca_genes, scores, n, n+1, col.v)
}


```


# Check that technical factors do not correlate with PCs

```{r corr PC}
# Calculate the relationship between each recorded covariate and the top 5 PCs. 
p_comps <- 1:5
info <- sampleinfo %>% 
     dplyr::select(c(Individual, Sex, Replicate, treatment, RIN, LibraryPrepBatch, LibSize)) #subset sample info for technical/biological variables

#Calculate correlations
pc_cov_cor <- matrix(nrow = ncol(info), ncol = length(p_comps), 
                      dimnames = list(colnames(info), colnames(pca_genes$x)[p_comps])) 

PC_pvalues <- matrix(data = NA, nrow = 5, ncol = 7, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "Sex", "Replicate", "Treatment", "RIN", "LibraryPrepBatch", "LibSize")))

for (pc in p_comps) { 
  for (covariate in 1:ncol(info)) { 
        lm_result <- lm(pca_genes$x[, pc] ~ info[, covariate]) 
    r2 <- summary(lm_result)$r.squared 
    fstat <- as.data.frame(summary(lm_result)$fstatistic)
    p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,])
    PC_pvalues[pc, covariate] <- p_fstat
    pc_cov_cor[covariate, pc] <- r2 
  } 
} 
datatable(pc_cov_cor) 

#make PCA plots with symbols as treatment status and colors as individuals for figure
library(ggfortify)
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3) + 
     theme_cowplot() +
     theme(legend.position = "none")
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment") + 
     theme_cowplot()
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3, x = 3, y = 4) + 
     theme_cowplot() +
     theme(legend.position = "none")
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3, x = 5, y = 6) + 
     theme_cowplot() +
     theme(legend.position = "none")

#BH adjust for multiple testing for the p-values for correlation
#Distribution of p-values adjusted by FDR
fdr_val <- p.adjust(PC_pvalues, method = "fdr", n = length(PC_pvalues))
fdr_val_order <- fdr_val[order(fdr_val)]
hist(fdr_val_order, ylab = "BH-adjusted p-values", main = "Distribution of Benjamini and Hochberg adjusted p-values", breaks = 10)
fdr_val <- matrix(fdr_val, nrow = 5, ncol = 7)
matrix_fdr_val <- matrix(fdr_val, nrow = 5, ncol = 7, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "Sex", "Replicate", "Treatment", "RIN", "LibraryPrepBatch", "LibSize")))

#Get the coordinates of which variables/PC combinations are significant at FDR 5%

TorF_matrix_fdr <- matrix_fdr_val <=0.05
coor_to_check <- which(matrix_fdr_val <= 0.05, arr.ind=T)
coor_to_check <- as.data.frame(coor_to_check)
matrix_fdr_val
coor_to_check # Individual has most significant correlation with pc1 and 2, and sex correlates with pc2 (probably due to the individual effect)

#Convert to long format to plot in ggplot2
pc_cov_cor_2 <- as.data.frame(pc_cov_cor) 
pc_cov_cor_2$variable <- rownames(pc_cov_cor) 
pc_cov_cor_2 <- gather(pc_cov_cor_2, key = "pc", value = "cor", -variable) 
head(pc_cov_cor_2) 

#Plot heatmap
d_heatmap <- pc_cov_cor_2 
d_heatmap$variable <- factor(d_heatmap$variable, 
                              levels = c("Individual", "Sex", "Replicate", 
                                         "treatment", "RIN", "LibraryPrepBatch", "LibSize"), 
                              labels = c("Individual", "Sex", "Replicate", 
                                         "treatment", "RIN", "LibraryPrepBatch", "LibSize")) 
pca_heat <- ggplot(d_heatmap, aes(x = pc, y = variable)) + 
  geom_tile(aes(fill = cor), colour = "white") + 
  scale_fill_gradient(low = "white", high = "red", limits = c(0, 1)) + 
  labs(x = "Principal Component", y = "", 
       title = "Correlation between principal components and experimental variables") 
pca_heat
```

# Remove unwanted variation using RUVSeq

```{r RUV}
#The RUVSeq vignette loads raw counts and uses the RUVSeq package to filter and normalize data (upper quantile normalization) before performing RUVs. 

#Use RUVs (replicates) and adjust k until corr heatmap looks right
replicates <- makeGroups(paste0(sampleinfo$Individual, sampleinfo$treatment))
x <- paste0(sampleinfo$Individual, sampleinfo$treatment)

#load data into expressionset
set <- newSeqExpressionSet(as.matrix(filt_counts$counts),
                           phenoData = data.frame(sampleinfo, row.names=colnames(filt_norm_counts)))
set
#normalization
set <- betweenLaneNormalization(x = set, which = "upper", round = T)

#Spearman
cors <- cor(set@assayData$normalizedCounts, method="spearman", use="pairwise.complete.obs")

labels <- paste(sampleinfo$Individual, sampleinfo$treatment, sep=" ")
heatmap.2( cors, scale="none", col = colors, margins = c(12, 12), trace='none', denscol="white", labCol=labels, ColSideColors=pal[as.integer(as.factor(sampleinfo$Individual))], RowSideColors=pal[as.integer(as.factor(sampleinfo$treatment))+3], cexCol = 0.2 + 1/log10(15), cexRow = 0.2 + 1/log10(15))

#Pearson
cors <- cor(set@assayData$normalizedCounts, method="pearson", use="pairwise.complete.obs")

labels <- paste(sampleinfo$Individual, sampleinfo$treatment, sep=" ")
heatmap.2( cors, scale="none", col = colors, margins = c(12, 12), trace='none', denscol="white", labCol=labels, ColSideColors=pal[as.integer(as.factor(sampleinfo$Individual))], RowSideColors=pal[as.integer(as.factor(sampleinfo$treatment))+3], cexCol = 0.2 + 1/log10(15), cexRow = 0.2 + 1/log10(15))

# 
set1 <- RUVs(x=set, cIdx = rownames(filt_counts), k=2, scIdx = replicates, round = F)
pData(set1)

#Plots after correcting RUV
plotRLE(set1, outline=FALSE, ylim=c(-4, 4), col=colors[x])
plotPCA(set1, col=colors[x], cex=1.2)

#Spearman
cors <- cor(set1@assayData$normalizedCounts, method="spearman", use="pairwise.complete.obs")

labels <- paste(sampleinfo$Individual, sampleinfo$treatment, sep=" ")
heatmap.2( cors, scale="none", col = colors, margins = c(12, 12), trace='none', denscol="white", labCol=labels, ColSideColors=pal[as.integer(as.factor(sampleinfo$Individual))], RowSideColors=pal[as.integer(as.factor(sampleinfo$treatment))+3], cexCol = 0.2 + 1/log10(15), cexRow = 0.2 + 1/log10(15))

#extract correlations between technical replicates
coords <- cbind(rbind(c(1,5),c(1,10), c(2,9), c(2,12), c(3,4), c(3,16), c(4,16), c(5,10), c(6,11), c(6,15), c(7,13), c(8,14), c(8,17), c(9,12), c(11,15), c(14,17)))
colnames(coords) <- c("index1","index2")
coords
cors_technical <- cors[coords]
median(cors_technical)
min(cors_technical)

#Pearson
cors <- cor(set1@assayData$normalizedCounts, method="pearson", use="pairwise.complete.obs")

labels <- paste(sampleinfo$Individual, sampleinfo$treatment, sep=" ")
heatmap.2( cors, scale="none", col = colors, margins = c(12, 12), trace='none', denscol="white", labCol=labels, ColSideColors=pal[as.integer(as.factor(sampleinfo$Individual))], RowSideColors=pal[as.integer(as.factor(sampleinfo$treatment))+3], cexCol = 0.2 + 1/log10(15), cexRow = 0.2 + 1/log10(15))
```

"The RUVg [RUVs] function returns two pieces of information: the estimated factors of unwanted
variation (added as columns to the phenoData slot of set) and the normalized counts
obtained by regressing the original counts on the unwanted factors. The normalized
values are stored in the normalizedCounts slot of set and can be accessed with the
normCounts method. These counts should be used only for exploration. It is important
that subsequent DE analysis be done on the original counts (accessible through the
counts method), as removing the unwanted factors from the counts can also remove
part of a factor of interest [6]."


# Check that technical factors do not correlate with PCs after RUVs

```{r corr PC after RUVs}

# Perform PCA
pca_genes <- prcomp(t(set1@assayData$normalizedCounts), scale = T)
scores <- pca_genes$x

variances <- pca_genes$sdev^2
explained <- variances / sum(variances)
plot(pca_genes, main = "Variance per PC")


#Make PCA plots with the factors colored by Individual

### PCA norm+filt Data
for (n in 1:5){
  col.v <- pal[as.integer(sampleinfo$Individual)]
  plot_scores(pca_genes, scores, n, n+1, col.v)
}

#make PCA plots with symbols as treatment status and colors as individuals for figure
library(ggfortify)
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3) + 
     theme_cowplot() +
     theme(legend.position = "none")
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment") + 
     theme_cowplot()
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3, x = 3, y = 4) + 
     theme_cowplot() +
     theme(legend.position = "none")
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3, x = 5, y = 6) + 
     theme_cowplot() +
     theme(legend.position = "none")

# Calculate the relationship between each recorded covariate and the top 5 PCs. 
p_comps <- 1:5
info <- sampleinfo %>% 
     dplyr::select(c(Individual, Sex, Replicate, treatment, RIN, LibraryPrepBatch, LibSize)) #subset sample info for technical/biological variables

#Calculate correlations
pc_cov_cor <- matrix(nrow = ncol(info), ncol = length(p_comps), 
                      dimnames = list(colnames(info), colnames(pca_genes$x)[p_comps])) 

PC_pvalues <- matrix(data = NA, nrow = 5, ncol = 7, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "Sex", "Replicate", "Treatment", "RIN", "LibraryPrepBatch", "LibSize")))

for (pc in p_comps) { 
  for (covariate in 1:ncol(info)) { 
        lm_result <- lm(pca_genes$x[, pc] ~ info[, covariate]) 
    r2 <- summary(lm_result)$r.squared 
    fstat <- as.data.frame(summary(lm_result)$fstatistic)
    p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,])
    PC_pvalues[pc, covariate] <- p_fstat
    pc_cov_cor[covariate, pc] <- r2 
  } 
} 
datatable(pc_cov_cor) 

#BH adjust for multiple testing for the p-values for correlation
#Distribution of p-values adjusted by FDR
fdr_val <- p.adjust(PC_pvalues, method = "fdr", n = length(PC_pvalues))
fdr_val_order <- fdr_val[order(fdr_val)]
hist(fdr_val_order, ylab = "BH-adjusted p-values", main = "Distribution of Benjamini and Hochberg adjusted p-values", breaks = 10)
fdr_val <- matrix(fdr_val, nrow = 5, ncol = 7)
matrix_fdr_val <- matrix(fdr_val, nrow = 5, ncol = 7, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "Sex", "Replicate", "Treatment", "RIN", "LibraryPrepBatch", "LibSize")))

#Get the coordinates of which variables/PC combinations are significant at FDR 5%

TorF_matrix_fdr <- matrix_fdr_val <=0.05
coor_to_check <- which(matrix_fdr_val <= 0.05, arr.ind=T)
coor_to_check <- as.data.frame(coor_to_check)
matrix_fdr_val
coor_to_check # Individual has most significant correlation with pc1 and 2, and sex correlates with pc2 (probably due to the individual effect)

#Convert to long format to plot in ggplot2
pc_cov_cor_2 <- as.data.frame(pc_cov_cor) 
pc_cov_cor_2$variable <- rownames(pc_cov_cor) 
pc_cov_cor_2 <- gather(pc_cov_cor_2, key = "pc", value = "cor", -variable) 
head(pc_cov_cor_2) 

#Plot heatmap
d_heatmap <- pc_cov_cor_2 
d_heatmap$variable <- factor(d_heatmap$variable, 
                              levels = c("Individual", "Sex", "Replicate", 
                                         "treatment", "RIN", "LibraryPrepBatch", "LibSize"), 
                              labels = c("Individual", "Sex", "Replicate", 
                                         "treatment", "RIN", "LibraryPrepBatch", "LibSize")) 
pca_heat <- ggplot(d_heatmap, aes(x = pc, y = variable)) + 
  geom_tile(aes(fill = cor), colour = "white") + 
  scale_fill_gradient(low = "white", high = "red", limits = c(0, 1)) + 
  labs(x = "Principal Component", y = "", 
       title = "Correlation between principal components and experimental variables") 
pca_heat
```


# Save data and RUVs output for unwanted variation (contains the same cpm data as before as well as pheno data, W values from RUVs)

```{r}
saveRDS(set1, "data/RUVsOut.rds")
```