analysis/removeUnwantedVariation_bulkRNA.Rmd

---
title: "Modeling and removing unwanted technical variation"
author: "Anthony Hung"
date: "2021-01-19"
output: workflowr::wflow_html
editor_options:
  chunk_output_type: console
---

# Introduction

This code takes in the normalized and filtered count data. It first performs a PCA on the data. It then looks for correlations between the first 5 PCs and several experimental variables before and after fitting two factors of unwanted variation in the data using RUVs (taking advantage of the technical replicates in our study design). We then save the RUVs output for use in later files.

# Examine normalized/filtered data before adjusting with RUVs

Load data and libraries

```{r load data libraries}
library("gplots")
library("ggplot2")
library("reshape")
library("edgeR")
library("RColorBrewer")  
library("scales")
library("cowplot")
library("DT")
library("tidyr")
library("RUVSeq")
library("dplyr")

# Load colors 
colors <- colorRampPalette(c(brewer.pal(9, "Blues")[1],brewer.pal(9, "Blues")[9]))(100)
pal <- c(brewer.pal(9, "Set1"), brewer.pal(8, "Set2"), brewer.pal(12, "Set3"))

#load in normalized/filtered data
filt_norm_counts <- readRDS("data/norm_filtered_counts.rds")
#load in filtered counts
filt_counts <- readRDS("data/filtered_counts.rds")

# load in reordered sample information
sampleinfo <- readRDS("data/Sample.info.RNAseq.reordered.rds")

#Load PCA plotting Function
source("code/PCA_fn.R")
```

# PCA of normalized and filtered data

Here, we plot the samples represented by their sample IDs (more informative) and by symbols (for more asthetic presentation in figures).

```{r clustering norm_filt_data}
# Perform PCA
pca_genes <- prcomp(t(filt_norm_counts), scale = T)
scores <- pca_genes$x

variances <- pca_genes$sdev^2
explained <- variances / sum(variances)
plot(pca_genes, main = "Variance per PC")


#Make PCA plots with the factors colored by Individual

### PCA norm+filt Data
for (n in 1:5){
  col.v <- pal[as.integer(sampleinfo$Individual)]
  plot_scores(pca_genes, scores, n, n+1, col.v)
}

#make PCA plots with symbols as treatment status and colors as individuals for figure
library(ggfortify)
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3) + 
     theme_cowplot() +
     theme(legend.position = "none")
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment") + 
     theme_cowplot()
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3, x = 3, y = 4) + 
     theme_cowplot() +
     theme(legend.position = "none")
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3, x = 5, y = 6) + 
     theme_cowplot() +
     theme(legend.position = "none")

```

# Examine correlations between top PCs and experimental variables

After performing our PCA, we calculate the correlation between the top 5 PCs and several recorded experimental variables, looking for significant correlations. We present them in a table as well as in a heatmap. We also find significant correlations after correction for multiple testing using a BH procedure.

```{r corr PC}
# Calculate the relationship between each recorded covariate and the top 5 PCs. 
p_comps <- 1:5
info <- sampleinfo %>% 
     dplyr::select(c(Individual, Sex, Replicate, treatment, RIN, LibraryPrepBatch, LibSize)) #subset sample info for technical/biological variables

#Calculate correlations
pc_cov_cor <- matrix(nrow = ncol(info), ncol = length(p_comps), 
                      dimnames = list(colnames(info), colnames(pca_genes$x)[p_comps])) 

PC_pvalues <- matrix(data = NA, nrow = 5, ncol = 7, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "Sex", "Replicate", "Treatment", "RIN", "LibraryPrepBatch", "LibSize")))

for (pc in p_comps) { 
  for (covariate in 1:ncol(info)) { 
        lm_result <- lm(pca_genes$x[, pc] ~ info[, covariate]) 
    r2 <- summary(lm_result)$r.squared 
    fstat <- as.data.frame(summary(lm_result)$fstatistic)
    p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,])
    PC_pvalues[pc, covariate] <- p_fstat
    pc_cov_cor[covariate, pc] <- r2 
  } 
} 
datatable(pc_cov_cor) 


#BH adjust for multiple testing for the p-values for correlation
#Distribution of p-values adjusted by FDR
fdr_val <- p.adjust(PC_pvalues, method = "fdr", n = length(PC_pvalues))
fdr_val_order <- fdr_val[order(fdr_val)]
hist(fdr_val_order, ylab = "BH-adjusted p-values", main = "Distribution of Benjamini and Hochberg adjusted p-values", breaks = 10)
fdr_val <- matrix(fdr_val, nrow = 5, ncol = 7)
matrix_fdr_val <- matrix(fdr_val, nrow = 5, ncol = 7, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "Sex", "Replicate", "Treatment", "RIN", "LibraryPrepBatch", "LibSize")))

#Get the coordinates of which variables/PC combinations are significant at FDR 5%

TorF_matrix_fdr <- matrix_fdr_val <=0.05
coor_to_check <- which(matrix_fdr_val <= 0.05, arr.ind=T)
coor_to_check <- as.data.frame(coor_to_check)
matrix_fdr_val
coor_to_check # Individual has most significant correlation with pc1 and 2, and sex correlates with pc2 (probably due to the individual effect)

#Convert to long format to plot in ggplot2
pc_cov_cor_2 <- as.data.frame(pc_cov_cor) 
pc_cov_cor_2$variable <- rownames(pc_cov_cor) 
pc_cov_cor_2 <- gather(pc_cov_cor_2, key = "pc", value = "cor", -variable) 
head(pc_cov_cor_2) 

#Plot heatmap
d_heatmap <- pc_cov_cor_2 
d_heatmap$variable <- factor(d_heatmap$variable, 
                              levels = c("Individual", "Sex", "Replicate", 
                                         "treatment", "RIN", "LibraryPrepBatch", "LibSize"), 
                              labels = c("Individual", "Sex", "Replicate", 
                                         "treatment", "RIN", "LibraryPrepBatch", "LibSize")) 
pca_heat <- ggplot(d_heatmap, aes(x = pc, y = variable)) + 
  geom_tile(aes(fill = cor), colour = "white") + 
  scale_fill_gradient(low = "white", high = "red", limits = c(0, 1)) + 
  labs(x = "Principal Component", y = "", 
       title = "Correlation between principal components and experimental variables") 
pca_heat
```

# Remove unwanted variation using RUVSeq

After examining the correlation structure between major axes of variation and experimental variables in the unadjusted data, we are interested in modeling two factors of unwanted variation in the data. Here, we take advantage of the technical replicate structure of our study design and employ RUVs to fit these two unwanted sources of variation.

```{r RUV}
#Use RUVs (replicates)
replicates <- makeGroups(paste0(sampleinfo$Individual, sampleinfo$treatment))
x <- paste0(sampleinfo$Individual, sampleinfo$treatment)

#load data into expressionset
set <- newSeqExpressionSet(as.matrix(filt_counts$counts),
                           phenoData = data.frame(sampleinfo, row.names=colnames(filt_norm_counts)))
set
#normalization
set <- betweenLaneNormalization(x = set, which = "upper", round = T)

# 
set1 <- RUVs(x=set, cIdx = rownames(filt_counts), k=2, scIdx = replicates, round = F)
pData(set1)
```

# Examine correlations between top PCs and experimental variables after RUVs

As above, we examine the correlations between top 5 PCs and experimental variables, but with the data adjusted for the two RUVs factors of unwanted variation.

```{r corr PC after RUVs}
# Perform PCA
pca_genes <- prcomp(t(set1@assayData$normalizedCounts), scale = T)
scores <- pca_genes$x

variances <- pca_genes$sdev^2
explained <- variances / sum(variances)
plot(pca_genes, main = "Variance per PC")


#Make PCA plots with the factors colored by Individual

### PCA norm+filt Data
for (n in 1:5){
  col.v <- pal[as.integer(sampleinfo$Individual)]
  plot_scores(pca_genes, scores, n, n+1, col.v)
}

#make PCA plots with symbols as treatment status and colors as individuals for figure
library(ggfortify)
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3) + 
     theme_cowplot() +
     theme(legend.position = "none")
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment") + 
     theme_cowplot()
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3, x = 3, y = 4) + 
     theme_cowplot() +
     theme(legend.position = "none")
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3, x = 5, y = 6) + 
     theme_cowplot() +
     theme(legend.position = "none")

# Calculate the relationship between each recorded covariate and the top 5 PCs. 
p_comps <- 1:5
info <- sampleinfo %>% 
     dplyr::select(c(Individual, Sex, Replicate, treatment, RIN, LibraryPrepBatch, LibSize)) #subset sample info for technical/biological variables

#Calculate correlations
pc_cov_cor <- matrix(nrow = ncol(info), ncol = length(p_comps), 
                      dimnames = list(colnames(info), colnames(pca_genes$x)[p_comps])) 

PC_pvalues <- matrix(data = NA, nrow = 5, ncol = 7, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "Sex", "Replicate", "Treatment", "RIN", "LibraryPrepBatch", "LibSize")))

for (pc in p_comps) { 
  for (covariate in 1:ncol(info)) { 
        lm_result <- lm(pca_genes$x[, pc] ~ info[, covariate]) 
    r2 <- summary(lm_result)$r.squared 
    fstat <- as.data.frame(summary(lm_result)$fstatistic)
    p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,])
    PC_pvalues[pc, covariate] <- p_fstat
    pc_cov_cor[covariate, pc] <- r2 
  } 
} 
datatable(pc_cov_cor) 

#BH adjust for multiple testing for the p-values for correlation
#Distribution of p-values adjusted by FDR
fdr_val <- p.adjust(PC_pvalues, method = "fdr", n = length(PC_pvalues))
fdr_val_order <- fdr_val[order(fdr_val)]
hist(fdr_val_order, ylab = "BH-adjusted p-values", main = "Distribution of Benjamini and Hochberg adjusted p-values", breaks = 10)
fdr_val <- matrix(fdr_val, nrow = 5, ncol = 7)
matrix_fdr_val <- matrix(fdr_val, nrow = 5, ncol = 7, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "Sex", "Replicate", "Treatment", "RIN", "LibraryPrepBatch", "LibSize")))

#Get the coordinates of which variables/PC combinations are significant at FDR 5%

TorF_matrix_fdr <- matrix_fdr_val <=0.05
coor_to_check <- which(matrix_fdr_val <= 0.05, arr.ind=T)
coor_to_check <- as.data.frame(coor_to_check)
matrix_fdr_val
coor_to_check # Individual has most significant correlation with pc1 and 2, and sex correlates with pc2 (probably due to the individual effect)

#Convert to long format to plot in ggplot2
pc_cov_cor_2 <- as.data.frame(pc_cov_cor) 
pc_cov_cor_2$variable <- rownames(pc_cov_cor) 
pc_cov_cor_2 <- gather(pc_cov_cor_2, key = "pc", value = "cor", -variable) 
head(pc_cov_cor_2) 

#Plot heatmap
d_heatmap <- pc_cov_cor_2 
d_heatmap$variable <- factor(d_heatmap$variable, 
                              levels = c("Individual", "Sex", "Replicate", 
                                         "treatment", "RIN", "LibraryPrepBatch", "LibSize"), 
                              labels = c("Individual", "Sex", "Replicate", 
                                         "treatment", "RIN", "LibraryPrepBatch", "LibSize")) 
pca_heat <- ggplot(d_heatmap, aes(x = pc, y = variable)) + 
  geom_tile(aes(fill = cor), colour = "white") + 
  scale_fill_gradient(low = "white", high = "red", limits = c(0, 1)) + 
  labs(x = "Principal Component", y = "", 
       title = "Correlation between principal components and experimental variables") 
pca_heat
```

# Save data and RUVs output for unwanted variation (contains the normalized/filtered count data as well as sample information data and W1+W2 values from RUVs)

```{r}
saveRDS(set1, "data/RUVsOut.rds")
```