-
Notifications
You must be signed in to change notification settings - Fork 0
/
removeUnwantedVariation_bulkRNA.Rmd
272 lines (218 loc) · 11.4 KB
/
removeUnwantedVariation_bulkRNA.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
---
title: "Modeling and removing unwanted technical variation"
author: "Anthony Hung"
date: "2021-01-19"
output: workflowr::wflow_html
editor_options:
chunk_output_type: console
---
# Introduction
This code takes in the normalized and filtered count data. It first performs a PCA on the data. It then looks for correlations between the first 5 PCs and several experimental variables before and after fitting two factors of unwanted variation in the data using RUVs (taking advantage of the technical replicates in our study design). We then save the RUVs output for use in later files.
# Examine normalized/filtered data before adjusting with RUVs
Load data and libraries
```{r load data libraries}
library("gplots")
library("ggplot2")
library("reshape")
library("edgeR")
library("RColorBrewer")
library("scales")
library("cowplot")
library("DT")
library("tidyr")
library("RUVSeq")
library("dplyr")
# Load colors
colors <- colorRampPalette(c(brewer.pal(9, "Blues")[1],brewer.pal(9, "Blues")[9]))(100)
pal <- c(brewer.pal(9, "Set1"), brewer.pal(8, "Set2"), brewer.pal(12, "Set3"))
#load in normalized/filtered data
filt_norm_counts <- readRDS("data/norm_filtered_counts.rds")
#load in filtered counts
filt_counts <- readRDS("data/filtered_counts.rds")
# load in reordered sample information
sampleinfo <- readRDS("data/Sample.info.RNAseq.reordered.rds")
#Load PCA plotting Function
source("code/PCA_fn.R")
```
# PCA of normalized and filtered data
Here, we plot the samples represented by their sample IDs (more informative) and by symbols (for more asthetic presentation in figures).
```{r clustering norm_filt_data}
# Perform PCA
pca_genes <- prcomp(t(filt_norm_counts), scale = T)
scores <- pca_genes$x
variances <- pca_genes$sdev^2
explained <- variances / sum(variances)
plot(pca_genes, main = "Variance per PC")
#Make PCA plots with the factors colored by Individual
### PCA norm+filt Data
for (n in 1:5){
col.v <- pal[as.integer(sampleinfo$Individual)]
plot_scores(pca_genes, scores, n, n+1, col.v)
}
#make PCA plots with symbols as treatment status and colors as individuals for figure
library(ggfortify)
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3) +
theme_cowplot() +
theme(legend.position = "none")
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment") +
theme_cowplot()
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3, x = 3, y = 4) +
theme_cowplot() +
theme(legend.position = "none")
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3, x = 5, y = 6) +
theme_cowplot() +
theme(legend.position = "none")
```
# Examine correlations between top PCs and experimental variables
After performing our PCA, we calculate the correlation between the top 5 PCs and several recorded experimental variables, looking for significant correlations. We present them in a table as well as in a heatmap. We also find significant correlations after correction for multiple testing using a BH procedure.
```{r corr PC}
# Calculate the relationship between each recorded covariate and the top 5 PCs.
p_comps <- 1:5
info <- sampleinfo %>%
dplyr::select(c(Individual, Sex, Replicate, treatment, RIN, LibraryPrepBatch, LibSize)) #subset sample info for technical/biological variables
#Calculate correlations
pc_cov_cor <- matrix(nrow = ncol(info), ncol = length(p_comps),
dimnames = list(colnames(info), colnames(pca_genes$x)[p_comps]))
PC_pvalues <- matrix(data = NA, nrow = 5, ncol = 7, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "Sex", "Replicate", "Treatment", "RIN", "LibraryPrepBatch", "LibSize")))
for (pc in p_comps) {
for (covariate in 1:ncol(info)) {
lm_result <- lm(pca_genes$x[, pc] ~ info[, covariate])
r2 <- summary(lm_result)$r.squared
fstat <- as.data.frame(summary(lm_result)$fstatistic)
p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,])
PC_pvalues[pc, covariate] <- p_fstat
pc_cov_cor[covariate, pc] <- r2
}
}
datatable(pc_cov_cor)
#BH adjust for multiple testing for the p-values for correlation
#Distribution of p-values adjusted by FDR
fdr_val <- p.adjust(PC_pvalues, method = "fdr", n = length(PC_pvalues))
fdr_val_order <- fdr_val[order(fdr_val)]
hist(fdr_val_order, ylab = "BH-adjusted p-values", main = "Distribution of Benjamini and Hochberg adjusted p-values", breaks = 10)
fdr_val <- matrix(fdr_val, nrow = 5, ncol = 7)
matrix_fdr_val <- matrix(fdr_val, nrow = 5, ncol = 7, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "Sex", "Replicate", "Treatment", "RIN", "LibraryPrepBatch", "LibSize")))
#Get the coordinates of which variables/PC combinations are significant at FDR 5%
TorF_matrix_fdr <- matrix_fdr_val <=0.05
coor_to_check <- which(matrix_fdr_val <= 0.05, arr.ind=T)
coor_to_check <- as.data.frame(coor_to_check)
matrix_fdr_val
coor_to_check # Individual has most significant correlation with pc1 and 2, and sex correlates with pc2 (probably due to the individual effect)
#Convert to long format to plot in ggplot2
pc_cov_cor_2 <- as.data.frame(pc_cov_cor)
pc_cov_cor_2$variable <- rownames(pc_cov_cor)
pc_cov_cor_2 <- gather(pc_cov_cor_2, key = "pc", value = "cor", -variable)
head(pc_cov_cor_2)
#Plot heatmap
d_heatmap <- pc_cov_cor_2
d_heatmap$variable <- factor(d_heatmap$variable,
levels = c("Individual", "Sex", "Replicate",
"treatment", "RIN", "LibraryPrepBatch", "LibSize"),
labels = c("Individual", "Sex", "Replicate",
"treatment", "RIN", "LibraryPrepBatch", "LibSize"))
pca_heat <- ggplot(d_heatmap, aes(x = pc, y = variable)) +
geom_tile(aes(fill = cor), colour = "white") +
scale_fill_gradient(low = "white", high = "red", limits = c(0, 1)) +
labs(x = "Principal Component", y = "",
title = "Correlation between principal components and experimental variables")
pca_heat
```
# Remove unwanted variation using RUVSeq
After examining the correlation structure between major axes of variation and experimental variables in the unadjusted data, we are interested in modeling two factors of unwanted variation in the data. Here, we take advantage of the technical replicate structure of our study design and employ RUVs to fit these two unwanted sources of variation.
```{r RUV}
#Use RUVs (replicates)
replicates <- makeGroups(paste0(sampleinfo$Individual, sampleinfo$treatment))
x <- paste0(sampleinfo$Individual, sampleinfo$treatment)
#load data into expressionset
set <- newSeqExpressionSet(as.matrix(filt_counts$counts),
phenoData = data.frame(sampleinfo, row.names=colnames(filt_norm_counts)))
set
#normalization
set <- betweenLaneNormalization(x = set, which = "upper", round = T)
#
set1 <- RUVs(x=set, cIdx = rownames(filt_counts), k=2, scIdx = replicates, round = F)
pData(set1)
```
# Examine correlations between top PCs and experimental variables after RUVs
As above, we examine the correlations between top 5 PCs and experimental variables, but with the data adjusted for the two RUVs factors of unwanted variation.
```{r corr PC after RUVs}
# Perform PCA
pca_genes <- prcomp(t(set1@assayData$normalizedCounts), scale = T)
scores <- pca_genes$x
variances <- pca_genes$sdev^2
explained <- variances / sum(variances)
plot(pca_genes, main = "Variance per PC")
#Make PCA plots with the factors colored by Individual
### PCA norm+filt Data
for (n in 1:5){
col.v <- pal[as.integer(sampleinfo$Individual)]
plot_scores(pca_genes, scores, n, n+1, col.v)
}
#make PCA plots with symbols as treatment status and colors as individuals for figure
library(ggfortify)
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3) +
theme_cowplot() +
theme(legend.position = "none")
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment") +
theme_cowplot()
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3, x = 3, y = 4) +
theme_cowplot() +
theme(legend.position = "none")
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3, x = 5, y = 6) +
theme_cowplot() +
theme(legend.position = "none")
# Calculate the relationship between each recorded covariate and the top 5 PCs.
p_comps <- 1:5
info <- sampleinfo %>%
dplyr::select(c(Individual, Sex, Replicate, treatment, RIN, LibraryPrepBatch, LibSize)) #subset sample info for technical/biological variables
#Calculate correlations
pc_cov_cor <- matrix(nrow = ncol(info), ncol = length(p_comps),
dimnames = list(colnames(info), colnames(pca_genes$x)[p_comps]))
PC_pvalues <- matrix(data = NA, nrow = 5, ncol = 7, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "Sex", "Replicate", "Treatment", "RIN", "LibraryPrepBatch", "LibSize")))
for (pc in p_comps) {
for (covariate in 1:ncol(info)) {
lm_result <- lm(pca_genes$x[, pc] ~ info[, covariate])
r2 <- summary(lm_result)$r.squared
fstat <- as.data.frame(summary(lm_result)$fstatistic)
p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,])
PC_pvalues[pc, covariate] <- p_fstat
pc_cov_cor[covariate, pc] <- r2
}
}
datatable(pc_cov_cor)
#BH adjust for multiple testing for the p-values for correlation
#Distribution of p-values adjusted by FDR
fdr_val <- p.adjust(PC_pvalues, method = "fdr", n = length(PC_pvalues))
fdr_val_order <- fdr_val[order(fdr_val)]
hist(fdr_val_order, ylab = "BH-adjusted p-values", main = "Distribution of Benjamini and Hochberg adjusted p-values", breaks = 10)
fdr_val <- matrix(fdr_val, nrow = 5, ncol = 7)
matrix_fdr_val <- matrix(fdr_val, nrow = 5, ncol = 7, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "Sex", "Replicate", "Treatment", "RIN", "LibraryPrepBatch", "LibSize")))
#Get the coordinates of which variables/PC combinations are significant at FDR 5%
TorF_matrix_fdr <- matrix_fdr_val <=0.05
coor_to_check <- which(matrix_fdr_val <= 0.05, arr.ind=T)
coor_to_check <- as.data.frame(coor_to_check)
matrix_fdr_val
coor_to_check # Individual has most significant correlation with pc1 and 2, and sex correlates with pc2 (probably due to the individual effect)
#Convert to long format to plot in ggplot2
pc_cov_cor_2 <- as.data.frame(pc_cov_cor)
pc_cov_cor_2$variable <- rownames(pc_cov_cor)
pc_cov_cor_2 <- gather(pc_cov_cor_2, key = "pc", value = "cor", -variable)
head(pc_cov_cor_2)
#Plot heatmap
d_heatmap <- pc_cov_cor_2
d_heatmap$variable <- factor(d_heatmap$variable,
levels = c("Individual", "Sex", "Replicate",
"treatment", "RIN", "LibraryPrepBatch", "LibSize"),
labels = c("Individual", "Sex", "Replicate",
"treatment", "RIN", "LibraryPrepBatch", "LibSize"))
pca_heat <- ggplot(d_heatmap, aes(x = pc, y = variable)) +
geom_tile(aes(fill = cor), colour = "white") +
scale_fill_gradient(low = "white", high = "red", limits = c(0, 1)) +
labs(x = "Principal Component", y = "",
title = "Correlation between principal components and experimental variables")
pca_heat
```
# Save data and RUVs output for unwanted variation (contains the normalized/filtered count data as well as sample information data and W1+W2 values from RUVs)
```{r}
saveRDS(set1, "data/RUVsOut.rds")
```