-
Notifications
You must be signed in to change notification settings - Fork 0
/
removeUnwantedVariation_bulkRNA.Rmd
348 lines (272 loc) · 14.6 KB
/
removeUnwantedVariation_bulkRNA.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
---
title: "removeUnwantedVariation_bulkRNA"
author: "Anthony Hung"
date: "2021-01-19"
output: workflowr::wflow_html
editor_options:
chunk_output_type: console
---
---
title: "Analysis of Technical factors"
author: "Anthony Hung"
date: "2019-12-16"
output: workflowr::wflow_html
editor_options:
chunk_output_type: console
---
# Examine normalized/filtered data and see if any PCs correlate with technical factors
```{r load data libraries}
library("gplots")
library("ggplot2")
library("reshape")
library("edgeR")
library("RColorBrewer")
library("scales")
library("cowplot")
library("DT")
library("tidyr")
library("RUVSeq")
library("dplyr")
# Load colors
colors <- colorRampPalette(c(brewer.pal(9, "Blues")[1],brewer.pal(9, "Blues")[9]))(100)
pal <- c(brewer.pal(9, "Set1"), brewer.pal(8, "Set2"), brewer.pal(12, "Set3"))
#load in normalized/filtered data
filt_norm_counts <- readRDS("data/norm_filtered_counts.rds")
#load in filtered counts
filt_counts <- readRDS("data/filtered_counts.rds")
# load in reordered sample information
sampleinfo <- readRDS("data/Sample.info.RNAseq.reordered.rds")
```
# PCA and hierarchical clustering of normalized and filtered data
```{r clustering norm_filt_data}
#Load PCA plotting Function
source("code/PCA_fn.R")
#spearman
cors <- cor(filt_norm_counts, method="spearman", use="pairwise.complete.obs")
labels <- paste(sampleinfo$Individual, sampleinfo$treatment, sep=" ")
heatmap.2( cors, scale="none", col = colors, margins = c(12, 12), trace='none', denscol="white", labCol=labels, ColSideColors=pal[as.integer(as.factor(sampleinfo$Individual))], RowSideColors=pal[as.integer(as.factor(sampleinfo$treatment))+3], cexCol = 0.2 + 1/log10(15), cexRow = 0.2 + 1/log10(15))
#extract correlations between technical replicates
coords <- cbind(rbind(c(1,5),c(1,10), c(2,9), c(2,12), c(3,4), c(3,16), c(4,16), c(5,10), c(6,11), c(6,15), c(7,13), c(8,14), c(8,17), c(9,12), c(11,15), c(14,17)))
colnames(coords) <- c("index1","index2")
coords
cors_technical <- cors[coords]
median(cors_technical)
min(cors_technical)
#pearson
cors <- cor(filt_norm_counts, method="pearson", use="pairwise.complete.obs")
labels <- paste(sampleinfo$Individual, sampleinfo$treatment, sep=" ")
heatmap.2( cors, scale="none", col = colors, margins = c(12, 12), trace='none', denscol="white", labCol=labels, ColSideColors=pal[as.integer(as.factor(sampleinfo$Individual))], RowSideColors=pal[as.integer(as.factor(sampleinfo$treatment))+3], cexCol = 0.2 + 1/log10(15), cexRow = 0.2 + 1/log10(15))
```
#Perform PCA and color by factors to look for any potential correlations
```{r PCA}
# Perform PCA
pca_genes <- prcomp(t(filt_norm_counts), scale = T)
scores <- pca_genes$x
variances <- pca_genes$sdev^2
explained <- variances / sum(variances)
plot(pca_genes, main = "Variance per PC")
#Make PCA plots with the factors colored by Individual
### PCA norm+filt Data
for (n in 1:5){
col.v <- pal[as.integer(sampleinfo$Individual)]
plot_scores(pca_genes, scores, n, n+1, col.v)
}
```
# Check that technical factors do not correlate with PCs
```{r corr PC}
# Calculate the relationship between each recorded covariate and the top 5 PCs.
p_comps <- 1:5
info <- sampleinfo %>%
dplyr::select(c(Individual, Sex, Replicate, treatment, RIN, LibraryPrepBatch, LibSize)) #subset sample info for technical/biological variables
#Calculate correlations
pc_cov_cor <- matrix(nrow = ncol(info), ncol = length(p_comps),
dimnames = list(colnames(info), colnames(pca_genes$x)[p_comps]))
PC_pvalues <- matrix(data = NA, nrow = 5, ncol = 7, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "Sex", "Replicate", "Treatment", "RIN", "LibraryPrepBatch", "LibSize")))
for (pc in p_comps) {
for (covariate in 1:ncol(info)) {
lm_result <- lm(pca_genes$x[, pc] ~ info[, covariate])
r2 <- summary(lm_result)$r.squared
fstat <- as.data.frame(summary(lm_result)$fstatistic)
p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,])
PC_pvalues[pc, covariate] <- p_fstat
pc_cov_cor[covariate, pc] <- r2
}
}
datatable(pc_cov_cor)
#make PCA plots with symbols as treatment status and colors as individuals for figure
library(ggfortify)
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3) +
theme_cowplot() +
theme(legend.position = "none")
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment") +
theme_cowplot()
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3, x = 3, y = 4) +
theme_cowplot() +
theme(legend.position = "none")
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3, x = 5, y = 6) +
theme_cowplot() +
theme(legend.position = "none")
#BH adjust for multiple testing for the p-values for correlation
#Distribution of p-values adjusted by FDR
fdr_val <- p.adjust(PC_pvalues, method = "fdr", n = length(PC_pvalues))
fdr_val_order <- fdr_val[order(fdr_val)]
hist(fdr_val_order, ylab = "BH-adjusted p-values", main = "Distribution of Benjamini and Hochberg adjusted p-values", breaks = 10)
fdr_val <- matrix(fdr_val, nrow = 5, ncol = 7)
matrix_fdr_val <- matrix(fdr_val, nrow = 5, ncol = 7, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "Sex", "Replicate", "Treatment", "RIN", "LibraryPrepBatch", "LibSize")))
#Get the coordinates of which variables/PC combinations are significant at FDR 5%
TorF_matrix_fdr <- matrix_fdr_val <=0.05
coor_to_check <- which(matrix_fdr_val <= 0.05, arr.ind=T)
coor_to_check <- as.data.frame(coor_to_check)
matrix_fdr_val
coor_to_check # Individual has most significant correlation with pc1 and 2, and sex correlates with pc2 (probably due to the individual effect)
#Convert to long format to plot in ggplot2
pc_cov_cor_2 <- as.data.frame(pc_cov_cor)
pc_cov_cor_2$variable <- rownames(pc_cov_cor)
pc_cov_cor_2 <- gather(pc_cov_cor_2, key = "pc", value = "cor", -variable)
head(pc_cov_cor_2)
#Plot heatmap
d_heatmap <- pc_cov_cor_2
d_heatmap$variable <- factor(d_heatmap$variable,
levels = c("Individual", "Sex", "Replicate",
"treatment", "RIN", "LibraryPrepBatch", "LibSize"),
labels = c("Individual", "Sex", "Replicate",
"treatment", "RIN", "LibraryPrepBatch", "LibSize"))
pca_heat <- ggplot(d_heatmap, aes(x = pc, y = variable)) +
geom_tile(aes(fill = cor), colour = "white") +
scale_fill_gradient(low = "white", high = "red", limits = c(0, 1)) +
labs(x = "Principal Component", y = "",
title = "Correlation between principal components and experimental variables")
pca_heat
```
# Remove unwanted variation using RUVSeq
```{r RUV}
#The RUVSeq vignette loads raw counts and uses the RUVSeq package to filter and normalize data (upper quantile normalization) before performing RUVs.
#Use RUVs (replicates) and adjust k until corr heatmap looks right
replicates <- makeGroups(paste0(sampleinfo$Individual, sampleinfo$treatment))
x <- paste0(sampleinfo$Individual, sampleinfo$treatment)
#load data into expressionset
set <- newSeqExpressionSet(as.matrix(filt_counts$counts),
phenoData = data.frame(sampleinfo, row.names=colnames(filt_norm_counts)))
set
#normalization
set <- betweenLaneNormalization(x = set, which = "upper", round = T)
#Spearman
cors <- cor(set@assayData$normalizedCounts, method="spearman", use="pairwise.complete.obs")
labels <- paste(sampleinfo$Individual, sampleinfo$treatment, sep=" ")
heatmap.2( cors, scale="none", col = colors, margins = c(12, 12), trace='none', denscol="white", labCol=labels, ColSideColors=pal[as.integer(as.factor(sampleinfo$Individual))], RowSideColors=pal[as.integer(as.factor(sampleinfo$treatment))+3], cexCol = 0.2 + 1/log10(15), cexRow = 0.2 + 1/log10(15))
#Pearson
cors <- cor(set@assayData$normalizedCounts, method="pearson", use="pairwise.complete.obs")
labels <- paste(sampleinfo$Individual, sampleinfo$treatment, sep=" ")
heatmap.2( cors, scale="none", col = colors, margins = c(12, 12), trace='none', denscol="white", labCol=labels, ColSideColors=pal[as.integer(as.factor(sampleinfo$Individual))], RowSideColors=pal[as.integer(as.factor(sampleinfo$treatment))+3], cexCol = 0.2 + 1/log10(15), cexRow = 0.2 + 1/log10(15))
#
set1 <- RUVs(x=set, cIdx = rownames(filt_counts), k=2, scIdx = replicates, round = F)
pData(set1)
#Plots after correcting RUV
plotRLE(set1, outline=FALSE, ylim=c(-4, 4), col=colors[x])
plotPCA(set1, col=colors[x], cex=1.2)
#Spearman
cors <- cor(set1@assayData$normalizedCounts, method="spearman", use="pairwise.complete.obs")
labels <- paste(sampleinfo$Individual, sampleinfo$treatment, sep=" ")
heatmap.2( cors, scale="none", col = colors, margins = c(12, 12), trace='none', denscol="white", labCol=labels, ColSideColors=pal[as.integer(as.factor(sampleinfo$Individual))], RowSideColors=pal[as.integer(as.factor(sampleinfo$treatment))+3], cexCol = 0.2 + 1/log10(15), cexRow = 0.2 + 1/log10(15))
#extract correlations between technical replicates
coords <- cbind(rbind(c(1,5),c(1,10), c(2,9), c(2,12), c(3,4), c(3,16), c(4,16), c(5,10), c(6,11), c(6,15), c(7,13), c(8,14), c(8,17), c(9,12), c(11,15), c(14,17)))
colnames(coords) <- c("index1","index2")
coords
cors_technical <- cors[coords]
median(cors_technical)
min(cors_technical)
#Pearson
cors <- cor(set1@assayData$normalizedCounts, method="pearson", use="pairwise.complete.obs")
labels <- paste(sampleinfo$Individual, sampleinfo$treatment, sep=" ")
heatmap.2( cors, scale="none", col = colors, margins = c(12, 12), trace='none', denscol="white", labCol=labels, ColSideColors=pal[as.integer(as.factor(sampleinfo$Individual))], RowSideColors=pal[as.integer(as.factor(sampleinfo$treatment))+3], cexCol = 0.2 + 1/log10(15), cexRow = 0.2 + 1/log10(15))
```
"The RUVg [RUVs] function returns two pieces of information: the estimated factors of unwanted
variation (added as columns to the phenoData slot of set) and the normalized counts
obtained by regressing the original counts on the unwanted factors. The normalized
values are stored in the normalizedCounts slot of set and can be accessed with the
normCounts method. These counts should be used only for exploration. It is important
that subsequent DE analysis be done on the original counts (accessible through the
counts method), as removing the unwanted factors from the counts can also remove
part of a factor of interest [6]."
# Check that technical factors do not correlate with PCs after RUVs
```{r corr PC after RUVs}
# Perform PCA
pca_genes <- prcomp(t(set1@assayData$normalizedCounts), scale = T)
scores <- pca_genes$x
variances <- pca_genes$sdev^2
explained <- variances / sum(variances)
plot(pca_genes, main = "Variance per PC")
#Make PCA plots with the factors colored by Individual
### PCA norm+filt Data
for (n in 1:5){
col.v <- pal[as.integer(sampleinfo$Individual)]
plot_scores(pca_genes, scores, n, n+1, col.v)
}
#make PCA plots with symbols as treatment status and colors as individuals for figure
library(ggfortify)
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3) +
theme_cowplot() +
theme(legend.position = "none")
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment") +
theme_cowplot()
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3, x = 3, y = 4) +
theme_cowplot() +
theme(legend.position = "none")
autoplot(pca_genes, data = sampleinfo, colour = "Individual", shape = "treatment", size = 3, x = 5, y = 6) +
theme_cowplot() +
theme(legend.position = "none")
# Calculate the relationship between each recorded covariate and the top 5 PCs.
p_comps <- 1:5
info <- sampleinfo %>%
dplyr::select(c(Individual, Sex, Replicate, treatment, RIN, LibraryPrepBatch, LibSize)) #subset sample info for technical/biological variables
#Calculate correlations
pc_cov_cor <- matrix(nrow = ncol(info), ncol = length(p_comps),
dimnames = list(colnames(info), colnames(pca_genes$x)[p_comps]))
PC_pvalues <- matrix(data = NA, nrow = 5, ncol = 7, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "Sex", "Replicate", "Treatment", "RIN", "LibraryPrepBatch", "LibSize")))
for (pc in p_comps) {
for (covariate in 1:ncol(info)) {
lm_result <- lm(pca_genes$x[, pc] ~ info[, covariate])
r2 <- summary(lm_result)$r.squared
fstat <- as.data.frame(summary(lm_result)$fstatistic)
p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,])
PC_pvalues[pc, covariate] <- p_fstat
pc_cov_cor[covariate, pc] <- r2
}
}
datatable(pc_cov_cor)
#BH adjust for multiple testing for the p-values for correlation
#Distribution of p-values adjusted by FDR
fdr_val <- p.adjust(PC_pvalues, method = "fdr", n = length(PC_pvalues))
fdr_val_order <- fdr_val[order(fdr_val)]
hist(fdr_val_order, ylab = "BH-adjusted p-values", main = "Distribution of Benjamini and Hochberg adjusted p-values", breaks = 10)
fdr_val <- matrix(fdr_val, nrow = 5, ncol = 7)
matrix_fdr_val <- matrix(fdr_val, nrow = 5, ncol = 7, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "Sex", "Replicate", "Treatment", "RIN", "LibraryPrepBatch", "LibSize")))
#Get the coordinates of which variables/PC combinations are significant at FDR 5%
TorF_matrix_fdr <- matrix_fdr_val <=0.05
coor_to_check <- which(matrix_fdr_val <= 0.05, arr.ind=T)
coor_to_check <- as.data.frame(coor_to_check)
matrix_fdr_val
coor_to_check # Individual has most significant correlation with pc1 and 2, and sex correlates with pc2 (probably due to the individual effect)
#Convert to long format to plot in ggplot2
pc_cov_cor_2 <- as.data.frame(pc_cov_cor)
pc_cov_cor_2$variable <- rownames(pc_cov_cor)
pc_cov_cor_2 <- gather(pc_cov_cor_2, key = "pc", value = "cor", -variable)
head(pc_cov_cor_2)
#Plot heatmap
d_heatmap <- pc_cov_cor_2
d_heatmap$variable <- factor(d_heatmap$variable,
levels = c("Individual", "Sex", "Replicate",
"treatment", "RIN", "LibraryPrepBatch", "LibSize"),
labels = c("Individual", "Sex", "Replicate",
"treatment", "RIN", "LibraryPrepBatch", "LibSize"))
pca_heat <- ggplot(d_heatmap, aes(x = pc, y = variable)) +
geom_tile(aes(fill = cor), colour = "white") +
scale_fill_gradient(low = "white", high = "red", limits = c(0, 1)) +
labs(x = "Principal Component", y = "",
title = "Correlation between principal components and experimental variables")
pca_heat
```
# Save data and RUVs output for unwanted variation (contains the same cpm data as before as well as pheno data, W values from RUVs)
```{r}
saveRDS(set1, "data/RUVsOut.rds")
```