-
Notifications
You must be signed in to change notification settings - Fork 0
/
variation_bulkRNA.Rmd
204 lines (155 loc) · 6.38 KB
/
variation_bulkRNA.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
---
title: "variation_bulkRNA"
author: "Anthony Hung"
date: "2021-01-19"
output: workflowr::wflow_html
editor_options:
chunk_output_type: console
---
---
title: "Partition Variance"
author: "Anthony Hung"
date: "2020-04-26"
output: workflowr::wflow_html
editor_options:
chunk_output_type: console
---
# Use the variancePartition package to quantify the percent of variance in the data explained by biological/technical factors
# Load packages and normalized data
```{r load}
library(variancePartition)
library(RUVSeq)
library(edgeR)
library(dplyr)
#raw(filtered)
counts_upperquartile <- readRDS("data/filtered_counts.rds")$counts
#normalized
filtered_upperquartile <- readRDS("data/norm_filtered_counts.rds")
filtered_RLE <- readRDS("data/norm_filtered_counts_RLE.rds")
#metadata
sampleinfo <- readRDS("data/Sample.info.RNAseq.reordered.rds")
```
# Additional RUVseq analysis: Using top 100 (~1% of the genes considered) LEAST variable genes as control genes to get an output of W variables
```{r identify least variable genes}
#input data consists of raw filtered data (filtered for lowly expressed genes)
#compute CV (stdev/mean) and rank from least to most; pick 100 least variable
cv <- apply(counts_upperquartile, 1, function(x) sd(x)/mean(x))
least_var_genes <- names(head(sort(cv), 100))
```
```{r RUVSeq least variable genes}
#The RUVSeq vignette loads raw counts and uses the RUVSeq package to filter and normalize data (upper quantile normalization) before performing RUVg.
#Use RUVg
#load data into expressionset
set <- newSeqExpressionSet(as.matrix(counts_upperquartile),phenoData = data.frame(sampleinfo, row.names=colnames(counts_upperquartile)))
set
#normalization
set <- betweenLaneNormalization(x = set, which = "upper")
#run RUVg
set1 <- RUVg(set, least_var_genes, k=1)
sample_info <- pData(set1)
sample_info$Replicate <- as.factor(sample_info$Replicate)
sample_info$LibraryPrepBatch <- as.factor(sample_info$LibraryPrepBatch)
#fit with W1 and W2
set1_W2 <- RUVg(set, least_var_genes, k=2)
sample_info_W2 <- pData(set1_W2)
sample_info_W2$Replicate <- as.factor(sample_info_W2$Replicate)
sample_info_W2$LibraryPrepBatch <- as.factor(sample_info_W2$LibraryPrepBatch)
```
# Specify variables to consider
```{r variables_to_consider}
# Specify variables to consider
form <- ~ (1|Individual) + (1|treatment) + (1|Replicate) + (1|LibraryPrepBatch) + (1|Individual:treatment) + W_1
form_no_interaction <- ~ (1|Individual) + (1|treatment) + (1|Replicate) + (1|LibraryPrepBatch) + W_1
# W2
form_W2 <- ~ (1|Individual) + (1|treatment) + (1|Replicate) + (1|LibraryPrepBatch) + (1|Individual:treatment) + W_1 + W_2
form_no_interaction_W2 <- ~ (1|Individual) + (1|treatment) + (1|Replicate) + (1|LibraryPrepBatch) + W_1 + W_2
# No ruv
form_no_ruv <- ~ (1|Individual) + (1|treatment) + (1|Replicate) + (1|LibraryPrepBatch)
```
# Run function
```{r function_1}
# Fit model and extract results
# 1) fit linear mixed model on gene expression
# If categorical variables are specified,
# a linear mixed model is used
# If all variables are modeled as fixed effects,
# a linear model is used
# each entry in results is a regression model fit on a single gene
# 2) extract variance fractions from each model fit
# for each gene, returns fraction of variation attributable
# to each variable
# Interpretation: the variance explained by each variables
# after correcting for all other variables
varPart_1_int <- fitExtractVarPartModel( filtered_RLE, form, sample_info )
# sort variables (i.e. columns) by median fraction
# of variance explained
vp <- sortCols(varPart_1_int)
# Bar plot of variance fractions for the first 10 genes
plotPercentBars( vp[1:10,] )
# violin plot of contribution of each variable to total variance
plotVarPart( vp )
#no interaction
varPart_1 <- fitExtractVarPartModel( filtered_RLE, form_no_interaction, sample_info )
vp <- sortCols(varPart_1)
plotVarPart( vp )
#W2
varPart <- fitExtractVarPartModel( filtered_RLE, form_W2, sample_info_W2 )
vp <- sortCols(varPart)
plotVarPart( vp )
#no interaction
varPart <- fitExtractVarPartModel( filtered_RLE, form_no_interaction_W2, sample_info_W2 )
vp <- sortCols(varPart)
plotVarPart( vp )
#noRuv
VarPart_noRUV <- fitExtractVarPartModel( filtered_RLE, form_no_ruv, sample_info )
vp_noRUV <- sortCols(VarPart_noRUV)
plotVarPart( vp_noRUV )
```
# Looking at treatment
```{r}
# load gene annotations
gene_anno <- read.delim("data/gene-annotation.txt",
sep = "\t")
#for W1, interaction term included
# looking at genes that have majority var explained by treatment or interaction term
#treatment
sorted_treatment <- varPart_1_int[order(varPart_1_int$treatment, decreasing=TRUE),]
plotPercentBars(sorted_treatment[1:10,])
treatment_genes <- rownames(sorted_treatment)
treatment_symbols <- gene_anno$external_gene_name[match(treatment_genes, gene_anno$ensembl_gene_id)]
head(treatment_symbols)
treatment_anno <- gene_anno[match(treatment_genes, gene_anno$ensembl_gene_id),]
head(treatment_anno)
#int term
sorted_treatment <- varPart_1_int[order(varPart_1_int$`Individual:treatment`, decreasing=TRUE),]
plotPercentBars(sorted_treatment[1:10,])
treatment_genes <- rownames(sorted_treatment)
#genes with interaction term explaining at least 60% of the variation
interaction_genes <- rownames(sorted_treatment)[sorted_treatment$`Individual:treatment` > 0.6]
saveRDS(interaction_genes, "output/interaction_genes.rds")
treatment_symbols <- gene_anno$external_gene_name[match(treatment_genes, gene_anno$ensembl_gene_id)]
head(treatment_symbols)
treatment_anno <- gene_anno[match(treatment_genes, gene_anno$ensembl_gene_id),]
head(treatment_anno)
#for W1, no interaction term
# looking at genes that have majority var explained by treatment
sorted_treatment <- varPart_1[order(varPart$treatment, decreasing=TRUE),]
plotPercentBars(sorted_treatment[1:10,])
treatment_genes <- rownames(sorted_treatment)
treatment_symbols <- gene_anno$external_gene_name[match(treatment_genes, gene_anno$ensembl_gene_id)]
head(treatment_symbols)
treatment_anno <- gene_anno[match(treatment_genes, gene_anno$ensembl_gene_id),]
head(treatment_anno, 100)
```
# Raw data analysis
# Specify variables to consider
```{r variables}
# Specify variables to consider
form_no_interaction <- ~ (1|Individual) + (1|treatment) + (1|Replicate) + (1|LibraryPrepBatch)
```
# Run function
```{r function}
varPart_1 <- fitExtractVarPartModel( counts_upperquartile, form_no_interaction, sample_info )
vp <- sortCols(varPart_1)
plotVarPart( vp )
```