/
08_defining_reservoirs.Rmd
449 lines (369 loc) · 16.9 KB
/
08_defining_reservoirs.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
---
title: "Defining Reservoirs"
author: "JR"
date: "5/27/2020"
output: html_document
editor_options:
chunk_output_type: console
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(tidyverse)
library(GenomicRanges)
library(GenomicFeatures)
library(rtracklayer)
library(liftOver)
library(ggpubr)
source("../util/intersect_functions.R")
source("../util/_setup.R")
```
# Defining Reservoirs
##### Goal: to characterize the hundreds of promoters that have numerous
DBPs bound, but do not result in expression.
Here we want to find a cut off of binding events at promoters
that are expressed less than 0.001 TPM.
```{r import}
# Read in reservoir annotations combined with promoter peak occurence df.
base_path <- "../07_binding_versus_expression/results"
peak_occurrence_df <- read.csv(file.path(base_path,
"peak_occurrence_df_exp_added.csv"))
```
Density plots of binding events for lncRNAs and mRNAs expressed above 0.001 TPM
We see similar distributions with a slight shift of lncRNA promoters having
more promoters with > 60 DBPs bound
```{r dbp-tpm-density-plots, message=FALSE}
# Plotting the density distribution of number of dbps bound for all promoters.
g <- ggplot(peak_occurrence_df, aes(x = number_of_dbp))
g + geom_density(alpha = 0.2, fill = "#a8404c") +
ggtitle("Distribution of promoter DBP binding",
subtitle = "All genes") +
geom_vline(xintercept = 7, lty = 2)
ggsave("figures/all_genes_binding_distribution.pdf")
# Comparing the density distribution above for non-expressed
# promoters seperately.
# Interestingly, mRNA and lncRNA promoters that are not expressed
# have similar DBP density distributions.
# Unlike that was seen 07_binding_versus_expression
off_genes <- peak_occurrence_df %>% filter(tpm < 0.001)
g <- ggplot(off_genes, aes(x = number_of_dbp, fill = gene_type))
g + geom_density(alpha = 0.2) +
scale_fill_manual(values = c("#424242","#a8404c")) +
ggtitle("Distribution of promoter DBP binding",
subtitle = "Zero expression RNA-seq")
ggsave("figures/off_genes_binding_distribution.pdf")
# Density distribution of expressed genes with perhaps enrichment of
# lncNRAs in the 30:60 range of dbp binding events
g <- ggplot(peak_occurrence_df %>% filter(tpm > 0.001),
aes(x = number_of_dbp, fill = gene_type))
g + geom_density(alpha = 0.5) +
scale_fill_manual(values = c("#424242","#a8404c")) +
ggtitle("Distribution of promoter DBP binding",
subtitle = "TPM > 0.001 RNA-seq")
ggsave("figures/expressed_genes_binding_distribution.pdf")
```
```{r ecdf-promoter-binding, message=FALSE}
#### FIGURE: Supplemental 4B
num_peaks_threshold <- 7
g <- ggplot(peak_occurrence_df, aes(x = number_of_dbp))
g + stat_ecdf() +
geom_vline(xintercept = num_peaks_threshold, lty = 2, color = "#a8404c") +
ggtitle("ECDF of promoter binding events")
ggsave("figures/ecdf_promoter_binding_events.pdf")
# Percentile of cutoff
round(ecdf(peak_occurrence_df$number_of_dbp)(num_peaks_threshold)*100,1)
```
We defined 7 DBPs as 54% percentile and now we will define "reservoirs"
as having 7 binding events and expression less than 0.001 TPM
#### define reservoirs and write out to .csv
```{r define-reservoirs}
# Let's set a new column defining reservoirs in our peak_occurrence data frame
peak_occurrence_df$reservoir <-
as.numeric(peak_occurrence_df$number_of_dbp > 7 &
peak_occurrence_df$tpm < 0.001)
write_csv(peak_occurrence_df, "results/08_peak_occurence_df_resvoirs.csv")
```
```{r percent-lncrna, message=FALSE}
#### FIGURE: Supplemental 4C
reservoirs_by_gene_type <- peak_occurrence_df %>%
group_by(gene_type, reservoir) %>%
summarize(count = n())
g <- ggplot(reservoirs_by_gene_type,
aes(x = factor(reservoir, labels = c("Non-reservoir", "Reservoir")),
y = count, fill = gene_type))
g + geom_bar(position = "fill", stat = "identity") +
scale_fill_manual(values = c("#a8404c", "#424242")) +
ggtitle("Reservoir status by gene type") +
ylab("Fraction") +
xlab("")
ggsave("figures/reservoir_status_by_gene_type.pdf")
```
We still have two concerns: I. Overlap with Super Enhancer annotations
that have some similar proterties (e.g. many DBPs bound)
II. Neighboring gene-expression
# I. Overlap with super enhancers
Goal: to determine if reservoir promoters are comprised of
Super-Enhancer (SE) annotations
We used SE annotations from the Super-Enhancer DB (Sedb) from HG19
SEdb: https://asntech.org/dbsuper/
We will further lift over these regions to HG38
```{r import-sedb}
if (!file.exists("results/k562_superenhancers_hg38.bed")) {
# download super enhancer annotations
url <- "http://asntech.org/dbsuper/data/bed/hg19/K562.bed"
system(paste0("cd data; wget ", url))
se_hg19 <- rtracklayer::import("data/K562.bed")
# Dowloading liftover chain-file #
url <- "http://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz"
system(paste0("cd data; wget ",
url, "; gunzip hg19ToHg38.over.chain.gz"))
ch <- import.chain("data/hg19ToHg38.over.chain")
seqlevelsStyle(se_hg19) <- "UCSC" # this is required for liftover
se_hg38 <- liftOver(se_hg19, ch)
# Subset just to contiguously liftedOver regions
se_hg38 <- se_hg38[sapply(width(se_hg38), length) == 1]
# Exporting .bed file of se_hg38 (super-enhancers with
# contigious overlap in hg38)
rtracklayer::export(se_hg38, "results/k562_superenhancers_hg38.bed")
}
se_hg38 <- rtracklayer::import("results/k562_superenhancers_hg38.bed")
```
```{r overlap-ses-reservoirs, message=FALSE}
# Reading in all promoters
base_path <- "../01_consensus_peaks/results"
promoters <- rtracklayer::import(file.path(base_path,
"lncrna_mrna_promoters.gtf"))
se_promoters <- subsetByOverlaps(promoters, se_hg38)
# Annotate in data.frame
peak_occurrence_df$annotated_superenhancer <- FALSE
peak_occurrence_df[peak_occurrence_df$gene_id %in% se_promoters$gene_id,
"annotated_superenhancer"] <- TRUE
# et's write this out as a specific .csv here per our goal of
# adding new variables (SE in this case) to peak_occurence_df
# (observations == promoters)
write_csv(peak_occurrence_df, "results/08_peak_occurrence_df_super_enhancers")
# Let's count how many reservoirs overlap super-enhancers.
table(peak_occurrence_df$annotated_superenhancer)
# Plotting
se_res_summary <- peak_occurrence_df %>%
group_by(reservoir, annotated_superenhancer) %>%
summarize(count = n())
g <- ggplot(se_res_summary,
aes(x = reservoir, y = count, fill = annotated_superenhancer))
g + geom_bar(position = "fill", stat = "identity") +
scale_fill_manual(values = c("#a8404c", "#424242")) +
ggtitle("Reservoir status by Superenhancer annotation") +
ylab("Fraction")
ggsave("figures/reservoir_status_by_SE_annotation.pdf")
```
#### Hypergeometric test to see if superenhacers are enriched in reservoirs
```{r se-enriched}
t <- length(which(peak_occurrence_df$reservoir == T &
peak_occurrence_df$annotated_superenhancer == T))
num_promoters_w_gt7_dbps <- nrow(peak_occurrence_df %>%
filter(number_of_dbp > 7))
n <- num_promoters_w_gt7_dbps
number_of_reservoirs <- length(which(peak_occurrence_df$reservoir == T))
a <- number_of_reservoirs
number_of_superenhancers <- length(se_hg38)
b <- number_of_superenhancers
# Hypergeometric test p-value
sum(dhyper(t:b, a, n - a, b))
```
II. Bidirectional promoters.
Goal: To determine if reservoir promoters are due to regulation of a
neighboring or "bidirectional gene"
We will define "Bidirectional promoters if they overlap a gene on the
opposite strand within 1000 bp (PMID: 17447839 for definition)".
Where the gene has to be on the opposite strand within 1000bp.
```{r shared-promoters, message=FALSE}
# Load in Gencode annotations
gencode_gr <- rtracklayer::import(
"../../../genomes/human/gencode/v32/gencode.v32.annotation.gtf")
# Define that the potential "bidirectional promoter is on the
# opposite strand and less than 1000bp in distance"
# Note this is a newly defined function in 'overlapping_promoters'
# ../util/intersect_functions.R
shared_promoters <- overlapping_promoters(gencode_gr,
upstream = 1000,
downstream = 0)
# Merge in shared promoters to the peak_occurrence_df
peak_occurrence_df <- merge(peak_occurrence_df, shared_promoters, all.x = TRUE)
# Annotate non overlapping promoters
peak_occurrence_df[is.na(peak_occurrence_df$shared_promoter_type),
"shared_promoter_type"] <- "none"
# Plotting percentage of promoters that are "bidirectional",
# "mutliple_nearby_promoters", "nearby and on same strand"
reservoir_status_by_shared_promoters <- peak_occurrence_df %>%
group_by(reservoir, shared_promoter_type) %>%
summarize(count = n())
knitr::kable(reservoir_status_by_shared_promoters)
g <- ggplot(reservoir_status_by_shared_promoters,
aes(x = reservoir, y = count, fill = shared_promoter_type))
g + geom_bar(position = "fill", stat = "identity") +
scale_fill_manual(values = c("#a8404c","#024059","#71969F","#424242")) +
ggtitle("Reservoir status by shared promoter status") +
ylab("Fraction")
ggsave("figures/reservoir_status_by_shared_promoter_status.pdf")
# Plotting the number of dbps at each type of promoter
g <- ggplot(peak_occurrence_df %>% filter(reservoir == T),
aes(x = shared_promoter_type, y = number_of_dbp))
g + geom_boxplot() +
ggtitle("Number of binding events vs. shared promoter types",
subtitle = "In reservoirs")
ggsave("figures/num_binding_events_vs_shared_promoter_type_reservoirs.pdf")
prop.table(table(peak_occurrence_df$reservoir,
peak_occurrence_df$shared_promoter_type), margin = 1)*100
chisq.test(peak_occurrence_df$shared_promoter_type,
peak_occurrence_df$reservoir)
# TODO: ANOVA or some multi group test -- I think the stat above is
# saying that there is a difference in means but not for each promoter type
# Per usual we are going to now add the promoter_properties variable(s)
# bidirectional, closeby etc . Importantly this is what will be read
# into 09_reservoir_binding_properties
write_csv(peak_occurrence_df,
"results/08_peak_occurrence_df_promoter_types.csv")
```
#### Number of shared promoters with neighboring gene expressed
```{r}
shared_promoter_expression <- peak_occurrence_df %>%
filter(reservoir == 1, num_nearby_promoters > 0) %>%
dplyr::select(gene_id, nearby_promoter_gene_ids, shared_promoter_type) %>%
separate_rows(nearby_promoter_gene_ids, sep = ";") %>%
rename(reservoir_gene = gene_id,
gene_id = nearby_promoter_gene_ids) %>%
merge(peak_occurrence_df %>% dplyr::select(gene_id,tpm), all.x = T) %>%
group_by(reservoir_gene, shared_promoter_type) %>%
summarize(n_expressed = length(which(tpm > 0.1))) %>%
mutate(at_least_one_expressed = n_expressed > 0) %>%
group_by(shared_promoter_type, at_least_one_expressed) %>%
summarize(count = n())
knitr::kable(shared_promoter_expression)
sum(shared_promoter_expression$count[shared_promoter_expression$at_least_one_expressed == TRUE])
sum(shared_promoter_expression$count[shared_promoter_expression$at_least_one_expressed == FALSE])
g <- ggplot(shared_promoter_expression, aes(x = at_least_one_expressed,
y = count))
g + geom_bar(stat = "identity") + facet_grid(~shared_promoter_type) +
geom_text(aes(label = count, y = count + 5),
size = 3)
```
Goal: to test gene-expression levels in 5 genes that surround a
reservoir promoter We will using a sliding window approach of windows
containing 5 genes. We will compute the median TMP of each window.
We will then slide the window one gene ... until all the gene windows
of a given chromosome are measured.
We then ask what the differences are when a reservoir is or
is not these gene windows.
```{r expression-neighborhood}
# Get the TSS position for ordering genes
promoter_tss <- resize(promoters, width = 1, fix = "center") %>%
as.data.frame() %>%
mutate(chr = as.character(seqnames)) %>%
dplyr::select(gene_id, chr, start) %>%
dplyr::rename(tss = start)
peak_occurrence_df <- merge(peak_occurrence_df, promoter_tss)
# Window assignment function
window_size <- 5
assign_windows <- function(ngenes, window_size) {
starts <- c(rep(1,(window_size - 1)), 1:(ngenes - (window_size - 1)))
ends <- 1:ngenes
sapply(1:length(starts), function(x) {
paste(seq(starts[x], ends[x]), collapse = ";")
})
}
# TODO: perhaps this should all be compared to random windows..
# Or potentially for some analyses below, taking out the reservoir
# to just focus on the properties of the neighborhood.
# Calculate the mean tpm per expression window
# We will label the neighborhoods by whether the center gene
# is a reservoir or superenhancer
expression_window_df <- peak_occurrence_df %>%
arrange(chr, tss) %>%
group_by(chr) %>%
mutate(window_5 = assign_windows(n(), window_size)) %>%
dplyr::select(number_of_dbp, tss, tpm, reservoir,
annotated_superenhancer, shared_promoter_type,
chr, window_5) %>%
separate_rows(window_5, sep = ";") %>%
mutate(window_5 = as.numeric(window_5)) %>%
group_by(chr, window_5) %>%
arrange(tss) %>%
mutate(index = 1:n(),
center = index == 3) %>%
# Remove the center gene in the calculation of window tpm.
summarize(mean_tpm = mean(tpm[center == F]),
res_window = reservoir[center == T],
se_window = annotated_superenhancer[center == T],
n_reservoirs = sum(as.numeric(reservoir)),
n_se = sum(as.numeric(annotated_superenhancer)),
total_dbps_bound = sum(number_of_dbp[center == F]),
mean_dbps_bound = mean(number_of_dbp[center == F]),
n_occupied_promoters = length(which(number_of_dbp[center == F] > 0)),
dbps_bound = paste(number_of_dbp, collapse = ";"),
shared_promoter_types = paste(shared_promoter_type, collapse = ";"))
# Plotting the summary of expression window measurements to see
# if there is a difference between reservoir and non-res windows.
data_summary <- function(x) {
m <- mean(x)
ymin <- m - sd(x)
ymax <- m + sd(x)
return(c(y = m, ymin = ymin, ymax = ymax))
}
# Lets format this data long per window type so that we can plot it all together
expression_window_df$window_type <- "none"
expression_window_df[expression_window_df$se_window == TRUE, "window_type"] <-
"superenhancer"
# Instead of making the 35 reservoir and SE containing windows a separate
# Category, we will include them with reservoirs
expression_window_df[expression_window_df$res_window == 1, "window_type"] <-
"reservoir"
expression_window_df[expression_window_df$res_window == 1 &
expression_window_df$se_window == TRUE, "window_type"] <-
"res + se"
window_type_summary <- expression_window_df %>%
group_by(window_type) %>%
summarize(mean = mean(mean_tpm),
count = n()) %>%
arrange(mean)
knitr::kable(window_type_summary)
# Fold changes
resm <- window_type_summary$mean[window_type_summary$window_type == "reservoir"]
nonem <- window_type_summary$mean[window_type_summary$window_type == "none"]
sem <- window_type_summary$mean[window_type_summary$window_type == "superenhancer"]
log2(nonem/resm)
log2(sem/nonem)
# Arrange window types by mean
expression_window_df$window_type <- factor(expression_window_df$window_type,
levels = window_type_summary$window_type)
#### FIGURE: Figure 4D
g <- ggplot(expression_window_df %>% filter(window_type != "res + se"),
aes(x = window_type, y = log10(mean_tpm + 0.01)))
g + geom_violin() +
stat_summary(fun.data = data_summary) +
stat_compare_means(comparisons = list(c("reservoir", "none"),
c("none", "superenhancer"))) +
ylim(-2,5) +
ggtitle("Mean expression in five gene windows")
ggsave("figures/mean_expression_in_5_gene_windows_res_vs_nonres.pdf")
res_exp_windows <- expression_window_df %>%
mutate(contains_reservoir = n_reservoirs > 0) %>%
group_by(contains_reservoir) %>%
summarize(mean_tpm = mean(mean_tpm))
knitr::kable(res_exp_windows)
# We find that reservoirs tend to be in less highly
# expressed gene-windows. However the effect is small but significant.
g <- ggplot(expression_window_df,
aes(x = factor(n_reservoirs > 0), y = mean_dbps_bound))
g + geom_violin() +
stat_compare_means()
# When windowed does expression correlate bettwer with binding
# Cool finding that number of DBPs bound in five gene
# windows also correlates with expression in 5 gene window.
g <- ggplot(expression_window_df, aes(x = total_dbps_bound,
y = log10(mean_tpm)))
g + geom_point() + stat_cor() +
geom_smooth() +
ggtitle("Five gene windows",
subtitle = "mean RNA-seq TPM vs DPBs bound in window")
ggsave("figures/five_gene_window_expression_vs_dbps.pdf")
```