/
methods_tutorial_wgcna_cflo_ophio_22Oct21.Rmd
1395 lines (995 loc) · 50.9 KB
/
methods_tutorial_wgcna_cflo_ophio_22Oct21.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Host-parasite-clocks"
author: Biplabendu Das
date: '`r format(Sys.time(), "%d %B, %Y")`'
output:
word_document:
html_document:
toc: true
toc_depth: 3
toc_float: true
theme: united
keep_md: no
code_folding: hide
editor_options:
chunk_output_type: console
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(warning = F)
knitr::opts_chunk$set(message = F)
## For more inspiration on customizing the html output, refer to the following:
# https://bookdown.org/yihui/rmarkdown/html-document.html#table-of-contents
```
```{r housekeeping, include=FALSE}
set.seed(420)
rm(list = ls())
#' Load the libraries
pacman::p_load(pheatmap, dendextend, tidyverse, viridis)
pacman::p_load(RSQLite, tidyverse, dbplyr, DT, conflicted, WGCNA, igraph)
pacman::p_load(patchwork)
#' set conflict preference
conflict_prefer("select", "dplyr")
conflict_prefer("filter", "dplyr")
conflict_prefer("layout", "plotly")
conflict_prefer("hclust", "flashClust")
conflict_prefer("union", "dplyr")
#' set path to your working directory
path_to_repo = "/Users/biplabendudas/Documents/GitHub/deBekker_and_Das_2021"
#' load functions
# customized theme for publication quality figures
source(paste0(path_to_repo,"/functions/theme_publication.R"))
# function to perform enrichment analysis
source(paste0(path_to_repo,"/functions/enrichment_analysis.R"))
# function to plot z-scores (Cflo genes only)
source(paste0(path_to_repo,"/functions/plot_zscores.R"))
```
## Overview/Goals
This document provides a step-by-step tutorial on how to:
- build a circadian gene co-expression network (GCN),
- how to annotate the network using published data,
- infer functions of your gene-clusters-of-interest.
## Step 1: Build circadian GCN
### 1.1 Load data
We will build a circadian GCN for the ant, Camponotus floridanus, using time-course RNASeq data collected in Das and de Bekker (2021; bioRxiv). The raw data is deposited in NCBI under BioProject PRJNA704762.
Description of the dataset: Three forager and three nurse ant brains were sampled and pooled for RNA extraction and Illumina sequencing, every 2h over a 24h period. This resulted in 24 RNASeq datasets for ant brains (12 forager and 12 nurse datasets over the course of a 24h LD 12:12 day).
One would need to perform the usual steps – trimming the reads, mapping the reads to the genome, and quantifying normalized gene counts – to obtain normalized gene expression data from the raw reads. At the end, for each gene in the genome, we should have the normalized expression for each time point, throughout the 24h day.
For the purpose of this tutorial, we assume that you have organized the processed data into a (gene-expr X time-point) format, in a chronological order, as shown below.
> figure goes here
X2F = forager brain sampled at ZT2 (2h after lights were turned on), X4F = forager brain sampled at ZT4, and so on.
Now we read the data into R.
```{r load_data}
# loading database which contains data for Das and de Bekker 2021 (bioRxiv)
db <- dbConnect(RSQLite::SQLite(), paste0(path_to_repo,"/data/databases/TC5_data.db"))
# extract the (gene-expr X time-point) data
dat <-
db %>%
tbl(., "annot_fpkm") %>%
select(gene_name, X2F:X24N) %>%
collect()
dim(dat)
```
### 1.2 Clean data
The above dataset contains all genes (n=13,813) in the ant genome. However, not all of these genes are expressed in the ant brain, and some are expressed at very low levels that are not biologically meaningful.
Therefore, we will only keep the genes that are “expressed” (≥1 FPKM) in the ant brain, for at least half of all the sampled time points.
```{r clean_data}
# Which genes are expressed throughout the day in both forager and nurses brains?
daily.exp.genes <-
tbl(db, "expressed_genes") %>% # note, the information is already available in the database
filter(exp_half_for == "yes" & exp_half_nur == "yes") %>%
collect() %>%
pull(gene_name)
# Subset the gene-expr X time-point file
dat <- dat %>% filter(gene_name %in% daily.exp.genes)
dim(dat)
```
This is our cleaned, input data file.
The daily expression for these 9139 genes will be used to create the circadian GCN of Camponotus floridanus.
### 1.3 Format data
To create the ant GCN, we will need to calculate the expression similarity (co-expression) of different gene pairs. Therefore, we would like to normalize the gene expression data by log2-transformation. Let’s do that and visualize the result.
```{r format_data}
datExpr = as.data.frame(t(log2(dat[-c(1)]+1)))
names(datExpr) = dat$gene_name
rownames(datExpr) = names(dat)[-c(1)]
# ----------------------------------------------------------- #
# USE THE FOLLOWING CODE TO CHECK IF YOU HAVE ANY BAD SAMPLES #
# ----------------------------------------------------------- #
# gsg = goodSamplesGenes(datExpr0, verbose = 3);
# gsg$allOK
#
# sampleTree = hclust(dist(datExpr0), method = "average");
# # Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# # The user should change the dimensions if the window is too large or too small.
# sizeGrWindow(12,9)
# #pdf(file = "Plots/sampleClustering.pdf", width = 12, height = 9);
# par(cex = 1);
# par(mar = c(0,4,2,0))
# plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
# cex.axis = 1.5, cex.main = 2)
# ----------------------------------------------------------- #
# save the number of genes and samples
# that will be used to create the circadian GCN
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
# visualize the log-transformed data
x = reshape2::melt(as.matrix(t(datExpr)))
colnames(x) = c('gene_id', 'sample', 'value')
ggplot(x, aes(x=value, color=sample)) + geom_density() + theme_Publication()
```
> Figure 1. Normalized gene expression: The density plot above shows the distribution of log2-transformed gene expression values for each sample.
### 1.4 Calculate gene-gene similarity
Now, we can calculate the pairwise gene expression similarity for each of the 9139 genes and save it to a matrix.
I calculated expression similarity for all gene pairs in a dataset using Kendall’s tau, which measures the ordinal relationship between two variables and is used in rhythmicity detection algorithms [1].
```{r gene_sim_matrix}
## Calculate Kendall's tau-b correlation for each gene-gene pair
#
# sim_matrix <- cor((datExpr), method = "kendall") # this step takes time
# save(sim_matrix, file = paste0(path_to_repo, "/results/temp_files/sim_matrix_for_nur_TC5.RData")) # might be useful to save the sim_matrix and
load(paste0(path_to_repo, "/results/temp_files/sim_matrix_for_nur_TC5.RData")) # load it up
## Let's display a chunk of the matrix (code from Hughitt 2016; github)
heatmap_indices <- sample(nrow(sim_matrix), 500)
gplots::heatmap.2(t(sim_matrix[heatmap_indices, heatmap_indices]),
col=inferno(100),
labRow=NA, labCol=NA,
trace='none', dendrogram='row',
xlab='Gene', ylab='Gene',
main='Similarity matrix \n correlation method = "kendall" \n (500 random genes)',
density.info='none', revC=TRUE)
```
> Figure 2. Similarity matrix: The heatmap shows the pairwise Kendall’s tau correlation for a set of 500 genes randomly pulled from the 9139 genes expressed in the ant brain.
### 1.5 Create adjacency matrix
From the above similarity matrix, we then need to create the adjacency matrix needed for constructing a gene co-expression network.
To create the adjacency matrix, we need to first identify the soft-thresholding power by calling the network topology analysis function from the WGCNA package [2].
```{r soft_thresholding_power}
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# # Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
# Plot the results:
# sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
```
> Figure 3. Soft-thresholding power: The above plots show the effect of soft-thresholding power on the topology and the mean connectivity of the transformed similarity matrix (network).
NOTE: The scale-free topology fit index reaches ~0.9 at a soft-thresholding power of 9 and it does not improve drastically beyond that.
So, we will set our soft thresholding power to 9 for creating the adjacency matrix.
```{r adjacency_matrix}
## Specify the soft-thresholding-power
soft.power = 9
## Construct adjacency matrix
# adj_matrix <- adjacency.fromSimilarity(sim_matrix,
# power=soft.power,
# type='signed'
# )
# save(adj_matrix, file = paste0(path_to_repo, "/results/temp_files/adj_matrix_for_nur_TC5.RData")) # might be useful to save the sim_matrix and
load(paste0(path_to_repo, "/results/temp_files/adj_matrix_for_nur_TC5.RData")) # load it up
# Convert adj_matrix to matrix
gene_ids <- rownames(adj_matrix)
adj_matrix <- matrix(adj_matrix, nrow=nrow(adj_matrix))
rownames(adj_matrix) <- gene_ids
colnames(adj_matrix) <- gene_ids
## Same heatmap as before, but now with the power-transformed adjacency matrix
gplots::heatmap.2(t(adj_matrix[heatmap_indices, heatmap_indices]),
col=inferno(100),
labRow=NA, labCol=NA,
trace='none', dendrogram='row',
xlab='Gene', ylab='Gene',
main='Adjacency matrix',
density.info='none', revC=TRUE)
```
Figure 4. Adjacency matrix: The heatmap shows the result of the power-transformation on the similarity of the 500 random genes shown previously in Figure 2. As you can see, only the highest pair-wise correlations are retained whereas the weak correlations tend to zero.
***
## Step 2: Identify gene clusters
### 2.1 Create topological overalp matrix
```{r adj_to_TOM}
# Turn adjacency into topological overlap
# TOM = TOMsimilarity(adj_matrix);
# dissTOM = 1-TOM
# save(dissTOM, file = paste0(path_to_repo, "/results/temp_files/dissTOM_for_nur_TC5.RData")) # might be useful to save the sim_matrix and
load(paste0(path_to_repo, "/results/temp_files/dissTOM_for_nur_TC5.RData")) # load it up
# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average")
# Plot the resulting clustering tree (dendrogram)
# sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04)
```
> Figure 5. Clustering tree: The figure shows the clustering tree (dendrogram) that results from hierarchical clustering of the TOM-based dissimilarity matrix and will be used for identifying modules of highly similar genes in the co-expression network.
### 2.2 Identify clusters
To cluster genes with similar daily expression pattern, we use the cutreeDynamic() function from the WGCNA package.
We need to provide a minimum size for the identified clusters or modules. This can set depending on the user’s question. In our case, we want to identify fairly large modules that are biologically meaningful (i.e., enriched in different GO/PFAM terms). As such, we set the minimum module size to 30. However, as you will see later, we will refine our cluster identification by merging very similar modules. As such, the choice of minimum module size should not affect cluster identification drastically.
```{r identify_clusters}
# We like large modules, so we set the minimum module size relatively high:
minModuleSize = 30;
# Module identification using dynamic tree cut:
dynamicMods= cutreeDynamic(dendro = geneTree,
distM = dissTOM,
method = "hybrid",
verbose = 4,
deepSplit = 3, # see WGCNA for more info on tuning parameters
pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
# view number of genes in each module
table(dynamicMods)
# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
```
**In the initial cluster (module) identification step, WGCNA finds 30 modules. However, some of the identified modules might have very similar expression pattern and we would rather merge this closely related modules into one.**
We do that in the next step.
### 2.3 Merge similar modules
```{r refine_cluster_part1}
# Calculate eigengenes
MEList = moduleEigengenes(datExpr, colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs, method = "kendall");
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
# sizeGrWindow(7, 8)
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "MEDiss = 1-cor(MEs, method = 'kendall')")
# We choose a height cut of 0.2, corresponding to correlation of 0.8, to merge
MEDissThres = 0.2 # user-specified parameter value; see WGCNA manual for more info
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")
```
> Figure 6. Merging similar modules: The above figure shows the similarity of the different gene modules using hierarchical clustering of the module’s eigenvalue (eigengene expression). The horizontal red line shows the cutoff used to merge similar modules.
We choose a cut height of 0.2, corresponding to correlation of 0.8, to merge similar modules. Although arbitrary, the cutoff was motivated by the number of modules we would like to retain in the GCN; in our case, a 0.2 threshold resulted in a total of 12 modules in the GCN (see below).
In the following code, we merge the similar modules and visualize the module assignments before and after merging.
```{r refine_cluster_part2}
# Call an automatic merging function
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# The merged module colors
mergedColors = merge$colors;
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs;
# sizeGrWindow(12, 9)
plotDendroAndColors(geneTree,
cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1
```
> Figure 7. Modules of highly co-expressed genes: The above plot shows the results of module identification before (Dyanamic Tree Cut) and after (Merged dyanamic) similar modules were merged.
**We identified 12 modules in the ant GCN, the size of each of these modules are shown below.**
> figure goes here
Note, WGCNA names the different modules as colors (see above), and the colors have no meaning. Therefore, it might be useful to rename the modules. In the next step we rename all the modules according to the following convention:
> figure goes here
### 2.4 Calculate module-module similarity
Thus far, we have created the ant GCN (adjacency matrix) and identified 12 modules of highly co-expressed genes in the network.
Next, we investigate how the different modules are connected to each other in the GCN. To do so, we calculate the module-module similarity (Kendall’s tau-b correlation for pairwise module-eigengene expression) and then use the similarity matrix to create the module adjacency matrix.
The following code calculates the module adjacency matrix and visualizes it as a heatmap.
```{r module_sim_matrix}
# Calculate similarity of the eigen-genes
sim_matrix_ME <- cor(mergedMEs, method = "kendall")
# calculate adj_matrix
adj_matrix_ME <- adjacency.fromSimilarity(sim_matrix_ME,
power=1, # DO NOT power transform
type='signed'
)
# coerce into a matrix
## GET THE NAMES OF THE MODULES
# module_ids <- rownames(adj_matrix_ME)
## CHANGE THE NAMES OF THE MODULES
module_ids <- data.frame(old_labels = rownames(adj_matrix_ME),
new_labels = paste0("module-", 1:nrow(adj_matrix_ME)))
adj_matrix_ME <- matrix(adj_matrix_ME, nrow=nrow(adj_matrix_ME))
rownames(adj_matrix_ME) <- module_ids$new_labels
colnames(adj_matrix_ME) <- module_ids$new_labels
# png(paste0(path_to_repo, "/results/figures/ME_adjacency.png"),
# width = 18, height = 18, units = "cm", res = 400)
gplots::heatmap.2(t(adj_matrix_ME),
col=inferno(100),
# labRow=NA, labCol=NA,
trace='none', dendrogram='row',
xlab='', ylab='',
# main='Similarity matrix - MEs \n correlation method = "kendall")',
main='Adjacency matrix - MEs',
density.info='none', revC=TRUE)
# trash <- dev.off()
```
> Figure 8. Module-module relationships: The heatmap above shows the pairwise Kendall’s tau correlation (similarity) of the twelve modules identified in the ant GCN. Darker shades indicate low correlations and brighter shades indicate high correlations, as shown in the Color Key.
### 2.5 Visualize the network
To better visualize the global network – how the modules are connected to each other – we can simplify the network. That is, we remove most of the weak edges of the network and retain only the strong module-module correlations.
For example, to remove weak edges, we can set all correlations less than 0.6 to be zero. This will help us obtain a fairly clean network for visualization. To simplify further, we can assign the same edge weight for all correlations between 0.6 and 0.8 (e.g., 0.5), and a different edge weight for correlations ≥ 0.8 (e.g., 1).
The following code uses the igraph package in R to simplify and visualize the module-module relationships in the network.
```{r visualize_network}
pacman::p_load(igraph)
# get rid of low correlations (0.6 & 0.8 are arbitrary)
adj_matrix_ME[adj_matrix_ME < 0.6] <- 0
adj_matrix_ME[adj_matrix_ME < 0.8 & adj_matrix_ME>0] <- 0.5
adj_matrix_ME[adj_matrix_ME >= 0.8] <- 1
# build_network
network <- graph.adjacency(adj_matrix_ME,
mode = "upper",
weighted = T,
diag = F)
# simplify network
network <- igraph::simplify(network) # removes self-loops
# E(network)$width <- E(network)$weight + min(E(network)$weight) + 1 # offset=1
colors <- mergedMEs %>% names() %>% str_split("ME", 2) %>% sapply("[", 2)
V(network)$color <- colors
genes_ME <- factor(moduleColors, levels=colors) %>% summary()
V(network)$size <- log2(genes_ME)*2
V(network)$label.color <- "black"
V(network)$frame.color <- "white"
E(network)$width <- E(network)$weight^2*4
E(network)$edge.color <- "gray80"
# par(mar=c(0,0,0,0))
# remove unconnected nodes
# network <- delete.vertices(network, degree(network)==0)
# plot(network,
# layout=layout.fruchterman.reingold
# # layout = layout.kamada.kawai
# # layout = layout.kamada.kawai
# )
## Circular layout
plot(network,
layout=layout.kamada.kawai,
# layout=layout.fruchterman.reingold
# layout=layout.graphopt
# layout=layout_in_circle,
# vertex.label=NA
# vertex.size=hub.score(network)$vector*30
vertex.shape="none"
)
```
> Figure 9. Visualizing the ant GCN: A simplified view of the connectivity patterns between the different gene modules of the ant GCN are shown. In our case, thick edges between two modules indicate correlations ≥ 0.8, thinner edges indicate correlations between (0.6, 0.8), and no edges indicate correlations < 0.6.
## Step 3: Annotate the network
Now that we have created the ant GCN, we can functionally annotate the network by identifying which modules contain our genes of interest. To do so, we will check for significant overlap between a module in the network and our genes of interest using Fisher’s exact test.
### 3.1 Define your genes of interest
For example, we want to identify the GCN modules that contain our 24h oscillating genes (for.rhy = 24h-rhythmic genes in forager brains, nur.rhy = 24h-rhythmic genes in nurses).
```{r define_genes_of_interest}
# DEFINE GENES OF INTEREST (PART 1)
# load the data table that contains the results from the rhythmicity analysis (eJTK-Cycle output)
rhy.trait.24 <- tbl(db, "ejtk_all") %>% select(gene_name:rhy) %>% collect()
# pull the gene names that are rhythmic in forager brains
for.rhy <- rhy.trait.24 %>% filter(caste=="for" & rhy=="yes") %>% pull(gene_name)
# pull the gene names that are rhythmic in forager brains
nur.rhy <- rhy.trait.24 %>% filter(caste=="nur" & rhy=="yes") %>% pull(gene_name)
# DEFINE GENES OF INTEREST (PART 2)
# Make a list that contains all gene names for a given cluster
module_color = colors
module = names(mergedMEs)
module_colors <-
data.frame(module_label=module) %>%
mutate(module_color = str_replace(module_label, "ME", ""))
module_genes <- list()
module_color <- module_colors$module_color
# Get the genes from each of the modules
for (i in 1:length(module_color)) {
module_genes[[i]] <- names(datExpr)[which(moduleColors==module_color[[i]])]
names(module_genes)[[i]] <- module_color[[i]]
}
names(module_genes) <- module_ids$new_labels
# For example, we can obtain names of all the genes in module-1 using the following code:
# module_genes["module-1"]
```
Now that we have defined our genes of interest (rhythmic genes), we can use a Fisher’s exact test to check for significant overlap between our genes of interest (e.g., for.rhy) and modules in the GCN (e.g., module-7) in a pairwise manner.
For example, to check if the 24h-rhythmic genes in forager brains (for.rhy) are overrepresented in module-7, we first create the contingency table. To do so, we need to define the background number of genes, i.e., the size of the set that contains all possible genes from which module-7 and for.rhy genesets are drawn. In our case, that would be the 9139 genes that was used to build the GCN. Using this information, we can create the contingency table as shown below.
```{r fishers_exact_part1}
set.1 <- for.rhy
set.2 <- module_genes[["module-7"]]
# To create the contingency table for module-7 (set-1) and for.rhy (set-2), we will need the following information:
# genes that are in both sets
overlapping.genes <- intersect(set.1, set.2) %>% length()
# genes in set-1 but not in set-2
set.1.not.set.2 <- setdiff(set.1, set.2) %>% length()
# genes in set-2 but not in set-1
set.2.not.set.1 <- setdiff(set.2, set.1) %>% length()
# background genes not in set-1 or set-2
not.set.1.set.2 <- nGenes - (union(set.1, set.2) %>% unique() %>% length())
# for.rhy and module-7
test.table <-
data.frame(
in.module = c(overlapping.genes, set.2.not.set.1),
not.module = c(set.1.not.set.2, not.set.1.set.2)
)
rownames(test.table) <- c("in.for.rhy", "not.for.rhy")
# take a look at the contingency table
test.table
```
The above table shows that 477 genes are found in both, for.rhy and module-7. However, there are 3092 genes that are found in for.rhy but not in module-7, whereas 187 genes occur only in module-7 but not in for.rhy. Finally, we have 5383 genes that are in the background geneset but neither in rhy.for nor in module-7.
Now, we can run the Fisher’s exact test using the fisher.test() function in R. The results of which are shown below:
```{r fishers_exact_part2}
# run the fisher's exact test
fisher.test(test.table)
```
The output shows that the odds-ratio is approximately 4, which is significantly higher than 1 (p-value < 2e-16). In other words, the genes that show 24h-rhythms in forager brains are significantly overrepresented in module-7 and vice-versa, or that the two sets show significant overlap.
### 3.2 Where are my genes of interest located?
Since we need to perform multiple Fisher’s exact test for our comparisons, we will make use of the [GeneOverlap]( https://www.bioconductor.org/packages/devel/bioc/vignettes/GeneOverlap/inst/doc/GeneOverlap.pdf) package in R [3].
```{r find_genes_in_network_1}
# Load the GeneOverlap library
pacman::p_load(GeneOverlap)
#
# You can find more about the package here:
# https://www.bioconductor.org/packages/devel/bioc/vignettes/GeneOverlap/inst/doc/GeneOverlap.pdf
## DEFINE YOUR LIST OF GENES FOR PAIRWISE TEST OF OVERLAP ##
# LIST ONE - WGCNA modules
list1 <- module_genes
sapply(list1, length)
## LIST TWO - rhythmic genes
list2 <- list(for.rhy, nur.rhy)
names(list2) <- c("for24", "nur24")
sapply(list2, length)
## CHECK FOR OVERLAP
## make a GOM object
gom.1v2 <- newGOM(list1, list2,
genome.size = nGenes)
png(paste0(path_to_repo, "/results/figures/gom_1v2.png"),
width = 18, height = 18, units = "cm", res = 300)
drawHeatmap(gom.1v2,
adj.p=T,
cutoff=0.05,
what="odds.ratio",
# what="Jaccard",
log.scale = T,
note.col = "grey80")
trash <- dev.off()
```
```{r plot_gom_1v2, echo = FALSE, fig.align='center', fig.cap='Gene-clusters with 24h-rhythmic genes'}
knitr::include_graphics(paste0(path_to_repo, "/results/figures/gom_1v2.png"))
```
> Figure 10. Gene-clusters with rhythmic genes: The above plot shows the result of the Fisher’s exact test performed for each module-geneset pair. The color of the boxes represents the odds-ratio (darker the green, higher is the odds-ratio) and the p-values are shown. The p-values shown were corrected for multiple-hypothesis testing using Benjamini-Hochberg method. Non-significant overlaps between module and geneset are indicated with a N.S. inside the box.
From the plot, we can see that the 24h-rhythmic genes are located in five of the twelve modules of the ant GCN (module-1, module-4, module-7, module-11, and module-12).
We can further annotate the rhythmic modules by identifying which of these five modules peak during the day and which peak at night in ant brains. For the purpose of this study, we focus on the daily expression patterns only in the forager brain because the Ophiocordyceps are most likely to infect forager ants that are perform bulk of the outside-nest tasks. To identify day- and night-peaking modules, we can visualize the daily expression of all genes in these rhythmic modules as well as their module eigengene expression (shown below).
```{r rhythmic_modules_zplots}
# NOTE: The black line denotes median expression, not ME eigengene expression.
# par(mfrow = c(3,2))
# module-4 plot
# module-11 plot
# module-12 plot
# module-1 plot
# module-7 plot
```
> Figure 11. Daily expression patterns of genes in rhythmic modules: The daily expression pattern of all genes in a given module as well as the module’s eigengene expression are shown. For a module, each red line represents the expression of one gene and the black line represents the eigengene expression. The x-axis shows the time-of-day or Zeitgeber Time whereas the y-axis shows normalized gene expression (z-scores calculated from log2-transformed expression data). The 12h:12h light-dark cycles during which the samples were collected are also shown; white background indicates the light phase (lights on at ZT24/ZT0) and grey background indicates the dark phase (lights turned off at ZT12).
**We found that module-4, module-11, and module-12 were day-peaking modules, whereas module-1 and module-7 were night-peaking modules.**
Using the same approach as above, we can identify the ant modules that putatively underlie behavioral plasticity, as well as the modules that are affected during Ophiocordyceps-induced behavioral manipulation.
```{r find_genes_in_network_2}
## Genes underlying behavioral plasticity, i.e., DEGS (foragers v. nurses)
# genes higher expressed in forager brains (v. nurse brains)
for.up <- tbl(db, "TC5_DEGs_all") %>% filter(upregulation=="for") %>% collect() %>% pull(gene_name)
# genes lower expressed in for. brains (v. nurse brains)
for.down <- tbl(db, "TC5_DEGs_all") %>% filter(upregulation=="nur") %>% collect() %>% pull(gene_name)
## Genes underlying parasite-induced behavioral manipulation, i.e., DEGs (ophio-ant v. control-ant)
# load the data table
ophio.dat <-
tbl(db, "ophio_biting_control") %>%
collect() %>%
select(gene, value_1, value_2, q_value:logFC) %>%
filter(abs(logFC) >= 1 & significant=="yes" & q_value < 0.05) %>%
mutate(ophio = ifelse(logFC > 0, "down", "up"))
# genes higher expressed in ant heads during Ophio-manipulated biting (v. controls)
ophio.up <- ophio.dat %>% filter(ophio=="up") %>% pull(gene)
# genes lower expressed in ant heads during manipulated biting (v. controls)
ophio.down <- ophio.dat %>% filter(ophio=="down") %>% pull(gene)
## LIST THREE - genes underlying behavioral plasticity and parasitic behavioral manipulation
list3 <- list(for.up, for.down, # same as list three
ophio.up, ophio.down)
names(list3) <- c("for-UP", "for-DOWN",
"ophio-UP", "ophio-DOWN")
## CHECK FOR OVERLAP
## make a GOM object
gom.1v3 <- newGOM(list1, list3,
genome.size = nGenes)
## visualize the overlaps
png(paste0(path_to_repo, "/results/figures/gom_1v3.png"),
width = 14, height = 18, units = "cm", res = 400)
drawHeatmap(gom.1v3,
adj.p=T,
cutoff=0.05,
what="odds.ratio",
# what="Jaccard",
log.scale = T,
note.col = "grey80")
trash <- dev.off()
# # OR, do all comparisons in one:
# list4 <- list(for.rhy, nur.rhy,
# for.up, for.down,
# ophio.up, ophio.down)
# names(list4) <- c("for-24h", "nur-24h",
# "for-UP", "for-DOWN",
# "ophio-UP", "ophio-DOWN")
# ## CHECK FOR OVERLAP
# ## make a GOM object
# gom.1v4 <- newGOM(list1, list4, genome.size = nGenes)
# ## visualize the overlaps
# png(paste0(path_to_repo, "/results/figures/gom_1v4.png"),
# width = 18, height = 18, units = "cm", res = 400)
# drawHeatmap(gom.1v4,
# adj.p=T,
# cutoff=0.05,
# what="odds.ratio",
# grid.col = "Greens",
# # what="Jaccard",
# log.scale = T,
# note.col = "black")
# trash <- dev.off()
```
```{r plot_gom_1v3, echo = FALSE, fig.align='center', fig.cap='Gene-clusters underlying behavioral plasiticity and parasitic behavioral manipulation'}
knitr::include_graphics(paste0(path_to_repo, "/results/figures/gom_1v3.png"))
```
> Figure 12. Gene-clusters behavioral plasiticity and parasitic behavioral manipulation: The heatmap identifies the different gene modules that show an overrepresentation of the genes previously found to underlie behavioral plasticity (genes differentially expressed between foragers and nurses) and parasitic behavioral manipulation (genes differentially expressed in foragers during manipulated biting behavior).
The figure above shows that the gene-modules that putatively underlie behavioral plasticity (forager-nurse differentiation) and the ones that are affected during Ophiocordycpes-induced behavioral manipulation, are the same.
**In other words, to induce the characteristic manipulated biting behavior, the manipulating fungal parasite seems to be targeting the same genes and processes that otherwise allow ants to display behavioral plasticity (shown in the summary figure below).**
```{r plot_annotated_network, echo = FALSE, fig.align='center', fig.cap='Annotated circadian GCN', out.width="85%"}
knitr::include_graphics("./../results/illustrations/01_figures_das_and_deBekker_2021_draft5.png")
```
```{r define_all_rhythmic_genes, include=F}
rhy.trait.8 <- tbl(db, "ejtk_8h_all") %>% select(gene_name:rhy) %>% collect()
for.rhy.8 <- rhy.trait.8 %>% filter(caste=="for" & rhy=="yes") %>% pull(gene_name)
nur.rhy.8 <- rhy.trait.8 %>% filter(caste=="nur" & rhy=="yes") %>% pull(gene_name)
rhy.trait.12 <- tbl(db, "ejtk_12h_all") %>% select(gene_name:rhy) %>% collect()
for.rhy.12 <- rhy.trait.12 %>% filter(caste=="for" & rhy=="yes") %>% pull(gene_name)
nur.rhy.12 <- rhy.trait.12 %>% filter(caste=="nur" & rhy=="yes") %>% pull(gene_name)
```
## Step 4: Explore your clusters-of-interest
### 4.1 Cluster: module-9
#### 4.1.1 What are these overlapping genes?
- Let's focus on the cluster module-9/module-9 that contains most Cflo genes that:
- are sig. higher expressed in foragers (v. nurses) and
- are sig. up-regulated in forager heads during behavioral manipulation (v. uninfected foragers)
```{r explore_module-9_part1}
# specify our cluster of interest (coi)
coi.1 <- "module-9"
# How many genes are there in the cluster?
module_genes[[coi.1]] %>% length() # n = 209 genes
# specify our genes of interest (goi)
goi.1 <- ophio.up
# how many genes are there in the gene-set?
goi.1 %>% length() # n = 232 genes
# Identify overlapping genes
overlapping.genes.1 <- intersect(module_genes[[coi.1]], goi.1) # n = 32 genes
# what are these genes?
overlapping.genes.1.annot <-
db %>%
tbl(., "annot_fpkm") %>%
filter(gene_name %in% overlapping.genes.1) %>%
select(gene_name,
blast_annot=old_annotation,
GOs, pfams, signalP, TMHMM) %>%
collect() %>%
# add a column that indicates if the gene is rhythmic or not
## in forager brains
mutate(rhy_foragers = ifelse(gene_name %in% for.rhy, "24h",
ifelse(gene_name %in% for.rhy.12, "12h",
ifelse(gene_name %in% for.rhy.8, "8h", "not_rhy")))) %>%
## in nurse brains
mutate(rhy_nurses = ifelse(gene_name %in% nur.rhy, "24h",
ifelse(gene_name %in% nur.rhy.12, "12h",
ifelse(gene_name %in% nur.rhy.8, "8h", "not_rhy"))))
# Visualize the results
DT::datatable(overlapping.genes.1.annot, options = list(
pageLength = 5,
lengthMenu = c(5, 10, 15, 20)
))
```
Which genes in module-9 show overlap with both genes of interests?
```{r explore_module-9_part2}
# define the second gene-set of interst
goi.2 <- for.up
# how many genes are up-regulated during manipulation?
goi.2 %>% length() # n = 34 genes
# Identify overlapping.genes
overlapping.genes.2 <- intersect(overlapping.genes.1, goi.2) # n = 10 genes
# what are these genes?
overlapping.genes.2.annot <-
db %>%
tbl(., "annot_fpkm") %>%
filter(gene_name %in% overlapping.genes.2) %>%
select(gene_name,
blast_annot=old_annotation,
GOs, pfams, signalP, TMHMM) %>%
collect() %>%
# add a column that indicates if the gene is rhythmic or not
## in forager brains
mutate(rhy_foragers = ifelse(gene_name %in% for.rhy, "24h",
ifelse(gene_name %in% for.rhy.12, "12h",
ifelse(gene_name %in% for.rhy.8, "8h", "not_rhy")))) %>%
## in nurse brains
mutate(rhy_nurses = ifelse(gene_name %in% nur.rhy, "24h",
ifelse(gene_name %in% nur.rhy.12, "12h",
ifelse(gene_name %in% nur.rhy.8, "8h", "not_rhy"))))
# Visualize the results
DT::datatable(overlapping.genes.2.annot, options = list(
pageLength = 5,
lengthMenu = c(5, 10, 15, 20)
))
```
#### 4.1.2 What's special about my cluster?
Now that we know that the module **module-9** contains most of our genes of interest, we can infer its function (enriched GOs and PFAMs) and also identify the genes that are important for the cluster to be functional (i.e., hub genes).
#### 4.1.3 Enriched GO terms
First up, let's see which processes are overrepresented in the cluster.
```{r explore_module-9_v3}
# To run a functional enrichment analyis, we first need to define the set of background genes; for our purpose, we will use the 9139 genes that we used to build our circadian GCN
bg.genes <- dat %>% pull(gene_name)
# Run the enrichment function (note, GO HERE TO READ MORE ABOUT THIS FUNCTION)
# png(paste0(path_to_repo, "/results/figures/module_9_enrichments.png"),
# width = 16, height = 10, units = "cm", res = 400)
go_enrichment(geneset = module_genes[[coi.1]],
function.dir = path_to_repo,
org = "cflo",
bg = bg.genes) %>% #view()
# visualize the results
go_enrichment_plot(function.dir = path_to_repo, clean = "no")
# trash <- dev.off()
```
#### 4.1.4 Daily rhythms?
Second, let's plot the daily expression patterns of all genes in the cluster, for nurses and foragers.
```{r explore_module-9_v4}
# Obtain the stacked z-plots for nurses (blue) and foragers (red)
zplots.module <-
module_genes[[coi.1]] %>%
stacked.zplot()
# Plot them side by side
zplots.module[[1]] / zplots.module[[2]]
```
> LEGEND: **RED** = Forager brains, **BLUE** = Nurse brains
#### 4.1.5 HUB genes?
Need to:
- identify the hub genes in the module-9 cluster
- other genes of interest based on their location in the network?
### 4.2 Cluster: module-6
#### 4.2.1 Overlapping genes
```{r explore_module-6_v1, echo=F}
# specify our cluster of interest (coi)
coi.2 <- "module-6"
# specify our genes of interest (goi)
goi.3 <- ophio.down
# Identify overlapping genes
overlapping.genes.3 <- intersect(module_genes[[coi.2]], goi.3)
# define the second gene-set of interst
goi.4 <- for.down
# Identify overlapping.genes
overlapping.genes.4 <- intersect(overlapping.genes.3, goi.4)
# what are these genes?
overlapping.genes.4.annot <-
db %>%
tbl(., "annot_fpkm") %>%
filter(gene_name %in% overlapping.genes.4) %>%
select(gene_name,
blast_annot=old_annotation,
GOs, pfams, signalP, TMHMM) %>%
collect()
# Visualize the results
DT::datatable(overlapping.genes.4.annot, options = list(
pageLength = 5,
lengthMenu = c(5, 10, 15, 20)
))
```
#### 4.2.2 Enriched GO terms
```{r explore_module-6_v2, echo=F}
# png(paste0(path_to_repo, "/results/figures/module_6_enrichments.png"),
# width = 16, height = 12, units = "cm", res = 400)
# which genes?
module_genes[[coi.2]] %>%
# run enrichment
go_enrichment(.,
function.dir = path_to_repo,
org = "cflo",
bg = bg.genes) %>% #view()
# visualize the results
go_enrichment_plot(function.dir = path_to_repo, clean = "no")
# trash <- dev.off()
```
#### 4.2.3 Daily rhythms?
```{r explore_module-6_v3, echo=F}
# Obtain the stacked z-plots for
# nurses (blue) and foragers (red)
zplots.module <-
module_genes[[coi.2]] %>%
stacked.zplot()
# Plot them side by side
zplots.module[[1]] / zplots.module[[2]]
```
LEGEND: **RED** = Forager brains, **BLUE** = Nurse brains
#### 4.3.4 HUB genes?
coming soon...
### 4.3 Cluster: module-12
> Connected to the forager-cluster (module-9)
#### 4.3.1 Explore cluster
```{r explore_module-12_v1, echo=F}
# specify our cluster of interest (coi)
coi.3 <- "module-12"
writeLines(paste0("How many genes are there in the ", coi.3, " cluster?"))
module_genes[[coi.3]] %>% length() # n = 95 genes
```
#### 4.3.2 Enriched GO terms