/
netclu_beckett.R
235 lines (215 loc) · 8.68 KB
/
netclu_beckett.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#' Community structure detection in weighted bipartite network via modularity
#' optimization
#'
#' This function takes a bipartite weighted graph and computes modules by
#' applying Newman’s modularity measure in a bipartite weighted version to it.
#'
#' @param net a `data.frame` representing a bipartite network with the two
#' first columns as undirected links between pair of nodes and and the next
#' column(s) are the weight of the links.
#'
#' @param weight a `boolean` indicating if the weights should be considered
#' if there are more than two columns (see Note).
#'
#' @param cut_weight a minimal weight value. If `weight` is TRUE, the links
#' between sites with a weight strictly lower than this value will not be
#' considered (O by default).
#'
#' @param index name or number of the column to use as weight. By default,
#' the third column name of `net` is used.
#'
#' @param seed for the random number generator (NULL for random by default).
#'
#' @param forceLPA a `boolean` indicating if the even faster pure
#' LPA-algorithm of Beckett should be used? DIRT-LPA, the default, is less
#' likely to get trapped in a local minimum, but is slightly slower. Defaults
#' to FALSE.
#'
#' @param site_col name or number for the column of site nodes (i.e. primary
#' nodes).
#'
#' @param species_col name or number for the column of species nodes (i.e.
#' feature nodes).
#'
#' @param return_node_type a `character` indicating what types of nodes
#' (`site`, `species` or `both`) should be returned in the output
#' (`return_node_type = "both"` by default).
#'
#' @param algorithm_in_output a `boolean` indicating if the original output
#' of [computeModules][bipartite::computeModules] should be returned in the
#' output (`TRUE` by default, see Value).
#'
#' @details
#' This function is based on the modularity optimization algorithm provided by
#' Stephen Beckett \insertCite{Beckett2016}{bioregion} as implemented in the
#' [bipartite](https://cran.r-project.org/package=bipartite)
#' package ([computeModules][bipartite::computeModules]).
#'
#' @note
#' Beckett has been designed to deal with weighted bipartite networks. Note
#' that if `weight = FALSE`, a weight of 1 will be assigned to each pair of
#' nodes. Do not forget to indicate which of the first two columns is
#' dedicated to the site nodes (i.e. primary nodes) and species nodes (i.e.
#' feature nodes) using the arguments `site_col` and `species_col`. The type of
#' nodes returned in the output can be chosen with the argument
#' `return_node_type` equal to `both` to keep both types of nodes,`sites`
#' to preserve only the sites nodes and `species` to preserve only the
#' species nodes.
#'
#' @return
#' A `list` of class `bioregion.clusters` with five slots:
#' \enumerate{
#' \item{**name**: `character` containing the name of the algorithm}
#' \item{**args**: `list` of input arguments as provided by the user}
#' \item{**inputs**: `list` of characteristics of the clustering process}
#' \item{**algorithm**: `list` of all objects associated with the
#' clustering procedure, such as original cluster objects (only if
#' `algorithm_in_output = TRUE`)}
#' \item{**clusters**: `data.frame` containing the clustering results}}
#'
#' In the `algorithm` slot, if `algorithm_in_output = TRUE`, users can find the
#' output of [computeModules][bipartite::computeModules].
#'
#' @author
#' Maxime Lenormand (\email{maxime.lenormand@inrae.fr}),
#' Pierre Denelle (\email{pierre.denelle@gmail.com}) and
#' Boris Leroy (\email{leroy.boris@gmail.com})
#'
#' @seealso [netclu_infomap], [netclu_oslom]
#'
#' @examples
#' net <- data.frame(
#' Site = c(rep("A", 2), rep("B", 3), rep("C", 2)),
#' Species = c("a", "b", "a", "c", "d", "b", "d"),
#' Weight = c(10, 100, 1, 20, 50, 10, 20))
#'
#' com <- netclu_beckett(net)
#'
#' @references
#' \insertRef{Beckett2016}{bioregion}
#'
#' @importFrom bipartite computeModules
#'
#' @export
netclu_beckett <- function(net,
weight = TRUE,
cut_weight = 0,
index = names(net)[3],
seed = NULL,
forceLPA = FALSE,
site_col = 1,
species_col = 2,
return_node_type = "both",
algorithm_in_output = TRUE){
# Controls inputs
if(!is.null(seed)){
controls(args = seed, data = NULL, type = "strict_positive_integer")
}
controls(args = forceLPA, data = NULL, type = "boolean")
controls(args = algorithm_in_output, data = NULL, type = "boolean")
controls(args = NULL, data = net, type = "input_net")
controls(args = NULL, data = net, type = "input_net_bip")
if(site_col == species_col){
stop("site_col and species_col should not be the same.", call. = FALSE)
}
controls(args = site_col, data = net, type = "input_net_bip_col")
controls(args = species_col, data = net, type = "input_net_bip_col")
controls(args = return_node_type, data = NULL, type = "character")
if(!(return_node_type %in% c("both", "sites", "species"))) {
stop("Please choose return_node_type among the followings values:
both, sites or species", call. = FALSE)}
controls(args = weight, data = net, type = "input_net_weight")
if (weight) {
controls(args = cut_weight, data = net, type = "positive_numeric")
controls(args = index, data = net, type = "input_net_index")
net[, 3] <- net[, index]
net <- net[, 1:3]
controls(args = NULL, data = net, type = "input_net_index_positive_value")
}
# Prepare input
idprim <- as.character(net[, site_col])
idprim <- idprim[!duplicated(idprim)]
nbsites <- length(idprim)
idfeat <- as.character(net[, species_col])
idfeat <- idfeat[!duplicated(idfeat)]
idnode <- c(idprim, idfeat)
idnode <- data.frame(ID = 1:length(idnode), ID_NODE = idnode)
netemp <- data.frame(
node1 = idnode[match(net[, site_col], idnode[, 2]), 1],
node2 = idnode[match(net[, species_col], idnode[, 2]), 1])
if(weight) {
netemp <- cbind(netemp, net[, 3])
netemp <- netemp[netemp[, 3] > cut_weight,]
colnames(netemp)[3] <- "weight"
} else {
netemp$weight <- 1
}
# Class preparation
outputs <- list(name = "netclu_beckett")
outputs$args <- list(weight = weight,
cut_weight = cut_weight,
index = index,
seed = seed,
forceLPA = forceLPA,
site_col = site_col,
species_col = species_col,
return_node_type = return_node_type,
algorithm_in_output = algorithm_in_output)
outputs$inputs <- list(
bipartite = TRUE,
weight = weight,
pairwise = FALSE,
pairwise_metric = NA,
dissimilarity = FALSE,
nb_sites = nbsites,
hierarchical = FALSE)
outputs$algorithm <- list()
# Transform netemp into a contingency table
comat <- net_to_mat(netemp, weight = weight)
if(dim(comat)[1]<2 | dim(comat)[2]<2){
stop("At least two species and two sites are needed to run this algorithm.
Please check your data or choose an appropriate cut_weight value.",
.call = FALSE)
}
# Run algo (with seed)
if(is.null(seed)){
outalg <- bipartite::computeModules(comat, forceLPA = forceLPA)
}else{
set.seed(seed)
outalg <- bipartite::computeModules(comat, forceLPA = forceLPA)
rm(.Random.seed, envir=globalenv())
}
comtemp <- outalg@modules[-1, -c(1, 2)]
comtemp <- cbind(c(as.numeric(rownames(comat)),
as.numeric(colnames(comat))),
apply(comtemp, 2, function(x) which(x > 0)))
com <- data.frame(ID = idnode[, 2], Com = NA)
com[match(comtemp[, 1], idnode[, 1]), 2] <- comtemp[, 2]
# Rename and reorder columns
com <- knbclu(com)
# Add attributes and return_node_type
attr(com, "node_type") <- rep("site", dim(com)[1])
attributes(com)$node_type[!is.na(match(com[, 1], idfeat))] <- "species"
if (return_node_type == "sites") {
com <- com[attributes(com)$node_type == "site", ]
}
if (return_node_type == "species") {
com <- com[attributes(com)$node_type == "species", ]
}
# Set algorithm in outputs
if (!algorithm_in_output) {
outalg <- NA
}
outputs$algorithm <- outalg
# Set clusters and cluster_info in output
outputs$clusters <- com
outputs$cluster_info <- data.frame(
partition_name = names(outputs$clusters)[2:length(outputs$clusters),
drop = FALSE],
n_clust = apply(
outputs$clusters[, 2:length(outputs$clusters), drop = FALSE],
2, function(x) length(unique(x[!is.na(x)]))))
# Return outputs
class(outputs) <- append("bioregion.clusters", class(outputs))
return(outputs)
}