Skip to content

Commit

Permalink
Make serialization of slendr models optional
Browse files Browse the repository at this point in the history
  • Loading branch information
bodkan committed Sep 5, 2022
1 parent b0bc2a0 commit 0757b6e
Show file tree
Hide file tree
Showing 10 changed files with 432 additions and 273 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ inst/doc
.vscode/
build/
inst/pylib/__pycache__/
inst/scripts/__pycache__/
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@

- Relative paths are now expanded in `ts_save()` ([#9521dfb](https://github.com/bodkan/slendr/commit/9521dfb)).

- _slendr_ models can now be optionally compiled without serialization to disk. This only works with the `msprime()` coalescent back end but will be much faster in cases where a huge number of simulations needs to be run because for non-serialized models, `msprime()` now calls the back end engine directly through the R-Python interface (rather than on the command line) and output tree sequences are not saved to disk, rather than passed through the Python-R interface directly in memory ([#6bfda57](https://github.com/bodkan/slendr/commit/6bfda57)).

- Deprecated argument `sampling = ` of the functions `slim()` and `msprime()` has now been permanently removed ([#6bfda57](https://github.com/bodkan/slendr/commit/6bfda57)).

# slendr 0.3.0

- SLiM 4.0 is now required for running simulations with the `slim()` engine. If you want to run _slendr_ simulations with SLiM (spatial or non-spatial), you will need to upgrade you SLiM installation. SLiM 3.7.1 version is no longer supported as the upcoming new _slendr_ spatial features will depend on SLiM 4.x and maintaining two functionally identical yet syntactically different back ends is not feasible (PR [#104](https://github.com/bodkan/slendr/pull/104)).
Expand Down
133 changes: 85 additions & 48 deletions R/compilation.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
#' default run to "the present time")
#' @param direction Intended direction of time. Under normal circumstances this
#' parameter is inferred from the model and does not need to be set manually.
#' @param serialize Should model files be serialized to disk? If not, only an
#' R model object will be returned but no files will be created. This speeds
#' up simulation with msprime but prevents using the SLiM back end.
#' @param slim_script Path to a SLiM script to be used for executing the model
#' (by default, a bundled backend script will be used). If \code{NULL}, the
#' SLiM script bundled with slendr will be used.
Expand All @@ -50,6 +53,7 @@ compile_model <- function(populations, generation_time, path = NULL, resolution
competition = NULL, mating = NULL, dispersal = NULL,
gene_flow = list(), overwrite = FALSE, force = FALSE,
simulation_length = NULL, direction = NULL,
serialize = TRUE,
slim_script = NULL, description = "", sim_length = NULL) {
if (is.null(simulation_length) && !is.null(sim_length)) {
warning("Argument `sim_length` will soon be deprecated in favor of `simulation_length`.", call. = FALSE)
Expand All @@ -58,15 +62,20 @@ compile_model <- function(populations, generation_time, path = NULL, resolution

if (inherits(populations, "slendr_pop")) populations <- list(populations)

if (is.null(path)) path <- tempfile()

if (is.null(slim_script))
slim_script <- system.file("scripts", "script.slim", package = "slendr")

map <- get_map(populations[[1]])
if (!is.null(map) && is.null(resolution))
stop("A map resolution must be specified for spatial models", call. = FALSE)

if (!serialize && is.null(path) && inherits(map, "slendr_map")) {
stop("Spatial models must be serialized to disk for SLiM to simulate data from",
call. = FALSE)
}
if (serialize && is.null(path))
path <- tempfile()

# make sure that all parents are present
pop_names <- purrr::map_chr(populations, ~ .x$pop[1])
parent_names <- unique(purrr::map_chr(populations, function(pop) {
Expand All @@ -83,29 +92,31 @@ compile_model <- function(populations, generation_time, path = NULL, resolution
stop("All populations must have unique names", call. = FALSE)

# prepare the model output directory
if (dir.exists(path)) {
if (!overwrite)
stop("Directory '", path, "' already exists. Either delete it\nmanually ",
"or set 'overwrite = TRUE' to delete it from R.", call. = FALSE)
else {
if (interactive() && !force) {
answer <- utils::menu(c("Yes", "No"),
title = paste0("Are you ABSOLUTELY SURE you want to delete '", path,
"'?\nThere is no going back.")
)
force <- answer == 1
if (serialize) {
if (dir.exists(path)) {
if (!overwrite)
stop("Directory '", path, "' already exists. Either delete it\nmanually ",
"or set 'overwrite = TRUE' to delete it from R.", call. = FALSE)
else {
if (interactive() && !force) {
answer <- utils::menu(c("Yes", "No"),
title = paste0("Are you ABSOLUTELY SURE you want to delete '", path,
"'?\nThere is no going back.")
)
force <- answer == 1
}
if (force)
unlink(path, recursive = TRUE)
else
stop("Compilation aborted because the specified model path directory\ncould ",
"not have been created. If you're running this in a non-interactive\n",
"mode in a script and want to overwrite an already existing model\n",
"directory, you must set `force = TRUE`.", call. = FALSE)
}
if (force)
unlink(path, recursive = TRUE)
else
stop("Compilation aborted because the specified model path directory\ncould ",
"not have been created. If you're running this in a non-interactive\n",
"mode in a script and want to overwrite an already existing model\n",
"directory, you must set `force = TRUE`.", call. = FALSE)
}

}
dir.create(path)
}
dir.create(path)

if (is.data.frame(gene_flow)) gene_flow <- list(gene_flow)

Expand Down Expand Up @@ -163,11 +174,14 @@ setting `direction = 'backward'.`", call. = FALSE)

simulation_length <- if (is.null(simulation_length)) end_time else simulation_length

checksums <- write_model(
path, populations, admix_table, map_table, split_table, resize_table,
dispersal_table, generation_time, resolution, simulation_length, time_dir, slim_script,
description, map
)
if (serialize)
checksums <- write_model(
path, populations, admix_table, map_table, split_table, resize_table,
dispersal_table, generation_time, resolution, simulation_length, time_dir, slim_script,
description, map
)
else
checksums <- NULL

names(populations) <- pop_names

Expand All @@ -177,6 +191,7 @@ setting `direction = 'backward'.`", call. = FALSE)
world = map,
populations = populations,
splits = split_table,
resizes = resize_table,
geneflow = admix_table,
maps = return_maps,
dispersals = dispersal_table,
Expand All @@ -185,6 +200,7 @@ setting `direction = 'backward'.`", call. = FALSE)
length = round(simulation_length / generation_time),
orig_length = simulation_length,
direction = time_dir,
description = description,
checksums = checksums
)
class(result) <- set_class(result, "model")
Expand Down Expand Up @@ -219,13 +235,15 @@ read_model <- function(path) {
# for running the backend script using the run() function
path_populations <- file.path(path, "ranges.rds")
path_splits <- file.path(path, "populations.tsv")
path_resizes <- file.path(path, "resizes.tsv")
path_geneflow <- file.path(path, "geneflow.tsv")
path_maps <- file.path(path, "maps.tsv")
path_generation_time <- file.path(path, "generation_time.txt")
path_resolution <- file.path(path, "resolution.txt")
path_length <- file.path(path, "length.txt")
path_orig_length <- file.path(path, "orig_length.txt")
path_direction <- file.path(path, "direction.txt")
path_description <- file.path(path, "description.txt")

if (!dir.exists(path))
stop(sprintf("Model directory '%s' does not exist", path), call. = FALSE)
Expand All @@ -237,8 +255,10 @@ read_model <- function(path) {
generation_time <- scan(path_generation_time, what = integer(), quiet = TRUE)
length <- as.integer(scan(path_length, what = numeric(), quiet = TRUE))
orig_length <- as.integer(scan(path_orig_length, what = numeric(), quiet = TRUE))
description <- scan(path_description, what = character(), quiet = TRUE)

split_table <- utils::read.table(path_splits, header = TRUE, stringsAsFactors = FALSE)
resize_table <- utils::read.table(path_resizes, header = TRUE, stringsAsFactors = FALSE)

admix_table <- NULL
if (file.exists(path_geneflow)) {
Expand All @@ -263,6 +283,7 @@ read_model <- function(path) {
world = world,
populations = populations,
splits = split_table,
resizes = resize_table,
geneflow = admix_table,
maps = maps,
generation_time = generation_time,
Expand Down Expand Up @@ -360,10 +381,13 @@ slim <- function(
model, sequence_length, recombination_rate, samples = NULL, output = NULL,
burnin = 0, max_attempts = 1, spatial = !is.null(model$world), coalescent_only = TRUE,
method = c("batch", "gui"), random_seed = NULL, verbose = FALSE, load = TRUE,
locations = NULL, slim_path = NULL, sampling = NULL
locations = NULL, slim_path = NULL
) {
method <- match.arg(method)

if (is.null(model$path))
stop("It is not possible to simulate non-serialized models in SLiM", call. = FALSE)

if (is.null(output) & !load)
warning("No custom tree-sequence output path is given but loading a tree sequence from\n",
"a temporary file after the simulation has been prevented", call. = FALSE)
Expand All @@ -383,11 +407,6 @@ slim <- function(
if (!is.numeric(recombination_rate) | recombination_rate < 0)
stop("Recombination rate must be a numeric value", call. = FALSE)

if (!is.null(sampling) && is.null(samples)) {
warning("Argument `sampling` will soon be deprecated in favor of `samples`.", call. = FALSE)
samples <- sampling
}

# verify checksums of serialized model configuration files
checksums <- readr::read_tsv(file.path(model_dir, "checksums.tsv"), progress = FALSE,
col_types = "cc")
Expand Down Expand Up @@ -529,7 +548,6 @@ slim <- function(
#' file is written to a custom location to be loaded at a later point.
#' @param verbose Write the output log to the console (default \code{FALSE})?
#' @param debug Write msprime's debug log to the console (default \code{FALSE})?
#' @param sampling Deprecated in favor of \code{samples}.
#'
#' @return A tree-sequence object loaded via Python-R reticulate interface function \code{ts_load}
#' (internally represented by the Python object \code{tskit.trees.TreeSequence})
Expand Down Expand Up @@ -570,14 +588,46 @@ slim <- function(
#' @export
msprime <- function(model, sequence_length, recombination_rate, samples = NULL,
output = NULL, random_seed = NULL,
load = TRUE, verbose = FALSE, debug = FALSE, sampling = NULL) {
load = TRUE, verbose = FALSE, debug = FALSE) {
if (sum(sapply(model$populations, attr, "parent") == "ancestor") > 1)
stop("Multiple ancestral populations without a common ancestor would lead to\n",
"an infinitely deep history without coalescence. Please make sure that all\n",
"populations trace their ancestry to a single ancestral population.\n",
"(This restriction only applies to coalescent simulations with msprime().)",
call. = FALSE)

if (!is.null(samples)) {
samples <- process_sampling(samples, model, verbose)
if (!is.null(model$path)) {
sampling_path <- tempfile()
readr::write_tsv(samples, sampling_path)
}
sampling <- paste("--sampling-schedule", sampling_path)
} else {
samples <- NULL
sampling <- ""
}

# call msprime back-end code directly for non-serialized models
if (is.null(model$path)) {
script <- reticulate::import_from_path("script", path = system.file("scripts", package = "slendr"))
ts_msprime <- script$simulate(
sequence_length = sequence_length,
recombination_rate = recombination_rate,
seed = random_seed,
populations = reticulate::r_to_py(model$splits),
resizes = reticulate::r_to_py(model$resizes),
geneflows = reticulate::r_to_py(model$geneflow),
length = as.integer(model$length),
direction = model$direction,
description = model$description,
samples = reticulate::r_to_py(samples),
debug = debug
)
ts <- ts_load(ts_msprime, model = model)
return(ts)
}

if (is.null(output) & !load)
warning("No custom tree-sequence output path is given but loading a tree sequence from\n",
"a temporary file after the simulation has been prevented", call. = FALSE)
Expand All @@ -594,26 +644,13 @@ msprime <- function(model, sequence_length, recombination_rate, samples = NULL,
if (!is.numeric(recombination_rate) | recombination_rate < 0)
stop("Recombination rate must be a numeric value", call. = FALSE)

if (!is.null(sampling) && is.null(samples)) {
warning("Argument `sampling` will soon be deprecated in favor of `samples`.", call. = FALSE)
samples <- sampling
}

# verify checksums of serialized model configuration files
checksums <- readr::read_tsv(file.path(model_dir, "checksums.tsv"), progress = FALSE,
col_types = "cc")
verify_checksums(file.path(model_dir, checksums$file), checksums$hash)

script_path <- path.expand(file.path(model_dir, "script.py"))

if (!is.null(samples)) {
sampling_path <- tempfile()
sampling_df <- process_sampling(samples, model, verbose)
readr::write_tsv(sampling_df, sampling_path)
sampling <- paste("--sampling-schedule", sampling_path)
} else
sampling <- ""

msprime_command <- sprintf(
"%s %s %s --model %s --output %s --sequence-length %d --recombination-rate %s %s %s %s",
reticulate::py_exe(),
Expand Down
5 changes: 4 additions & 1 deletion R/print.R
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,10 @@ print.slendr_model <- function(x, ...) {
} else
cat("non-spatial\n\n")

cat("configuration files in:", normalizePath(x$path), "\n")
if (is.null(x$path))
cat("non-serialized slendr model\n")
else
cat("configuration files in:", normalizePath(x$path), "\n")
}


Expand Down
4 changes: 2 additions & 2 deletions R/tree-sequences.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#' \code{\link{ts_simplify}}.
#'
#' @param file A path to the tree-sequence file (either originating from a
#' slendr model or a standard non-slendr tree sequence)
#' slendr model or a standard non-slendr tree sequence).
#' @param model Optional \code{slendr_model} object which produced the
#' tree-sequence \code{file}. Used for adding various annotation data and
#' metadata to the standard tskit tree-sequence object.
Expand Down Expand Up @@ -89,7 +89,7 @@ ts_load <- function(file, model = NULL,
call. = FALSE)

# load the tree sequence, converting it to a SLiM tree sequence if necessary
ts <- tskit$load(path.expand(file))
if (is.character(file)) ts <- tskit$load(path.expand(file))

if (length(ts$metadata) == 0 || is.null(ts$metadata$SLiM))
type <- "generic"
Expand Down
Loading

0 comments on commit 0757b6e

Please sign in to comment.