Make serialization of slendr models optional

bodkan · Sep 5, 2022 · 0757b6e · 0757b6e
1 parent b0bc2a0
commit 0757b6e
Show file tree

Hide file tree

Showing 10 changed files with 432 additions and 273 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ inst/doc
 .vscode/
 build/
 inst/pylib/__pycache__/
+inst/scripts/__pycache__/
diff --git a/NEWS.md b/NEWS.md
@@ -10,6 +10,10 @@
 
 - Relative paths are now expanded in `ts_save()` ([#9521dfb](https://github.com/bodkan/slendr/commit/9521dfb)).
 
+- _slendr_ models can now be optionally compiled without serialization to disk. This only works with the `msprime()` coalescent back end but will be much faster in cases where a huge number of simulations needs to be run because for non-serialized models, `msprime()` now calls the back end engine directly through the R-Python interface (rather than on the command line) and output tree sequences are not saved to disk, rather than passed through the Python-R interface directly in memory ([#6bfda57](https://github.com/bodkan/slendr/commit/6bfda57)).
+
+- Deprecated argument `sampling = ` of the functions `slim()` and `msprime()` has now been permanently removed ([#6bfda57](https://github.com/bodkan/slendr/commit/6bfda57)).
+
 # slendr 0.3.0
 
 -   SLiM 4.0 is now required for running simulations with the `slim()` engine. If you want to run _slendr_ simulations with SLiM (spatial or non-spatial), you will need to upgrade you SLiM installation. SLiM 3.7.1 version is no longer supported as the upcoming new _slendr_ spatial features will depend on SLiM 4.x and maintaining two functionally identical yet syntactically different back ends is not feasible (PR [#104](https://github.com/bodkan/slendr/pull/104)).

diff --git a/R/compilation.R b/R/compilation.R
@@ -28,6 +28,9 @@
 #'   default run to "the present time")
 #' @param direction Intended direction of time. Under normal circumstances this
 #'   parameter is inferred from the model and does not need to be set manually.
+#' @param serialize Should model files be serialized to disk? If not, only an
+#'   R model object will be returned but no files will be created. This speeds
+#'   up simulation with msprime but prevents using the SLiM back end.
 #' @param slim_script Path to a SLiM script to be used for executing the model
 #'   (by default, a bundled backend script will be used). If \code{NULL}, the
 #'   SLiM script bundled with slendr will be used.
@@ -50,6 +53,7 @@ compile_model <- function(populations, generation_time, path = NULL, resolution
                           competition = NULL, mating = NULL, dispersal = NULL,
                           gene_flow = list(), overwrite = FALSE, force = FALSE,
                           simulation_length = NULL, direction = NULL,
+                          serialize = TRUE,
                           slim_script = NULL, description = "", sim_length = NULL) {
   if (is.null(simulation_length) && !is.null(sim_length)) {
     warning("Argument `sim_length` will soon be deprecated in favor of `simulation_length`.", call. = FALSE)
@@ -58,15 +62,20 @@ compile_model <- function(populations, generation_time, path = NULL, resolution
 
   if (inherits(populations, "slendr_pop"))  populations <- list(populations)
 
-  if (is.null(path)) path <- tempfile()
-
   if (is.null(slim_script))
     slim_script <- system.file("scripts", "script.slim", package = "slendr")
 
   map <- get_map(populations[[1]])
   if (!is.null(map) && is.null(resolution))
     stop("A map resolution must be specified for spatial models", call. = FALSE)
 
+  if (!serialize && is.null(path) && inherits(map, "slendr_map")) {
+    stop("Spatial models must be serialized to disk for SLiM to simulate data from",
+         call. = FALSE)
+  }
+  if (serialize && is.null(path))
+    path <- tempfile()
+
   # make sure that all parents are present
   pop_names <- purrr::map_chr(populations, ~ .x$pop[1])
   parent_names <- unique(purrr::map_chr(populations, function(pop) {
@@ -83,29 +92,31 @@ compile_model <- function(populations, generation_time, path = NULL, resolution
     stop("All populations must have unique names", call. = FALSE)
 
   # prepare the model output directory
-  if (dir.exists(path)) {
-    if (!overwrite)
-      stop("Directory '", path, "' already exists. Either delete it\nmanually ",
-           "or set 'overwrite = TRUE' to delete it from R.", call. = FALSE)
-    else {
-      if (interactive() && !force) {
-        answer <- utils::menu(c("Yes", "No"),
-          title = paste0("Are you ABSOLUTELY SURE you want to delete '", path,
-                         "'?\nThere is no going back.")
-        )
-        force <- answer == 1
+  if (serialize) {
+    if (dir.exists(path)) {
+      if (!overwrite)
+        stop("Directory '", path, "' already exists. Either delete it\nmanually ",
+             "or set 'overwrite = TRUE' to delete it from R.", call. = FALSE)
+      else {
+        if (interactive() && !force) {
+          answer <- utils::menu(c("Yes", "No"),
+            title = paste0("Are you ABSOLUTELY SURE you want to delete '", path,
+                           "'?\nThere is no going back.")
+          )
+          force <- answer == 1
+        }
+        if (force)
+          unlink(path, recursive = TRUE)
+        else
+          stop("Compilation aborted because the specified model path directory\ncould ",
+               "not have been created. If you're running this in a non-interactive\n",
+               "mode in a script and want to overwrite an already existing model\n",
+               "directory, you must set `force = TRUE`.", call. = FALSE)
       }
-      if (force)
-        unlink(path, recursive = TRUE)
-      else
-        stop("Compilation aborted because the specified model path directory\ncould ",
-             "not have been created. If you're running this in a non-interactive\n",
-             "mode in a script and want to overwrite an already existing model\n",
-             "directory, you must set `force = TRUE`.", call. = FALSE)
-    }
 
+    }
+    dir.create(path)
   }
-  dir.create(path)
 
   if (is.data.frame(gene_flow)) gene_flow <- list(gene_flow)
 
@@ -163,11 +174,14 @@ setting `direction = 'backward'.`", call. = FALSE)
 
   simulation_length <- if (is.null(simulation_length)) end_time else simulation_length
 
-  checksums <- write_model(
-    path, populations, admix_table, map_table, split_table, resize_table,
-    dispersal_table, generation_time, resolution, simulation_length, time_dir, slim_script,
-    description, map
-  )
+  if (serialize)
+    checksums <- write_model(
+      path, populations, admix_table, map_table, split_table, resize_table,
+      dispersal_table, generation_time, resolution, simulation_length, time_dir, slim_script,
+      description, map
+    )
+  else
+    checksums <- NULL
 
   names(populations) <- pop_names
 
@@ -177,6 +191,7 @@ setting `direction = 'backward'.`", call. = FALSE)
     world = map,
     populations = populations,
     splits = split_table,
+    resizes = resize_table,
     geneflow = admix_table,
     maps = return_maps,
     dispersals = dispersal_table,
@@ -185,6 +200,7 @@ setting `direction = 'backward'.`", call. = FALSE)
     length = round(simulation_length / generation_time),
     orig_length = simulation_length,
     direction = time_dir,
+    description = description,
     checksums = checksums
   )
   class(result) <- set_class(result, "model")
@@ -219,13 +235,15 @@ read_model <- function(path) {
   # for running the backend script using the run() function
   path_populations <- file.path(path, "ranges.rds")
   path_splits <- file.path(path, "populations.tsv")
+  path_resizes <- file.path(path, "resizes.tsv")
   path_geneflow <- file.path(path, "geneflow.tsv")
   path_maps <- file.path(path, "maps.tsv")
   path_generation_time <- file.path(path, "generation_time.txt")
   path_resolution <- file.path(path, "resolution.txt")
   path_length <- file.path(path, "length.txt")
   path_orig_length <- file.path(path, "orig_length.txt")
   path_direction <- file.path(path, "direction.txt")
+  path_description <- file.path(path, "description.txt")
 
   if (!dir.exists(path))
     stop(sprintf("Model directory '%s' does not exist", path), call. = FALSE)
@@ -237,8 +255,10 @@ read_model <- function(path) {
   generation_time <- scan(path_generation_time, what = integer(), quiet = TRUE)
   length <- as.integer(scan(path_length, what = numeric(), quiet = TRUE))
   orig_length <- as.integer(scan(path_orig_length, what = numeric(), quiet = TRUE))
+  description <- scan(path_description, what = character(), quiet = TRUE)
 
   split_table <- utils::read.table(path_splits, header = TRUE, stringsAsFactors = FALSE)
+  resize_table <- utils::read.table(path_resizes, header = TRUE, stringsAsFactors = FALSE)
 
   admix_table <- NULL
   if (file.exists(path_geneflow)) {
@@ -263,6 +283,7 @@ read_model <- function(path) {
     world = world,
     populations = populations,
     splits = split_table,
+    resizes = resize_table,
     geneflow = admix_table,
     maps = maps,
     generation_time = generation_time,
@@ -360,10 +381,13 @@ slim <- function(
   model, sequence_length, recombination_rate, samples = NULL, output = NULL,
   burnin = 0, max_attempts = 1, spatial = !is.null(model$world), coalescent_only = TRUE,
   method = c("batch", "gui"), random_seed = NULL, verbose = FALSE, load = TRUE,
-  locations = NULL, slim_path = NULL, sampling = NULL
+  locations = NULL, slim_path = NULL
 ) {
   method <- match.arg(method)
 
+  if (is.null(model$path))
+    stop("It is not possible to simulate non-serialized models in SLiM", call. = FALSE)
+
   if (is.null(output) & !load)
     warning("No custom tree-sequence output path is given but loading a tree sequence from\n",
             "a temporary file after the simulation has been prevented", call. = FALSE)
@@ -383,11 +407,6 @@ slim <- function(
   if (!is.numeric(recombination_rate) | recombination_rate < 0)
     stop("Recombination rate must be a numeric value", call. = FALSE)
 
-  if (!is.null(sampling) && is.null(samples)) {
-    warning("Argument `sampling` will soon be deprecated in favor of `samples`.", call. = FALSE)
-    samples <- sampling
-  }
-
   # verify checksums of serialized model configuration files
   checksums <- readr::read_tsv(file.path(model_dir, "checksums.tsv"), progress = FALSE,
                                col_types = "cc")
@@ -529,7 +548,6 @@ slim <- function(
 #'   file is written to a custom location to be loaded at a later point.
 #' @param verbose Write the output log to the console (default \code{FALSE})?
 #' @param debug Write msprime's debug log to the console (default \code{FALSE})?
-#' @param sampling Deprecated in favor of \code{samples}.
 #'
 #' @return A tree-sequence object loaded via Python-R reticulate interface function \code{ts_load}
 #'   (internally represented by the Python object \code{tskit.trees.TreeSequence})
@@ -570,14 +588,46 @@ slim <- function(
 #' @export
 msprime <- function(model, sequence_length, recombination_rate, samples = NULL,
                     output = NULL, random_seed = NULL,
-                    load = TRUE, verbose = FALSE, debug = FALSE, sampling = NULL) {
+                    load = TRUE, verbose = FALSE, debug = FALSE) {
   if (sum(sapply(model$populations, attr, "parent") == "ancestor") > 1)
     stop("Multiple ancestral populations without a common ancestor would lead to\n",
          "an infinitely deep history without coalescence. Please make sure that all\n",
          "populations trace their ancestry to a single ancestral population.\n",
          "(This restriction only applies to coalescent simulations with msprime().)",
          call. = FALSE)
 
+  if (!is.null(samples)) {
+    samples <- process_sampling(samples, model, verbose)
+    if (!is.null(model$path)) {
+      sampling_path <- tempfile()
+      readr::write_tsv(samples, sampling_path)
+    }
+    sampling <- paste("--sampling-schedule", sampling_path)
+  } else {
+    samples <- NULL
+    sampling <- ""
+  }
+
+  # call msprime back-end code directly for non-serialized models
+  if (is.null(model$path)) {
+    script <- reticulate::import_from_path("script", path = system.file("scripts", package = "slendr"))
+    ts_msprime <- script$simulate(
+      sequence_length = sequence_length,
+      recombination_rate = recombination_rate,
+      seed = random_seed,
+      populations = reticulate::r_to_py(model$splits),
+      resizes = reticulate::r_to_py(model$resizes),
+      geneflows = reticulate::r_to_py(model$geneflow),
+      length = as.integer(model$length),
+      direction = model$direction,
+      description = model$description,
+      samples = reticulate::r_to_py(samples),
+      debug = debug
+    )
+    ts <- ts_load(ts_msprime, model = model)
+    return(ts)
+  }
+
   if (is.null(output) & !load)
     warning("No custom tree-sequence output path is given but loading a tree sequence from\n",
             "a temporary file after the simulation has been prevented", call. = FALSE)
@@ -594,26 +644,13 @@ msprime <- function(model, sequence_length, recombination_rate, samples = NULL,
   if (!is.numeric(recombination_rate) | recombination_rate < 0)
     stop("Recombination rate must be a numeric value", call. = FALSE)
 
-  if (!is.null(sampling) && is.null(samples)) {
-    warning("Argument `sampling` will soon be deprecated in favor of `samples`.", call. = FALSE)
-    samples <- sampling
-  }
-
   # verify checksums of serialized model configuration files
   checksums <- readr::read_tsv(file.path(model_dir, "checksums.tsv"), progress = FALSE,
                                col_types = "cc")
   verify_checksums(file.path(model_dir, checksums$file), checksums$hash)
 
   script_path <- path.expand(file.path(model_dir, "script.py"))
 
-  if (!is.null(samples)) {
-    sampling_path <- tempfile()
-    sampling_df <- process_sampling(samples, model, verbose)
-    readr::write_tsv(sampling_df, sampling_path)
-    sampling <- paste("--sampling-schedule", sampling_path)
-  } else
-    sampling <- ""
-
   msprime_command <- sprintf(
     "%s %s %s --model %s --output %s --sequence-length %d --recombination-rate %s %s %s %s",
     reticulate::py_exe(),

diff --git a/R/print.R b/R/print.R
@@ -88,7 +88,10 @@ print.slendr_model <- function(x, ...) {
   } else
     cat("non-spatial\n\n")
 
-  cat("configuration files in:", normalizePath(x$path), "\n")
+  if (is.null(x$path))
+    cat("non-serialized slendr model\n")
+  else
+    cat("configuration files in:", normalizePath(x$path), "\n")
 }
 
 

diff --git a/R/tree-sequences.R b/R/tree-sequences.R
@@ -16,7 +16,7 @@
 #' \code{\link{ts_simplify}}.
 #'
 #' @param file A path to the tree-sequence file (either originating from a
-#'   slendr model or a standard non-slendr tree sequence)
+#'   slendr model or a standard non-slendr tree sequence).
 #' @param model Optional \code{slendr_model} object which produced the
 #'   tree-sequence \code{file}. Used for adding various annotation data and
 #'   metadata to the standard tskit tree-sequence object.
@@ -89,7 +89,7 @@ ts_load <- function(file, model = NULL,
          call. = FALSE)
 
   # load the tree sequence, converting it to a SLiM tree sequence if necessary
-  ts <- tskit$load(path.expand(file))
+  if (is.character(file)) ts <- tskit$load(path.expand(file))
 
   if (length(ts$metadata) == 0 || is.null(ts$metadata$SLiM))
     type <- "generic"