bodkan · bodkan · May 11, 2023 · May 8, 2023 · May 8, 2023 · May 8, 2023
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,7 @@ S3method(print,slendr_model)
 S3method(print,slendr_pop)
 S3method(print,slendr_region)
 S3method(print,slendr_ts)
+S3method(summary,slendr_nodes)
 export("%>%")
 export(animate_model)
 export(area)

diff --git a/NEWS.md b/NEWS.md
@@ -32,6 +32,10 @@ Users have to call `init_env()` to manually activate the Python environment of s
 
 - `ts_simplify()` now accepts optional arguments `keep_unary` and `keep_unary_in_individuals` (see the official [tskit docs](https://tskit.dev/tskit/docs/stable/python-api.html#tskit.TreeSequence.simplify) for more detail) ([#1b2112](https://github.com/bodkan/slendr/commit/1b2112))
 
+- Fix for `ts_load()` failing to load _slendr_-produced tree sequences after they were simplified down to a smaller set of sampled individuals (reported [here](https://github.com/bodkan/slendr/issues/136)). The issue was caused by incompatible sizes of the sampling table (always in the same form as used during simulation) and the table of individuals stored in the tree sequence after simplification (potentially containing a smaller set of individuals than in the original sampling table). To fix this, _slendr_ tree sequence objects now track information about which individuals are regarded as "samples" (i.e. those with symbolic names) which is maintained through simplification, serialization and loading, and used by _slendr_'s internal machinery during join operations. (PR [#137](https://github.com/bodkan/slendr/pull/137))
+
+- Metadata summary of `ts_nodes()` results is no longer printed whenever typed into the R console. Instead, summary can be obtained by explicit call to `summary()` on the `ts_nodes()` tables.  ([#01af51](https://github.com/bodkan/slendr/commit/01af51) [#8176b5](https://github.com/bodkan/slendr/commit/8176b5))
+
 # slendr 0.5.1
 
 - This minor release implements an emergency fix for a CRAN warning which suddenly popped up in latest CRAN checks. ([#5600a4](https://github.com/bodkan/slendr/commit/5600a4))

diff --git a/R/interface.R b/R/interface.R
@@ -1216,6 +1216,9 @@ schedule_sampling <- function(model, times, ..., locations = NULL, strict = FALS
   sample_pops <- purrr::map(samples, 1)
   sample_counts <- purrr::map(samples, 2)
 
+  if (is.null(model$world) && !is.null(locations))
+    stop("Sampling locations may only be specified for a spatial model", call. = FALSE)
+
   if (length(sample_pops) != length(sample_counts))
     stop("Samples must be represented by pairs of <slendr_pop>-<n>", call. = FALSE)
 

diff --git a/R/print.R b/R/print.R
@@ -95,15 +95,18 @@ print.slendr_model <- function(x, ...) {
 }
 
 
-# @rdname print.slendr_pop
-# @export
-print.slendr_nodes <- function(x, ...) {
-  model <- attr(x, "model")
-  type <- attr(x, "type")
+#' Summarise the contents of a \code{ts_nodes} result
+#' @param object Data frame produced by the function \code{ts_nodes}
+#' @param ... Additional formal arguments to the \code{summary} method (unused here)
+#' @return Used for its output to the terminal
+#' @export
+summary.slendr_nodes <- function(object, ...) {
+  model <- attr(object, "model")
+  type <- attr(object, "type")
 
   from_slendr <- !is.null(model)
 
-  sep <- print_header_info(x)
+  sep <- print_header_info(object)
 
   if (from_slendr)
     direction <- model$direction
@@ -112,7 +115,7 @@ print.slendr_nodes <- function(x, ...) {
 
   cat("times are expressed in a", direction, "time direction\n")
 
-  individuals <- as.data.frame(x) %>%
+  individuals <- as.data.frame(object) %>%
     dplyr::filter(!is.na(ind_id)) %>%
     dplyr::distinct(ind_id, .keep_all = TRUE)
 
@@ -187,18 +190,18 @@ print.slendr_nodes <- function(x, ...) {
       cat("youngest sampled individual:", funs[[2]](.$time), "time units", direction, "\n")
     }
 
-    cat("\noldest node:", funs[[1]](x$time), "time units", direction, "\n")
-    cat("youngest node:", funs[[2]](x$time), "time units", direction, "\n")
+    cat("\noldest node:", funs[[1]](object$time), "time units", direction, "\n")
+    cat("youngest node:", funs[[2]](object$time), "time units", direction, "\n")
 
     cat(sep)
   }
 
-  if (inherits(x, "sf"))
+  if (inherits(object, "sf"))
     cat("overview of the underlying sf object:\n\n")
   else
     cat("overview of the underlying table object:\n\n")
 
-  dplyr::as_tibble(x) %>% print()
+  dplyr::as_tibble(object) %>% print()
 }
 
 

diff --git a/R/tree-sequences.R b/R/tree-sequences.R
@@ -159,6 +159,27 @@ ts_load <- function(file, model = NULL,
 #' @export
 ts_save <- function(ts, file) {
   check_ts_class(ts)
+  type <- attr(ts, "type")
+  from_slendr <- !is.null(attr(ts, "model"))
+
+  # overwrite the original list of sample names (if the tree sequence was simplified
+  # down to a smaller number of individuals than originally sampled)
+  if (from_slendr && nrow(ts_samples(ts)) != nrow(attr(ts, "metadata")$sampling)) {
+    tables <- ts$dump_tables()
+    tables$metadata_schema = tskit$MetadataSchema(list("codec" = "json"))
+
+    sample_names <- attr(ts, "metadata")$sample_names
+    pedigree_ids <- attr(ts, "metadata")$sample_ids
+    if (type == "SLiM") {
+      tables$metadata$SLiM$user_metadata$slendr[[1]]$sample_names <- sample_names
+      tables$metadata$SLiM$user_metadata$slendr[[1]]$sample_ids <- pedigree_ids
+    } else
+      tables$metadata$slendr$sample_names <- sample_names
+
+    # put the tree sequence object back together
+    ts <- tables$tree_sequence()
+  }
+
   ts$dump(path.expand(file))
 }
 
@@ -248,6 +269,7 @@ ts_recapitate <- function(ts, recombination_rate, Ne = NULL, demography = NULL,
     # inherit the information about which individuals should be marked as
     # explicitly "sampled" from the previous tree sequence object (if that
     # was specified) -- this is only necessary for a SLiM sequence
+    # TODO: no longer necessary
     old_individuals <- attr(ts, "raw_individuals")
     sampled_ids <- old_individuals[old_individuals$sampled, ]$pedigree_id
     attr(ts_new, "raw_individuals") <- attr(ts_new, "raw_individuals") %>%
@@ -439,6 +461,14 @@ ts_simplify <- function(ts, simplify_to = NULL, keep_input_roots = FALSE,
   } else
     attr(ts_new, "nodes") <- get_tskit_table_data(ts_new, simplify_to)
 
+  # replace the names of sampled individuals (if simplification led to subsetting)
+  if (from_slendr) {
+    sampled_nodes <- attr(ts_new, "nodes") %>% dplyr::filter(sampled)
+    attr(ts_new, "metadata")$sample_names <-  unique(sampled_nodes$name)
+    if (type == "SLiM")
+      attr(ts_new, "metadata")$sample_ids <- unique(sampled_nodes$pedigree_id)
+  }
+
   class(ts_new) <- c("slendr_ts", class(ts_new))
 
   ts_new
@@ -1140,9 +1170,8 @@ ts_samples <- function(ts) {
          "from a slendr model. To access information about times and\nlocations ",
          "of nodes and individuals from non-slendr tree sequences,\nuse the ",
          "function ts_nodes().\n", call. = FALSE)
-  data <- ts_nodes(ts) %>% dplyr::filter(sampled, !is.na(name))
   samples <- attr(ts, "metadata")$sampling %>%
-    dplyr::filter(name %in% data$name)
+    dplyr::filter(name %in% attr(ts, "metadata")$sample_names)
 
   samples
 }
@@ -2314,7 +2343,11 @@ get_ts_raw_individuals <- function(ts) {
     if (from_slendr) {
       ind_table$time <- time_fun(ts)(ts$individual_times, model)
       # for slendr SLiM models, "sampled" nodes are those were explicitly scheduled for sampling
-      ind_table$sampled <- ind_table$remembered
+      # - for "original" SLiM/slendr tree sequences, sampled nodes are those that are remembered
+      if (is.null(attr(ts, "metadata")$sample_ids))
+        ind_table$sampled <- ind_table$remembered
+      else # - for previously simplified tree sequences, use information from pedigree IDs
+        ind_table$sampled <- ind_table$pedigree_id %in% attr(ts, "metadata")$sample_ids
     } else {
       ind_table$time <- ts$individual_times
       # for pure SLiM tree sequences, simply use the sampling information encoded in the data
@@ -2411,7 +2444,14 @@ get_pyslim_table_data <- function(ts, simplify_to = NULL) {
 
   # load information about samples at times and from populations of remembered individuals
   if (from_slendr) {
-    samples <- attr(ts, "metadata")$sampling %>% dplyr::arrange(-time, pop)
+    # when a tree sequence is being loaded from a file where it was saved in its
+    # simplified form, the metadata table saved along side it won't correspond to the
+    # subset of sampled nodes -- in these situations, subset the original sampling table
+    # using the names of sampled individuals that are propagated through serialization
+    # (this is achieved by the filter() step right below)
+    samples <- attr(ts, "metadata")$sampling %>%
+      dplyr::arrange(-time, pop) %>%
+      dplyr::filter(name %in% attr(ts, "metadata")$sample_names)
     if (!is.null(simplify_to))
       samples <- samples %>% dplyr::filter(name %in% simplify_to)
   } else
@@ -2502,7 +2542,12 @@ get_tskit_table_data <- function(ts, simplify_to = NULL) {
   # load information about samples at times and from populations of remembered
   # individuals
   if (from_slendr) {
-    samples <- attr(ts, "metadata")$sampling
+    # when a tree sequence is being loaded from a file where it was saved in its
+    # simplified form, the metadata table saved along side it won't correspond to the
+    # subset of sampled nodes -- in these situations, subset the original sampling table
+    # using the names of sampled individuals that are propagated through serialization
+    samples <- attr(ts, "metadata")$sampling %>%
+      dplyr::filter(name %in% attr(ts, "metadata")$sample_names)
     if (!is.null(simplify_to))
       samples <- dplyr::filter(samples, name %in% simplify_to)
     samples <- dplyr::arrange(samples, -time, pop)
@@ -2791,7 +2836,7 @@ get_sampling <- function(metadata) {
   } else
     sampling <- dplyr::as_tibble(metadata$sampling)
 
-  sampling %>%
+  df <- sampling %>%
     dplyr::select(-time_gen) %>%
     {
       rbind(
@@ -2805,6 +2850,10 @@ get_sampling <- function(metadata) {
     dplyr::arrange(-time_orig, pop) %>%
     dplyr::rename(time = time_orig) %>%
     dplyr::select(name, time, pop)
+
+  # if needed (i.e. after simplification to a smaller set of sampled individuals), subset
+  # the full original sampling schedule table to only individuals of interest
+  df %>% dplyr::filter(name %in% metadata$sample_names)
 }
 
 # Extract list with slendr metadata (created as Eidos Dictionaries by SLiM and Python
@@ -2827,6 +2876,8 @@ get_slendr_metadata <- function(ts) {
     version = metadata$version,
     description = metadata$description,
     sampling = get_sampling(metadata),
+    sample_names = metadata$sample_names,
+    sample_ids = metadata$sample_ids,
     map = metadata$map[[1]],
     arguments = arguments
   )

diff --git a/R/utils.R b/R/utils.R
@@ -554,6 +554,18 @@ process_sampling <- function(samples, model, verbose = FALSE) {
     dplyr::summarise(n = sum(n), .groups = "drop") %>%
     dplyr::arrange(time)
 
+  # samples must either all have a sampling location defined, or none
+  pop_consistent <- split(df, df$pop) %>% vapply(function(pop) {
+    all(is.na(pop$x_orig) | is.na(pop$y_orig)) ||
+    all(!is.na(pop$x_orig) | !is.na(pop$y_orig))
+  }, logical(1))
+  if (!all(pop_consistent)) {
+    stop("For each population, samples must be all spatial or all non-spatial.\n",
+         "This is not true for the following populations: ",
+         paste(names(pop_consistent)[!pop_consistent], collapse = ", "),
+         call. = FALSE)
+  }
+
   # in case a backwards time model is not to be simulated all the way to the
   # present (i.e. time 0), the conversion of sampling times from absolute model
   # time units into SLiM's forward time units needs to be adjusted relative

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -105,8 +105,8 @@ reference:
     - print.slendr_region
     - print.slendr_pop
     - print.slendr_model
-    - print.slendr_nodes
     - print.slendr_ts
+    - summary.slendr_nodes
 
   - title: Tree sequence loading and processing
     contents:

diff --git a/docs/articles/vignette-01-tutorial.html b/docs/articles/vignette-01-tutorial.html