Implement loading of 'pure' manual tree sequences (also closes #107)

bodkan · Sep 3, 2022 · 79adf14 · bhaller · Sep 30, 2022 · bodkan
1 parent 52f5880
commit 79adf14
Show file tree

Hide file tree

Showing 10 changed files with 315 additions and 5 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,10 @@
 
 - The initial size of a population which emerges from a split from another population is now printed in a population history summary in the R console. ([#6525bf3](https://github.com/bodkan/slendr/commit/6525bf3))
 
+- A couple of fixes to support loading, processing, and plotting of "manually" created tree sequences have been implemented (see [this](https://tskit.dev/tutorials/tables_and_editing.html#constructing-a-tree-sequence)). Not sure how practically useful, but it's important to be able to load even "pure" tree sequences which are not from simulators such as SLiM and msprime. A set of [unit tests](https://github.com/bodkan/slendr/blob/9611437554bbb171f3df6374651acc3d73c63426/tests/testthat/test-manual-ts.R) has been added, making sure that a minimalist nodes & edges table can be loaded, as well as nodes & edges & individuals, plus tables of populations and sites & mutations. PRs with more extensive unit tests and bug reports of tree sequences which are failing to load would be appreciated! The code for handling cases of "manually-created" tree sequences which have missing individual table, missing populations table, etc. seems especially brittle at the moment ([#2f5fc32](https://github.com/bodkan/slendr/commit/2f5fc32)).
+
+- The `-1` value as a missing value indicator used in tskit is now replaced with the more R-like `NA` in various tree-sequence tables (annotated by _slendr_ or original through tskit itself) ([#2f5fc32](https://github.com/bodkan/slendr/commit/2f5fc32)).
+
 # slendr 0.3.0
 
 -   SLiM 4.0 is now required for running simulations with the `slim()` engine. If you want to run _slendr_ simulations with SLiM (spatial or non-spatial), you will need to upgrade you SLiM installation. SLiM 3.7.1 version is no longer supported as the upcoming new _slendr_ spatial features will depend on SLiM 4.x and maintaining two functionally identical yet syntactically different back ends is not feasible (PR [#104](https://github.com/bodkan/slendr/pull/104)).

diff --git a/R/tree-sequences.R b/R/tree-sequences.R
@@ -2046,6 +2046,10 @@ get_ts_raw_nodes <- function(ts) {
 
   node_table$time_tskit <- table$time
 
+  # -1 as a missing value in tskit is not very R like, so let's replace it with
+  # a proper NA
+  node_table$pop_id[node_table$pop_id == -1] <- NA
+
   node_table
 }
 
@@ -2197,8 +2201,9 @@ get_pyslim_table_data <- function(ts, simplify_to = NULL) {
     dplyr::bind_rows(sampled, not_sampled) %>%
     dplyr::right_join(nodes, by = "ind_id") %>%
     dplyr::mutate(time = ifelse(is.na(ind_id), time.y, time.x),
+                  time_tskit = time_tskit.y,
                   sampled = ifelse(is.na(ind_id), FALSE, sampled)) %>%
-    dplyr::rename(pop = pop.y, pop_id = pop_id.y, time_tskit = time_tskit.x)
+    dplyr::rename(pop = pop.y, pop_id = pop_id.y)
 
   if (spatial) {
     combined <- convert_to_sf(combined, model)
@@ -2251,9 +2256,17 @@ get_tskit_table_data <- function(ts, simplify_to = NULL) {
     }))
     individuals$pop <- pop_names[individuals$pop_id + 1]
     nodes$pop <- pop_names[nodes$pop_id + 1]
+    if (is.null(nodes[["pop"]])) nodes$pop <- NA
   }
 
-  individuals <- dplyr::arrange(individuals, -time, pop)
+  if (nrow(individuals)) {
+    if (length(ts$tables$populations)) {
+      individuals <- dplyr::arrange(individuals, -time, pop)
+    } else {
+      individuals <- dplyr::arrange(individuals, -time)
+      individuals$pop <- NA
+    }
+  }
 
   # load information about samples at times and from populations of remembered
   # individuals
@@ -2277,13 +2290,20 @@ get_tskit_table_data <- function(ts, simplify_to = NULL) {
     # on which nodes are actually sampled)
     nodes$sampled <- FALSE
     nodes$sampled[ts$samples() + 1] <- TRUE
-    nodes$time_tskit.x <- nodes$time_tskit; nodes$time_tskit <- NULL
+
+    nodes$time_tskit.y <- nodes$time_tskit
+    nodes$time_tskit <- NULL
+
+    nodes$pop.y <- nodes$pop
+    nodes$pop <- individuals$pop <- NULL
+
+    nodes$sampled <- nodes$node_id %in% as.vector(ts$samples())
   }
 
   # add numeric node IDs to each individual
   combined <- dplyr::select(individuals, -time, -pop_id) %>%
     dplyr::right_join(nodes, by = "ind_id") %>%
-    dplyr::rename(pop = pop.y, time_tskit = time_tskit.x)
+    dplyr::rename(pop = pop.y, time_tskit = time_tskit.y)
 
   # for tree sequences which have some individuals/nodes marked as 'sampled', mark the
   # nodes which do not have individuals assigned to them as FALSE

diff --git a/docs/news/index.html b/docs/news/index.html
diff --git a/tests/testthat/manual_ts_nodes+edges+inds+pops+muts.py b/tests/testthat/manual_ts_nodes+edges+inds+pops+muts.py
@@ -0,0 +1,49 @@
+import tskit
+import numpy
+import tempfile
+
+tables = tskit.TableCollection(sequence_length=1e5)
+
+node_table = tables.nodes
+node_table.add_row(flags=tskit.NODE_IS_SAMPLE, individual=0, population=0)  # node 0
+node_table.add_row(flags=tskit.NODE_IS_SAMPLE, individual=0, population=0)  # node 1
+node_table.add_row(time=3, population=1)                      # node 2
+node_table.add_row(flags=tskit.NODE_IS_SAMPLE, individual=1, population=2)  # node 3
+node_table.add_row(flags=tskit.NODE_IS_SAMPLE, individual=1, population=2)  # node 4
+node_table.add_row(time=7, population=2)                      # node 5
+node_table.add_row(time=10, population=1)                     # node 6
+node_table
+
+edge_table = tables.edges
+edge_table.set_columns(
+    left=numpy.array([0, 0, 0, 0, 0, 0]),
+    right=numpy.array([1e5, 1e5, 1e5, 1e5, 1e5, 1e5]),
+    parent=numpy.array([2, 2, 5, 5, 6, 6], dtype=numpy.int32),
+    child=numpy.array([0, 1, 3, 4, 2, 5], dtype=numpy.int32)
+)
+edge_table
+
+ind_table = tables.individuals
+ind_table.add_row()
+ind_table.add_row()
+ind_table.add_row()
+ind_table.add_row()
+
+pop_table = tables.populations
+pop_table.add_row()
+pop_table.add_row()
+pop_table.add_row()
+
+sites_table = tables.sites
+sites_table.add_row(position=42, ancestral_state="0")
+sites_table.add_row(position=123, ancestral_state="0")
+
+mutations_table = tables.mutations
+mutations_table.add_row(site=0, node=2, derived_state="1")
+mutations_table.add_row(site=1, node=5, derived_state="1")
+
+tseq = tables.tree_sequence()
+
+filename = tempfile.NamedTemporaryFile().name
+
+tseq.dump(filename)
diff --git a/tests/testthat/manual_ts_nodes+edges+inds+pops.py b/tests/testthat/manual_ts_nodes+edges+inds+pops.py
@@ -0,0 +1,41 @@
+import tskit
+import numpy
+import tempfile
+
+tables = tskit.TableCollection(sequence_length=1e5)
+
+node_table = tables.nodes
+node_table.add_row(flags=tskit.NODE_IS_SAMPLE, individual=0, population=0)  # node 0
+node_table.add_row(flags=tskit.NODE_IS_SAMPLE, individual=0, population=0)  # node 1
+node_table.add_row(time=3, population=1)                      # node 2
+node_table.add_row(flags=tskit.NODE_IS_SAMPLE, individual=1, population=2)  # node 3
+node_table.add_row(flags=tskit.NODE_IS_SAMPLE, individual=1, population=2)  # node 4
+node_table.add_row(time=7, population=2)                      # node 5
+node_table.add_row(time=10, population=1)                     # node 6
+node_table
+
+edge_table = tables.edges
+edge_table.set_columns(
+    left=numpy.array([0, 0, 0, 0, 0, 0]),
+    right=numpy.array([1e5, 1e5, 1e5, 1e5, 1e5, 1e5]),
+    parent=numpy.array([2, 2, 5, 5, 6, 6], dtype=numpy.int32),
+    child=numpy.array([0, 1, 3, 4, 2, 5], dtype=numpy.int32)
+)
+edge_table
+
+ind_table = tables.individuals
+ind_table.add_row()
+ind_table.add_row()
+ind_table.add_row()
+ind_table.add_row()
+
+pop_table = tables.populations
+pop_table.add_row()
+pop_table.add_row()
+pop_table.add_row()
+
+tseq = tables.tree_sequence()
+
+filename = tempfile.NamedTemporaryFile().name
+
+tseq.dump(filename)
diff --git a/tests/testthat/manual_ts_nodes+edges+inds.py b/tests/testthat/manual_ts_nodes+edges+inds.py
@@ -0,0 +1,36 @@
+import tskit
+import numpy
+import tempfile
+
+tables = tskit.TableCollection(sequence_length=1e5)
+
+node_table = tables.nodes
+node_table.add_row(flags=tskit.NODE_IS_SAMPLE, individual=0)  # node 0
+node_table.add_row(flags=tskit.NODE_IS_SAMPLE, individual=0)  # node 1
+node_table.add_row(time=3)                      # node 2
+node_table.add_row(flags=tskit.NODE_IS_SAMPLE, individual=1)  # node 3
+node_table.add_row(flags=tskit.NODE_IS_SAMPLE, individual=1)  # node 4
+node_table.add_row(time=7)                      # node 5
+node_table.add_row(time=10)                     # node 6
+node_table
+
+edge_table = tables.edges
+edge_table.set_columns(
+    left=numpy.array([0, 0, 0, 0, 0, 0]),
+    right=numpy.array([1e5, 1e5, 1e5, 1e5, 1e5, 1e5]),
+    parent=numpy.array([2, 2, 5, 5, 6, 6], dtype=numpy.int32),
+    child=numpy.array([0, 1, 3, 4, 2, 5], dtype=numpy.int32)
+)
+edge_table
+
+ind_table = tables.individuals
+ind_table.add_row()
+ind_table.add_row()
+ind_table.add_row()
+ind_table.add_row()
+
+tseq = tables.tree_sequence()
+
+filename = tempfile.NamedTemporaryFile().name
+
+tseq.dump(filename)
diff --git a/tests/testthat/manual_ts_nodes+edges.py b/tests/testthat/manual_ts_nodes+edges.py
@@ -0,0 +1,30 @@
+import tskit
+import numpy
+import tempfile
+
+tables = tskit.TableCollection(sequence_length=1e5)
+
+node_table = tables.nodes
+node_table.add_row(flags=tskit.NODE_IS_SAMPLE)  # node 0
+node_table.add_row(flags=tskit.NODE_IS_SAMPLE)  # node 1
+node_table.add_row(time=3)                      # node 2
+node_table.add_row(flags=tskit.NODE_IS_SAMPLE)  # node 3
+node_table.add_row(flags=tskit.NODE_IS_SAMPLE)  # node 4
+node_table.add_row(time=7)                      # node 5
+node_table.add_row(time=10)                     # node 6
+node_table
+
+edge_table = tables.edges
+edge_table.set_columns(
+    left=numpy.array([0, 0, 0, 0, 0, 0]),
+    right=numpy.array([1e5, 1e5, 1e5, 1e5, 1e5, 1e5]),
+    parent=numpy.array([2, 2, 5, 5, 6, 6], dtype=numpy.int32),
+    child=numpy.array([0, 1, 3, 4, 2, 5], dtype=numpy.int32)
+)
+edge_table
+
+tseq = tables.tree_sequence()
+
+filename = tempfile.NamedTemporaryFile().name
+
+tseq.dump(filename)
diff --git a/tests/testthat/test-manual-ts.R b/tests/testthat/test-manual-ts.R
@@ -0,0 +1,109 @@
+test_that("minimal tree sequence (nodes+edges) is correctly loaded", {
+  reticulate::py_run_file("manual_ts_nodes+edges.py")
+
+  path <- reticulate::py$filename
+  ts <- ts_load(path)
+
+  # ts_tree(ts, 1) %>% ts_draw()
+
+  edges <- ts_table(ts, "edges")
+  expect_true(all(edges$child == c(0, 1, 3, 4, 2, 5)))
+  expect_true(all(edges$parent == c(2, 2, 5, 5, 6, 6)))
+
+  expect_true(all(ts_table(ts, "nodes")$node_id == seq(0, 6)))
+  expect_true(all(ts_table(ts, "nodes")$time_tskit == c(0, 0, 3, 0, 0, 7, 10)))
+
+  expect_true(all(is.na(ts_table(ts, "nodes")$pop_id)))
+  expect_true(all(is.na(ts_nodes(ts)$pop_id)))
+
+  # check the annotated nodes table
+  expect_true(all(ts_nodes(ts)$node_id == seq(0, 6)))
+  expect_true(all(ts_nodes(ts)$time_tskit == c(0, 0, 3, 0, 0, 7, 10)))
+
+  # check the annotated edges table
+  expect_true(all(ts_edges(ts)$child_node_id == c(0, 1, 2, 3, 4, 5)))
+  expect_true(all(ts_edges(ts)$parent_node_id == c(2, 2, 6, 5, 5, 6)))
+})
+
+test_that("minimal tree sequence (nodes+edges+inds) is correctly loaded", {
+  reticulate::py_run_file("manual_ts_nodes+edges+inds.py")
+
+  path <- reticulate::py$filename
+  ts <- ts_load(path)
+
+  # ts_tree(ts, 1) %>% ts_draw()
+
+  edges <- ts_table(ts, "edges")
+  expect_true(all(edges$child == c(0, 1, 3, 4, 2, 5)))
+  expect_true(all(edges$parent == c(2, 2, 5, 5, 6, 6)))
+
+  expect_true(all(ts_table(ts, "nodes")$node_id == seq(0, 6)))
+  expect_true(all(ts_table(ts, "nodes")$time_tskit == c(0, 0, 3, 0, 0, 7, 10)))
+
+  expect_true(all(is.na(ts_table(ts, "nodes")$pop_id)))
+  expect_true(all(is.na(ts_nodes(ts)$pop_id)))
+
+  # check the annotated nodes table
+  expect_true(all(ts_nodes(ts)$node_id == c(0, 1, 3, 4, 2, 5, 6)))
+  expect_true(all(ts_nodes(ts)$time_tskit == c(0, 0, 0, 0, 3, 7, 10)))
+
+  # check the annotated edges table
+  expect_true(all(ts_edges(ts)$child_node_id == c(0, 1, 3, 4, 2, 5)))
+  expect_true(all(ts_edges(ts)$parent_node_id == c(2, 2, 5, 5, 6, 6)))
+})
+
+test_that("minimal tree sequence (nodes+edges+inds+pops) is correctly loaded", {
+  reticulate::py_run_file("manual_ts_nodes+edges+inds+pops.py")
+
+  path <- reticulate::py$filename
+  ts <- ts_load(path)
+
+  # ts_tree(ts, 1) %>% ts_draw()
+
+  edges <- ts_table(ts, "edges")
+  expect_true(all(edges$child == c(0, 1, 3, 4, 2, 5)))
+  expect_true(all(edges$parent == c(2, 2, 5, 5, 6, 6)))
+
+  expect_true(all(ts_table(ts, "nodes")$node_id == seq(0, 6)))
+  expect_true(all(ts_table(ts, "nodes")$time_tskit == c(0, 0, 3, 0, 0, 7, 10)))
+
+  # check the annotated nodes table
+  expect_true(all(ts_nodes(ts)$node_id == c(0, 1, 3, 4, 2, 5, 6)))
+  expect_true(all(ts_nodes(ts)$time_tskit == c(0, 0, 0, 0, 3, 7, 10)))
+  expect_true(all(ts_nodes(ts)$pop_id == c(0, 0, 2, 2, 1, 2, 1)))
+
+  # check the annotated edges table
+  expect_true(all(ts_edges(ts)$child_node_id == c(0, 1, 3, 4, 2, 5)))
+  expect_true(all(ts_edges(ts)$parent_node_id == c(2, 2, 5, 5, 6, 6)))
+  expect_true(all(ts_edges(ts)$child_pop == c(0, 0, 2, 2, 1, 2)))
+  expect_true(all(ts_edges(ts)$parent_pop == c(1, 1, 2, 2, 1, 1)))
+})
+
+test_that("minimal tree sequence (nodes+edges+inds+pops+muts) is correctly loaded", {
+  reticulate::py_run_file("manual_ts_nodes+edges+inds+pops+muts.py")
+
+  path <- reticulate::py$filename
+  ts <- ts_load(path)
+
+  # ts_tree(ts, 1) %>% ts_draw()
+
+  edges <- ts_table(ts, "edges")
+  expect_true(all(edges$child == c(0, 1, 3, 4, 2, 5)))
+  expect_true(all(edges$parent == c(2, 2, 5, 5, 6, 6)))
+
+  expect_true(all(ts_table(ts, "nodes")$node_id == seq(0, 6)))
+  expect_true(all(ts_table(ts, "nodes")$time_tskit == c(0, 0, 3, 0, 0, 7, 10)))
+
+  expect_true(all(ts_table(ts, "mutations")$node == c(2, 5)))
+
+  # check the annotated nodes table
+  expect_true(all(ts_nodes(ts)$node_id == c(0, 1, 3, 4, 2, 5, 6)))
+  expect_true(all(ts_nodes(ts)$time_tskit == c(0, 0, 0, 0, 3, 7, 10)))
+  expect_true(all(ts_nodes(ts)$pop_id == c(0, 0, 2, 2, 1, 2, 1)))
+
+  # check the annotated edges table
+  expect_true(all(ts_edges(ts)$child_node_id == c(0, 1, 3, 4, 2, 5)))
+  expect_true(all(ts_edges(ts)$parent_node_id == c(2, 2, 5, 5, 6, 6)))
+  expect_true(all(ts_edges(ts)$child_pop == c(0, 0, 2, 2, 1, 2)))
+  expect_true(all(ts_edges(ts)$parent_pop == c(1, 1, 2, 2, 1, 1)))
+})
diff --git a/tests/testthat/test-trees.R b/tests/testthat/test-trees.R
@@ -95,6 +95,10 @@ test_that("ape phylo and tskit.Tree objects are equivalent (SLiM)", {
   t2_nodes <- unique(as.vector(t2$edge))
 
   expect_true(length(t1_nodes) == length(t2_nodes))
+
+  # even the recapitated nodes should have numerical time_tskit values
+  # (after the fix of the join operation which caused NA to be introduced)
+  expect_true(all(!is.na(ts_nodes(t2)$time_tskit)))
 })
 
 

diff --git a/tests/testthat/test-ts.R b/tests/testthat/test-ts.R
@@ -231,7 +231,7 @@ test_that("simplification retains only specified samples (SLiM)", {
   expect_true(all(df_data$time == df_samples$time))
   expect_true(all(df_data$pop == df_samples$pop))
 
-  suppressWarnings(ts2 <- ts_load(model, file = msprime_ts, recapitate = TRUE, recombination_rate = 0, Ne = 1))
+  suppressWarnings(ts2 <- ts_load(model, file = slim_ts, recapitate = TRUE, recombination_rate = 0, Ne = 1))
   simplify_to <- sample(ts_samples(ts2)$name, 10)
 
   ts2 <- ts_simplify(ts2, simplify_to = simplify_to)