# 1. Parameters

In [1]:
simulation_dir <- "simulations"

simulation_dirs <- list.files(pattern='^(cov|alpha)', simulation_dir)

# Remove this case since it failed to build trees
simulation_dirs <- simulation_dirs[sapply(simulation_dirs, function(x){x != 'alpha-10-cov-30'})]

simulation_paths <- sapply(simulation_dirs, function(x){file.path(simulation_dir, x)})
simulation_read_trees <- sapply(simulation_paths, function(x){file.path(x, "index-reads", "reads.tre")})
simulation_assembly_trees <- sapply(simulation_paths, function(x){file.path(x, "index-assemblies", "assemblies.tre")})

names(simulation_read_trees) <- simulation_dirs
names(simulation_assembly_trees) <- simulation_dirs

initial_tree_file <- "input/salmonella.tre"
trees_table_file <- file.path(simulation_dir, "tree-comparisons.tsv")

In [2]:
simulation_read_trees

# 2. Compare trees

## 2.1. Load trees

In [3]:
library(ape)

read_and_root_tree <- function(file, keep.multi) {
    tree <- ape::read.tree(file=file, keep.multi=keep.multi)
    tree <- ape::root(tree, "reference", resolve.root=TRUE)
    
    return(tree)
}

t_orig <- read_and_root_tree(file=initial_tree_file, keep.multi=FALSE)

t_reads <- lapply(simulation_read_trees, function(x){read_and_root_tree(file=x, keep.multi=TRUE)})
t_reads <- do.call(c, unlist(t_reads, recursive=FALSE))
t_assemblies <- lapply(simulation_assembly_trees, function(x){read_and_root_tree(file=x, keep.multi=TRUE)})
t_assemblies <- do.call(c, unlist(t_assemblies, recursive=FALSE))

## 2.2. Distances between trees

In [4]:
library(treespace)

reference_other_distances <- function(reference_tree, other_trees, dist_func) {
    distances <- c()
    labels <- names(other_trees)
    for(label in labels) {
        tree <- other_trees[[label]]
        distance <- dist_func(reference_tree, tree, normalize=TRUE, rooted=TRUE)
        distances <- c(distances, distance)
    }
    
    return(distances)
}

tree_distances_reference <- function(reference_tree, read_trees, assembly_trees) {
    types <- c(rep("reads", each=length(read_trees)), rep("assembly", each=length(assembly_trees)))
    labels <- c(names(read_trees), names(assembly_trees))
    lambda <- 0.5
    distances = c(treespace::refTreeDist(reference_tree, read_trees, lambda=lambda),
                 treespace::refTreeDist(reference_tree, assembly_trees, lambda=lambda))
    
    rfn_distance_reads <- reference_other_distances(t_orig, read_trees, phangorn::RF.dist)
    rfn_distance_assemblies <- reference_other_distances(t_orig, assembly_trees, phangorn::RF.dist)
    rfn_distances <- c(rfn_distance_reads, rfn_distance_assemblies)
    
    wrfn_distance_reads <- reference_other_distances(t_orig, read_trees, phangorn::wRF.dist)
    wrfn_distance_assemblies <- reference_other_distances(t_orig, assembly_trees, phangorn::wRF.dist)
    wrfn_distances <- c(rfn_distance_reads, rfn_distance_assemblies)
    
    df <- data.frame(
               Type = types,
               Simulation = labels,
               RFN_Distance = rfn_distances,
               wRFN_Distance = wrfn_distances,
               KC_Distance = distances)
    
    return(df)
}

tree_distances_df <- tree_distances_reference(t_orig, t_reads, t_assemblies)
# tree_distances_df["Coverage"] <- as.numeric(tree_distances_df$Coverage)
tree_distances_df

Loading required package: ade4

code for methods in class “Rcpp_SpExtent” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

code for methods in class “Rcpp_SpExtent” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

code for methods in class “Rcpp_SpPoly” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

code for methods in class “Rcpp_SpPoly” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

code for methods in class “Rcpp_SpPolyPart” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

code for methods in class “Rcpp_SpPolyPart” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

code for methods in class “Rcpp_SpPolygons” was not checked for suspicious field assignments (recommended package ‘codetoo

Type,Simulation,RFN_Distance,wRFN_Distance,KC_Distance
<chr>,<chr>,<dbl>,<dbl>,<dbl>
reads,alpha-0.5-cov-30,0.7241379,0.7241379,82.28627
reads,alpha-1.0-cov-30,0.7413793,0.7413793,86.4074
reads,alpha-2.0-cov-30,0.7586207,0.7586207,71.60713
reads,alpha-5.0-cov-30,0.7413793,0.7413793,85.3172
reads,cov-10,0.7931034,0.7931034,79.86547
reads,cov-20,0.7413793,0.7413793,83.07946
reads,cov-30,0.7413793,0.7413793,72.87955
reads,cov-40,0.7413793,0.7413793,92.0451
reads,cov-5,0.9827586,0.9827586,144.96241
reads,cov-50,0.7241379,0.7241379,74.60191


In [5]:
write.table(tree_distances_df, trees_table_file, append = FALSE, sep = "\t", dec = ".",
            row.names = FALSE, col.names = TRUE)

# 3. Visualize trees

In [6]:
names(t_reads)

In [7]:
library(phytools)

simulation <- "cov-30"
label <- sprintf("cov=30, alpha=0.2")
rfn_distance <- tree_distances_df[tree_distances_df["Simulation"] == simulation & tree_distances_df["Type"] == "reads", "RFN_Distance"]
kc_distance <- tree_distances_df[tree_distances_df["Simulation"] == simulation & tree_distances_df["Type"] == "reads", "KC_Distance"]

pdf(file = sprintf("figures/original-reads-tree-%s.pdf", label))
options(repr.plot.width=8, repr.plot.height=10)
par(mar = c(1,1,5,1), oma=c(1,2,3,2))
plot(phytools::cophylo(t_orig, t_reads[[simulation]], rotate=TRUE),
     fsize=0.5, scale.bar=c(0.1,0.01),
     link.type="curved", link.col="#777777", link.lwd=1.5, link.lty=1)
title(main=sprintf("Comparison of original tree to tree constructed from reads\n (%s, NRF=%0.2f, KC=%0.1f)",
                   label, rfn_distance, kc_distance), outer=TRUE)
mtext("Original tree", side=2, cex=1.3)
mtext(sprintf("Constructed tree (reads, %s)", label), side=4, cex=1.3)

dev.off()

Loading required package: maps



Rotating nodes to optimize matching...
Done.


In [8]:
# There is no applicable coverage value for assemblies, but I did re-build the assemblies trees
# For every coverage value for the reads. So I'm just picking the best-scoring (NRF) tree
simulation <- "cov-5"
label <- sprintf("alpha=0.2")

rfn_distance <- tree_distances_df[tree_distances_df["Simulation"] == simulation & tree_distances_df["Type"] == "assembly", "RFN_Distance"]
kc_distance <- tree_distances_df[tree_distances_df["Simulation"] == simulation & tree_distances_df["Type"] == "assembly", "KC_Distance"]

pdf(file = sprintf("figures/original-assemblies-tree-%s.pdf", label))
options(repr.plot.width=8, repr.plot.height=10)
par(mar = c(1,1,5,1), oma=c(1,2,3,2))
plot(phytools::cophylo(t_orig, t_assemblies[[simulation]], rotate=TRUE),
     fsize=0.5, scale.bar=c(0.1,0.01), link.lty=1,
     link.type="curved", link.col="#777777", link.lwd=1.5)
title(main=sprintf("Comparison of original tree to tree constructed from an assembly\n (%s, NRF=%0.2f, KC=%0.1f)",
                   label, rfn_distance, kc_distance), outer=TRUE)
mtext("Original tree", side=2, cex=1.3)
mtext(sprintf("Constructed tree (assembly, %s)", label), side=4, cex=1.3)

dev.off()

Rotating nodes to optimize matching...
Done.


In [9]:
simulation <- "alpha-5.0-cov-30"
label <- sprintf("cov=30, alpha=5.0")
rfn_distance <- tree_distances_df[tree_distances_df["Simulation"] == simulation & tree_distances_df["Type"] == "reads", "RFN_Distance"]
kc_distance <- tree_distances_df[tree_distances_df["Simulation"] == simulation & tree_distances_df["Type"] == "reads", "KC_Distance"]

pdf(file = sprintf("figures/original-reads-tree-%s.pdf", label))
options(repr.plot.width=8, repr.plot.height=10)
par(mar = c(1,1,5,1), oma=c(1,2,3,2))
plot(phytools::cophylo(t_orig, t_reads[[simulation]], rotate=TRUE),
     fsize=0.5, scale.bar=c(0.1,0.01),
     link.type="curved", link.col="#777777", link.lwd=1.5, link.lty=1)
title(main=sprintf("Comparison of original tree to tree constructed from reads\n (%s, NRF=%0.2f, KC=%0.1f)",
                   label, rfn_distance, kc_distance), outer=TRUE)
mtext("Original tree", side=2, cex=1.3)
mtext(sprintf("Constructed tree (reads, %s)", label), side=4, cex=1.3)

dev.off()

Rotating nodes to optimize matching...
Done.
