# 1. Parameters

In [1]:
simulation_dir <- "simulations"

coverage_cases <- c(5, 10, 20, 30, 40, 50)
coverage_paths <- sapply(coverage_cases, function(x){file.path(simulation_dir, sprintf("cov-%s", x))})
coverage_read_trees <- sapply(coverage_paths, function(x){file.path(x, "index-reads", "reads.tre")})
coverage_assembly_trees <- sapply(coverage_paths, function(x){file.path(x, "index-assemblies", "assemblies.tre")})

names(coverage_read_trees) <- coverage_cases
names(coverage_assembly_trees) <- coverage_cases

initial_tree_file <- "input/salmonella.tre"
trees_table_file <- file.path(simulation_dir, "tree-comparisons.tsv")

# 2. Compare trees

## 2.1. Load trees

In [2]:
library(ape)

read_and_root_tree <- function(file, keep.multi) {
    tree <- ape::read.tree(file=file, keep.multi=keep.multi)
    tree <- ape::root(tree, "reference", resolve.root=TRUE)
    
    return(tree)
}

t_orig <- read_and_root_tree(file=initial_tree_file, keep.multi=FALSE)

t_reads <- lapply(coverage_read_trees, function(x){read_and_root_tree(file=x, keep.multi=TRUE)})
t_reads <- do.call(c, unlist(t_reads, recursive=FALSE))
t_assemblies <- lapply(coverage_assembly_trees, function(x){read_and_root_tree(file=x, keep.multi=TRUE)})
t_assemblies <- do.call(c, unlist(t_assemblies, recursive=FALSE))

## 2.2. Distances between trees

In [3]:
library(treespace)

reference_other_distances <- function(reference_tree, other_trees, dist_func) {
    distances <- c()
    labels <- names(other_trees)
    for(label in labels) {
        tree <- other_trees[[label]]
        distance <- dist_func(reference_tree, tree, normalize=TRUE, rooted=TRUE)
        distances <- c(distances, distance)
    }
    
    return(distances)
}

tree_distances_reference <- function(reference_tree, read_trees, assembly_trees, label_name) {
    types <- c(rep("reads", each=length(read_trees)), rep("assembly", each=length(assembly_trees)))
    labels <- c(names(read_trees), names(assembly_trees))
    lambda <- 0.5
    distances = c(treespace::refTreeDist(reference_tree, read_trees, lambda=lambda),
                 treespace::refTreeDist(reference_tree, assembly_trees, lambda=lambda))
    
    rfn_distance_reads <- reference_other_distances(t_orig, read_trees, phangorn::RF.dist)
    rfn_distance_assemblies <- reference_other_distances(t_orig, assembly_trees, phangorn::RF.dist)
    rfn_distances <- c(rfn_distance_reads, rfn_distance_assemblies)
    
    wrfn_distance_reads <- reference_other_distances(t_orig, read_trees, phangorn::wRF.dist)
    wrfn_distance_assemblies <- reference_other_distances(t_orig, assembly_trees, phangorn::wRF.dist)
    wrfn_distances <- c(rfn_distance_reads, rfn_distance_assemblies)
    
    df <- data.frame(
               Types = types,
               Coverage = labels,
               RFN_Distance = rfn_distances,
               wRFN_Distance = wrfn_distances,
               KC_Distance = distances)
    
    return(df)
}

tree_distances_df <- tree_distances_reference(t_orig, t_reads, t_assemblies, "Coverage")
tree_distances_df["Coverage"] <- as.numeric(tree_distances_df$Coverage)
tree_distances_df

Loading required package: ade4

code for methods in class “Rcpp_SpExtent” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

code for methods in class “Rcpp_SpExtent” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

code for methods in class “Rcpp_SpPoly” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

code for methods in class “Rcpp_SpPoly” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

code for methods in class “Rcpp_SpPolyPart” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

code for methods in class “Rcpp_SpPolyPart” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

code for methods in class “Rcpp_SpPolygons” was not checked for suspicious field assignments (recommended package ‘codetoo

Types,Coverage,RFN_Distance,wRFN_Distance,KC_Distance
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
reads,5,0.9655172,0.9655172,249.91242
reads,10,0.7241379,0.7241379,79.78536
reads,20,0.6724138,0.6724138,86.85927
reads,30,0.7758621,0.7758621,86.04379
reads,40,0.7586207,0.7586207,101.09099
reads,50,0.7758621,0.7758621,94.77731
assembly,5,0.6724138,0.6724138,74.5591
assembly,10,0.7068966,0.7068966,71.47704
assembly,20,0.7241379,0.7241379,77.11151
assembly,30,0.7068966,0.7068966,62.74741


In [4]:
write.table(tree_distances_df, trees_table_file, append = FALSE, sep = "\t", dec = ".",
            row.names = FALSE, col.names = TRUE)

# 3. Visualize trees

In [5]:
names(t_reads)

In [6]:
library(phytools)

cov <- "20"
rfn_distance <- tree_distances_df[tree_distances_df["Coverage"] == strtoi(cov) & tree_distances_df["Types"] == "reads", "RFN_Distance"]
kc_distance <- tree_distances_df[tree_distances_df["Coverage"] == strtoi(cov) & tree_distances_df["Types"] == "reads", "KC_Distance"]

pdf(file = "figures/original-reads-tree-20.pdf")
options(repr.plot.width=8, repr.plot.height=10)
par(mar = c(1,1,5,1), oma=c(1,2,3,2))
plot(phytools::cophylo(t_orig, t_reads[[cov]], rotate=TRUE),
     fsize=0.5, scale.bar=c(0.1,0.01),
     link.type="curved", link.col="#777777", link.lwd=1.5, link.lty=1)
title(main=sprintf("Comparison of original tree to tree constructed from reads\n (cov=%s, NRF=%0.2f, KC=%0.1f)",
                   cov, rfn_distance, kc_distance), outer=TRUE)
mtext("Original tree", side=2, cex=1.3)
mtext("Constructed tree (reads)", side=4, cex=1.3)

dev.off()

Loading required package: maps



Rotating nodes to optimize matching...
Done.


In [7]:
# There is no applicable coverage value for assemblies, but I did re-build the assemblies trees
# For every coverage value for the reads. So I'm just picking the best-scoring (NRF) tree
cov <- "5"

rfn_distance <- tree_distances_df[tree_distances_df["Coverage"] == strtoi(cov) & tree_distances_df["Types"] == "assembly", "RFN_Distance"]
kc_distance <- tree_distances_df[tree_distances_df["Coverage"] == strtoi(cov) & tree_distances_df["Types"] == "assembly", "KC_Distance"]

pdf(file = "figures/original-assemblies-tree.pdf")
options(repr.plot.width=8, repr.plot.height=10)
par(mar = c(1,1,5,1), oma=c(1,2,3,2))
plot(phytools::cophylo(t_orig, t_assemblies[[cov]], rotate=TRUE),
     fsize=0.5, scale.bar=c(0.1,0.01), link.lty=1,
     link.type="curved", link.col="#777777", link.lwd=1.5)
title(main=sprintf("Comparison of original tree to tree constructed from an assembly\n (NRF=%0.2f, KC=%0.1f)",
                   rfn_distance, kc_distance), outer=TRUE)
mtext("Original tree", side=2, cex=1.3)
mtext("Constructed tree (assembly)", side=4, cex=1.3)

dev.off()

Rotating nodes to optimize matching...
Done.
