# 1. Parameters

In [1]:
# Defaults

## Random seed
random_seed <- 25524

## Directories
simulation_dir <- "simulations/unset"
reference_file <- "simulations/reference/reference.fa.gz"
initial_tree_file <- "input/salmonella.tre"

## Simulation parameters
sub_lambda <- 1e-2
sub_pi_tcag <- c(0.1, 0.2, 0.3, 0.4)
sub_alpha <- 0.2
sub_beta <- sub_alpha/2
sub_mu <- 1
sub_invariant <- 0.3
ins_rate <- 1e-4
ins_max_length <- 60
ins_a <- 1.6
del_rate <- 1e-4
del_max_length <- 60
del_a <- 1.6

## Read simulation information
read_coverage <- 30
read_length <- 250

## Other
ncores <- 48

In [2]:
output_dir <- file.path(simulation_dir, "simulated_data")
output_vcf_prefix <- file.path(output_dir, "haplotypes")
reads_data_initial_prefix <- file.path(output_dir, "reads_initial", "data")
set.seed(random_seed)
print(output_dir)
print(output_vcf_prefix)

[1] "simulations/unset/simulated_data"
[1] "simulations/unset/simulated_data/haplotypes"


# 2. Generate simulated data

This simulates *Salmonella* data using a reference genome and a tree.

In [3]:
library(jackalope)

# Make sure we've complied with openmp
jackalope:::using_openmp()

In [4]:
reference <- read_fasta(reference_file)
reference_len <- sum(reference$sizes())
reference

< Set of 2 chromosomes >
# Total size: 19,699 bp
  name                             chromosome                             length
chrom0     GTATTGCGAGGGTGGGGGAGTACGGCAG...CAAGAATGCAACCGACTCTGTCGGGACT     10834
chrom1     CACCTCGAGCAGATCCTAGTTTGCGCGA...TGCGAATTTATCGCCTAATCCAATGTTA      8865

In [5]:
library(ape)

tree <- read.tree(initial_tree_file)
tree <- root(tree, "reference", resolve.root=TRUE)
tree


Phylogenetic tree with 60 tips and 59 internal nodes.

Tip labels:
  reference, SH12-013, SH12-014, SH10-015, SH10-014, SH11-002, ...
Node labels:
  Root, 1.000000, 0.000000, 0.000000, 1.000000, 0.000000, ...

Rooted; includes branch lengths.

In [6]:
sub <- sub_HKY85(pi_tcag = sub_pi_tcag, mu = sub_mu,
                 alpha = sub_alpha, beta = sub_beta, gamma_shape=1, gamma_k = 5,
                 invariant = sub_invariant)
ins <- indels(rate = ins_rate, max_length = ins_max_length, a = ins_a)
del <- indels(rate = del_rate, max_length = del_max_length, a = del_a)

ref_haplotypes <- create_haplotypes(reference, haps_phylo(tree), sub=sub, ins=ins, del=del)
ref_haplotypes

                              << haplotypes object >>
# Haplotypes: 60
# Mutations: 16,932

                          << Reference genome info: >>
< Set of 2 chromosomes >
# Total size: 19,699 bp
  name                             chromosome                             length
chrom0     GTATTGCGAGGGTGGGGGAGTACGGCAG...CAAGAATGCAACCGACTCTGTCGGGACT     10834
chrom1     CACCTCGAGCAGATCCTAGTTTGCGCGA...TGCGAATTTATCGCCTAATCCAATGTTA      8865

# 3. Write simulated data

In [7]:
write_vcf(ref_haplotypes, out_prefix=output_vcf_prefix, compress=TRUE)

In [8]:
assemblies_prefix = file.path(output_dir, "assemblies", "data")

write_fasta(ref_haplotypes, out_prefix=assemblies_prefix,
            compress=TRUE, n_threads=ncores, overwrite=TRUE)

In [9]:
n_samples <- length(tree$tip)
n_reads <- round((reference_len * read_coverage * n_samples) / read_length)
print(sprintf("Number of reads for coverage %sX and read length %s over %s samples with respect to reference with length %s: %s", 
              read_coverage, read_length, n_samples, reference_len, n_reads))

illumina(ref_haplotypes, out_prefix = reads_data_initial_prefix, sep_files=TRUE, n_reads = n_reads,
         frag_mean = read_length * 2 + 50, frag_sd = 100,
         compress=TRUE, comp_method="bgzip", n_threads=ncores,
         paired=TRUE, read_length = read_length)

[1] "Number of reads for coverage 30X and read length 250 over 60 samples with respect to reference with length 19699: 141833"


In [10]:
# Rename the simulated reads for the reference genome
ref1 <- paste(toString(reads_data_initial_prefix), "_reference_R1.fq.gz", sep="")
ref2 <- paste(toString(reads_data_initial_prefix), "_reference_R2.fq.gz", sep="")

newref1 <- paste(toString(reads_data_initial_prefix), "_reference-tree_R1.fq.gz", sep="")
newref2 <- paste(toString(reads_data_initial_prefix), "_reference-tree_R2.fq.gz", sep="")

if (file.exists(ref1)) {
    file.rename(ref1, newref1)
    print(sprintf("Renaming: %s to %s", ref1, newref1))
}
if (file.exists(ref2)) {
    file.rename(ref2, newref2)
    print(sprintf("Renaming: %s to %s", ref2, newref2))
}

[1] "Renaming: simulations/unset/simulated_data/reads_initial/data_reference_R1.fq.gz to simulations/unset/simulated_data/reads_initial/data_reference-tree_R1.fq.gz"
[1] "Renaming: simulations/unset/simulated_data/reads_initial/data_reference_R2.fq.gz to simulations/unset/simulated_data/reads_initial/data_reference-tree_R2.fq.gz"
