# 1. Initialize indexes

In [1]:
from pathlib import Path
from os import mkdir
from shutil import rmtree

index_reads_path = Path('index-reads')
index_assemblies_path = Path('index-assemblies')

In [2]:
if index_reads_path.exists():
    rmtree(index_reads_path)
    
if index_assemblies_path.exists():
    rmtree(index_assemblies_path)

!gdi --version
!gdi init {index_reads_path}
!gdi init {index_assemblies_path}

gdi, version 0.3.0.dev3
Initializing empty project in [index-reads]
Initializing empty project in [index-assemblies]


# 2. Index genomes

## 2.1. Index reads

In [3]:
import glob

reads_dir = 'simulated_data/reads'
reference_file = 'input/S_HeidelbergSL476.fasta.gz'
ncores = 48

!gdi --project-dir {index_reads_path} --ncores {ncores} analysis \
    --use-conda --reference-file {reference_file} {reads_dir}/*.fq.gz

[32m2021-08-18 13:30:58[0m [1;30mINFO:[0m Automatically structuring 118 input files into assemblies/reads
[32m2021-08-18 13:30:58[0m [1;30mINFO:[0m Processing 59 genomes to identify mutations
[32m2021-08-18 13:30:58[0m [1;30mINFO:[0m Cannot use snpeff for reference file [input/S_HeidelbergSL476.fasta.gz], no snpeff annotations are included
[32m2021-08-18 13:30:58[0m [1;30mINFO:[0m Running Snakemake for rule all
[32m2021-08-18 13:43:02[0m [1;30mINFO:[0m Finished running snakemake.
[32m2021-08-18 13:43:02[0m [1;30mINFO:[0m Indexing processed VCF files defined in [/home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1629311458.4737868/gdi-input.fofn]
[32m2021-08-18 13:43:02[0m [1;30mINFO:[0m Attempting to load reference genome=[input/S_HeidelbergSL476.fasta.gz]
[32m2021-08-18 13:43:03[0m [1;30mINFO:[0m Sample batch 1/1: Stage 1/2 (Insert): Processed 0% (0/59) samples
[32m2021-08-18 13:43:07[0m [

## 2.2. Index assemblies

In [4]:
assemblies_dir = 'simulated_data/assemblies'
reference_file = 'input/S_HeidelbergSL476.fasta.gz'
ncores = 32

!gdi --project-dir {index_assemblies_path} --ncores {ncores} analysis \
    --use-conda --reference-file {reference_file} {assemblies_dir}/*.fa.gz

[32m2021-08-18 13:45:54[0m [1;30mINFO:[0m Automatically structuring 59 input files into assemblies/reads
[32m2021-08-18 13:45:54[0m [1;30mINFO:[0m Processing 59 genomes to identify mutations
[32m2021-08-18 13:45:54[0m [1;30mINFO:[0m Cannot use snpeff for reference file [input/S_HeidelbergSL476.fasta.gz], no snpeff annotations are included
[32m2021-08-18 13:45:54[0m [1;30mINFO:[0m Running Snakemake for rule all
[32m2021-08-18 13:47:14[0m [1;30mINFO:[0m Finished running snakemake.
[32m2021-08-18 13:47:14[0m [1;30mINFO:[0m Indexing processed VCF files defined in [/home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1629312354.5644708/gdi-input.fofn]
[32m2021-08-18 13:47:14[0m [1;30mINFO:[0m Attempting to load reference genome=[input/S_HeidelbergSL476.fasta.gz]
[32m2021-08-18 13:47:15[0m [1;30mINFO:[0m Sample batch 1/1: Stage 1/2 (Insert): Processed 0% (0/59) samples
[32m2021-08-18 13:47:18[0m [1

# 3. Create phylogenetic tree

## 3.1. Reads

### 3.1.1. Reads with only SNPs

In [5]:
ncores = 32

output_tree = index_reads_path / 'reads.snp-tree.tre'

!gdi --project-dir index-reads/ --ncores 32 build tree \
     --align-type full --include-variants SNP \
     --extra-params '--fast -m GTR+F+R4' --output-file {output_tree} --reference-name S_HeidelbergSL476
!sed -i.bak -e 's/S_HeidelbergSL476/reference/' {output_tree}

[32m2021-08-18 13:48:09[0m [1;30mINFO:[0m Building build_tree using "iqtree" for 0 samples
[32m2021-08-18 13:48:09[0m [1;30mINFO:[0m Started building alignment for 59 samples with include_variants=['SNP']
[32m2021-08-18 13:48:23[0m [1;30mINFO:[0m Finished building alignment for 59 samples. Took 13.53 seconds
Wrote tree to [index-reads/reads.snp-tree.tre]


### 3.1.2. Reads with SNPs and MNPs

In [6]:
ncores = 32

output_tree = index_reads_path / 'reads.snp-mnp-tree.tre'

!gdi --project-dir index-reads/ --ncores 32 build tree \
     --align-type full --include-variants SNP --include-variants MNP \
     --extra-params '--fast -m GTR+F+R4' --output-file {output_tree} --reference-name S_HeidelbergSL476
!sed -i.bak -e 's/S_HeidelbergSL476/reference/' {output_tree}

[32m2021-08-18 13:53:38[0m [1;30mINFO:[0m Building build_tree using "iqtree" for 0 samples
[32m2021-08-18 13:53:38[0m [1;30mINFO:[0m Started building alignment for 59 samples with include_variants=['SNP', 'MNP']
[32m2021-08-18 13:53:52[0m [1;30mINFO:[0m Finished building alignment for 59 samples. Took 13.56 seconds
Wrote tree to [index-reads/reads.snp-mnp-tree.tre]


## 3.2. Assemblies

### 3.2.1. Assemblies with only SNPs

In [7]:
ncores = 32

output_tree = index_assemblies_path / 'assemblies.snp-tree.tre'

!gdi --project-dir index-assemblies/ --ncores 32 build tree \
     --align-type full --include-variants SNP \
     --extra-params '--fast -m GTR+F+R4' --output-file {output_tree} --reference-name S_HeidelbergSL476
!sed -i.bak -e 's/S_HeidelbergSL476/reference/' {output_tree}

[32m2021-08-18 13:56:48[0m [1;30mINFO:[0m Building build_tree using "iqtree" for 0 samples
[32m2021-08-18 13:56:48[0m [1;30mINFO:[0m Started building alignment for 59 samples with include_variants=['SNP']
[32m2021-08-18 13:57:01[0m [1;30mINFO:[0m Finished building alignment for 59 samples. Took 13.22 seconds
Wrote tree to [index-assemblies/assemblies.snp-tree.tre]


### 3.2.2. Assemblies with SNPs and MNPs

In [8]:
ncores = 32

output_tree = index_assemblies_path / 'assemblies.snp-mnp-tree.tre'

!gdi --project-dir index-assemblies/ --ncores 32 build tree \
     --align-type full --include-variants SNP --include-variants MNP \
     --extra-params '--fast -m GTR+F+R4' --output-file {output_tree} --reference-name S_HeidelbergSL476
!sed -i.bak -e 's/S_HeidelbergSL476/reference/' {output_tree}

[32m2021-08-18 13:58:48[0m [1;30mINFO:[0m Building build_tree using "iqtree" for 0 samples
[32m2021-08-18 13:58:48[0m [1;30mINFO:[0m Started building alignment for 59 samples with include_variants=['SNP', 'MNP']
[32m2021-08-18 13:59:01[0m [1;30mINFO:[0m Finished building alignment for 59 samples. Took 13.10 seconds
Wrote tree to [index-assemblies/assemblies.snp-mnp-tree.tre]


# 4. Build k-mer based trees

## 4.1. Build k-mer reads tree