# 1. Initialize indexes

In [2]:
from pathlib import Path
from os import mkdir
from shutil import rmtree

index_reads_path = Path('index-reads')
index_assemblies_path = Path('index-assemblies')

if index_reads_path.exists():
    rmtree(index_reads_path)
    
if index_assemblies_path.exists():
    rmtree(index_assemblies_path)

!gdi --version
!gdi init {index_reads_path}
!gdi init {index_assemblies_path}

gdi, version 0.2.0
Initializing empty project in [index-reads]
Initializing empty project in [index-assemblies]


# 2. Index genomes

## 2.1. Index reads

In [3]:
import glob

reads_dir = 'simulated_data/reads'
reference_file = 'input/S_HeidelbergSL476.fasta.gz'
ncores = 48

!gdi --project-dir {index_reads_path} --ncores {ncores} analysis \
    --use-conda --reference-file {reference_file} {reads_dir}/*.fq.gz

[32m2021-08-17 19:25:20[0m [1;30mINFO:[0m Automatically structuring 118 input files into assemblies/reads
[32m2021-08-17 19:25:20[0m [1;30mINFO:[0m Processing 59 genomes to identify mutations
[32m2021-08-17 19:25:20[0m [1;30mINFO:[0m Cannot use snpeff for reference file [input/S_HeidelbergSL476.fasta.gz], no snpeff annotations are included
[32m2021-08-17 19:25:20[0m [1;30mINFO:[0m Running Snakemake for rule all
[32m2021-08-17 19:38:10[0m [1;30mINFO:[0m Finished running snakemake.
[32m2021-08-17 19:38:10[0m [1;30mINFO:[0m Indexing processed VCF files defined in [/home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1629246320.477752/gdi-input.fofn]
[32m2021-08-17 19:38:10[0m [1;30mINFO:[0m Attempting to load reference genome=[input/S_HeidelbergSL476.fasta.gz]
[32m2021-08-17 19:38:11[0m [1;30mINFO:[0m Sample batch 1/1: Stage 1/2 (Insert): Processed 0% (0/59) samples
[32m2021-08-17 19:38:16[0m [1

## 2.2. Index assemblies

In [7]:
assemblies_dir = 'simulated_data/assemblies'
reference_file = 'input/S_HeidelbergSL476.fasta.gz'
ncores = 8

!gdi --project-dir {index_assemblies_path} --ncores {ncores} analysis \
    --use-conda --reference-file {reference_file} {assemblies_dir}/*.fa.gz

[32m2021-08-17 19:53:38[0m [1;30mINFO:[0m Automatically structuring 59 input files into assemblies/reads
[32m2021-08-17 19:53:38[0m [1;30mINFO:[0m Processing 59 genomes to identify mutations
[32m2021-08-17 19:53:38[0m [1;30mINFO:[0m Cannot use snpeff for reference file [input/S_HeidelbergSL476.fasta.gz], no snpeff annotations are included
[32m2021-08-17 19:53:38[0m [1;30mINFO:[0m Running Snakemake for rule all
[32m2021-08-17 19:56:41[0m [1;30mINFO:[0m Finished running snakemake.
[32m2021-08-17 19:56:41[0m [1;30mINFO:[0m Indexing processed VCF files defined in [/home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1629248018.6606922/gdi-input.fofn]
[32m2021-08-17 19:56:41[0m [1;30mINFO:[0m Attempting to load reference genome=[input/S_HeidelbergSL476.fasta.gz]
[32m2021-08-17 19:56:42[0m [1;30mINFO:[0m Sample batch 1/1: Stage 1/2 (Insert): Processed 0% (0/59) samples
[32m2021-08-17 19:56:58[0m [1

# 3. Create phylogenetic tree

## 3.1. Reads

In [4]:
ncores = 32

!gdi --project-dir {index_reads_path} --ncores {ncores} rebuild tree \
     --align-type full --extra-params '--fast -m GTR+F+R4' S_HeidelbergSL476

[32m2021-08-17 19:41:07[0m [1;30mINFO:[0m Started rebuilding tree for reference genome [S_HeidelbergSL476]
[32m2021-08-17 19:46:33[0m [1;30mINFO:[0m Finished rebuilding tree


## 3.2. Assemblies

In [8]:
ncores = 32

!gdi --project-dir {index_assemblies_path} --ncores {ncores} rebuild tree \
     --align-type full --extra-params '--fast -m GTR+F+R4' S_HeidelbergSL476

[32m2021-08-17 19:58:07[0m [1;30mINFO:[0m Started rebuilding tree for reference genome [S_HeidelbergSL476]
[32m2021-08-17 20:00:20[0m [1;30mINFO:[0m Finished rebuilding tree


# 4. Export tree for comparison to original tree

## 4.1. Export reads tree

In [5]:
from os import mkdir
from pathlib import Path
    
!gdi --project-dir index-reads export tree S_HeidelbergSL476 \
    | sed -e 's/S_HeidelbergSL476/reference/' \
    > index-reads/reads-tree.tre

## 4.2. Export assemblies tree

In [9]:
from os import mkdir
from pathlib import Path
    
!gdi --project-dir index-assemblies export tree S_HeidelbergSL476 \
    | sed -e 's/S_HeidelbergSL476/reference/' \
    > index-assemblies/assemblies-tree.tre