# Fix reads

In [1]:
import os

# Fix warning about locale unset
os.environ['LANG'] = 'en_US.UTF-8'

!pushd output/reads; prename 's/initial_//' *.fq.gz; popd

~/workspace/thesis-data-simulation/jackalope/small/output/reads ~/workspace/thesis-data-simulation/jackalope/small
~/workspace/thesis-data-simulation/jackalope/small


Jackalope produces reads with non-standard identifiers where pairs of reads don't have matching identifiers. For example:

* Pair 1: `@SH08-001-NC_011083-3048632-R/1`
* Pair 2: `@SH08-001-NC_011083-3048396-F/2`

In order to run snippy, these paired identifiers need to match (except for the `/1` and `/2` suffix).

So, I have to replace them all with something unique, but which matches in each pair of files. I do this by replacing the position (I think) with the read number (as it appears in the file). So the above identifiers become:

* Pair 1: `@SH08-001-NC_011083-1/1`
* Pair 2: `@SH08-001-NC_011083-1/2`

In [2]:
import glob
import os

files = [os.path.basename(f) for f in glob.glob('output/reads/*.fq.gz')]
!parallel -j 24 -I% 'gzip -d --stdout output/reads/% | perl ../scripts/replace-fastq-header.pl | gzip > output/%' \
    ::: {' '.join(files)}

# Create input file for snippy

In [3]:
import os
import glob

reference_file = 'input/genome.fasta.gz'

# snippy only runs with uncompressed reference
!gunzip -f -k {reference_file}

reference_file_abs = os.path.abspath('input/genome.fasta')

snippy_out = os.path.abspath('phylogeny')

if not os.path.exists(snippy_out):
    os.mkdir(snippy_out)

with open(f'{snippy_out}/snippy.fofn', 'w') as snippy_fofn:
    directory = 'output'
    for file in glob.glob(f'{directory}/*_R1.fq.gz'):
        sample = os.path.basename(file).rsplit('_R1.fq.gz')[0]
        
        files = [f'{directory}/{sample}_R1.fq.gz', f'{directory}/{sample}_R2.fq.gz']
        files = [os.path.abspath(f) for f in files]
        values = [sample]
        values.extend(files)
        snippy_fofn.write('\t'.join(values)+'\n')

In [4]:
!head -n 1 {snippy_out}/snippy.fofn

SampleA	/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/small/output/SampleA_R1.fq.gz	/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/small/output/SampleA_R2.fq.gz


# Run snippy

In [5]:
!conda run --name snippy snippy-multi {snippy_out}/snippy.fofn \
    --reference {reference_file_abs} --cpus 6 > {snippy_out}/snippy-commands-all.sh
!head -n-2 {snippy_out}/snippy-commands-all.sh > {snippy_out}/snippy-commands-variant.sh
!tail -n 2 {snippy_out}/snippy-commands-all.sh > {snippy_out}/snippy-commands-core.sh

Reading: /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/small/phylogeny/snippy.fofn
Generating output commands for 3 isolates
Done.



In [6]:
!tail -n 2 {snippy_out}/snippy-commands-variant.sh
!echo '****'
!tail {snippy_out}/snippy-commands-core.sh

snippy --outdir 'SampleB' --R1 '/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/small/output/SampleB_R1.fq.gz' --R2 '/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/small/output/SampleB_R2.fq.gz' --reference /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/small/input/genome.fasta --cpus 6
snippy --outdir 'SampleC' --R1 '/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/small/output/SampleC_R1.fq.gz' --R2 '/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/small/output/SampleC_R2.fq.gz' --reference /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/small/input/genome.fasta --cpus 6
****
snippy-core --ref 'SampleA/ref.fa' SampleA SampleB SampleC



In [7]:
# Run variant calling in parallel
!(pushd {snippy_out} && conda run --name snippy \
  parallel -j 12 -a {snippy_out}/snippy-commands-variant.sh && popd) > {snippy_out}/snippy-variant.log 2>&1

In [8]:
# Run core in serial
!(pushd {snippy_out} && conda run --name snippy \
  bash {snippy_out}/snippy-commands-core.sh && popd) > {snippy_out}/snippy-core.log 2>&1

In [9]:
!column -s$'\t' -t phylogeny/core.txt

ID         LENGTH  ALIGNED  UNALIGNED  VARIANT  HET  MASKED  LOWCOV
SampleA    5180    4741     49         26       3    0       387
SampleB    5180    4904     102        26       2    0       172
SampleC    5180    4851     52         18       3    0       274
Reference  5180    5180     0          0        0    0       0


# Tree

In [10]:
!iqtree -redo -s phylogeny/core.aln -T 24 | tail -n 30

UPDATE BEST LOG-LIKELIHOOD: -226.209
Iteration 100 / LogL: -234.124 / Time: 0h:0m:0s (0h:0m:0s left)
TREE SEARCH COMPLETED AFTER 101 ITERATIONS / Time: 0h:0m:0s

--------------------------------------------------------------------
|                    FINALIZING TREE SEARCH                        |
--------------------------------------------------------------------
Performs final model parameters optimization
Estimate model parameters (epsilon = 0.010)
1. Initial log-likelihood: -226.209
Optimal log-likelihood: -226.206
Rate parameters:  A-C: 1.00000  A-G: 1.00000  A-T: 1.00000  C-G: 1.00000  C-T: 1.00000  G-T: 1.00000
Base frequencies:  A: 0.250  C: 0.250  G: 0.250  T: 0.250
Parameters optimization took 1 rounds (0.000 sec)
BEST SCORE FOUND : -226.206
Total tree length: 0.009

Total number of iterations: 101
CPU time used for tree search: 0.372 sec (0h:0m:0s)
Wall-clock time used for tree search: 0.098 sec (0h:0m:0s)
Total CPU time used: 0.440 sec (0h:0m:0s)
Total wall-clock time use

In [11]:
!sed -i.bak 's/Reference/reference/' phylogeny/core.aln.treefile