# Fix reads

In [1]:
import os

# Fix warning about locale unset
os.environ['LANG'] = 'en_US.UTF-8'

!pushd output/reads; prename 's/initial_//' *.fq.gz; popd

~/workspace/thesis-data-simulation/jackalope/output/reads ~/workspace/thesis-data-simulation/jackalope
~/workspace/thesis-data-simulation/jackalope


Jackalope produces reads with non-standard identifiers where pairs of reads don't have matching identifiers. For example:

* Pair 1: `@SH08-001-NC_011083-3048632-R/1`
* Pair 2: `@SH08-001-NC_011083-3048396-F/2`

In order to run snippy, these paired identifiers need to match (except for the `/1` and `/2` suffix).

So, I have to replace them all with something unique, but which matches in each pair of files. I do this by replacing the position (I think) with the read number (as it appears in the file). So the above identifiers become:

* Pair 1: `@SH08-001-NC_011083-1/1`
* Pair 2: `@SH08-001-NC_011083-1/2`

In [2]:
import glob
import os

files = [os.path.basename(f) for f in glob.glob('output/reads/*.fq.gz')]
!parallel -j 24 -I% 'gzip -d --stdout output/reads/% | perl replace-fastq-header.pl | gzip > output/%' \
    ::: {' '.join(files)}

# Create input file for snippy

In [3]:
import os
import glob

reference_file = 'input/S_HeidelbergSL476.fasta.gz'

# snippy only runs with uncompressed reference
!gunzip -f -k {reference_file}

reference_file_abs = os.path.abspath('input/S_HeidelbergSL476.fasta')

snippy_out = os.path.abspath('phylogeny')

if not os.path.exists(snippy_out):
    os.mkdir(snippy_out)

with open(f'{snippy_out}/snippy.fofn', 'w') as snippy_fofn:
    directory = 'output'
    for file in glob.glob(f'{directory}/*_R1.fq.gz'):
        sample = os.path.basename(file).rsplit('_R1.fq.gz')[0]
        
        files = [f'{directory}/{sample}_R1.fq.gz', f'{directory}/{sample}_R2.fq.gz']
        files = [os.path.abspath(f) for f in files]
        values = [sample]
        values.extend(files)
        snippy_fofn.write('\t'.join(values)+'\n')

In [4]:
!head -n 1 {snippy_out}/snippy.fofn

SH14-013	/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/output/SH14-013_R1.fq.gz	/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/output/SH14-013_R2.fq.gz


# Run snippy

In [5]:
!conda run --name snippy snippy-multi {snippy_out}/snippy.fofn \
    --reference {reference_file_abs} --cpus 6 > {snippy_out}/snippy-commands-all.sh
!head -n-2 {snippy_out}/snippy-commands-all.sh > {snippy_out}/snippy-commands-variant.sh
!tail -n 2 {snippy_out}/snippy-commands-all.sh > {snippy_out}/snippy-commands-core.sh

Reading: /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/phylogeny/snippy.fofn
Generating output commands for 59 isolates
Done.



In [6]:
!tail -n 2 {snippy_out}/snippy-commands-variant.sh
!echo '****'
!tail {snippy_out}/snippy-commands-core.sh

snippy --outdir 'SH14-023' --R1 '/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/output/SH14-023_R1.fq.gz' --R2 '/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/output/SH14-023_R2.fq.gz' --reference /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/input/S_HeidelbergSL476.fasta --cpus 6
snippy --outdir 'SH13-005' --R1 '/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/output/SH13-005_R1.fq.gz' --R2 '/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/output/SH13-005_R2.fq.gz' --reference /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/input/S_HeidelbergSL476.fasta --cpus 6
****
snippy-core --ref 'SH08-001/ref.fa' SH08-001 SH09-29 SH10-001 SH10-002 SH10-014 SH10-015 SH10-30 SH11-001 SH11-002 SH12-001 SH12-002 SH12-003 SH12-004 SH12-005 SH12-006 SH12-007 SH12-008 SH12-009 SH12-010 SH12-011 SH12-012 SH12-013 SH12-014 SH13-001 SH13-002 SH13-003 SH13-004 SH13-005

In [7]:
# Run variant calling in parallel
!(pushd {snippy_out} && conda run --name snippy \
  parallel -j 12 -a {snippy_out}/snippy-commands-variant.sh && popd) > {snippy_out}/snippy-variant.log 2>&1

In [8]:
# Run core in serial
!(pushd {snippy_out} && conda run --name snippy \
  bash {snippy_out}/snippy-commands-core.sh && popd) > {snippy_out}/snippy-core.log 2>&1

In [9]:
!column -s$'\t' -t phylogeny/core.txt

ID         LENGTH   ALIGNED  UNALIGNED  VARIANT  HET   MASKED  LOWCOV
SH08-001   4888768  4830026  48062      18570    2369  0       8311
SH09-29    4888768  4829419  47976      15564    2354  0       9019
SH10-001   4888768  4830067  48022      16618    2360  0       8319
SH10-002   4888768  4829571  48240      17199    2449  0       8508
SH10-014   4888768  4830290  47996      20802    2405  0       8077
SH10-015   4888768  4829888  48237      20319    2363  0       8280
SH10-30    4888768  4829838  48110      16040    2380  0       8440
SH11-001   4888768  4829682  48015      16028    2405  0       8666
SH11-002   4888768  4829894  48238      20798    2515  0       8121
SH12-001   4888768  4829610  47966      17032    2356  0       8836
SH12-002   4888768  4829756  47924      17289    2442  0       8646
SH12-003   4888768  4830008  48006      17025    2308  0       8446
SH12-004   4888768  4830474  47853      17021    2466  0       7975
SH12-005   4888768  4830199  48023      17020 

# Tree

In [10]:
!iqtree -redo -s phylogeny/core.aln -T 24 | tail -n 30

Iteration 100 / LogL: -398851.548 / Time: 0h:1m:57s (0h:0m:1s left)
TREE SEARCH COMPLETED AFTER 102 ITERATIONS / Time: 0h:1m:59s

--------------------------------------------------------------------
|                    FINALIZING TREE SEARCH                        |
--------------------------------------------------------------------
Performs final model parameters optimization
Estimate model parameters (epsilon = 0.010)
1. Initial log-likelihood: -398799.986
Optimal log-likelihood: -398799.986
Rate parameters:  A-C: 1.00000  A-G: 1.00000  A-T: 1.00000  C-G: 1.00000  C-T: 1.00000  G-T: 1.00000
Base frequencies:  A: 0.242  C: 0.257  G: 0.258  T: 0.243
Gamma shape alpha: 4.759
Parameters optimization took 1 rounds (0.014 sec)
BEST SCORE FOUND : -398799.986
Total tree length: 0.029

Total number of iterations: 102
CPU time used for tree search: 2556.735 sec (0h:42m:36s)
Wall-clock time used for tree search: 118.956 sec (0h:1m:58s)
Total CPU time used: 2571.310 sec (0h:42m:51s)
Total wall

In [11]:
!sed -i.bak -e 's/Reference/reference/' phylogeny/core.aln.treefile