# Fix reads

In [1]:
import os

# Fix warning about locale unset
os.environ['LANG'] = 'en_US.UTF-8'

!pushd output/reads; prename 's/initial_//' *.fq.gz; popd

~/workspace/thesis-data-simulation/jackalope/salmonella/output/reads ~/workspace/thesis-data-simulation/jackalope/salmonella
~/workspace/thesis-data-simulation/jackalope/salmonella


Jackalope produces reads with non-standard identifiers where pairs of reads don't have matching identifiers. For example:

* Pair 1: `@SH08-001-NC_011083-3048632-R/1`
* Pair 2: `@SH08-001-NC_011083-3048396-F/2`

In order to run snippy, these paired identifiers need to match (except for the `/1` and `/2` suffix).

So, I have to replace them all with something unique, but which matches in each pair of files. I do this by replacing the position (I think) with the read number (as it appears in the file). So the above identifiers become:

* Pair 1: `@SH08-001-NC_011083-1/1`
* Pair 2: `@SH08-001-NC_011083-1/2`

In [3]:
import glob
import os

files = [os.path.basename(f) for f in glob.glob('output/reads/*.fq.gz')]
!parallel -j 24 -I% 'gzip -d --stdout output/reads/% | perl ../scripts/replace-fastq-header.pl | gzip > output/%' \
    ::: {' '.join(files)}

# Create input file for snippy

In [4]:
import os
import glob

reference_file = 'input/S_HeidelbergSL476.fasta.gz'

# snippy only runs with uncompressed reference
!gunzip -f -k {reference_file}

reference_file_abs = os.path.abspath('input/S_HeidelbergSL476.fasta')

snippy_out = os.path.abspath('phylogeny')

if not os.path.exists(snippy_out):
    os.mkdir(snippy_out)

with open(f'{snippy_out}/snippy.fofn', 'w') as snippy_fofn:
    directory = 'output'
    for file in glob.glob(f'{directory}/*_R1.fq.gz'):
        sample = os.path.basename(file).rsplit('_R1.fq.gz')[0]
        
        files = [f'{directory}/{sample}_R1.fq.gz', f'{directory}/{sample}_R2.fq.gz']
        files = [os.path.abspath(f) for f in files]
        values = [sample]
        values.extend(files)
        snippy_fofn.write('\t'.join(values)+'\n')

In [5]:
!head -n 1 {snippy_out}/snippy.fofn

SH14-013	/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/output/SH14-013_R1.fq.gz	/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/output/SH14-013_R2.fq.gz


# Run snippy

In [6]:
!conda run --name snippy snippy-multi {snippy_out}/snippy.fofn \
    --reference {reference_file_abs} --cpus 6 > {snippy_out}/snippy-commands-all.sh
!head -n-2 {snippy_out}/snippy-commands-all.sh > {snippy_out}/snippy-commands-variant.sh
!tail -n 2 {snippy_out}/snippy-commands-all.sh > {snippy_out}/snippy-commands-core.sh

Reading: /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/phylogeny/snippy.fofn
Generating output commands for 59 isolates
Done.



In [7]:
!tail -n 2 {snippy_out}/snippy-commands-variant.sh
!echo '****'
!tail {snippy_out}/snippy-commands-core.sh

snippy --outdir 'SH14-023' --R1 '/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/output/SH14-023_R1.fq.gz' --R2 '/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/output/SH14-023_R2.fq.gz' --reference /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/input/S_HeidelbergSL476.fasta --cpus 6
snippy --outdir 'SH13-005' --R1 '/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/output/SH13-005_R1.fq.gz' --R2 '/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/output/SH13-005_R2.fq.gz' --reference /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/salmonella/input/S_HeidelbergSL476.fasta --cpus 6
****
snippy-core --ref 'SH08-001/ref.fa' SH08-001 SH09-29 SH10-001 SH10-002 SH10-014 SH10-015 SH10-30 SH11-001 SH11-002 SH12-001 SH12-002 SH12-003 SH12-004 SH12-005 SH12-006 SH12-007 SH12-008 SH12-009 SH12-010 SH12-011 SH12-

In [8]:
# Run variant calling in parallel
!(pushd {snippy_out} && conda run --name snippy \
  parallel -j 12 -a {snippy_out}/snippy-commands-variant.sh && popd) > {snippy_out}/snippy-variant.log 2>&1

In [9]:
# Run core in serial
!(pushd {snippy_out} && conda run --name snippy \
  bash {snippy_out}/snippy-commands-core.sh && popd) > {snippy_out}/snippy-core.log 2>&1

In [10]:
!column -s$'\t' -t phylogeny/core.txt

ID         LENGTH   ALIGNED  UNALIGNED  VARIANT  HET   MASKED  LOWCOV
SH08-001   4888768  4829501  48095      18425    2385  0       8787
SH09-29    4888768  4829610  48255      15585    2378  0       8525
SH10-001   4888768  4829370  47886      16770    2459  0       9053
SH10-002   4888768  4829251  48106      17282    2425  0       8986
SH10-014   4888768  4829822  48272      20812    2467  0       8207
SH10-015   4888768  4829414  48035      20341    2486  0       8833
SH10-30    4888768  4829590  48155      16013    2495  0       8528
SH11-001   4888768  4829479  47904      16234    2446  0       8939
SH11-002   4888768  4829426  48204      20843    2539  0       8599
SH12-001   4888768  4828792  48042      17178    2429  0       9505
SH12-002   4888768  4829176  48203      17406    2448  0       8941
SH12-003   4888768  4829802  48159      17171    2397  0       8410
SH12-004   4888768  4829943  48005      17169    2493  0       8327
SH12-005   4888768  4829544  48008      17165 

# Tree

In [11]:
!iqtree -redo -s phylogeny/core.aln -T 24 | tail -n 30

Iteration 100 / LogL: -406778.060 / Time: 0h:2m:31s (0h:0m:1s left)
TREE SEARCH COMPLETED AFTER 102 ITERATIONS / Time: 0h:2m:33s

--------------------------------------------------------------------
|                    FINALIZING TREE SEARCH                        |
--------------------------------------------------------------------
Performs final model parameters optimization
Estimate model parameters (epsilon = 0.010)
1. Initial log-likelihood: -398085.579
Optimal log-likelihood: -398085.579
Rate parameters:  A-C: 1.00000  A-G: 1.00000  A-T: 1.00000  C-G: 1.00000  C-T: 1.00000  G-T: 1.00000
Base frequencies:  A: 0.250  C: 0.250  G: 0.250  T: 0.250
Gamma shape alpha: 2.819
Parameters optimization took 1 rounds (0.017 sec)
BEST SCORE FOUND : -398085.579
Total tree length: 0.025

Total number of iterations: 102
CPU time used for tree search: 3366.220 sec (0h:56m:6s)
Wall-clock time used for tree search: 153.031 sec (0h:2m:33s)
Total CPU time used: 3389.903 sec (0h:56m:29s)
Total wall-

In [12]:
!sed -i.bak -e 's/Reference/reference/' phylogeny/core.aln.treefile