# Fix reads

In [1]:
import os

# Fix warning about locale unset
os.environ['LANG'] = 'en_US.UTF-8'

!pushd output/reads; prename 's/initial_//' *.fq.gz; popd

~/workspace/thesis-data-simulation/jackalope/output/reads ~/workspace/thesis-data-simulation/jackalope
~/workspace/thesis-data-simulation/jackalope


Jackalope produces reads with non-standard identifiers where pairs of reads don't have matching identifiers. For example:

* Pair 1: `@SH08-001-NC_011083-3048632-R/1`
* Pair 2: `@SH08-001-NC_011083-3048396-F/2`

In order to run snippy, these paired identifiers need to match (except for the `/1` and `/2` suffix).

So, I have to replace them all with something unique, but which matches in each pair of files. I do this by replacing the position (I think) with the read number (as it appears in the file). So the above identifiers become:

* Pair 1: `@SH08-001-NC_011083-1/1`
* Pair 2: `@SH08-001-NC_011083-1/2`

In [2]:
import glob
import os

files = [os.path.basename(f) for f in glob.glob('output/reads/*.fq.gz')]
!parallel -j 24 -I% 'gzip -d --stdout output/reads/% | perl replace-fastq-header.pl | gzip > output/%' \
    ::: {' '.join(files)}

# Create input file for snippy

In [3]:
import os
import glob

reference_file = 'input/S_HeidelbergSL476.fasta.gz'

# snippy only runs with uncompressed reference
!gunzip -f -k {reference_file}

reference_file_abs = os.path.abspath('input/S_HeidelbergSL476.fasta')

snippy_out = os.path.abspath('phylogeny')

if not os.path.exists(snippy_out):
    os.mkdir(snippy_out)

with open(f'{snippy_out}/snippy.fofn', 'w') as snippy_fofn:
    directory = 'output'
    for file in glob.glob(f'{directory}/*_R1.fq.gz'):
        sample = os.path.basename(file).rsplit('_R1.fq.gz')[0]
        
        files = [f'{directory}/{sample}_R1.fq.gz', f'{directory}/{sample}_R2.fq.gz']
        files = [os.path.abspath(f) for f in files]
        values = [sample]
        values.extend(files)
        snippy_fofn.write('\t'.join(values)+'\n')

In [4]:
!head -n 1 {snippy_out}/snippy.fofn

SH14-013	/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/output/SH14-013_R1.fq.gz	/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/output/SH14-013_R2.fq.gz


# Run snippy

In [5]:
!conda run --name snippy snippy-multi {snippy_out}/snippy.fofn \
    --reference {reference_file_abs} --cpus 6 > {snippy_out}/snippy-commands-all.sh
!head -n-2 {snippy_out}/snippy-commands-all.sh > {snippy_out}/snippy-commands-variant.sh
!tail -n 2 {snippy_out}/snippy-commands-all.sh > {snippy_out}/snippy-commands-core.sh

Reading: /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/phylogeny/snippy.fofn
Generating output commands for 59 isolates
Done.



In [6]:
!tail -n 2 {snippy_out}/snippy-commands-variant.sh
!echo '****'
!tail {snippy_out}/snippy-commands-core.sh

snippy --outdir 'SH14-023' --R1 '/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/output/SH14-023_R1.fq.gz' --R2 '/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/output/SH14-023_R2.fq.gz' --reference /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/input/S_HeidelbergSL476.fasta --cpus 6
snippy --outdir 'SH13-005' --R1 '/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/output/SH13-005_R1.fq.gz' --R2 '/home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/output/SH13-005_R2.fq.gz' --reference /home/CSCScience.ca/apetkau/workspace/thesis-data-simulation/jackalope/input/S_HeidelbergSL476.fasta --cpus 6
****
snippy-core --ref 'SH08-001/ref.fa' SH08-001 SH09-29 SH10-001 SH10-002 SH10-014 SH10-015 SH10-30 SH11-001 SH11-002 SH12-001 SH12-002 SH12-003 SH12-004 SH12-005 SH12-006 SH12-007 SH12-008 SH12-009 SH12-010 SH12-011 SH12-012 SH12-013 SH12-014 SH13-001 SH13-002 SH13-003 SH13-004 SH13-005

In [7]:
# Run variant calling in parallel
!(pushd {snippy_out} && conda run --name snippy \
  parallel -j 12 -a {snippy_out}/snippy-commands-variant.sh && popd) > {snippy_out}/snippy-variant.log 2>&1

In [8]:
# Run core in serial
!(pushd {snippy_out} && conda run --name snippy \
  bash {snippy_out}/snippy-commands-core.sh && popd) > {snippy_out}/snippy-core.log 2>&1

In [9]:
!column -s$'\t' -t phylogeny/core.txt

ID         LENGTH   ALIGNED  UNALIGNED  VARIANT  HET   MASKED  LOWCOV
SH08-001   4888768  4830225  47746      165      2236  0       8561
SH09-29    4888768  4830336  47599      150      2138  0       8695
SH10-001   4888768  4830166  47831      162      2206  0       8565
SH10-002   4888768  4830518  47547      164      2201  0       8502
SH10-014   4888768  4830877  47692      210      2219  0       7980
SH10-015   4888768  4830401  47712      209      2369  0       8286
SH10-30    4888768  4830626  47567      157      2211  0       8364
SH11-001   4888768  4830607  47592      159      2245  0       8324
SH11-002   4888768  4830360  47622      211      2164  0       8622
SH12-001   4888768  4830668  47687      168      2144  0       8269
SH12-002   4888768  4829797  47563      170      2193  0       9215
SH12-003   4888768  4830531  47609      168      2189  0       8439
SH12-004   4888768  4830556  47601      168      2202  0       8409
SH12-005   4888768  4830682  47685      168   

# Tree

In [10]:
!iqtree -redo -s phylogeny/core.aln | tail -n 30

Iteration 200 / LogL: -3791.625 / Time: 0h:0m:16s (0h:0m:1s left)
Iteration 210 / LogL: -3791.301 / Time: 0h:0m:16s (0h:0m:0s left)
TREE SEARCH COMPLETED AFTER 215 ITERATIONS / Time: 0h:0m:17s

--------------------------------------------------------------------
|                    FINALIZING TREE SEARCH                        |
--------------------------------------------------------------------
Performs final model parameters optimization
Estimate model parameters (epsilon = 0.010)
1. Initial log-likelihood: -3790.801
Optimal log-likelihood: -3790.793
Rate parameters:  A-C: 1.00000  A-G: 1.00000  A-T: 1.00000  C-G: 1.00000  C-T: 1.00000  G-T: 1.00000
Base frequencies:  A: 0.250  C: 0.250  G: 0.250  T: 0.250
Parameters optimization took 1 rounds (0.001 sec)
BEST SCORE FOUND : -3790.793
Total tree length: 0.010

Total number of iterations: 215
CPU time used for tree search: 15.038 sec (0h:0m:15s)
Wall-clock time used for tree search: 16.941 sec (0h:0m:16s)
Total CPU time used: 15.100 

In [12]:
!sed -i.bak -e 's/Reference/reference/' phylogeny/core.aln.treefile