# 1. Parameters

In [1]:
# Defaults
datasets_root_dir = 'data/datasets'
fastq_out_dir = 'data/fastq'
ncores=32

In [2]:
from pathlib import Path
import os

datasets_root = Path(datasets_root_dir)
fastq_out = Path(fastq_out_dir)

read_dir_names = ['Campylobacter_jejuni_0810PADBR-1', 'Escherichia_coli_1405WAEXK-1',
                 'Listeria_monocytogenes_1408MLGX6-3WGS', 'Salmonella_enterica_1203NYJAP-1']

input_reads_paths = [datasets_root / x / 'reads' for x in read_dir_names]

if not fastq_out.exists():
    os.mkdir(fastq_out)
    
input_reads_paths

[PosixPath('data/datasets/Campylobacter_jejuni_0810PADBR-1/reads'),
 PosixPath('data/datasets/Escherichia_coli_1405WAEXK-1/reads'),
 PosixPath('data/datasets/Listeria_monocytogenes_1408MLGX6-3WGS/reads'),
 PosixPath('data/datasets/Salmonella_enterica_1203NYJAP-1/reads')]

# 2. Fix reads

Fix read file names and data so they can be indexed.

In [3]:
import os

# Fix warning about locale unset
os.environ['LANG'] = 'en_US.UTF-8'

Some reads downloaded have different names for each pair:

* Pair 1: `@SRR1993270_HWI-D00290:61:HA99HADXX:1:1101:2142:2223_forward/1`
* Pair 2: `@SRR1993270_HWI-D00290:61:HA99HADXX:1:1101:2142:2223_reverse/2`

In order to run snippy, these paired identifiers need to match (except for the `/1` and `/2` suffix).

So, I have to replace them all with something unique, but which matches in each pair of files. I do this by removing the `_forward` and `_reverse` from the identifiers:

* Pair 1: `@SRR1993270_HWI-D00290:61:HA99HADXX:1:1101:2142:2223/1`
* Pair 2: `@SRR1993270_HWI-D00290:61:HA99HADXX:1:1101:2142:2223/2`

In [4]:
import glob
import os

for reads_dir in input_reads_paths:
    files = [os.path.basename(f) for f in glob.glob(f'{reads_dir}/*.fastq.gz')]
    print(f'Working on {len(files)} files in {reads_dir}')
    !parallel -j {ncores} -I% 'gzip -d --stdout {reads_dir}/% | perl scripts/replace-fastq-header.pl | gzip > {fastq_out}/%' \
        ::: {' '.join(files)}

Working on 44 files in data/datasets/Campylobacter_jejuni_0810PADBR-1/reads
Working on 18 files in data/datasets/Escherichia_coli_1405WAEXK-1/reads
Working on 62 files in data/datasets/Listeria_monocytogenes_1408MLGX6-3WGS/reads
Working on 46 files in data/datasets/Salmonella_enterica_1203NYJAP-1/reads
