# 1. Parameters

In [1]:
# Defaults
simulation_dir = 'simulations/unset'
ncores = 48

In [2]:
# Parameters
read_coverage = 30
mincov = 10
simulation_dir = "simulations/alpha-0.5-cov-30"
iterations = 1
sub_alpha = 0.5


In [3]:
from pathlib import Path
import os

simulation_data_dir = Path(simulation_dir) / 'simulated_data'
initial_reads_dir = simulation_data_dir / 'reads_initial'
reads_dir = simulation_data_dir / 'reads'
assemblies_dir = simulation_data_dir / 'assemblies'
assembled_reads_dir = simulation_data_dir / 'assembled_reads'

if not reads_dir.exists():
    os.mkdir(reads_dir)
    
if not assembled_reads_dir.exists():
    os.mkdir(assembled_reads_dir)

# 2. Fix reads

Fix read file names and data so they can be indexed.

In [4]:
import os

# Fix warning about locale unset
os.environ['LANG'] = 'en_US.UTF-8'

!pushd {initial_reads_dir}; prename 's/data_//' *.fq.gz; popd

~/workspace/genomics-data-index-evaluation-manuscript/evaluations/simulation/simulations/alpha-0.5-cov-30/simulated_data/reads_initial ~/workspace/genomics-data-index-evaluation-manuscript/evaluations/simulation


~/workspace/genomics-data-index-evaluation-manuscript/evaluations/simulation


Jackalope produces reads with non-standard identifiers where pairs of reads don't have matching identifiers. For example:

* Pair 1: `@SH08-001-NC_011083-3048632-R/1`
* Pair 2: `@SH08-001-NC_011083-3048396-F/2`

In order to run snippy, these paired identifiers need to match (except for the `/1` and `/2` suffix).

So, I have to replace them all with something unique, but which matches in each pair of files. I do this by replacing the position (I think) with the read number (as it appears in the file). So the above identifiers become:

* Pair 1: `@SH08-001-NC_011083-1/1`
* Pair 2: `@SH08-001-NC_011083-1/2`

In [5]:
import glob
import os

files = [os.path.basename(f) for f in glob.glob(f'{initial_reads_dir}/*.fq.gz')]
!parallel -j {ncores} -I% 'gzip -d --stdout {initial_reads_dir}/% | perl scripts/replace-fastq-header.pl | gzip > {reads_dir}/%' \
    ::: {' '.join(files)}

In [6]:
import shutil

shutil.rmtree(initial_reads_dir)

# 3. Fix assemblies

Fix assembly genome names

In [7]:
!pushd {assemblies_dir}; prename 's/data__//' *.fa.gz; popd

~/workspace/genomics-data-index-evaluation-manuscript/evaluations/simulation/simulations/alpha-0.5-cov-30/simulated_data/assemblies ~/workspace/genomics-data-index-evaluation-manuscript/evaluations/simulation


~/workspace/genomics-data-index-evaluation-manuscript/evaluations/simulation


# 4. Assemble reads

Assemble reads to have another scenario of imperfect assemblies.

In [8]:
!conda run -n skesa skesa --version

SKESA 2.4.0

skesa --version 




In [9]:
import glob
import re

fastq_names = [os.path.basename(f) for f in glob.glob(f'{reads_dir}/*_R1.fq.gz')]
sample_names = [re.sub(r'_R\d.fq.gz', '', f) for f in fastq_names]
!parallel -j {ncores} -I% 'conda run -n skesa skesa --reads {reads_dir}/%_R1.fq.gz,{reads_dir}/%_R2.fq.gz --contigs_out {assembled_reads_dir}/%.fasta > {assembled_reads_dir}/log.% 2>&1' \
    ::: {' '.join(sample_names)}