Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to Innuca's recipe modules #191

Open
wants to merge 20 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@
### New components

- `Kraken2`: Taxonomic identification on FastQ files
- `insert_size`: calculates the inser size of a sample from mapping the reads
cimendes marked this conversation as resolved.
Show resolved Hide resolved
back to an assembly

### Bug fixes

- Fix bug in `momps`component related to added in the introduction of the clear input parameter
- Fix bug in `momps`component related to added in the introduction of the clear
cimendes marked this conversation as resolved.
Show resolved Hide resolved
input parameter
- Fixed bug with the `-ft` parameters not retrieving the dockerhub tags for
all the components.
- Fixed bug in the `megahit` process where the fastg mode would break the process
Expand All @@ -25,7 +28,12 @@ position in the `nextflow run` command inside the .nextflow.log file.

### Minor/Other changes

- Added option to `dengue_typing` to retrive closest referece sequence and link it
- `integrity_coverage` now checks the integrity of the compressed read files with the
appropriate software.
- `mlst` components now has it's own process template
cimendes marked this conversation as resolved.
Show resolved Hide resolved
- `assembly_mapping` now verifies the percentage of mapped reads, issuing a quality
control warning when it falls bellow 95%
- Added option to `dengue_typing` to retrieve closest reference sequence and link it
with a secondary channel into `mafft`
- New version of DEN-IM recipe
- Now prints an ordered list of components
Expand Down
71 changes: 67 additions & 4 deletions flowcraft/generator/components/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,16 @@


class Bowtie(Process):
"""bowtie2 to align short paired-end sequencing reads to long reference sequences
"""
bowtie2 to align short paired-end sequencing reads to long reference sequences
cimendes marked this conversation as resolved.
Show resolved Hide resolved

This process is set with:

- ``input_type``: fastq
- ``output_type``: bam
- ``ptype``: mapping

"""
"""

def __init__(self, **kwargs):

Expand Down Expand Up @@ -65,7 +66,8 @@ def __init__(self, **kwargs):


class RetrieveMapped(Process):
"""Samtools process to to align short paired-end sequencing reads to
"""
Samtools process to to align short paired-end sequencing reads to
long reference sequences

This process is set with:
Expand All @@ -74,7 +76,7 @@ class RetrieveMapped(Process):
- ``output_type``: fastq
- ``ptype``: mapping

"""
"""

def __init__(self, **kwargs):

Expand Down Expand Up @@ -108,3 +110,64 @@ def __init__(self, **kwargs):
self.status_channels = [
"retrieve_mapped"
]


class InsertSize(Process):
"""
Determines the sequencing insert size by reads mapping
to an assembly file

This process is set with:

- ``input_type``: fasta
- ``output_type``:
cimendes marked this conversation as resolved.
Show resolved Hide resolved
- ``ptype``: mapping

It contains one **secondary channel link end**:

- ``MAIN_fq`` (alias: ``_MAIN_assembly``): Receives the FastQ files
from the last process with ``fastq`` output type.

"""

def __init__(self, **kwargs):

super().__init__(**kwargs)

self.input_type = "fasta"
self.output_type = None

self.link_end.append({"link": "__fastq", "alias": "_LAST_fastq"})

self.params = {
"distribution_plot": {
"default": "false",
"description": "Produces a distribution plot of the insert sizes."
},
"clearInput": {
"default": "false",
"description":
"Permanently removes temporary input files. This option "
"is only useful to remove temporary files in large "
"workflows and prevents nextflow's resume functionality. "
"Use with caution."
}
}

self.directives = {
"assembly_mapping_statistics": {
"container": "flowcraft/bowtie2_samtools",
"version": "1.0.0-1",
"memory": "{1.Gb*task.cpus*task.attempt}",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed with @miguelpmachado, to me this seems like enforcing an intrusive behavior that will easily fail in some setups. However in this case it seems better because default cpus is 1.

"cpus": 1
},
"insert_size": {
"container": "flowcraft/plotly",
"version": "3.5.0-1"
}
}

self.status_channels = [
"assembly_mapping_statistics",
"insert_size"
]
3 changes: 2 additions & 1 deletion flowcraft/generator/components/mlst.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def __init__(self, **kwargs):
self.output_type = "fasta"

self.directives = {"mlst": {
"container": "ummidock/mlst",
"container": "flowcraft/mlst",
"version": "2.15.1-1"
}}

self.params = {
Expand Down
7 changes: 6 additions & 1 deletion flowcraft/generator/components/reads_quality_control.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ def __init__(self, **kwargs):

self.link_start.extend(["SIDE_phred", "SIDE_max_len"])

self.directives = {"integrity_coverage": {
"container": "flowcraft/integrity_coverage",
"version": "1.0-1"
}}


class CheckCoverage(Process):
"""Process template interface for additional integrity_coverage process
Expand Down Expand Up @@ -112,7 +117,7 @@ def __init__(self, **kwargs):
"cpus": 4,
"memory": "'1GB'",
"container": "flowcraft/true_coverage",
"version": "3.2-1"
"version": "3.3-1"
}
}

Expand Down
72 changes: 72 additions & 0 deletions flowcraft/generator/templates/insert_size.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
IN_plot{{ pid }} = params.distribution_plot{{ param_id }} ? "True" : "False"


process assembly_mapping_statistics_{{ pid }} {

// Send POST request to platform
{% include "post.txt" ignore missing %}

tag { sample_id }

input:
set sample_id, file(assembly), file(fastq) from {{ input_channel }}.join(_LAST_fastq_{{ pid }})

output:
set sample_id, file("samtools_stats.txt") into IN_insert_size_{{ pid }}
{% with task_name="assembly_mapping_statistics" %}
{%- include "compiler_channels.txt" ignore missing -%}
{% endwith %}

script:
"""
{
echo [DEBUG] BUILDING BOWTIE INDEX FOR ASSEMBLY: $assembly >> .command.log 2>&1
bowtie2-build --threads ${task.cpus} $assembly genome_index >> .command.log 2>&1

echo [DEBUG] MAPPING READS FROM $fastq >> .command.log 2>&1
bowtie2 -q --very-fast --threads ${task.cpus} -x genome_index -1 ${fastq[0]} -2 ${fastq[1]} \
--fr -I 0 -X 2000 --no-discordant --no-mixed --no-unal -S alignment.sam >> .command.log 2>&1

echo [DEBUG] GET STATISTICS FROM SAM: alignment.sam
samtools stats alignment.sam > samtools_stats.txt

if [ -f "alignment.sam" ] && [ -f "samtools_stats.txt" ]
then
echo pass > .status
else
echo fail > .status
fi

echo -n "" > .report.json
echo -n "" > .versions
} || {
echo fail > .status
}
"""
}


process insert_size_{{ pid }} {

// Send POST request to platform
{% include "post.txt" ignore missing %}

tag { sample_id }

publishDir "results/assembly/insert_size_{{ pid }}/"

input:
set sample_id, file(sam_stats) from IN_insert_size_{{ pid }}
val plot from IN_plot{{ pid }}

output:
file ("*insert_size_report.tab")
file ("*insert_size_distribution.html") optional true
{% with task_name="insert_size" %}
{%- include "compiler_channels.txt" ignore missing -%}
{% endwith %}

script:
template "insert_size.py"
cimendes marked this conversation as resolved.
Show resolved Hide resolved

}
33 changes: 9 additions & 24 deletions flowcraft/generator/templates/mlst.nf
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
// If a species is not provided, it bypasses the species verification
if (params.mlstSpecies{{ param_id }} == null){
IN_expected_species_{{ pid }} = Channel.value("PASS")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PASS? 🔨

} else {
IN_expected_species_{{ pid }} = Channel.value(params.mlstSpecies{{ param_id }})
}

process mlst_{{ pid }} {

Expand All @@ -10,6 +16,7 @@ process mlst_{{ pid }} {

input:
set sample_id, file(assembly) from {{ input_channel }}
val expected_species from IN_expected_species_{{ pid }}

output:
file '*.mlst.txt' into LOG_mlst_{{ pid }}
Expand All @@ -19,30 +26,8 @@ process mlst_{{ pid }} {
{% endwith %}

script:
"""
{
expectedSpecies=${params.mlstSpecies{{ param_id }}}
mlst $assembly >> ${sample_id}.mlst.txt
mlstSpecies=\$(cat *.mlst.txt | cut -f2)
json_str="{'expectedSpecies':\'\$expectedSpecies\',\
'species':'\$mlstSpecies',\
'st':'\$(cat *.mlst.txt | cut -f3)',\
'tableRow':[{'sample':'${sample_id}','data':[\
{'header':'MLST species','value':'\$mlstSpecies','table':'typing'},\
{'header':'MLST ST','value':'\$(cat *.mlst.txt | cut -f3)','table':'typing'}]}]}"
echo \$json_str > .report.json

if [ ! \$mlstSpecies = \$expectedSpecies ];
then
printf fail > .status
else
printf pass > .status
fi

} || {
printf fail > .status
}
"""
template "run_mlst.py"
cimendes marked this conversation as resolved.
Show resolved Hide resolved

}

process compile_mlst_{{ pid }} {
Expand Down
Loading