Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to Innuca's recipe modules #191

Open
wants to merge 20 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@
### New components

- `Kraken2`: Taxonomic identification on FastQ files
- `insert_size`: calculates the insert size of a sample from mapping the reads
back to an assembly

### Bug fixes

- Fix bug in `momps`component related to added in the introduction of the clear input parameter
- Fix bug in `momps`component added in the introduction of the clear
input option
- Fixed bug with the `-ft` parameters not retrieving the dockerhub tags for
all the components.
- Fixed bug in the `megahit` process where the fastg mode would break the process
Expand All @@ -25,7 +28,13 @@ position in the `nextflow run` command inside the .nextflow.log file.

### Minor/Other changes

- Added option to `dengue_typing` to retrive closest referece sequence and link it
- Added `insert_size` to `innuca` recipe
- `integrity_coverage` now checks the integrity of the compressed read files with the
appropriate software.
- `mlst` component now has it's own process template
- `assembly_mapping` now verifies the percentage of mapped reads, issuing a quality
control warning when it falls bellow 95%
- Added option to `dengue_typing` to retrieve closest reference sequence and link it
with a secondary channel into `mafft`
- New version of DEN-IM recipe
- Now prints an ordered list of components
Expand Down
71 changes: 67 additions & 4 deletions flowcraft/generator/components/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,16 @@


class Bowtie(Process):
"""bowtie2 to align short paired-end sequencing reads to long reference sequences
"""
bowtie2 process to align short paired-end sequencing reads to long reference sequences

This process is set with:

- ``input_type``: fastq
- ``output_type``: bam
- ``ptype``: mapping

"""
"""

def __init__(self, **kwargs):

Expand Down Expand Up @@ -65,7 +66,8 @@ def __init__(self, **kwargs):


class RetrieveMapped(Process):
"""Samtools process to to align short paired-end sequencing reads to
"""
Samtools process to to align short paired-end sequencing reads to
long reference sequences

This process is set with:
Expand All @@ -74,7 +76,7 @@ class RetrieveMapped(Process):
- ``output_type``: fastq
- ``ptype``: mapping

"""
"""

def __init__(self, **kwargs):

Expand Down Expand Up @@ -108,3 +110,64 @@ def __init__(self, **kwargs):
self.status_channels = [
"retrieve_mapped"
]


class InsertSize(Process):
"""
Determines the sequencing insert size by reads mapping
to an assembly file

This process is set with:

- ``input_type``: fasta
- ``output_type``: None
- ``ptype``: mapping

It contains one **secondary channel link end**:

- ``MAIN_fq`` (alias: ``_MAIN_assembly``): Receives the FastQ files
from the last process with ``fastq`` output type.

"""

def __init__(self, **kwargs):

super().__init__(**kwargs)

self.input_type = "fasta"
self.output_type = None

self.link_end.append({"link": "__fastq", "alias": "_LAST_fastq"})

self.params = {
"distribution_plot": {
"default": "false",
"description": "Produces a distribution plot of the insert sizes."
},
"clearInput": {
"default": "false",
"description":
"Permanently removes temporary input files. This option "
"is only useful to remove temporary files in large "
"workflows and prevents nextflow's resume functionality. "
"Use with caution."
}
}

self.directives = {
"assembly_mapping_statistics": {
"container": "flowcraft/bowtie2_samtools",
"version": "1.0.0-1",
"memory": "{1.Gb*task.cpus*task.attempt}",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed with @miguelpmachado, to me this seems like enforcing an intrusive behavior that will easily fail in some setups. However in this case it seems better because default cpus is 1.

"cpus": 1
},
"insert_size": {
"container": "flowcraft/plotly",
"version": "3.5.0-1"
}
}

self.status_channels = [
"assembly_mapping_statistics",
"insert_size"
]
3 changes: 2 additions & 1 deletion flowcraft/generator/components/mlst.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def __init__(self, **kwargs):
self.output_type = "fasta"

self.directives = {"mlst": {
"container": "ummidock/mlst",
"container": "flowcraft/mlst",
"version": "2.15.1-1"
}}

self.params = {
Expand Down
7 changes: 6 additions & 1 deletion flowcraft/generator/components/reads_quality_control.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ def __init__(self, **kwargs):

self.link_start.extend(["SIDE_phred", "SIDE_max_len"])

self.directives = {"integrity_coverage": {
"container": "flowcraft/integrity_coverage",
"version": "1.0-1"
}}


class CheckCoverage(Process):
"""Process template interface for additional integrity_coverage process
Expand Down Expand Up @@ -112,7 +117,7 @@ def __init__(self, **kwargs):
"cpus": 4,
"memory": "'1GB'",
"container": "flowcraft/true_coverage",
"version": "3.2-1"
"version": "3.3-1"
}
}

Expand Down
3 changes: 2 additions & 1 deletion flowcraft/generator/recipes/innuca.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def __init__(self):
"spades " \
"process_spades " \
"pilon " \
"mlst "
"mlst " \
"insert_size"

# Recipe parameters and directives
self.directives = {
Expand Down
72 changes: 72 additions & 0 deletions flowcraft/generator/templates/insert_size.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
IN_plot{{ pid }} = params.distribution_plot{{ param_id }} ? "True" : "False"


process assembly_mapping_statistics_{{ pid }} {

// Send POST request to platform
{% include "post.txt" ignore missing %}

tag { sample_id }

input:
set sample_id, file(assembly), file(fastq) from {{ input_channel }}.join(_LAST_fastq_{{ pid }})

output:
set sample_id, file("samtools_stats.txt") into IN_insert_size_{{ pid }}
{% with task_name="assembly_mapping_statistics" %}
{%- include "compiler_channels.txt" ignore missing -%}
{% endwith %}

script:
"""
{
echo [DEBUG] BUILDING BOWTIE INDEX FOR ASSEMBLY: $assembly >> .command.log 2>&1
bowtie2-build --threads ${task.cpus} $assembly genome_index >> .command.log 2>&1

echo [DEBUG] MAPPING READS FROM $fastq >> .command.log 2>&1
bowtie2 -q --very-fast --threads ${task.cpus} -x genome_index -1 ${fastq[0]} -2 ${fastq[1]} \
--fr -I 0 -X 2000 --no-discordant --no-mixed --no-unal -S alignment.sam >> .command.log 2>&1

echo [DEBUG] GET STATISTICS FROM SAM: alignment.sam
samtools stats alignment.sam > samtools_stats.txt

if [ -f "alignment.sam" ] && [ -f "samtools_stats.txt" ]
then
echo pass > .status
else
echo fail > .status
fi

echo -n "" > .report.json
echo -n "" > .versions
} || {
echo fail > .status
}
"""
}


process insert_size_{{ pid }} {

// Send POST request to platform
{% include "post.txt" ignore missing %}

tag { sample_id }

publishDir "results/assembly/insert_size_{{ pid }}/"

input:
set sample_id, file(sam_stats) from IN_insert_size_{{ pid }}
val plot from IN_plot{{ pid }}

output:
file ("*insert_size_report.tab")
file ("*insert_size_distribution.html") optional true
{% with task_name="insert_size" %}
{%- include "compiler_channels.txt" ignore missing -%}
{% endwith %}

script:
template "insert_size.py"
cimendes marked this conversation as resolved.
Show resolved Hide resolved

}
33 changes: 9 additions & 24 deletions flowcraft/generator/templates/mlst.nf
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
// If a species is not provided, it bypasses the species verification
if (params.mlstSpecies{{ param_id }} == null){
IN_expected_species_{{ pid }} = Channel.value("PASS")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PASS? 🔨

} else {
IN_expected_species_{{ pid }} = Channel.value(params.mlstSpecies{{ param_id }})
}

process mlst_{{ pid }} {

Expand All @@ -10,6 +16,7 @@ process mlst_{{ pid }} {

input:
set sample_id, file(assembly) from {{ input_channel }}
val expected_species from IN_expected_species_{{ pid }}

output:
file '*.mlst.txt' into LOG_mlst_{{ pid }}
Expand All @@ -19,30 +26,8 @@ process mlst_{{ pid }} {
{% endwith %}

script:
"""
{
expectedSpecies=${params.mlstSpecies{{ param_id }}}
mlst $assembly >> ${sample_id}.mlst.txt
mlstSpecies=\$(cat *.mlst.txt | cut -f2)
json_str="{'expectedSpecies':\'\$expectedSpecies\',\
'species':'\$mlstSpecies',\
'st':'\$(cat *.mlst.txt | cut -f3)',\
'tableRow':[{'sample':'${sample_id}','data':[\
{'header':'MLST species','value':'\$mlstSpecies','table':'typing'},\
{'header':'MLST ST','value':'\$(cat *.mlst.txt | cut -f3)','table':'typing'}]}]}"
echo \$json_str > .report.json

if [ ! \$mlstSpecies = \$expectedSpecies ];
then
printf fail > .status
else
printf pass > .status
fi

} || {
printf fail > .status
}
"""
template "run_mlst.py"
cimendes marked this conversation as resolved.
Show resolved Hide resolved

}

process compile_mlst_{{ pid }} {
Expand Down
34 changes: 17 additions & 17 deletions flowcraft/templates/fastqc_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,13 +446,13 @@ def check_summary_health(summary_file, **kwargs):

# Store the summary categories that cannot fail. If they fail, do not
# proceed with this sample
fail_sensitive = kwargs.get("fail_sensitive", [
fail_if_fail = kwargs.get("fail_if_fail", [
"Per base sequence quality",
"Overrepresented sequences",
"Sequence Length Distribution",
"Per sequence GC content"
])
logger.debug("Fail sensitive categories: {}".format(fail_sensitive))
logger.debug("Must not fail categories: {}".format(fail_if_fail))

# Store summary categories that must pass. If they do not, do not proceed
# with that sample
Expand All @@ -462,15 +462,17 @@ def check_summary_health(summary_file, **kwargs):
])
logger.debug("Must pass categories: {}".format(must_pass))

warning_fail_sensitive = kwargs.get("warning_fail_sensitive", [
warning_if_warning = kwargs.get("warning_if_warning", [
"Per base sequence quality",
"Overrepresented sequences",

])
logger.debug("Warninf categories: {}".format(warning_if_warning))

warning_must_pass = kwargs.get("warning_must_pass", [
warning_if_fail = kwargs.get("warning_if_fail", [
"Per base sequence content"
])
logger.debug("Warning if fail categories: {}".format(warning_if_fail))

# Get summary dictionary
summary_info = get_summary(summary_file)
Expand All @@ -486,31 +488,29 @@ def check_summary_health(summary_file, **kwargs):

logger.debug("Assessing category {} with result {}".format(cat, test))

# FAILURES
# Check for fail sensitive
if cat in fail_sensitive and test == "FAIL":
# Check for must not fail
if cat in fail_if_fail and test == "FAIL":
health = False
failed.append("{}:{}".format(cat, test))
logger.error("Category {} failed a fail sensitive "
failed.append("{}: {}".format(cat, test))
logger.error("Category {} failed a must not fail "
"category".format(cat))

# Check for must pass
if cat in must_pass and test != "PASS":
health = False
failed.append("{}:{}".format(cat, test))
failed.append("{}: {}".format(cat, test))
logger.error("Category {} failed a must pass category".format(
cat))

# WARNINGS
# Check for fail sensitive
if cat in warning_fail_sensitive and test == "FAIL":
warning.append("Failed category: {}".format(cat))
logger.warning("Category {} flagged at a fail sensitive "
if cat in warning_if_warning and test == "WARN":
warning.append("{}: {}".format(cat, test))
logger.warning("Category {} flagged at a warning "
"category".format(cat))

if cat in warning_must_pass and test != "PASS":
warning.append("Did not pass category: {}".format(cat))
logger.warning("Category {} flagged at a must pass "
if cat in warning_if_fail and test == "FAIL":
warning.append("{}: {}".format(cat, test))
logger.warning("Category {} flagged at warning if fail "
"category".format(cat))

# Passed all tests
Expand Down
Loading