diff --git a/changelog.md b/changelog.md index 38f10aa9..a63add25 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,45 @@ # Changelog +## 1.3.0 + +### Features +- Added `report` run mode to Flowcraft that displays the report of any given +pipeline in the Flowcraft's web application. The `report` mode can be executed +after a pipeline ended or during the pipeline execution using the `--watch` +option. +- Added standalone report HTML at the end of the pipeline execution. +- Components with support for the new report system: + - `abricate` + - `assembly_mapping` + - `check_coverage` + - `chewbbaca` + - `dengue_typing` + - `fastqc` + - `fastqc_trimmomatic` + - `integrity_coverage` + - `mlst` + - `patho_typing` + - `pilon` + - `process_mapping` + - `process_newick` + - `process_skesa` + - `process_spades` + - `process_viral_assembly` + - `seq_typing` + - `trimmomatic` + - `true_coverage` + +### Minor/Other changes + +- Refactored report json for components `mash_dist`, `mash_screen` and +`mapping_patlas` + +### Bug fixes +- Fixed issue where `seq_typing` and `patho_typing` processes were not feeding +report data to report compiler. +- Fixed fail messages for `process_assembly` and `process_viral_assembly` +components + ## 1.2.2 ### Components changes @@ -9,6 +49,8 @@ sam and bam files and added data to .report.json. Updated databases to pATLAS version 1.5.2. - `mash_screen` and `mash_dist`: added data to .report.json. Updated databases to pATLAS version 1.5.2. +- Added new options to `abricate` componente. Users can now provide custom database +directories, minimum coverage and minimum identity parameters. ### New components diff --git a/docs/_static/custom.css b/docs/_static/custom.css index a3f77c1b..4f8e51c7 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -4,4 +4,8 @@ div.wy-side-nav-search, div.wy-nav-top { .wy-menu > .caption > .caption-text { color: #5c6bc0; +} + +.wy-nav-content { + max-width: 100% } \ No newline at end of file diff --git a/docs/dev/create_process.rst b/docs/dev/create_process.rst index 29600c1c..9f3a99ca 100644 --- a/docs/dev/create_process.rst +++ b/docs/dev/create_process.rst @@ -116,6 +116,8 @@ must be used **only once**. Like in the input channel, this channel should be defined with a two element tuple with the sample ID and the data. The sample ID must match the one specified in the ``input_channel``. +.. _compiler: + {% include "compiler_channels.txt %} ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/dev/pipeline_reporting.rst b/docs/dev/pipeline_reporting.rst new file mode 100644 index 00000000..661ae1a2 --- /dev/null +++ b/docs/dev/pipeline_reporting.rst @@ -0,0 +1,90 @@ +Pipeline reporting +================== + +This section describes how the reports of a FlowCraft pipeline are generated +and collected at the end of a run. These reports can then be sent to the +`FlowCraft web application `_ +where the results are visualized. + +.. important:: + Note that if the nextflow process reports add new types of data, one or + more React components need to be added to the web application for them + to be rendered. + +Data collection +--------------- + +The data for the pipeline reports is collected from three dotfiles in each nextflow +process (they should be present in each work sub directory): + +- **.report.json**: Contains report data (See :ref:`report-json` for more information). +- **.versions**: Contains information about the versions of the software used + (See :ref:`versions` for more information). +- **.command.trace**: Contains resource usage information. + +The **.command.trace** file is generated by nextflow when the **trace** scope +is active. The **.report.json** and **.version** files are specific to +FlowCraft pipelines. + +Generation of dotfiles +^^^^^^^^^^^^^^^^^^^^^^ + +Both **report.json** and **.versions** empty dotfiles are automatically generated +by the ``{% include "post.txt" ignore missing %}`` placeholder, specified in the +:ref:`create-process` section. Using this placeholder in your processes is all +that is needed. + +Collection of dotfiles +^^^^^^^^^^^^^^^^^^^^^^ + +The **.report.json**, **.versions** and **.command.trace** files are automatically +collected and sent to dedicated report channels in the pipeline by the +``{%- include "compiler_channels.txt" ignore missing -%}`` placeholder, specified +in the :ref:`process creation ` section. Placing this placeholder in your +processes will generate the following line in the output channel specification:: + + set {{ sample_id|default("sample_id") }}, val("{{ task_name }}_{{ pid }}"), val("{{ pid }}"), file(".report.json"), file(".versions"), file(".command.trace") into REPORT_{{task_name}}_{{ pid }} + +This line collects several metadata associated with the process along with the three +dotfiles. + +Compilation of dotfiles +^^^^^^^^^^^^^^^^^^^^^^^ + +As mentioned in the previous section, the dotfiles and other relevant metadata +for are sent through special report channels to a FlowCraft component that is +responsible for compiling all the information and generate a single report +file at the end of each pipeline run. + +This component is specified in ``flowcraft.generator.templates.report_compiler.nf`` +and it consists of two nextflow processes: + +- First, the **report** process receives the data from each executed process that + sends report data and runs the ``flowcraft/bin/prepare_reports.py`` script + on that data. This script will simply merge metadata and dotfiles information + in a single JSON file. This file contains the following keys: + + - ``reportJson``: The data in **.report.json** file. + - ``versions``: The data in **.versions** file. + - ``trace``: The data in **.command.trace** file. + - ``processId``: The process ID + - ``pipelineId``: The pipeline ID that defaults to one, unless specified in + the parameters. + - ``projectid``: The project ID that defaults to one, unless specified in + the parameters. + - ``userId``: The user ID that defaults to one, unless specified in + the parameters. + - ``username``: The user name that defaults to *user*, unless specified in + the parameters + - ``processName``: The name of the flowcraft component. + - ``workdir``: The work directory where the process was executed. + +- Second, all JSON files created in the process above are merged + and a single reports JSON file is created. This file will contains the + following structure:: + + reportJSON = { + "data": { + "results": [] + } + } diff --git a/docs/dev/process_dotfiles.rst b/docs/dev/process_dotfiles.rst index 4e5fb23a..db5fc551 100644 --- a/docs/dev/process_dotfiles.rst +++ b/docs/dev/process_dotfiles.rst @@ -44,15 +44,22 @@ execution of the process. When this occurs, the ``.status`` channel must have the ``fail`` string as well. As in the warning dotfile, there is no particular format for the fail message. +.. _report-json: + Report JSON ----------- +.. important:: + The general specification of the report JSON changed in version 1.2.2. + See the `issue tracker `_ + for details. + The ``.report.json`` file stores any information from a given process that is deemed worthy of being reported and displayed at the end of the pipeline. Any information can be stored in this file, as long as it is in JSON format, but there are a couple of recommendations that are necessary to follow for them to be processed by a reporting web app (Currently hosted at -`report-nf `_). However, if +`flowcraft-webapp `_). However, if data processing will be performed with custom scripts, feel free to specify your own format. @@ -63,18 +70,33 @@ Information meant to be displayed in tables should be in the following format:: json_dic = { - "tableRow": [ - {"header": "Raw BP", - "value": chars, - "table": "assembly", - "columnBar": True}, + "tableRow": [{ + "sample": "A", + "data": [{ + "header": "Raw BP", + "value": 123, + "table": "qc" + }, { + "header": "Coverage", + "value": 32, + "table": "qc" + }] + }, { + "sample": "B", + "data": [{ + "header": "Coverage", + "value": 35, + "table": "qc" + }] + }] } -This means that the ``chars`` variable that is created during the execution -of the process should appear as a table entry with the specified ``header`` -and ``value``. The ``table`` key specifies in which table of the reports -it will appear and the ``columnBar`` key informs the report generator to -create a bar column in that particular cell. +This provides table information for multiple samples in the same process. In +this case, data for two samples is provided. For each sample, values for +one or more headers can be provided. For instance, this report provides +information about the **Raw BP** and **Coverage** for sample **A** and this +information should go to the **qc** table. If any other information is relevant +to build the table, feel free to add more elements to the JSON. Information for plots ^^^^^^^^^^^^^^^^^^^^^ @@ -82,14 +104,19 @@ Information for plots Information meant to be displayed in plots should be in the following format:: json_dic = { - "plotData": { - "size_dist": size_dist - } + "plotData": [{ + "sample": "strainA", + "data": { + "sparkline": 23123, + "otherplot": [1,2,3] + } + }], } -This is a simple key:value pair, where the key is the ID of the plot in the -reports and the ``size_dist`` contains the plot data that was gathered -for a particular process. +As in the table JSON, *plotData* should be an array with an entry for each +sample. The data for each sample should be another JSON where the keys are +the *plot signatures*, so that we know to which plot the data belongs. The +corresponding values are whatever data object you need. Other information ^^^^^^^^^^^^^^^^^ @@ -99,6 +126,8 @@ is not particular format for other information. They will simply store the data of interest to report and it will be the job of a downstream report app to process that data into an actual visual report. +.. _versions: + Versions -------- diff --git a/docs/dev/reports.rst b/docs/dev/reports.rst new file mode 100644 index 00000000..500fb6cc --- /dev/null +++ b/docs/dev/reports.rst @@ -0,0 +1,182 @@ +Reports +======= + +Report JSON specification +------------------------- + +The report JSON is quite flexibly on the information it can contain. Here are +some guidelines to promote consistency on the reports generated by each component. +In general, the reports file is an array of JSON objects that contain relevant +information for each executed process in the pipeline:: + + reportFile = [{}, {}, ... ] + + +Nextflow metadata +^^^^^^^^^^^^^^^^^ + +The nextflow metada is automatically added to the reportFile as a single JSON entry +with the ``nfMetadata`` key that contains the following information:: + + "nfMetadata": { + "scriptId": "${workflow.scriptId}", + "scriptName": "${workflow.scriptId}", + "profile": "${workflow.profile}", + "container": "${workflow.container}", + "containerEngine": "${workflow.containerEngine}", + "commandLine": "${workflow.commandLine}", + "runName": "${workflow.runName}", + "sessionId": "${workflow.sessionId}", + "projectDir": "${workflow.projectDir}", + "launchDir": "${workflow.launchDir}", + "start_time": "${workflow.start}" + } + +.. note:: + Unlike the remaining JSON entries in the report file, which are generated for + each process execution, the ``nfMetadata`` entry is generated only once per + project execution. + +Root +^^^^ + +The reports contained in the ``reports.json`` file for each process execution +are added to the root object:: + + { + "pipelineId": 1, + "processId": pid, + "processName": task_name, + "projectid": RUN_NAME, + "reportJson": reports, + "runName": RUN_NAME, + "scriptId": SCRIPT_ID, + "versions": versions, + "trace": trace, + "userId": 1, + "username": "user", + "workdir": dirname(abspath(report_json)) + } + +The other key:values are added automatically when the reports are compiled for each +process execution. + +Versions +^^^^^^^^ + +Inside the root, the signature key for software version information is ``versions``:: + + "versions": [{ + "program": "progA", + "version": "1.0.0", + "build": "1" + }, { + "program": "progB", + "version": "2.1" + }] + +Only the ``program`` and ``version`` keys are mandatory. + +ReportJson +^^^^^^^^^^ + +Table data +~~~~~~~~~~ + +Inside ``reportJson``, the signature key for table data is ``tableRow``:: + + "reportJson": { + "tableRow": [{ + "sample": "strainA", + "data": [{ + "header": "Raw BP", + "value": 123, + "table": "qc", + }, { + "header": "Coverage", + "value": 32, + "table": "qc" + }], + "sample": "strainB", + "data": [{ + "header": "Raw BP", + "value": 321, + "table": "qc", + }, { + "header": "Coverage", + "value": 22, + "table": "qc" + }] + }] + } + +``tableRow`` should contain an array of JSON for each sample with two key:value pairs: + + - ``sample``: Sample name + - ``data``: Table data (see below). + +``data`` should be an array of JSON with at least three key:value pairs: + + - ``header``: Column header + - ``value``: The data value + - ``table``: Informs to which table this data should go. + +.. note:: + Available ``table`` keys: ``typing``, ``qc``, ``assembly``, ``abricate``, + ``chewbbaca``. + + +Plot data +~~~~~~~~~ + +Inside ``reportJson``, the signature key for plot data is ``plotData``:: + + "reportJson": { + "plotData": [{ + "sample": "strainA", + "data": { + "sparkline": 23123, + "otherplot": [1,2,3] + } + }], + } + +``plotData`` should contain an array of JSON for each sample with two key:value pairs: + + - ``sample``: Sample name + - ``data``: Plot data (see below). + +``data`` should contain a JSON object with the plot signatures as keys, and the relevant +plot data as value. This data can be any object (integer, float, array, JSON, etc). +**It will be up to the components in the flowcraft web application to parse this data +and generate the appropriate chart.** + +Warnings and fails +~~~~~~~~~~~~~~~~~~ + +Inside ``reportJson``, the signature key for warnings is ``warnings`` and for +failures is ``fail``:: + + "reportJson": { + "warnings": [{ + "sample": "strainA", + "table": "qc", + "value": ["message 1", "message 2"] + }], + "fail": [{ + "sample": "strainA", + "table": "assembly", + "value": ["message 1"] + }] + } + + +``warnings``/``fail`` should contain an array of JSON for each sample with +two key:value pairs: + + - ``sample``: Sample name + - ``value``: An array with one or more string messages. + - ``table`` **[optional]**: If a table signature is provided, the warning/fail + messages information will appear on that table. Otherwise, it will appear as + a general warning/error that is associated to the sample but not to any particular + table. diff --git a/docs/index.rst b/docs/index.rst index 2f991402..5df3e88b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -32,6 +32,7 @@ A NextFlow pipeline assembler for genomics. user/pipeline_building user/pipeline_configuration user/pipeline_inspect + user/pipeline_reports user/available_components .. _Developer Guide: @@ -45,6 +46,8 @@ A NextFlow pipeline assembler for genomics. dev/create_template dev/containers dev/process_dotfiles + dev/pipeline_reporting + dev/reports .. _Source API: diff --git a/docs/resources/flowcraft_report.png b/docs/resources/flowcraft_report.png new file mode 100644 index 00000000..9e09fe79 Binary files /dev/null and b/docs/resources/flowcraft_report.png differ diff --git a/docs/resources/flowcraft_report_watch.png b/docs/resources/flowcraft_report_watch.png new file mode 100644 index 00000000..4c17d406 Binary files /dev/null and b/docs/resources/flowcraft_report_watch.png differ diff --git a/docs/resources/reports/abricate_table.png b/docs/resources/reports/abricate_table.png new file mode 100644 index 00000000..e0522eb8 Binary files /dev/null and b/docs/resources/reports/abricate_table.png differ diff --git a/docs/resources/reports/assembly_table_skesa.png b/docs/resources/reports/assembly_table_skesa.png new file mode 100644 index 00000000..a8bf0898 Binary files /dev/null and b/docs/resources/reports/assembly_table_skesa.png differ diff --git a/docs/resources/reports/assembly_table_spades.png b/docs/resources/reports/assembly_table_spades.png new file mode 100644 index 00000000..2b619ef5 Binary files /dev/null and b/docs/resources/reports/assembly_table_spades.png differ diff --git a/docs/resources/reports/assembly_table_viral_assembly.png b/docs/resources/reports/assembly_table_viral_assembly.png new file mode 100644 index 00000000..56044707 Binary files /dev/null and b/docs/resources/reports/assembly_table_viral_assembly.png differ diff --git a/docs/resources/reports/chewbbaca_table.png b/docs/resources/reports/chewbbaca_table.png new file mode 100644 index 00000000..04e48508 Binary files /dev/null and b/docs/resources/reports/chewbbaca_table.png differ diff --git a/docs/resources/reports/contig_size_distribution.png b/docs/resources/reports/contig_size_distribution.png new file mode 100644 index 00000000..91779f47 Binary files /dev/null and b/docs/resources/reports/contig_size_distribution.png differ diff --git a/docs/resources/reports/fastqc_base_gc_content.png b/docs/resources/reports/fastqc_base_gc_content.png new file mode 100644 index 00000000..014ca584 Binary files /dev/null and b/docs/resources/reports/fastqc_base_gc_content.png differ diff --git a/docs/resources/reports/fastqc_base_sequence_quality.png b/docs/resources/reports/fastqc_base_sequence_quality.png new file mode 100644 index 00000000..c726f7db Binary files /dev/null and b/docs/resources/reports/fastqc_base_sequence_quality.png differ diff --git a/docs/resources/reports/fastqc_missing_data.png b/docs/resources/reports/fastqc_missing_data.png new file mode 100644 index 00000000..d850f4c9 Binary files /dev/null and b/docs/resources/reports/fastqc_missing_data.png differ diff --git a/docs/resources/reports/fastqc_per_base_sequence_quality.png b/docs/resources/reports/fastqc_per_base_sequence_quality.png new file mode 100644 index 00000000..34376170 Binary files /dev/null and b/docs/resources/reports/fastqc_per_base_sequence_quality.png differ diff --git a/docs/resources/reports/fastqc_sequence_length.png b/docs/resources/reports/fastqc_sequence_length.png new file mode 100644 index 00000000..ce3c018e Binary files /dev/null and b/docs/resources/reports/fastqc_sequence_length.png differ diff --git a/docs/resources/reports/phylogenetic_tree.png b/docs/resources/reports/phylogenetic_tree.png new file mode 100644 index 00000000..fda6a8e8 Binary files /dev/null and b/docs/resources/reports/phylogenetic_tree.png differ diff --git a/docs/resources/reports/quality_control_table.png b/docs/resources/reports/quality_control_table.png new file mode 100644 index 00000000..0a51cbdf Binary files /dev/null and b/docs/resources/reports/quality_control_table.png differ diff --git a/docs/resources/reports/read_mapping_remove_host.png b/docs/resources/reports/read_mapping_remove_host.png new file mode 100644 index 00000000..f4636913 Binary files /dev/null and b/docs/resources/reports/read_mapping_remove_host.png differ diff --git a/docs/resources/reports/sliding_window_amr.png b/docs/resources/reports/sliding_window_amr.png new file mode 100644 index 00000000..c721a40f Binary files /dev/null and b/docs/resources/reports/sliding_window_amr.png differ diff --git a/docs/resources/reports/sparkline.png b/docs/resources/reports/sparkline.png new file mode 100644 index 00000000..3932ef0b Binary files /dev/null and b/docs/resources/reports/sparkline.png differ diff --git a/docs/resources/reports/typing_table.png b/docs/resources/reports/typing_table.png new file mode 100644 index 00000000..ace526ae Binary files /dev/null and b/docs/resources/reports/typing_table.png differ diff --git a/docs/resources/reports/typing_table_dengue.png b/docs/resources/reports/typing_table_dengue.png new file mode 100644 index 00000000..fd243660 Binary files /dev/null and b/docs/resources/reports/typing_table_dengue.png differ diff --git a/docs/user/available_components.rst b/docs/user/available_components.rst index dab996ac..85034e62 100644 --- a/docs/user/available_components.rst +++ b/docs/user/available_components.rst @@ -38,7 +38,7 @@ Reads Quality Control - :doc:`components/trimmomatic`: Runs Trimmomatic on paired-end FastQ files. -- :doc:`components/sample_fastq`: Subsamples fastq files up to a target coverage +- :doc:`components/downsample_fastq`: Subsamples fastq files up to a target coverage depth. diff --git a/docs/user/basic_usage.rst b/docs/user/basic_usage.rst index 77776e1f..18f4e944 100644 --- a/docs/user/basic_usage.rst +++ b/docs/user/basic_usage.rst @@ -11,7 +11,7 @@ Build Assembling a pipeline ::::::::::::::::::::: -Pipelines can be generated using the ``build`` execution mode of FlowCraft +Pipelines are generated using the ``build`` mode of FlowCraft and the ``-t`` parameter to specify the :ref:`components ` inside quotes:: flowcraft build -t "trimmomatic fastqc spades" -o my_pipe.nf @@ -19,16 +19,14 @@ and the ``-t`` parameter to specify the :ref:`components ` inside qu All components should be written inside quotes and be space separated. This command will generate a linear pipeline with three components on the current working directory (for more features and tips on how pipelines can be -built, see the :doc:`pipeline building ` section). A linear pipeline means that +built, see the :doc:`pipeline building ` section). **A linear pipeline means that there are no bifurcations between components, and the input data will flow -linearly. In this particular case, the input data of the -pipeline will be paired-end fastq files, since that is the input data type -of the first component, :doc:`trimmomatic `. +linearly.** The rationale of how the data flows across the pipeline is simple and intuitive. Data enters a component and is processed in some way, which may result on the -creation of results (stored in the ``results`` directory) and reports (stored -in the ``reports`` directory) (see `Results and reports`_ below). If that +creation of result files (stored in the ``results`` directory) and reports +files (stored in the ``reports`` directory) (see `Results and reports`_ below). If that component has an ``output_type``, it will feed the processed data into the next component (or components) and this will repeated until the end of the pipeline. @@ -44,8 +42,8 @@ in any browser. The ``integrity_coverage`` component is a dependency of ``trimmomatic``, so it was automatically added to the pipeline. -.. note:: - Not all pipeline variations will work. **You always need to ensure +.. important:: + Not all pipeline configurations will work. **You always need to ensure that the output type of a component matches the input type of the next component**, otherwise FlowCraft will exit with an error. @@ -124,21 +122,39 @@ with ``nextflow`` and using the ``--help`` option:: All these parameters are specific to the components of the pipeline. However, the main input parameter (or parameters) of the pipeline is always available. -In this case, since the pipeline started with fastq paired-end files as the -main input, the ``--fastq`` parameter is available. If the pipeline started +**In this case, since the pipeline started with fastq paired-end files as the +main input, the** ``--fastq`` **parameter is available.** If the pipeline started with any other input type or with more than one input type, the appropriate -parameters would appear. These parameters can be provided on run-time or -edited in the ``params.config`` file. +parameters will appear (more information in the :ref:`raw input types` section). -The parameters are composed by its name (`adapters`) followed by the ID of -the process it refers to (`_1_2`). The IDs can be consulted in the DAG tree -(See `Assembling a pipeline`_). As such, all parameters will be independent -between different components, **even if the parameter name is the same**. This +The parameters are composed by their name (``adapters``) followed by the ID of +the process it refers to (``_1_2``). The IDs can be consulted in the DAG tree +(See `Assembling a pipeline`_). This is done to prevent issues when duplicating +components and, as such, **all parameters will be independent between different +components**. This behaviour can be changed when building the pipeline by using the ``--merge-params`` option (See :ref:`mergeParams`). -Executing the pipeline -:::::::::::::::::::::: +.. note:: + The ``--merge-params`` option of the ``build`` mode will merge all parameters + with identical names (`e.g.:` ``--genomeSize_1_1`` and ``--genomeSize_1_5`` + become simply ``--genomeSize``) . This is usually more appropriate and useful + in linear pipelines without component duplication. + + +Providing/modifying parameters +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +These parameters can be provided on run-time:: + + nextflow run my_pipe.nf --genomeSize_1_1 5 --adapters_1_2 "/path/to/adapters" + +or edited in the ``params.config`` file:: + + params { + genomeSize_1_1 = 5 + adapters_1_2 = "path/to/adapters" + } Most parameters in FlowCraft's components already come with sensible defaults, which means that usually you'll only need to provide a small number @@ -152,7 +168,7 @@ We'll need to provide the pattern to the fastq files. This pattern is perhaps a bit confusing at first, but it's necessary for the correct inference of the paired:: - nextflow run my_pipe.nf --fastq "data/*_{1,2}.*" + --fastq "data/*_{1,2}.*" In this case, the pairs are separated by the "_1." or "_2." substring, which leads to the pattern ``*_{1,2}.*``. Another common nomenclature for paired fastq @@ -165,8 +181,16 @@ acceptable pattern would be ``*_R{1,2}_*``. to allow nextflow to resolve the pattern, otherwise your shell might try to resolve it and provide the wrong input to nextflow. +Execution +--------- + +Once you build your pipeline with Flowcraft you have a standard nextflow pipeline +ready to run. Therefore, all you need to do is:: + + nextflow run my_pipe.nf --fastq "data/*_{1,2}.* + Changing executor and container engine -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:::::::::::::::::::::::::::::::::::::: The default run mode of an FlowCraft pipeline is to be executed locally and using the singularity container engine. In nextflow terms, this is @@ -196,7 +220,7 @@ Other container engines are: .. _supported by nextflow: https://www.nextflow.io/docs/latest/executor.html Docker images -^^^^^^^^^^^^^ +::::::::::::: All components of FlowCraft are executed in containers, which means that the first time they are executed in a machine, **the corresponding image will have @@ -302,7 +326,7 @@ should be run in the folder where the pipeline is running**:: flowcraft inspect -m broadcast -This will output a url to the terminal that can be opened in a browser. +This will output an URL to the terminal that can be opened in a browser. This is an example of the screen that is displayed once the url is opened: .. image:: ../resources/flowcraft_inspect_broadcast.png @@ -325,4 +349,55 @@ Want to know more? Reports ------- -Coming soon... +The reporting of a FlowCraft pipeline is saved on a JSON file that is stored +in ``pipeline_reports/pipeline_report.json``. To visualize the reports you'll just +need to execute the following command in the folder where the pipeline was executed:: + + flowcraft report + +This will output an URL to the terminal that can be opened in a browser. +This is an example of the screen that is displayed once the url is opened: + +.. image:: ../resources/flowcraft_report.png + :align: center + +**The actual layout and content of the reports will depend on the pipeline you +build and it will only provide the information that is directly related to +your pipeline components.** + +.. important:: + This pipeline report will be available for **anyone** via the provided URL, + which means that the URL can be shared with anyone and/or any device with + a browser. **However, the report section will only be available while + the** ``flowcraft report`` **command is running. Once this command + is cancelled, the data will be erased from the service and the URL will + no longer be available**. + +Real time reports +::::::::::::::::: + +The reports of any FlowCraft pipeline can be monitored in real-time using the +``--watch`` option:: + + flowcraft report --watch + +This will output an URL exactly as in the previous section and will render the +same reports page with a small addition. In the top right of the screen in the +navigation bar, there will be a new icon that informs the user when new +reports are available: + +.. image:: ../resources/flowcraft_report_watch.png + :align: center + +Local visualization +::::::::::::::::::: + +The FlowCraft report JSON file can also be visualized locally by drag and dropping +it into the FlowCraft web application page, currently hosted at http://192.92.149.169/reports + +Offline visualization +::::::::::::::::::::: + +The complete FlowCraft report is also available as a standalone HTML file that +can be visualized offline. This HTML file, stored in +``pipeline_reports/pipeline_report.html``, can be opened in any modern browser. \ No newline at end of file diff --git a/docs/user/components/assembly_mapping.rst b/docs/user/components/assembly_mapping.rst index 447e8e7e..feb936fa 100644 --- a/docs/user/components/assembly_mapping.rst +++ b/docs/user/components/assembly_mapping.rst @@ -68,5 +68,9 @@ Template Reports JSON ^^^^^^^^^^^^ -``warnings``: Message with execution warnings -``fail``: Messages with execution failures \ No newline at end of file +``plotData``: + - ``sparkline``: Total number of base pairs. +``warnings``: + - When the number of contigs exceeds a provided threshold. +``fail``: + - When the genome size is below 80% or above 150% of the expected genome size. \ No newline at end of file diff --git a/docs/user/components/check_coverage.rst b/docs/user/components/check_coverage.rst index 8edf55cd..45fc36a7 100644 --- a/docs/user/components/check_coverage.rst +++ b/docs/user/components/check_coverage.rst @@ -62,5 +62,6 @@ Reports JSON ^^^^^^^^^^^^ ``tableRow``: - - ``Coverage (2nd)``: Estimated coverage -``minCoverage``: Minimum coverage specified for the module \ No newline at end of file + - ``Coverage``: Estimated coverage. +``fail``: + - When estimated coverage is below the provided threshold. \ No newline at end of file diff --git a/docs/user/components/sample_fastq.rst b/docs/user/components/downsample_fastq.rst similarity index 83% rename from docs/user/components/sample_fastq.rst rename to docs/user/components/downsample_fastq.rst index 29de2826..4ad7bd67 100644 --- a/docs/user/components/sample_fastq.rst +++ b/docs/user/components/downsample_fastq.rst @@ -1,10 +1,10 @@ -sample_fastq -========== +downsample_fastq +================ Purpose ------- -sample_fastq uses seqtk to subsample fastq read data to a target coverage depth +downsample_fastq uses seqtk to subsample fastq read data to a target coverage depth if the estimated coverage is higher than the provided target depth. When no subsample is required, it outputs the original FastQ files. @@ -49,4 +49,4 @@ Reports JSON ^^^^^^^^^^^^ ``tableRow``: - - ``Coverage``: \ No newline at end of file + - ``Coverage``: Estimated coverage. \ No newline at end of file diff --git a/docs/user/components/fastqc.rst b/docs/user/components/fastqc.rst index 4b0cb25f..6635a50e 100644 --- a/docs/user/components/fastqc.rst +++ b/docs/user/components/fastqc.rst @@ -57,10 +57,17 @@ Template Reports JSON ^^^^^^^^^^^^ -``tableRow``: - - ``Contigs``: Number of contigs - - ``Assembled BP``: Number of assembled base pairs ``plotData``: - - ``size_dist``: Distribution of contig size. - - ``gcSliding``: Sliding window of the GC content along the genome - - ``covSliding``: Sliding window of the coverage along the genome + - ``base_sequence_quality``: Per base sequence quality data + - (This structure is repeated for the other entries) + - ``status``: Status of the category (PASS, WARN, etc) + - ``data``: Plot data + - ``sequence_quality``: Per sequence quality data + - ``base_gc_content``: GC content distribution + - ``base_n_content``: Per base N content + - ``sequence_length_dist``: Distribution of sequence read length + - ``per_base_sequence_content``: Per base sequence content +``warnings``: + - List of failures or warnings for some non-sensitive FastQC categories +``fail``: + - Failure message when sensitive FastQC categories fail or do not pass. diff --git a/docs/user/components/integrity_coverage.rst b/docs/user/components/integrity_coverage.rst index 17ea1d7f..1b3d0254 100644 --- a/docs/user/components/integrity_coverage.rst +++ b/docs/user/components/integrity_coverage.rst @@ -82,9 +82,12 @@ Reports JSON ^^^^^^^^^^^^ ``tableRow``: - - ``Raw BP``: Number of nucleotides - - ``Reads``: Number of reads - - ``Coverage (1st)``: Estimated coverage + - ``Raw BP``: Number of nucleotides. + - ``Reads``: Number of reads. + - ``Coverage``: Estimated coverage. ``plotData``: - - ``sparkline``: Number of nucleotides -``minCoverage``: Minimum coverage specified for the module \ No newline at end of file + - ``sparkline``: Number of nucleotides. +``warnings``: + - When the enconding and/or phred score cannot be inferred from FastQ files. +``fail``: + - When estimated coverage is below the provided threshold. \ No newline at end of file diff --git a/docs/user/components/mlst.rst b/docs/user/components/mlst.rst index a0f4b432..3ffdc892 100644 --- a/docs/user/components/mlst.rst +++ b/docs/user/components/mlst.rst @@ -45,5 +45,9 @@ Advanced Reports JSON ^^^^^^^^^^^^ -``expectedSpecies``: Name of the expected species -``species``: Name of inferred species +``tableRow``: + - ``mlst``: Predicted species. +``expectedSpecies``: Name of the expected species. + +``species``: Name of inferred species. + diff --git a/docs/user/components/pilon.rst b/docs/user/components/pilon.rst index e2441101..d9c611ba 100644 --- a/docs/user/components/pilon.rst +++ b/docs/user/components/pilon.rst @@ -61,9 +61,20 @@ Template Reports JSON ^^^^^^^^^^^^ - ``tableRow``: - ``Trimmed (%)``: Percentage of trimmed nucleotides + - ``Contigs``: Number of contigs. + - ``Assembled BP``: Number of assembled base pairs. ``plotData``: - ``sparkline``: Number of nucleotides after trimming -``badReads``: Number of discarded reads + - ``size_dist``: Distribution of contig size. + - ``sparkline``: Number of assembled base pairs. + - ``genomeSliding``: + - ``gcData``: Genome sliding window of GC content. + - ``covData``: Genome sliding window of read coverage depth. + - ``window``: Size of sliding window + - ``xbars``: Position of contigs along the genome sliding window. + - ``assemblyFile``: Name of the input assembly file. +``warnings``: + - When the number of contigs exceeds a given threshold. +``fail``: + - When the genome size is below 80% or above 150% of the expected genome size. + diff --git a/docs/user/components/process_skesa.rst b/docs/user/components/process_skesa.rst index c61051b2..24e9a652 100644 --- a/docs/user/components/process_skesa.rst +++ b/docs/user/components/process_skesa.rst @@ -56,7 +56,9 @@ Reports JSON ^^^^^^^^^^^^ ``tableRow``: - - ``Contigs ()``: Number of contigs - - ``Assembled BP ()``: Number of assembled base pairs + - ``Contigs ()``: Number of contigs. + - ``Assembled BP ()``: Number of assembled base pairs. ``warnings``: - - ``process_assembly``: Failure messages + - When the number of contigs exceeds a given threshold. +``fail``: + - When the genome size is below 80% or above 150% of the expected genome size. diff --git a/docs/user/components/process_spades.rst b/docs/user/components/process_spades.rst index 82bc021b..9c4d71c5 100644 --- a/docs/user/components/process_spades.rst +++ b/docs/user/components/process_spades.rst @@ -57,7 +57,10 @@ Reports JSON ^^^^^^^^^^^^ ``tableRow``: - - ``Contigs ()``: Number of contigs - - ``Assembled BP ()``: Number of assembled base pairs + - ``Contigs ()``: Number of contigs. + - ``Assembled BP ()``: Number of assembled base pairs. ``warnings``: - - ``process_assembly``: Failure messages \ No newline at end of file + - When the number of contigs exceeds a given threshold. +``fail``: + - When the genome size is below 80% or above 150% of the expected genome size. + ``process_assembly``: Failure messages \ No newline at end of file diff --git a/docs/user/pipeline_building.rst b/docs/user/pipeline_building.rst index b5fc49cd..c65373e9 100644 --- a/docs/user/pipeline_building.rst +++ b/docs/user/pipeline_building.rst @@ -4,6 +4,8 @@ Pipeline building FlowCraft offers a few extra features when building pipelines using the ``build`` execution mode. +.. _rawInput: + Raw input types --------------- diff --git a/docs/user/pipeline_reports.rst b/docs/user/pipeline_reports.rst new file mode 100644 index 00000000..3a563f4b --- /dev/null +++ b/docs/user/pipeline_reports.rst @@ -0,0 +1,23 @@ +Pipeline reports +================ + +.. include:: reports/abricate.rst +.. include:: reports/assembly_mapping.rst +.. include:: reports/check_coverage.rst +.. include:: reports/chewbbaca.rst +.. include:: reports/dengue_typing.rst +.. include:: reports/fastqc.rst +.. include:: reports/fastqc_trimmomatic.rst +.. include:: reports/integrity_coverage.rst +.. include:: reports/mlst.rst +.. include:: reports/patho_typing.rst +.. include:: reports/pilon.rst +.. include:: reports/process_mapping.rst +.. include:: reports/process_newick.rst +.. include:: reports/process_skesa.rst +.. include:: reports/process_spades.rst +.. include:: reports/process_viral_assembly.rst +.. include:: reports/seq_typing.rst +.. include:: reports/trimmomatic.rst +.. include:: reports/true_coverage.rst + diff --git a/docs/user/reports/abricate.rst b/docs/user/reports/abricate.rst new file mode 100644 index 00000000..34175a79 --- /dev/null +++ b/docs/user/reports/abricate.rst @@ -0,0 +1,20 @@ +abricate +-------- + +Table data +^^^^^^^^^^ + +AMR table: + - ****: Number of hits for a particular given database + +.. image:: ../resources/reports/abricate_table.png + :align: center + +Plot data +^^^^^^^^^ + +- **Sliding window AMR annotation**: Provides annotation of Abricate hits for + each database along the genome. This report component is only available when + the ``pilon`` component was used downstream of ``abricate``. + +.. image:: ../resources/reports/sliding_window_amr.png \ No newline at end of file diff --git a/docs/user/reports/assembly_mapping.rst b/docs/user/reports/assembly_mapping.rst new file mode 100644 index 00000000..bd24ab75 --- /dev/null +++ b/docs/user/reports/assembly_mapping.rst @@ -0,0 +1,23 @@ +assembly_mapping +---------------- + +Plot data +^^^^^^^^^ + +- **Data loss chart**: Gives a trend of the data loss + (in total number of base pairs) across components that may filter this data. + +.. image:: ../resources/reports/sparkline.png + +Warnings +^^^^^^^^ + +Assembly table: + - When the number of contigs exceeds the threshold of 100 contigs per 1.5Mb. + +Fails +^^^^^ + +Assembly table: + - When the assembly size if smaller than 80% or larger than 150% of the + expected genome size. \ No newline at end of file diff --git a/docs/user/reports/check_coverage.rst b/docs/user/reports/check_coverage.rst new file mode 100644 index 00000000..f3e816a0 --- /dev/null +++ b/docs/user/reports/check_coverage.rst @@ -0,0 +1,24 @@ +check_coverage +-------------- + +Table data +^^^^^^^^^^ + +Quality control table: + - **Coverage**: Estimated coverage based on the number of base pairs and the expected + genome size. + +.. image:: ../resources/reports/quality_control_table.png + :align: center + +Warnings +^^^^^^^^ + +Quality control table: + - When the enconding and phred score cannot be guessed from the FastQ file(s). + +Fails +^^^^^ + +Quality control table: + - When the sample has lower estimated coverage than the provided coverage threshold. \ No newline at end of file diff --git a/docs/user/reports/chewbbaca.rst b/docs/user/reports/chewbbaca.rst new file mode 100644 index 00000000..86935457 --- /dev/null +++ b/docs/user/reports/chewbbaca.rst @@ -0,0 +1,12 @@ +chewbbaca +--------- + +Table data +^^^^^^^^^^ + +Chewbbaca table: + - Table with the summary statistics of ChewBBACA allele calling, including + the number of exact matches, inferred loci, loci not found, etc. + +.. image:: ../resources/reports/chewbbaca_table.png + :align: center \ No newline at end of file diff --git a/docs/user/reports/dengue_typing.rst b/docs/user/reports/dengue_typing.rst new file mode 100644 index 00000000..d1175696 --- /dev/null +++ b/docs/user/reports/dengue_typing.rst @@ -0,0 +1,11 @@ +dengue_typing +------------- + +Table data +^^^^^^^^^^ + +Typing table: + - **seqtyping**: The sequence typing result (serotypy-genotype). + +.. image:: ../resources/reports/typing_table_dengue.png + :align: center \ No newline at end of file diff --git a/docs/user/reports/fastqc.rst b/docs/user/reports/fastqc.rst new file mode 100644 index 00000000..a008642d --- /dev/null +++ b/docs/user/reports/fastqc.rst @@ -0,0 +1,49 @@ +fastqc +------ + +Plot data +^^^^^^^^^ + +- **Base sequence quality**: The average quality score across the read length. + +.. image:: ../resources/reports/fastqc_base_sequence_quality.png + +- **Sequence quality**: Distribution of the mean sequence quality score. + +.. image:: ../resources/reports/fastqc_per_base_sequence_quality.png + +- **Base GC content**: Distribution of the GC content of each sequence. + +.. image:: ../resources/reports/fastqc_base_gc_content.png + +- **Sequence length**: Distribution of the read sequence length. + +.. image:: ../resources/reports/fastqc_sequence_length.png + +- **Missing data**: Normalized count of missing data across the read length. + +.. image:: ../resources/reports/fastqc_missing_data.png + + +Warnings +^^^^^^^^ + +The following FastQC categories will issue a warning when they have a ``WARN`` flag: + - Per base sequence quality. + - Overrepresented sequences. + +The following FastQC categories will issue a warning when do not have a ``PASS`` flag: + - Per base sequence content. + +Fails +^^^^^ + +The following FastQC categories will issue a fail when they have a ``FAIL`` flag: + - Per base sequence quality. + - Overrepresented sequences. + - Sequence length distribution. + - Per sequence GC content. + +The following FastQC categories will issue a fail when the do not have a ``PASS`` flag: + - Per base N content. + - Adapter content. diff --git a/docs/user/reports/fastqc_trimmomatic.rst b/docs/user/reports/fastqc_trimmomatic.rst new file mode 100644 index 00000000..c723478c --- /dev/null +++ b/docs/user/reports/fastqc_trimmomatic.rst @@ -0,0 +1,21 @@ +fastqc_trimmomatic +------------------ + +Table data +^^^^^^^^^^ + +Quality control table: + - **Trimmed (%)**: Percentage of trimmed base pairs. + +.. image:: ../resources/reports/quality_control_table.png + :scale: 80 % + :align: center + +Plot data +^^^^^^^^^ + +- **Data loss chart**: Gives a trend of the data loss + (in total number of base pairs) across components that may filter this data. + +.. image:: ../resources/reports/sparkline.png + diff --git a/docs/user/reports/integrity_coverage.rst b/docs/user/reports/integrity_coverage.rst new file mode 100644 index 00000000..537b40ec --- /dev/null +++ b/docs/user/reports/integrity_coverage.rst @@ -0,0 +1,34 @@ +integrity_coverage +------------------ + +Table data +^^^^^^^^^^ + +Quality control table: + - **Raw BP**: Number of raw base pairs from the FastQ file(s). + - **Reads**: Number of reads in the FastQ file(s) + - **Coverage**: Estimated coverage based on the number of base pairs and the expected + genome size. + +.. image:: ../resources/reports/quality_control_table.png + :align: center + +Plot data +^^^^^^^^^ + +- **Data loss chart**: Gives a trend of the data loss + (in total number of base pairs) across components that may filter this data. + +.. image:: ../resources/reports/sparkline.png + +Warnings +^^^^^^^^ + +Quality control table: + - When the enconding and phred score cannot be guessed from the FastQ file(s). + +Fails +^^^^^ + +Quality control table: + - When the sample has lower estimated coverage than the provided coverage threshold. \ No newline at end of file diff --git a/docs/user/reports/mlst.rst b/docs/user/reports/mlst.rst new file mode 100644 index 00000000..e274b23f --- /dev/null +++ b/docs/user/reports/mlst.rst @@ -0,0 +1,13 @@ +mlst +---- + +Table data +^^^^^^^^^^ + +Typing table: + - **MLST species**: The inferred species name. + - **MLST ST**: The inferred sequence type. + +.. image:: ../resources/reports/typing_table.png + :scale: 80 % + :align: center \ No newline at end of file diff --git a/docs/user/reports/patho_typing.rst b/docs/user/reports/patho_typing.rst new file mode 100644 index 00000000..1103d201 --- /dev/null +++ b/docs/user/reports/patho_typing.rst @@ -0,0 +1,12 @@ +patho_typing +------------ + +Table data +^^^^^^^^^^ + +Typing table: + - **Patho_typing**: The pathotyping result. + +.. image:: ../resources/reports/typing_table.png + :scale: 80 % + :align: center \ No newline at end of file diff --git a/docs/user/reports/pilon.rst b/docs/user/reports/pilon.rst new file mode 100644 index 00000000..b7312d53 --- /dev/null +++ b/docs/user/reports/pilon.rst @@ -0,0 +1,38 @@ +pilon +----- + +Table data +^^^^^^^^^^ + +Quality control table: + - **Contigs**: Number of assembled contigs. + - **Assembled BP**: Total number of assembled base pairs. + +.. image:: ../resources/reports/assembly_table_skesa.png + :scale: 80 % + :align: center + +Plot data +^^^^^^^^^ + +- **Contig size distribution**: Distribution of the size of each assembled contig. + +.. image:: ../resources/reports/contig_size_distribution.png + +- **Sliding window coverage and GC content**: Provides coverage and GC content + metrics along the genome using a sliding window approach and two synchronised + charts. + +.. image:: ../resources/reports/sliding_window_amr.png + +Warnings +^^^^^^^^ + +Quality control table: + - When the enconding and phred score cannot be guessed from the FastQ file(s). + +Fails +^^^^^ + +Quality control table: + - When the sample has lower estimated coverage than the provided coverage threshold. \ No newline at end of file diff --git a/docs/user/reports/process_mapping.rst b/docs/user/reports/process_mapping.rst new file mode 100644 index 00000000..a1b84cd3 --- /dev/null +++ b/docs/user/reports/process_mapping.rst @@ -0,0 +1,15 @@ +process_mapping +--------------- + +Table data +^^^^^^^^^^ + +Read mapping table: + - **Reads**: Number reads in the the FastQ file(s). + - **Unmapped**: Number of unmapped reads + - **Mapped 1x**: Number of reads that aligned, concordantly and discordantly, exactly 1 time + - **Mapped >1x**: Number of reads that aligned, concordantly or disconrdantly, more than 1 times + - **Overall alignment rate (%)**: Overall alignment rate + +.. image:: ../resources/reports/read_mapping_remove_host.png + :align: center diff --git a/docs/user/reports/process_newick.rst b/docs/user/reports/process_newick.rst new file mode 100644 index 00000000..630b6e84 --- /dev/null +++ b/docs/user/reports/process_newick.rst @@ -0,0 +1,11 @@ +process_newick +-------------- + +Tree data +^^^^^^^^^^ + +Phylogenetic reconstruction with bootstrap values for the provided tree. + + +.. image:: ../resources/reports/phylogenetic_tree.png + :align: center \ No newline at end of file diff --git a/docs/user/reports/process_skesa.rst b/docs/user/reports/process_skesa.rst new file mode 100644 index 00000000..ea067a28 --- /dev/null +++ b/docs/user/reports/process_skesa.rst @@ -0,0 +1,26 @@ +process_skesa +------------- + +Table data +^^^^^^^^^^ + +Quality control table: + - **Contigs (skesa)**: Number of assembled contigs. + - **Assembled BP**: Total number of assembled base pairs. + +.. image:: ../resources/reports/assembly_table_skesa.png + :scale: 80 % + :align: center + +Warnings +^^^^^^^^ + +Assembly table: + - When the number of contigs exceeds the threshold of 100 contigs per 1.5Mb. + +Fails +^^^^^ + +Assembly table: + - When the assembly size if smaller than 80% or larger than 150% of the + expected genome size. diff --git a/docs/user/reports/process_spades.rst b/docs/user/reports/process_spades.rst new file mode 100644 index 00000000..0c1f2524 --- /dev/null +++ b/docs/user/reports/process_spades.rst @@ -0,0 +1,26 @@ +process_spades +------------- + +Table data +^^^^^^^^^^ + +Quality control table: + - **Contigs (spades)**: Number of assembled contigs. + - **Assembled BP**: Total number of assembled base pairs. + +.. image:: ../resources/reports/assembly_table_spades.png + :scale: 80 % + :align: center + +Warnings +^^^^^^^^ + +Assembly table: + - When the number of contigs exceeds the threshold of 100 contigs per 1.5Mb. + +Fails +^^^^^ + +Assembly table: + - When the assembly size if smaller than 80% or larger than 150% of the + expected genome size. diff --git a/docs/user/reports/process_viral_assembly.rst b/docs/user/reports/process_viral_assembly.rst new file mode 100644 index 00000000..a1fad4bc --- /dev/null +++ b/docs/user/reports/process_viral_assembly.rst @@ -0,0 +1,23 @@ +process_viral_assembly +---------------------- + +Table data +^^^^^^^^^^ + +Quality control table: + - **Contigs (SPAdes)**: Number of assembled contigs. + - **Assembled BP (SPAdes)**: Total number of assembled base pairs. + - **ORFs**: Number of complete ORFs in the assembly. + - **Contigs (MEGAHIT)**: Number of assembled contigs. + - **Assembled BP (MEGAHIT)**: Total number of assembled base pairs. + + +.. image:: ../resources/reports/assembly_table_viral_assembly.png + :align: center + +Fails +^^^^^ + +Assembly table: + - When the assembly size if smaller than 80% or larger than 150% of the + expected genome size. diff --git a/docs/user/reports/seq_typing.rst b/docs/user/reports/seq_typing.rst new file mode 100644 index 00000000..4437e9d6 --- /dev/null +++ b/docs/user/reports/seq_typing.rst @@ -0,0 +1,11 @@ +seq_typing +---------- + +Table data +^^^^^^^^^^ + +Typing table: + - **seqtyping**: The sequence typing result. + +.. image:: ../resources/reports/typing_table.png + :align: center \ No newline at end of file diff --git a/docs/user/reports/trimmomatic.rst b/docs/user/reports/trimmomatic.rst new file mode 100644 index 00000000..275b556f --- /dev/null +++ b/docs/user/reports/trimmomatic.rst @@ -0,0 +1,20 @@ +trimmomatic +----------- + +Table data +^^^^^^^^^^ + +Quality control table: + - **Trimmed (%)**: Percentage of trimmed base pairs. + +.. image:: ../resources/reports/quality_control_table.png + :align: center + +Plot data +^^^^^^^^^ + +- **Data loss chart**: Gives a trend of the data loss + (in total number of base pairs) across components that may filter this data. + +.. image:: ../resources/reports/sparkline.png + diff --git a/docs/user/reports/true_coverage.rst b/docs/user/reports/true_coverage.rst new file mode 100644 index 00000000..502bfc88 --- /dev/null +++ b/docs/user/reports/true_coverage.rst @@ -0,0 +1,17 @@ +true_coverage +------------- + +Table data +^^^^^^^^^^ + +Quality control table: + - **True Coverage**: Estimated coverage based on read mapping on MLST genes. + +.. image:: ../resources/reports/quality_control_table.png + :align: center + +Fails +^^^^^ + +Quality control table: + - When the sample has lower estimated coverage than the provided coverage threshold. \ No newline at end of file diff --git a/flowcraft/__init__.py b/flowcraft/__init__.py index 01943223..c341f0a7 100644 --- a/flowcraft/__init__.py +++ b/flowcraft/__init__.py @@ -1,6 +1,6 @@ -__version__ = "1.2.2" -__build__ = "29082018" +__version__ = "1.3.0" +__build__ = "21092018" __author__ = "Diogo N. Silva, Tiago F. Jesus, Ines Mendes, Bruno Ribeiro-Goncalves" __copyright__ = "Diogo N. Silva" __license__ = "GPL3" diff --git a/flowcraft/bin/merge_json.py b/flowcraft/bin/merge_json.py index 8754af5d..71212acd 100755 --- a/flowcraft/bin/merge_json.py +++ b/flowcraft/bin/merge_json.py @@ -3,7 +3,12 @@ import sys import json -core_file, f1, f2 = sys.argv[1:] +core_file, f1, f2 = sys.argv[1:4] + +try: + sample_id = sys.argv[4] +except IndexError: + sample_id = None def get_core_genes(core_file): @@ -49,6 +54,32 @@ def assess_quality(core_array, core_genes): return status, perc +def get_table_data(data_obj, sample_id=None): + + header_map = dict((p, h) for p, h in enumerate(data_obj["header"])) + table_data = [] + + for sample, data in data_obj.items(): + + if sample == "header": + continue + + cur_data = [] + for pos, d in enumerate(data): + cur_data.append({ + "header": header_map[pos], + "value": d, + "table": "chewbbaca" + }) + + table_data.append({ + "sample": sample_id if sample_id else sample, + "data": cur_data + }) + + return table_data + + def main(): core_genes = get_core_genes(core_file) @@ -57,14 +88,24 @@ def main(): j1 = json.load(f1h) j2 = json.load(f2h) - current_result = [v for k, v in j1.items() - if "polished.fasta" in k][0] + sample_info = [(k, v) for k, v in j1.items() if "header" not in k] current_array = j1["header"] - core_results = filter_core_genes(current_result, current_array, - core_genes) - status, perc = assess_quality(core_results, core_genes) - - res = {"cagao": [j1, j2], "status": status, 'lnfPercentage': perc} + status_info = [] + for sample, info in sample_info: + + sample_name = sample_id if sample_id else sample + + core_results = filter_core_genes(info, current_array, core_genes) + status, perc = assess_quality(core_results, core_genes) + status_info.append({ + "sample": sample_name, + "status": status, + "lnfPercentage": perc + }) + + table_data = get_table_data(j2, sample_name) + res = {"cagao": [j1, j2], "status": status_info, + "tableRow": table_data} with open(".report.json", "w") as fh: fh.write(json.dumps(res, separators=(",", ":"))) diff --git a/flowcraft/bin/metadata_POST.sh b/flowcraft/bin/metadata_POST.sh new file mode 100644 index 00000000..227180ae --- /dev/null +++ b/flowcraft/bin/metadata_POST.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env sh + +set -ex + +projectid=$1 +pipelineid=$2 +processid=$3 +sample=$4 +url=$5 +username=$6 +userid=$7 +task=$8 +species=$9 + +metadata_str="{}" + +# If a .report.json file was populated, set the json_str variable +if [ -s .metadata.json ]; +then + metadata_str=$(cat $(pwd)/.metadata.json | sed 's/ /%20/g' | sed s/\"/\'/g) +fi + +# If a .versions OR .report.json file was populated send the request +if [ ! "$metadata_str" = "{}" ]; +then + workdir=$(pwd) + json="{'projectid':'$projectid','pipelineId':'$pipelineid','processId':'nfMetadata','sample_name':'$sample','nfMetadata':$metadata_str,'username':'$username','userId':'$userid','workdir':'$workdir','task':'nfMetadata','processName':'nfMetadata','species':'$species','overwrite':'false'}" + echo \"${json}\" > .final.json + { + cat .final.json | curl -H "Content-Type: application/json" -k -L -X POST -d @- $url > /dev/null + } || { + echo Curl request failed + } + +fi diff --git a/flowcraft/bin/prepare_reports.py b/flowcraft/bin/prepare_reports.py index c2af93cb..f1217da5 100755 --- a/flowcraft/bin/prepare_reports.py +++ b/flowcraft/bin/prepare_reports.py @@ -2,29 +2,59 @@ import sys import json +import logging from os.path import dirname, abspath +logger = logging.getLogger("main.{}".format(__name__)) -def write_json(report_json, task_name, sample_name, pid): - with open(report_json) as fh: - res = json.load(fh) +def write_json(report_json, version_json, trace_file, task_name, + project_name, sample_name, pid, script_id, run_name): - res["task"] = task_name - del res["task"] + logging.info("Parsing report JSON") + try: + with open(report_json) as fh: + _reports = fh.read().replace("'", '"') + reports = json.loads(_reports) + if "task" in reports: + del reports["task"] + except json.JSONDecodeError: + logging.warning("Could not parse report JSON: {}".format(report_json)) + reports = {} + + logging.info("Parsing versions JSON") + try: + with open(version_json) as fh: + _version = fh.read().replace("'", '"') + versions = json.loads(_version) + except json.JSONDecodeError: + logging.warning("Could not parse versions JSON: {}".format( + version_json)) + versions = [] + + logging.info("Parsing trace file") + with open(trace_file) as fh: + trace = fh.readlines() report = { - "reportJson": res, + "pipelineId": run_name, "processId": pid, - "pipelineId": 1, - "projectid": 1, + "processName": task_name, + "projectid": run_name, + "reportJson": reports, + "runName": run_name, + "scriptId": script_id, + "versions": versions, + "sampleName": sample_name, + "trace": trace, "userId": 1, "username": "user", - "processName": task_name, "workdir": dirname(abspath(report_json)) } + logging.info("Dumping final report JSON file") + logging.debug("Final JSON file: {}".format(report)) with open("{}_{}_report.json".format(task_name, sample_name), "w") \ as report_fh: report_fh.write(json.dumps(report, separators=(",", ":"))) @@ -32,24 +62,30 @@ def write_json(report_json, task_name, sample_name, pid): def main(): + # Fetch arguments args = sys.argv[1:] report_json = args[0] - sample_name = args[1] - task_name = args[2] - project_name = args[3] - pid = args[4] + version_json = args[1] + trace = args[2] + sample_name = args[3] + task_name = args[4] + project_name = args[5] + pid = args[6] + script_id = args[7] + run_name = args[8] + logging.debug("Report JSON: {}".format(report_json)) + logging.debug("Version JSON: {}".format(version_json)) + logging.debug("Trace file: {}".format(trace)) + logging.debug("Sample name: {}".format(sample_name)) + logging.debug("Task name: {}".format(task_name)) + logging.debug("Project name: {}".format(project_name)) + logging.debug("Process ID: {}".format(pid)) + logging.debug("Script ID: {}".format(script_id)) + logging.debug("Run name: {}".format(run_name)) - print(report_json) - print(sample_name) - print(task_name) - print(project_name) - print(pid) - - try: - write_json(report_json, task_name, sample_name, pid) - except json.decoder.JSONDecodeError: - print("Could not parse JSON output from {}, sample name {} and " - "pid {}".format(report_json, sample_name, pid)) + # Write the final report JSON that compiles all information + write_json(report_json, version_json, trace, task_name, + project_name, sample_name, pid, script_id, run_name) main() diff --git a/flowcraft/bin/report_POST.sh b/flowcraft/bin/report_POST.sh index 059924f6..d2519e3c 100755 --- a/flowcraft/bin/report_POST.sh +++ b/flowcraft/bin/report_POST.sh @@ -27,14 +27,14 @@ then then json_str=$(cat $(pwd)/.report.json | sed 's/ //g' | sed s/\"/\'/g) else - json_str=$(cat $(pwd)/.report.json | sed 's/ /_/g' | sed s/\"/\'/g) + json_str=$(cat $(pwd)/.report.json | sed 's/ /%20/g' | sed s/\"/\'/g) fi fi # If a .versions file was populated, set the version_str variable if [ -s .versions ]; then - version_str=$(< $(pwd)/.versions sed 's/ /_/g' | sed s/\"/\'/g) + version_str=$(< $(pwd)/.versions sed 's/ /%20/g' | sed s/\"/\'/g) fi if [ -s .command.trace ]; @@ -46,7 +46,7 @@ fi if [ ! "$json_str" = "{}" ] || [ ! "$version_str" = "[]" ] || [ ! "$trace_str" = "" ]; then workdir=$(pwd) - json="{'project_id':'$projectid','pipeline_id':'$pipelineid','process_id':'$processid','sample_name':'$sample','report_json':$json_str,'current_user_name':'$username','current_user_id':'$userid','workdir':'$workdir','task':'$task','species':'$species','versions':$version_str,'trace':'$trace_str', 'overwrite': '$overwrite'}" + json="{'projectid':'$projectid','pipelineId':'$pipelineid','processId':'$processid','sample_name':'$sample','reportJson':$json_str,'username':'$username','userId':'$userid','workdir':'$workdir','task':'$task','processName':'$task','species':'$species','versions':$version_str,'trace':'$trace_str', 'overwrite': '$overwrite'}" echo \"${json}\" > .final.json { cat .final.json | curl -H "Content-Type: application/json" -k -L -X POST -d @- $url > /dev/null diff --git a/flowcraft/flowcraft.py b/flowcraft/flowcraft.py index 98d54a41..32745e0f 100755 --- a/flowcraft/flowcraft.py +++ b/flowcraft/flowcraft.py @@ -15,6 +15,7 @@ from __init__ import __version__, __build__ from generator.engine import NextflowGenerator, process_map from generator.inspect import NextflowInspector + from generator.report import FlowcraftReport from generator.recipe import brew_recipe, available_recipes from generator.pipeline_parser import parse_pipeline, SanityError from generator.process_details import proc_collector, colored_print @@ -23,6 +24,7 @@ from flowcraft import __version__, __build__ from flowcraft.generator.engine import NextflowGenerator, process_map from flowcraft.generator.inspect import NextflowInspector + from flowcraft.generator.report import FlowcraftReport from flowcraft.generator.recipe import brew_recipe, available_recipes from flowcraft.generator.pipeline_parser import parse_pipeline, \ SanityError @@ -130,6 +132,36 @@ def get_args(args=None): help="Pretty inspection mode that removes usual reporting processes." ) + # REPORT MODE + reports_parser = subparsers.add_parser("report", + help="Broadcast the report of " + "a pipeline") + reports_parser.add_argument( + "-i", dest="report_file", + default="pipeline_report/pipeline_report.json", + help="Specify the path to the pipeline report JSON file." + ) + reports_parser.add_argument( + "-u", "--url", dest="url", default="http://192.92.149.169:80/", + help="Specify the URL to where the data should be broadcast" + ) + reports_parser.add_argument( + "--trace-file", dest="trace_file", default="pipeline_stats.txt", + help="Specify the nextflow trace file. Only applicable in combination " + "with --watch option." + ) + reports_parser.add_argument( + "--log-file", dest="log_file", default=".nextflow.log", + help="Specify the nextflow log file. Only applicable in combination " + "with --watch option." + ) + reports_parser.add_argument( + "-w", "--watch", dest="watch", action="store_const", const=True, + help="Run the report in watch mode. This option will track the " + "generation of reports during the execution of the pipeline, " + "allowing for the visualization of the reports in real-time" + ) + if len(sys.argv) == 1: parser.print_help() sys.exit(1) @@ -217,6 +249,9 @@ def copy_project(path): # Copy Helper scripts copy_tree(join(repo_dir, "lib"), join(target_dir, "lib")) + # Copy resources dir + copy_tree(join(repo_dir, "resources"), join(target_dir, "resources")) + # Copy bin scripts copy_tree(join(repo_dir, "bin"), join(target_dir, "bin")) @@ -322,6 +357,22 @@ def inspect(args): nf_inspect.broadcast_status() +def report(args): + + try: + fc_report = FlowcraftReport( + report_file=args.report_file, + trace_file=args.trace_file, + log_file=args.log_file, + watch=args.watch, + ip_addr=args.url) + except eh.ReportError as e: + logger.error(colored_print(e.value, "red_bold")) + sys.exit(1) + + fc_report.broadcast_report() + + def main(): args = get_args() @@ -356,6 +407,9 @@ def main(): if args.main_op == "inspect": inspect(args) + if args.main_op == "report": + report(args) + if __name__ == '__main__': diff --git a/flowcraft/generator/components/annotation.py b/flowcraft/generator/components/annotation.py index 8566ab0e..ccaec335 100644 --- a/flowcraft/generator/components/annotation.py +++ b/flowcraft/generator/components/annotation.py @@ -36,6 +36,19 @@ def __init__(self, **kwargs): "default": '["resfinder", "card", "vfdb", "plasmidfinder", ' '"virulencefinder"]', "description": "Specify the databases for abricate." + }, + "abricateDataDir": { + "default": 'null', + "description": "Specify the full path location of the database " + "folders." + }, + "abricateMinId": { + "default": '75', + "description": "Minimum DNA %identity." + }, + "abricateMinCov": { + "default": '0', + "description": "Minimum DNA %coverage." } } diff --git a/flowcraft/generator/components/assembly.py b/flowcraft/generator/components/assembly.py index 7e6594ab..72749ee8 100644 --- a/flowcraft/generator/components/assembly.py +++ b/flowcraft/generator/components/assembly.py @@ -95,7 +95,7 @@ def __init__(self, **kwargs): self.dependencies = ["integrity_coverage"] - self.status_channels = ["va_spades" , "va_megahit"] + self.status_channels = ["va_spades" , "va_megahit", "report_viral_assembly"] self.link_end.append({"link": "SIDE_max_len", "alias": "SIDE_max_len"}) diff --git a/flowcraft/generator/components/mapping.py b/flowcraft/generator/components/mapping.py index 26d9492c..53c3577f 100644 --- a/flowcraft/generator/components/mapping.py +++ b/flowcraft/generator/components/mapping.py @@ -51,7 +51,8 @@ def __init__(self, **kwargs): } self.status_channels = [ - "bowtie" + "bowtie", + "report_bowtie" ] class Retrieve_mapped(Process): diff --git a/flowcraft/generator/components/metagenomics.py b/flowcraft/generator/components/metagenomics.py index 753587e2..07102fe5 100644 --- a/flowcraft/generator/components/metagenomics.py +++ b/flowcraft/generator/components/metagenomics.py @@ -258,7 +258,8 @@ def __init__(self, **kwargs): } self.status_channels = [ - "remove_host" + "remove_host", + "report_remove_host" ] class MetaProb(Process): diff --git a/flowcraft/generator/components/phylogeny.py b/flowcraft/generator/components/phylogeny.py index af25d749..e8f1b679 100644 --- a/flowcraft/generator/components/phylogeny.py +++ b/flowcraft/generator/components/phylogeny.py @@ -40,14 +40,19 @@ def __init__(self, **kwargs): self.directives = { "raxml": { "container": "flowcraft/raxml", - "version": "8.2.11-1", + "version": "8.2.11-2", "cpus": 4, "memory": "{ 4.GB * task.attempt }" + }, + "report_raxml": { + "container": "flowcraft/raxml", + "version": "8.2.11-2" } } self.status_channels = [ - "raxml" + "raxml", + "report_raxml" ] diff --git a/flowcraft/generator/components/typing.py b/flowcraft/generator/components/typing.py index 7bf78e05..5b7932cd 100644 --- a/flowcraft/generator/components/typing.py +++ b/flowcraft/generator/components/typing.py @@ -16,8 +16,6 @@ def __init__(self, **kwargs): self.input_type = "fastq" self.output_type = None - self.status_channels = [] - self.link_start = None self.directives = {"seq_typing": { @@ -59,8 +57,6 @@ def __init__(self, **kwargs): self.ignore_type = True - self.status_channels = [] - self.params = { "species": { "default": "null", @@ -132,8 +128,6 @@ def __init__(self, **kwargs): self.input_type = "fasta" self.output_type = None - self.status_channels = [] - self.link_start = None self.directives = {"dengue_typing": { diff --git a/flowcraft/generator/engine.py b/flowcraft/generator/engine.py index 14f6e5aa..4082ab69 100644 --- a/flowcraft/generator/engine.py +++ b/flowcraft/generator/engine.py @@ -1148,6 +1148,61 @@ def _get_params_string(self): return params_str + def _get_merged_params_string(self): + """Returns the merged nextflow params string from a dictionary object. + + The params dict should be a set of key:value pairs with the + parameter name, and the default parameter value:: + + self.params = { + "genomeSize": 2.1, + "minCoverage": 15 + } + + The values are then added to the string as they are. For instance, + a ``2.1`` float will appear as ``param = 2.1`` and a + ``"'teste'" string will appear as ``param = 'teste'`` (Note the + string). + + Identical parameters in multiple processes will be merged into the same + param. + + Returns + ------- + str + Nextflow params configuration string + """ + + params_str = "" + + for p in self.processes: + + logger.debug("[{}] Adding parameters: {}\n".format( + p.template, p.params) + ) + + # Add an header with the template name to structure the params + # configuration + if p.params and p.template != "init": + + p.set_param_id("_{}".format(p.pid)) + params_str += "\n\t/*" + params_str += "\n\tComponent '{}_{}'\n".format(p.template, + p.pid) + params_str += "\t{}\n".format("-" * (len(p.template) + len(p.pid) + 12)) + params_str += "\t*/\n" + + for param, val in p.params.items(): + + if p.template == "init": + param_id = param + else: + param_id = "{}_{}".format(param, p.pid) + + params_str += "\t{} = {}\n".format(param_id, val["default"]) + + return params_str + def _get_merged_params_string(self): """Returns the merged nextflow params string from a dictionary object. @@ -1311,7 +1366,7 @@ def _set_configurations(self): }) self.user_config = self._render_config("user.config", {}) - def dag_to_file(self, dict_viz): + def dag_to_file(self, dict_viz, output_file=".treeDag.json"): """Writes dag to output file Parameters @@ -1322,7 +1377,7 @@ def dag_to_file(self, dict_viz): """ - outfile_dag = open(os.path.join(dirname(self.nf_file), ".treeDag.json") + outfile_dag = open(os.path.join(dirname(self.nf_file), output_file) , "w") outfile_dag.write(json.dumps(dict_viz)) outfile_dag.close() @@ -1371,11 +1426,13 @@ def render_pipeline(self): dir_var = "" for k2, v2 in p.directives.items(): - dir_var += " {}:
".format(k2) + dir_var += k2 for d in v2: try: - dir_var += "  {}: {}
".\ - format(d, v2[d]) + # Remove quotes from string directives + directive = v2[d].replace("'", "").replace('"', '') \ + if isinstance(v2[d], str) else v2[d] + dir_var += "{}: {}".format(d, directive) except KeyError: pass @@ -1391,6 +1448,11 @@ def render_pipeline(self): # write to file dict_viz self.dag_to_file(dict_viz) + # Write tree forking information for dotfile + with open(os.path.join(dirname(self.nf_file), + ".forkTree.json"), "w") as fh: + fh.write(json.dumps(self._fork_tree)) + # send with jinja to html resource return self._render_config("pipeline_graph.html", {"data": dict_viz}) diff --git a/flowcraft/generator/error_handling.py b/flowcraft/generator/error_handling.py index fb99c7e5..19e31c8e 100644 --- a/flowcraft/generator/error_handling.py +++ b/flowcraft/generator/error_handling.py @@ -21,5 +21,10 @@ class InspectionError(Exception): def __init__(self, value): self.value = "Inspection ERROR: {}".format(value) + +class ReportError(Exception): + def __init__(self, value): + self.value = "Reports ERROR: {}".format(value) + # def __str__(self): # return repr(self.value) diff --git a/flowcraft/generator/header_skeleton.py b/flowcraft/generator/header_skeleton.py index c4898d54..d90697e4 100644 --- a/flowcraft/generator/header_skeleton.py +++ b/flowcraft/generator/header_skeleton.py @@ -1,6 +1,7 @@ header = """#!/usr/bin/env nextflow import Helper +import CollectInitialMetadata // Pipeline version if (workflow.commitId){ @@ -38,4 +39,5 @@ def infoMap = [:] } Help.start_info(infoMap, "$workflow.start", "$workflow.profile") +CollectInitialMetadata.print_metadata(workflow) """ \ No newline at end of file diff --git a/flowcraft/generator/report.py b/flowcraft/generator/report.py new file mode 100644 index 00000000..43031b79 --- /dev/null +++ b/flowcraft/generator/report.py @@ -0,0 +1,526 @@ +import os +import re +import sys +import json +import signal +import socket +import hashlib +import logging +import requests + +from os.path import join, abspath +from time import sleep +from pympler.asizeof import asizeof + +try: + import generator.error_handling as eh + from generator.process_details import colored_print +except ImportError: + import flowcraft.generator.error_handling as eh + from flowcraft.generator.process_details import colored_print + +logger = logging.getLogger("main.{}".format(__name__)) + + +def signal_handler(): + """This function is bound to the SIGINT signal (like ctrl+c) to graciously + exit the program and reset the curses options. + """ + + print("Exiting flowcraft report brodcast... Bye") + sys.exit(0) + + +class FlowcraftReport: + + def __init__(self, report_file, trace_file=None, log_file=None, + watch=False, ip_addr=None): + + self.report_file = report_file + """ + str: Path to Report JSON file. + """ + + if not ip_addr: + self.app_address = "http://192.92.149.169:80/" + else: + self.app_address = ip_addr + """ + str: Address of flowcraft web app + """ + + self.broadcast_address = "{}reports/broadcast/api/reports".format( + self.app_address) + + self.refresh_rate = 1 + + self.send = True + """ + boolean: This attribute is used when the report mode is used with the + --watch option. It will be set to False after sending a request, and + set to True when there is a change in the pipeline reports. + """ + + self.watch = watch + """ + boolean: When False, the reports mode will try to open the provided + report JSON file and send it to the flowcraft service. When True, + it will try to open the nextflow trace file instead and continuously + compile the report JSON files from the `report` processes as they + are created. + """ + + self.log_file = log_file + """ + str: Path to .nextflow.log file. + """ + + self.log_sizestamp = None + """ + str: Stores the sizestamp of the last modification of the trace file. + This is used to parse the file only when it has changed. + """ + + self.status_info = None + """ + str: Status of the pipeline execution. Used in the watch report mode + and varies between 'running', 'aborted', 'complete'. + """ + + self.trace_file = trace_file + """ + str: Path to nextflow trace file. + """ + + self.trace_sizestamp = None + """ + str: Stores the sizestamp of the last modification of the trace file. + This is used to parse the file only when it has changed. + """ + + self.trace_retry = 0 + """ + int: Each time the log file is not found, this counter is + increased. Only when it matches the :attr:`MAX_RETRIES` attribute + does it raises a FileNotFoundError. + """ + + self.stored_ids = [] + """ + list: Stores the task_ids that have already been parsed. It is used + to skip them when parsing the trace files multiple times. + """ + + self.report_queue = [] + """ + list: Stores the paths of the report JSON files that are on queue to + be sent to the flowcraft service. This list will be emptied when these + JSONs are sent. + """ + + # Checks if report file is available + self._check_required_files() + + signal.signal(signal.SIGINT, lambda *x: signal_handler()) + + def _check_required_files(self): + + if not os.path.exists(self.report_file) and not self.watch: + raise eh.ReportError("The provided report JSON file could not be" + " opened: {}".format(self.report_file)) + + @staticmethod + def _header_mapping(header): + """Parses the trace file header and retrieves the positions of each + column key. + + Parameters + ---------- + header : str + The header line of nextflow's trace file + + Returns + ------- + dict + Mapping the column ID to its position (e.g.: {"tag":2}) + """ + + return dict( + (x.strip(), pos) for pos, x in enumerate(header.split("\t")) + ) + + @staticmethod + def _expand_path(hash_str): + """Expands the hash string of a process (ae/1dasjdm) into a full + working directory + + Parameters + ---------- + hash_str : str + Nextflow process hash with the beggining of the work directory + + Returns + ------- + str + Path to working directory of the hash string + """ + + try: + first_hash, second_hash = hash_str.split("/") + first_hash_path = join(abspath("work"), first_hash) + + for l in os.listdir(first_hash_path): + if l.startswith(second_hash): + return join(first_hash_path, l) + except FileNotFoundError: + return None + + def _get_report_id(self): + """Returns a hash of the reports JSON file + """ + + if self.watch: + + with open(self.log_file) as fh: + header = fh.readline() + + pipeline_path = re.match( + ".*nextflow run ([^\s]+).*", header).group(1) + + # Get hash from the entire pipeline file + pipeline_hash = hashlib.md5() + with open(pipeline_path, "rb") as fh: + for chunk in iter(lambda: fh.read(4096), b""): + pipeline_hash.update(chunk) + # Get hash from the current working dir and hostname + workdir = os.getcwd().encode("utf8") + hostname = socket.gethostname().encode("utf8") + dir_hash = hashlib.md5(workdir + hostname) + + return pipeline_hash.hexdigest() + dir_hash.hexdigest() + + else: + with open(self.report_file) as fh: + report_json = json.loads(fh.read()) + + metadata = report_json["data"]["results"][0]["nfMetadata"] + + try: + report_id = metadata["scriptId"] + metadata["sessionId"] + except KeyError: + raise eh.ReportError("Incomplete or corrupt report JSON file " + "missing the 'scriptId' and/or 'sessionId' " + "metadata information") + + return report_id + + def _update_pipeline_status(self): + """ + Parses the .nextflow.log file for signatures of pipeline status and sets + the :attr:`status_info` attribute. + """ + + prev_status = self.status_info + + with open(self.log_file) as fh: + + for line in fh: + + if "Session aborted" in line: + self.status_info = "aborted" + self.send = True if prev_status != self.status_info \ + else self.send + return + + if "Execution complete -- Goodbye" in line: + self.status_info = "complete" + self.send = True if prev_status != self.status_info \ + else self.send + return + + self.status_info = "running" + self.send = True if prev_status != self.status_info \ + else self.send + + def update_trace_watch(self): + """Parses the nextflow trace file and retrieves the path of report JSON + files that have not been sent to the service yet. + """ + + # Check the size stamp of the tracefile. Only proceed with the parsing + # if it changed from the previous size. + size_stamp = os.path.getsize(self.trace_file) + self.trace_retry = 0 + if size_stamp and size_stamp == self.trace_sizestamp: + return + else: + logger.debug("Updating trace size stamp to: {}".format(size_stamp)) + self.trace_sizestamp = size_stamp + + with open(self.trace_file) as fh: + + # Skip potential empty lines at the start of file + header = next(fh).strip() + while not header: + header = next(fh).strip() + + # Get header mappings before parsing the file + hm = self._header_mapping(header) + + for line in fh: + # Skip empty lines + if line.strip() == "": + continue + + fields = line.strip().split("\t") + + # Skip if task ID was already processes + if fields[hm["task_id"]] in self.stored_ids: + continue + + if fields[hm["process"]] == "report": + self.report_queue.append( + self._expand_path(fields[hm["hash"]]) + ) + self.send = True + + # Add the processed trace line to the stored ids. It will be + # skipped in future parsers + self.stored_ids.append(fields[hm["task_id"]]) + + def update_log_watch(self): + """Parses nextflow log file and updates the run status + """ + + # Check the size stamp of the tracefile. Only proceed with the parsing + # if it changed from the previous size. + size_stamp = os.path.getsize(self.log_file) + self.trace_retry = 0 + if size_stamp and size_stamp == self.log_sizestamp: + return + else: + logger.debug("Updating log size stamp to: {}".format(size_stamp)) + self.log_sizestamp = size_stamp + + self._update_pipeline_status() + + def _send_live_report(self, report_id): + """Sends a PUT request with the report JSON files currently in the + report_queue attribute. + + Parameters + ---------- + report_id : str + Hash of the report JSON as retrieved from :func:`~_get_report_hash` + """ + + # Determines the maximum number of reports sent at the same time in + # the same payload + buffer_size = 100 + logger.debug("Report buffer size set to: {}".format(buffer_size)) + + for i in range(0, len(self.report_queue), buffer_size): + + # Reset the report compilation batch + reports_compilation = [] + + # Iterate over report JSON batches determined by buffer_size + for report in self.report_queue[i: i + buffer_size]: + try: + report_file = [x for x in os.listdir(report) + if x.endswith(".json")][0] + except IndexError: + continue + with open(join(report, report_file)) as fh: + reports_compilation.append(json.loads(fh.read())) + + logger.debug("Payload sent with size: {}".format( + asizeof(json.dumps(reports_compilation)) + )) + logger.debug("status: {}".format(self.status_info)) + + try: + requests.put( + self.broadcast_address, + json={"run_id": report_id, + "report_json": reports_compilation, + "status": self.status_info} + ) + except requests.exceptions.ConnectionError: + logger.error(colored_print( + "ERROR: Could not establish connection with server. The server" + " may be down or there is a problem with your internet " + "connection.", "red_bold")) + sys.exit(1) + + # When there is no change in the report queue, but there is a change + # in the run status of the pipeline + if not self.report_queue: + + logger.debug("status: {}".format(self.status_info)) + + try: + requests.put( + self.broadcast_address, + json={"run_id": report_id, + "report_json": [], + "status": self.status_info} + ) + except requests.exceptions.ConnectionError: + logger.error(colored_print( + "ERROR: Could not establish connection with server. The" + " server may be down or there is a problem with your " + "internet connection.", "red_bold")) + sys.exit(1) + + # Reset the report queue after sending the request + self.report_queue = [] + + def _init_live_reports(self, report_id): + """Sends a POST request to initialize the live reports + + Parameters + ---------- + report_id : str + Hash of the report JSON as retrieved from :func:`~_get_report_hash` + """ + + logger.debug("Sending initial POST request to {} to start report live" + " update".format(self.broadcast_address)) + + try: + with open(".metadata.json") as fh: + metadata = [json.load(fh)] + except: + metadata = [] + + start_json = { + "data": {"results": metadata} + } + + try: + requests.post( + self.broadcast_address, + json={"run_id": report_id, "report_json": start_json, + "status": self.status_info} + ) + except requests.exceptions.ConnectionError: + logger.error(colored_print( + "ERROR: Could not establish connection with server. The server" + " may be down or there is a problem with your internet " + "connection.", "red_bold")) + sys.exit(1) + + def _close_connection(self, report_id): + """Sends a delete request for the report JSON hash + + Parameters + ---------- + report_id : str + Hash of the report JSON as retrieved from :func:`~_get_report_hash` + """ + + logger.debug( + "Closing connection and sending DELETE request to {}".format( + self.broadcast_address)) + + try: + r = requests.delete(self.broadcast_address, + json={"run_id": report_id}) + if r.status_code != 202: + logger.error(colored_print( + "ERROR: There was a problem sending data to the server" + "with reason: {}".format(r.reason))) + except requests.exceptions.ConnectionError: + logger.error(colored_print( + "ERROR: Could not establish connection with server. The server" + " may be down or there is a problem with your internet " + "connection.", "red_bold")) + sys.exit(1) + + def _send_report(self, report_id): + + with open(self.report_file) as fh: + report_json = json.loads(fh.read()) + + logger.debug("Unique payload sent with size: {}".format( + asizeof(json.dumps(report_json)) + )) + + try: + requests.post( + self.broadcast_address, + json={"run_id": report_id, "report_json": report_json} + ) + except requests.exceptions.ConnectionError: + logger.error(colored_print( + "ERROR: Could not establish connection with server. The server" + " may be down or there is a problem with your internet " + "connection.", "red_bold")) + sys.exit(1) + + def _print_msg(self, run_id): + + report_address = "{}reports/broadcast/{}".format(self.app_address, + run_id) + logger.info(colored_print( + "The pipeline reports are available in the following link:", + "green_bold")) + logger.info("{}".format(report_address)) + + def broadcast_report(self): + + logger.info(colored_print("Preparing to broacast reports...", + "green_bold")) + + report_hash = self._get_report_id() + + # When in watch mode, + if self.watch: + logger.info(colored_print("\tFetching pipeline run status", + "green_bold")) + self._update_pipeline_status() + logger.info(colored_print( + "\tSending initial request to test service", "green_bold")) + self._init_live_reports(report_hash) + logger.info(colored_print("\tInitial parsing of trace file", + "green_bold")) + self.update_trace_watch() + + self._print_msg(report_hash) + + logger.debug("Establishing connection...") + + stay_alive = True + _broadcast_sent = False + try: + while stay_alive: + + # When not in watch mode, send the report JSON once + if not _broadcast_sent and not self.watch: + self._send_report(report_hash) + self._print_msg(report_hash) + _broadcast_sent = True + + # When in watch mode, continuously monitor the trace file for + # updates + if self.watch: + self.update_trace_watch() + self.update_log_watch() + # When new report JSON files are available, send then + # via a PUT request + if self.send: + self._send_live_report(report_hash) + self.send = False + + sleep(self.refresh_rate) + + except FileNotFoundError as e: + print(e) + logger.error(colored_print( + "ERROR: Report JSON file is not reachable!", "red_bold")) + except Exception as e: + logger.exception("ERROR: " + e) + finally: + logger.info("Closing connection") + self._close_connection(report_hash) diff --git a/flowcraft/generator/templates/Helper.groovy b/flowcraft/generator/templates/Helper.groovy index 14343814..a0890263 100644 --- a/flowcraft/generator/templates/Helper.groovy +++ b/flowcraft/generator/templates/Helper.groovy @@ -59,4 +59,32 @@ class Help { {% endfor %} } +} + +class CollectInitialMetadata { + + public static void print_metadata(nextflow.script.WorkflowMetadata workflow){ + + def treeDag = new File(".treeDag.json").text + def forkTree = new File(".forkTree.json").text + + def metadataJson = "{'nfMetadata':{'scriptId':'${workflow.scriptId}',\ +'scriptName':'${workflow.scriptName}',\ +'profile':'${workflow.profile}',\ +'container':'${workflow.container}',\ +'containerEngine':'${workflow.containerEngine}',\ +'commandLine':'${workflow.commandLine}',\ +'runName':'${workflow.runName}',\ +'sessionId':'${workflow.sessionId}',\ +'projectDir':'${workflow.projectDir}',\ +'launchDir':'${workflow.launchDir}',\ +'startTime':'${workflow.start}',\ +'dag':${treeDag},\ +'forks':${forkTree}}}" + + def json = metadataJson.replaceAll("'", '"') + + def jsonFile = new File(".metadata.json") + jsonFile.write json + } } \ No newline at end of file diff --git a/flowcraft/generator/templates/abricate.nf b/flowcraft/generator/templates/abricate.nf index ec67af56..6c050d1e 100644 --- a/flowcraft/generator/templates/abricate.nf +++ b/flowcraft/generator/templates/abricate.nf @@ -1,3 +1,20 @@ +if ( params.abricateDataDir{{ param_id }} ){ + if ( !file(params.abricateDataDir{{ param_id }}).exists() ){ + exit 1, "'abricateDataDir{{ param_id }}' data directory was not found: '${params.abricateDatabases{{ param_id }}}'" + } + dataDirOpt = "--datadir ${params.abricateDataDir{{ param_id }}}" +} else { + dataDirOpt = "" +} + +if ( !params.abricateMinId{{ param_id }}.toString().isNumber() ){ + exit 1, "'abricateMinId{{ param_id }}' parameter must be a number. Provide value: '${params.abricateMinId{{ param_id }}}'" +} + +if ( !params.abricateMinCov{{ param_id }}.toString().isNumber() ){ + exit 1, "'abricateMinCov{{ param_id }}' parameter must be a number. Provide value: '${params.abricateMinCov{{ param_id }}}'" +} + process abricate_{{ pid }} { @@ -10,6 +27,8 @@ process abricate_{{ pid }} { input: set sample_id, file(assembly) from {{ input_channel }} each db from params.abricateDatabases{{ param_id }} + val min_id from Channel.value(params.abricateMinId{{ param_id }}) + val min_cov from Channel.value(params.abricateMinCov{{ param_id }}) output: file '*.tsv' into abricate_out_{{ pid }} @@ -21,7 +40,7 @@ process abricate_{{ pid }} { """ { # Run abricate - abricate --db $db $assembly > ${sample_id}_abr_${db}.tsv + abricate $dataDirOpt --minid $min_id --mincov $min_cov --db $db $assembly > ${sample_id}_abr_${db}.tsv echo pass > .status } || { echo fail > .status diff --git a/flowcraft/generator/templates/assembly_mapping.nf b/flowcraft/generator/templates/assembly_mapping.nf index 23ee1065..1f24d53a 100644 --- a/flowcraft/generator/templates/assembly_mapping.nf +++ b/flowcraft/generator/templates/assembly_mapping.nf @@ -40,7 +40,7 @@ process assembly_mapping_{{ pid }} { echo [DEBUG] CREATING BAM INDEX >> .command.log 2>&1 samtools index sorted.bam >> .command.log 2>&1 echo [DEBUG] ESTIMATING READ DEPTH >> .command.log 2>&1 - parallel -j ${task.cpus} samtools depth -ar {} sorted.bam \\> {}.tab ::: \$(grep ">" $assembly | cut -c 2-) + parallel -j ${task.cpus} samtools depth -ar {} sorted.bam \\> {}.tab ::: \$(grep ">" $assembly | cut -c 2- | tr " " "_") # Insert 0 coverage count in empty files. See Issue #2 echo [DEBUG] REMOVING EMPTY FILES >> .command.log 2>&1 find . -size 0 -print0 | xargs -0 -I{} sh -c 'echo -e 0"\t"0"\t"0 > "{}"' diff --git a/flowcraft/generator/templates/bowtie.nf b/flowcraft/generator/templates/bowtie.nf index 8e057e97..7c9526b8 100644 --- a/flowcraft/generator/templates/bowtie.nf +++ b/flowcraft/generator/templates/bowtie.nf @@ -54,7 +54,7 @@ process bowtie_{{ pid }} { output: set sample_id , file("*.bam") into {{ output_channel }} - file "*_bowtie2.log" + set sample_id, file("*_bowtie2.log") into into_json_{{ pid }} {% with task_name="bowtie" %} {%- include "compiler_channels.txt" ignore missing -%} {% endwith %} @@ -65,4 +65,24 @@ process bowtie_{{ pid }} { """ } + +process report_bowtie_{{ pid }} { + + {% include "post.txt" ignore missing %} + + tag { sample_id } + + input: + set sample_id, file(bowtie_log) from into_json_{{ pid }} + + output: + {% with task_name="report_bowtie" %} + {%- include "compiler_channels.txt" ignore missing -%} + {% endwith %} + + script: + template "process_mapping.py" + +} + {{ forks }} \ No newline at end of file diff --git a/flowcraft/generator/templates/chewbbaca.nf b/flowcraft/generator/templates/chewbbaca.nf index c37782a4..fa51450d 100644 --- a/flowcraft/generator/templates/chewbbaca.nf +++ b/flowcraft/generator/templates/chewbbaca.nf @@ -76,7 +76,7 @@ if (params.chewbbacaBatch{{ param_id }}) { echo $assembly | tr " " "\n" >> input_file.txt chewBBACA.py AlleleCall -i input_file.txt -g \$inputGenomes -o chew_results $jsonOpt --cpu $task.cpus $training if [ "$jsonOpt" = "--json" ]; then - merge_json.py ${params.schemaCore{{ param_id }}} chew_results_*/*/results* + merge_json.py ${params.schemaCore{{ param_id }}} chew_results/*/results* else cp chew_results*/*/results_alleles.tsv cgMLST.tsv fi @@ -130,7 +130,7 @@ if (params.chewbbacaBatch{{ param_id }}) { echo $assembly >> input_file.txt chewBBACA.py AlleleCall -i input_file.txt -g \$inputGenomes -o chew_results_${sample_id} $jsonOpt --cpu $task.cpus $training --fc if [ "$jsonOpt" = "--json" ]; then - merge_json.py ${params.schemaCore{{ param_id }}} chew_results_*/*/results* + merge_json.py ${params.schemaCore{{ param_id }}} chew_results_*/*/results* ${sample_id} else mv chew_results_*/*/results_alleles.tsv ${sample_id}_cgMLST.tsv fi diff --git a/flowcraft/generator/templates/compiler_channels.txt b/flowcraft/generator/templates/compiler_channels.txt index 1c017611..238c883b 100644 --- a/flowcraft/generator/templates/compiler_channels.txt +++ b/flowcraft/generator/templates/compiler_channels.txt @@ -1,3 +1,3 @@ set {{ sample_id|default("sample_id") }}, val("{{ pid }}_{{ task_name }}{{ suffix }}"), file(".status"), file(".warning"), file(".fail"), file(".command.log") into STATUS_{{task_name}}_{{ pid }} -set {{ sample_id|default("sample_id") }}, val("{{ task_name }}_{{ pid }}"), val("{{ pid }}"), file(".report.json") into REPORT_{{task_name}}_{{ pid }} +set {{ sample_id|default("sample_id") }}, val("{{ task_name }}_{{ pid }}{{ suffix }}"), val("{{ pid }}"), file(".report.json"), file(".versions"), file(".command.trace") into REPORT_{{task_name}}_{{ pid }} file ".versions" \ No newline at end of file diff --git a/flowcraft/generator/templates/dengue_typing.nf b/flowcraft/generator/templates/dengue_typing.nf index ff8f85f9..ba04e005 100644 --- a/flowcraft/generator/templates/dengue_typing.nf +++ b/flowcraft/generator/templates/dengue_typing.nf @@ -28,6 +28,12 @@ process dengue_typing_{{ pid }} { seq_typing.py assembly -f ${assembly} -b ${ params.BD_sequence_file{{ param_id }} } -o ./ -j $task.cpus -t nucl + # Add information to dotfiles + json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'seqtyping','value':'\$(cat seq_typing.report.txt)','table':'typing'}]}],'metadata':[{'sample':'${sample_id}','treeData':'\$(cat seq_typing.report.txt)','column':'typing'}]}" + echo \$json_str > .report.json + version_str="[{'program':'seq_typing.py','version':'0.1'}]" + echo \$version_str > .versions + rm -r rematch_temp if [ -s seq_typing.report.txt ]; @@ -38,6 +44,8 @@ process dengue_typing_{{ pid }} { fi } || { echo fail > .status + json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'seqtyping','value':'NA','table':'typing'}]}]}" + echo \$json_str > .report.json } """ diff --git a/flowcraft/generator/templates/filter_poly.nf b/flowcraft/generator/templates/filter_poly.nf index 43510db8..9f45d8a9 100644 --- a/flowcraft/generator/templates/filter_poly.nf +++ b/flowcraft/generator/templates/filter_poly.nf @@ -36,6 +36,8 @@ process filter_poly_{{ pid }} { gzip ${sample_id}_filtered_*.fastq + rm *.fq *.fastq + """ } diff --git a/flowcraft/generator/templates/mafft.nf b/flowcraft/generator/templates/mafft.nf index 5f50d6de..178c7786 100644 --- a/flowcraft/generator/templates/mafft.nf +++ b/flowcraft/generator/templates/mafft.nf @@ -20,7 +20,7 @@ process mafft_{{ pid }} { cat ${assembly} > all_assemblies.fasta - mafft --thread $task.cpus --auto all_assemblies.fasta > ${workflow.scriptName}.align + mafft --adjustdirection --thread $task.cpus --auto all_assemblies.fasta > ${workflow.scriptName}.align """ } diff --git a/flowcraft/generator/templates/mlst.nf b/flowcraft/generator/templates/mlst.nf index f8105b72..efe0b72e 100644 --- a/flowcraft/generator/templates/mlst.nf +++ b/flowcraft/generator/templates/mlst.nf @@ -24,7 +24,12 @@ process mlst_{{ pid }} { expectedSpecies=${params.mlstSpecies{{ param_id }}} mlst $assembly >> ${sample_id}.mlst.txt mlstSpecies=\$(cat *.mlst.txt | cut -f2) - json_str="{'expectedSpecies':\'\$expectedSpecies\','species':'\$mlstSpecies','st':'\$(cat *.mlst.txt | cut -f3)','tableRow':[{'sample':'${sample_id}','data':[{'header':'mlst','value':'\$mlstSpecies','table':'typing'}]}]}" + json_str="{'expectedSpecies':\'\$expectedSpecies\',\ + 'species':'\$mlstSpecies',\ + 'st':'\$(cat *.mlst.txt | cut -f3)',\ + 'tableRow':[{'sample':'${sample_id}','data':[\ + {'header':'MLST species','value':'\$mlstSpecies','table':'typing'},\ + {'header':'MLST ST','value':'\$(cat *.mlst.txt | cut -f3)','table':'typing'}]}]}" echo \$json_str > .report.json if [ ! \$mlstSpecies = \$expectedSpecies ]; diff --git a/flowcraft/generator/templates/patho_typing.nf b/flowcraft/generator/templates/patho_typing.nf index 678f0a51..513be5da 100644 --- a/flowcraft/generator/templates/patho_typing.nf +++ b/flowcraft/generator/templates/patho_typing.nf @@ -7,6 +7,8 @@ IN_pathoSpecies_{{ pid }} = Channel.value(params.species{{ param_id }}) process patho_typing_{{ pid }} { + validExitStatus 0, 2 + // Send POST request to platform {% include "post.txt" ignore missing %} @@ -19,7 +21,7 @@ process patho_typing_{{ pid }} { val species from IN_pathoSpecies_{{ pid }} output: - file "patho_typing*" + file "patho_typing*" optional true {% with task_name="patho_typing" %} {%- include "compiler_channels.txt" ignore missing -%} {% endwith %} @@ -33,20 +35,28 @@ process patho_typing_{{ pid }} { export PATH="\$(pwd)/rematch_temp/ReMatCh:\$PATH" patho_typing.py -f \$(pwd)/${fastq_pair[0]} \$(pwd)/${fastq_pair[1]} -o \$(pwd) -j $task.cpus --trueCoverage --species $species - json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'pathotyping','value':'\$(cat patho_typing.report.txt)','table':'typing'}]}]}" - echo \$json_str > .report.json + + # Add information to dotfiles + version_str="[{'program':'patho_typing.py','version':'0.4'}]" + echo \$version_str > .versions rm -r rematch_temp echo pass > .status if [ -s patho_typing.report.txt ]; then + json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'pathotyping','value':'\$(cat patho_typing.report.txt)','table':'typing'}]}]}" + echo \$json_str > .report.json echo pass > .status else + json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'pathotyping','value':'NA','table':'typing'}]}]}" + echo \$json_str > .report.json echo fail > .status fi } || { echo fail > .status + json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'pathotyping','value':'NA','table':'typing'}]}]}" + echo \$json_str > .report.json } """ diff --git a/flowcraft/generator/templates/raxml.nf b/flowcraft/generator/templates/raxml.nf index 4fbffd4e..e51eee0a 100644 --- a/flowcraft/generator/templates/raxml.nf +++ b/flowcraft/generator/templates/raxml.nf @@ -18,6 +18,7 @@ process raxml_{{ pid }} { output: file ("RAxML_*") into {{ output_channel }} + file ("RAxML_bipartitions.*.nf") into into_json_{{ pid }} {% with task_name="raxml", sample_id="val('single')" %} {%- include "compiler_channels.txt" ignore missing -%} {% endwith %} @@ -25,8 +26,32 @@ process raxml_{{ pid }} { script: """ raxmlHPC -s ${alignment} -p 12345 -m ${substitution_model} -T $task.cpus -n $workflow.scriptName -f a -x ${seednumber} -N ${bootstrapnumber} + + # Add information to dotfiles + version_str="[{'program':'raxmlHPC','version':'8.2.11'}]" + echo \$version_str > .versions """ } +process report_raxml_{{ pid }} { + + {% include "post.txt" ignore missing %} + + tag { 'raxml' } + + input: + file(newick) from into_json_{{ pid }} + + output: + {% with task_name="report_raxml", sample_id="val('single')" %} + {%- include "compiler_channels.txt" ignore missing -%} + {% endwith %} + + script: + template "process_newick.py" + +} + + {{ forks }} \ No newline at end of file diff --git a/flowcraft/generator/templates/reads_download.nf b/flowcraft/generator/templates/reads_download.nf index 8fa619e7..9292cffa 100644 --- a/flowcraft/generator/templates/reads_download.nf +++ b/flowcraft/generator/templates/reads_download.nf @@ -17,11 +17,11 @@ process reads_download_{{ pid }} { maxRetries 1 input: - val accession_id from {{ input_channel }}.splitText(){ it.trim() }.filter{ it.trim() != "" } + set val(accession_id), val(name) from reads_download_in_1_0.splitText(){ it.trim() }.filter{ it != "" }.map{ it.split().length > 1 ? ["accession": it.split()[0], "name": it.split()[1]] : [it.split()[0], null] } each file(aspera_key) from IN_asperaKey_{{ pid }} output: - set accession_id, file("${accession_id}/*fq.gz") optional true into {{ output_channel }} + set val({ "$name" != "null" ? "$name" : "$accession_id" }), file("${accession_id}/*fq.gz") optional true into {{ output_channel }} {% with task_name="reads_download", sample_id="accession_id" %} {%- include "compiler_channels.txt" ignore missing -%} {% endwith %} @@ -39,6 +39,11 @@ process reads_download_{{ pid }} { fi getSeqENA.py -l accession_file.txt \$asperaOpt -o ./ --SRAopt --downloadCramBam + + if [ $name != null ]; + then + echo renaming pattern '${accession_id}' to '${name}' && cd ${accession_id} && rename "s/${accession_id}/${name}/" *.gz + fi } || { # If exit code other than 0 if [ \$? -eq 0 ] @@ -49,6 +54,8 @@ process reads_download_{{ pid }} { echo "Could not download accession $accession_id" > .fail fi } + version_str="{'version':[{'program':'getSeqENA.py','version':'1.3'}]}" + echo \$version_str > .versions """ } diff --git a/flowcraft/generator/templates/remove_host.nf b/flowcraft/generator/templates/remove_host.nf index 3bf1dbf1..aee89c84 100644 --- a/flowcraft/generator/templates/remove_host.nf +++ b/flowcraft/generator/templates/remove_host.nf @@ -15,7 +15,7 @@ process remove_host_{{ pid }} { output: set sample_id , file("${sample_id}*.headersRenamed_*.fq.gz") into {{ output_channel }} - file "*_bowtie2.log" + set sample_id, file("*_bowtie2.log") into into_json_{{ pid }} {% with task_name="remove_host" %} {%- include "compiler_channels.txt" ignore missing -%} {% endwith %} @@ -26,12 +26,39 @@ process remove_host_{{ pid }} { samtools view -buh -f 12 -o ${sample_id}_samtools.bam -@ $task.cpus ${sample_id}.bam + rm ${sample_id}.bam + samtools fastq -1 ${sample_id}_unmapped_1.fq -2 ${sample_id}_unmapped_2.fq ${sample_id}_samtools.bam + rm ${sample_id}_samtools.bam + renamePE_samtoolsFASTQ.py -1 ${sample_id}_unmapped_1.fq -2 ${sample_id}_unmapped_2.fq gzip *.headersRenamed_*.fq + + rm *.fq """ } + + +process report_remove_host_{{ pid }} { + + {% include "post.txt" ignore missing %} + + tag { sample_id } + + input: + set sample_id, file(bowtie_log) from into_json_{{ pid }} + + output: + {% with task_name="report_remove_host" %} + {%- include "compiler_channels.txt" ignore missing -%} + {% endwith %} + + script: + template "process_mapping.py" + +} + {{ forks }} \ No newline at end of file diff --git a/flowcraft/generator/templates/report_compiler.nf b/flowcraft/generator/templates/report_compiler.nf index b7f91ae0..8ac5c0cb 100644 --- a/flowcraft/generator/templates/report_compiler.nf +++ b/flowcraft/generator/templates/report_compiler.nf @@ -7,13 +7,18 @@ process report { tag { sample_id } input: - set sample_id, task_name, pid, report_json from {{ compile_channels }} + set sample_id, + task_name, + pid, + report_json, + version_json, + trace from {{ compile_channels }} output: - file "*.json" optional true into master_report + file "*" optional true into master_report """ - prepare_reports.py $report_json $sample_id $task_name 1 $pid + prepare_reports.py $report_json $version_json $trace $sample_id $task_name 1 $pid $workflow.scriptId $workflow.runName """ } @@ -21,30 +26,25 @@ process report { process compile_reports { - publishDir "pipeline_report/" + publishDir "pipeline_report/", mode: "copy" + + if ( params.reportHTTP != null ){ + beforeScript "PATH=${workflow.projectDir}/bin:\$PATH; export PATH;" + afterScript "metadata_POST.sh $params.projectId $params.pipelineId 0 $params.sampleName $params.reportHTTP $params.currentUserName $params.currentUserId 0 \"$params.platformSpecies\"" + } input: file report from master_report.collect() + file forks from Channel.fromPath(".forkTree.json") + file dag from Channel.fromPath(".treeDag.json") + file js from Channel.fromPath("${workflow.projectDir}/resources/main.js.zip") output: file "pipeline_report.json" + file "pipeline_report.html" + file "src/main.js" - """ - #!/usr/bin/env python3 - import json - - reports = '${report}'.split() - - storage = [] - for r in reports: - with open(r) as fh: - rjson = json.load(fh) - storage.append(rjson) - - with open("pipeline_report.json", "w") as rep_fh: - rep_fh.write(json.dumps({"data": {"results": storage}}, - separators=(",", ":"))) - """ - + script: + template "compile_reports.py" } diff --git a/flowcraft/generator/templates/retrieve_mapped.nf b/flowcraft/generator/templates/retrieve_mapped.nf index 73ead02d..4a906c11 100644 --- a/flowcraft/generator/templates/retrieve_mapped.nf +++ b/flowcraft/generator/templates/retrieve_mapped.nf @@ -19,11 +19,17 @@ process retrieve_mapped_{{ pid }} { """ samtools view -buh -F 12 -o ${sample_id}_samtools.bam -@ $task.cpus ${bam} + rm ${bam} + samtools fastq -1 ${sample_id}_mapped_1.fq -2 ${sample_id}_mapped_2.fq ${sample_id}_samtools.bam + rm ${sample_id}_samtools.bam + renamePE_samtoolsFASTQ.py -1 ${sample_id}_mapped_1.fq -2 ${sample_id}_mapped_2.fq gzip *.headersRenamed_*.fq + + rm *.fq """ } diff --git a/flowcraft/generator/templates/seq_typing.nf b/flowcraft/generator/templates/seq_typing.nf index 61596bee..59990dd4 100644 --- a/flowcraft/generator/templates/seq_typing.nf +++ b/flowcraft/generator/templates/seq_typing.nf @@ -17,8 +17,8 @@ process seq_typing_{{ pid }} { input: set sample_id, file(fastq_pair) from {{ input_channel }} - file refO from IN_refO_{{ pid }} - file refH from IN_refH_{{ pid }} + each file(refO) from IN_refO_{{ pid }} + each file(refH) from IN_refH_{{ pid }} output: file "seq_typing*" @@ -35,8 +35,12 @@ process seq_typing_{{ pid }} { export PATH="\$(pwd)/rematch_temp/ReMatCh:\$PATH" seq_typing.py -f ${fastq_pair[0]} ${fastq_pair[1]} -r \$(pwd)/$refO \$(pwd)/$refH -o ./ -j $task.cpus --extraSeq 0 --mapRefTogether --minGeneCoverage 60 + + # Add information to dotfiles json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'seqtyping','value':'\$(cat seq_typing.report.txt)','table':'typing'}]}]}" echo \$json_str > .report.json + version_str="[{'program':'seq_typing.py','version':'0.1'}]" + echo \$version_str > .versions rm -r rematch_temp @@ -48,6 +52,8 @@ process seq_typing_{{ pid }} { fi } || { echo fail > .status + json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'seqtyping','value':'NA','table':'typing'}]}]}" + echo \$json_str > .report.json } """ diff --git a/flowcraft/generator/templates/sistr.nf b/flowcraft/generator/templates/sistr.nf index d54f58e6..c0f611fd 100644 --- a/flowcraft/generator/templates/sistr.nf +++ b/flowcraft/generator/templates/sistr.nf @@ -19,8 +19,11 @@ process sistr_{{ pid }} { """ { sistr --qc -vv -t $task.cpus -f tab -o ${sample_id}_sistr.tab ${assembly} - json_str="{'typing':{'sistr':'\$(awk \"FNR == 2\" *.tab | cut -f14)'}}" + json_str="{'tableRow':[{'sample':'${sample_id}','data':[{'header':'sistr','value':'\$(awk \"FNR == 2\" *.tab | cut -f14)','table':'typing'}]}]}" echo \$json_str > .report.json + sistr_version=\$(sistr --version | cut -d" " -f2) + version_str="[{'program':'sistr','version':'\$sistr_version'}]" + echo \$version_str > .versions if [ -s ${sample_id}_sistr.tab ]; then @@ -33,7 +36,6 @@ process sistr_{{ pid }} { echo fail > .status } """ - } {{ forks }} diff --git a/flowcraft/generator/templates/split_assembly.nf b/flowcraft/generator/templates/split_assembly.nf index 0876ca27..1a03b53b 100644 --- a/flowcraft/generator/templates/split_assembly.nf +++ b/flowcraft/generator/templates/split_assembly.nf @@ -18,7 +18,7 @@ process split_assembly_{{ pid }} { val min_contig_size from IN_min_contig_size_{{ pid }} output: - file '*split.fasta' into splitCh_{{ pid }} optional true + file '*.fasta' into splitCh_{{ pid }} optional true {% with task_name="split_assembly" %} {%- include "compiler_channels.txt" ignore missing -%} {% endwith %} diff --git a/flowcraft/generator/templates/viral_assembly.nf b/flowcraft/generator/templates/viral_assembly.nf index ca581b46..0a86be23 100644 --- a/flowcraft/generator/templates/viral_assembly.nf +++ b/flowcraft/generator/templates/viral_assembly.nf @@ -128,7 +128,29 @@ process va_megahit_{{ pid }} { } -good_assembly.mix(megahit_assembly).set{ {{ output_channel }} } +good_assembly.mix(megahit_assembly).into{ to_report_{{ pid }} ; {{ output_channel }} } +orf_size = Channel.value(params.minimumContigSize{{ param_id }}) + + +process report_viral_assembly_{{ pid }} { + + {% include "post.txt" ignore missing %} + + tag { sample_id } + + input: + set sample_id, file(assembly) from to_report_{{ pid }} + val min_size from orf_size + + output: + {% with task_name="report_viral_assembly" %} + {%- include "compiler_channels.txt" ignore missing -%} + {% endwith %} + + script: + template "process_viral_assembly.py" + +} {{ forks }} \ No newline at end of file diff --git a/flowcraft/nextflow.config b/flowcraft/nextflow.config index aeacb41b..70780f97 100644 --- a/flowcraft/nextflow.config +++ b/flowcraft/nextflow.config @@ -1,5 +1,6 @@ params { platformHTTP = null + reportHTTP = null // Settings this option to true, will trigger the removal of temporary // data (usually fastq reads) at particular checkpoint processes that diff --git a/flowcraft/resources/main.js.zip b/flowcraft/resources/main.js.zip new file mode 100644 index 00000000..f3e81e3c Binary files /dev/null and b/flowcraft/resources/main.js.zip differ diff --git a/flowcraft/templates/assembly_report.py b/flowcraft/templates/assembly_report.py index 84b214d5..9f790ef0 100644 --- a/flowcraft/templates/assembly_report.py +++ b/flowcraft/templates/assembly_report.py @@ -54,7 +54,7 @@ def __get_version_pilon(): try: - cli = ["java", "-jar", pilon_path , "--version"] + cli = ["java", "-jar", pilon_path, "--version"] p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE) stdout, _ = p.communicate() @@ -310,24 +310,9 @@ def _get_window_labels(self, window): contig_id = self._get_contig_id(contig) self.contig_boundaries[contig_id] = [c, c + len(seq)] c += len(seq) - xbars.append( - { - "contig": contig_id, - "position": c / window, - "absPosition": c, - "window": window - } - ) - - # Get label contig for each window - labels = [] - for i in range(0, self.summary_info["total_len"], window): - for contig, rg in self.contig_boundaries.items(): - if rg[0] <= i < rg[1]: - labels.append("{}_{}".format(contig, i)) - break + xbars.append((contig_id, c, contig)) - return labels, xbars + return xbars @staticmethod def _gc_prop(s, length): @@ -348,7 +333,7 @@ def _gc_prop(s, length): return gc / length - def get_gc_sliding(self, window=500): + def get_gc_sliding(self, window=2000): """Calculates a sliding window of the GC content for the assembly @@ -357,18 +342,10 @@ def get_gc_sliding(self, window=500): gc_res : list List of GC proportion floats for each data point in the sliding window - labels: list - List of labels for each data point - xbars : list - List of the ending position of each contig in the genome - """ gc_res = [] - # Get contigID for each window position - labels, xbars = self._get_window_labels(window) - # Get complete sequence to calculate sliding window values complete_seq = "".join(self.contigs.values()).lower() @@ -377,9 +354,9 @@ def get_gc_sliding(self, window=500): seq_window = complete_seq[i:i + window] # Get GC proportion - gc_res.append(self._gc_prop(seq_window, len(seq_window))) + gc_res.append(round(self._gc_prop(seq_window, len(seq_window)), 2)) - return gc_res, labels, xbars + return gc_res def _get_coverage_from_file(self, coverage_file): """ @@ -408,7 +385,7 @@ def _get_coverage_from_file(self, coverage_file): else: self.contig_coverage[header].append(coverage) - def get_coverage_sliding(self, coverage_file, window=500): + def get_coverage_sliding(self, coverage_file, window=2000): """ Parameters @@ -427,9 +404,6 @@ def get_coverage_sliding(self, coverage_file, window=500): if not self.contig_coverage: self._get_coverage_from_file(coverage_file) - # Get contigID for each window position - labels, xbars = self._get_window_labels(window) - # Stores the coverage results cov_res = [] @@ -440,9 +414,9 @@ def get_coverage_sliding(self, coverage_file, window=500): # Get coverage values for current window cov_window = complete_cov[i:i + window] # Get mean coverage - cov_res.append(sum(cov_window) / len(cov_window)) + cov_res.append(int(sum(cov_window) / len(cov_window))) - return cov_res, labels, xbars + return cov_res @MainWrapper @@ -489,21 +463,26 @@ def main(sample_id, assembly_file, coverage_bp_file=None): if coverage_bp_file: try: - gc_sliding_data, gc_label, gc_xbars = assembly_obj.get_gc_sliding() - cov_sliding_data, cov_label, cov_xbars = \ - assembly_obj.get_coverage_sliding(coverage_bp_file) + window = 2000 + gc_sliding_data = assembly_obj.get_gc_sliding(window=window) + cov_sliding_data = \ + assembly_obj.get_coverage_sliding(coverage_bp_file, + window=window) # Get total basepairs based on the individual coverage of each - # contig bp + # contig bpx total_bp = sum( [sum(x) for x in assembly_obj.contig_coverage.values()] ) # Add data to json report - json_dic["plotData"][0]["data"]["gcSliding"] = \ - [gc_sliding_data, gc_label, gc_xbars] - json_dic["plotData"][0]["data"]["covSliding"] = \ - [cov_sliding_data, cov_label, cov_xbars] + json_dic["plotData"][0]["data"]["genomeSliding"] = { + "gcData": gc_sliding_data, + "covData": cov_sliding_data, + "window": window, + "xbars": assembly_obj._get_window_labels(window), + "assemblyFile": os.path.basename(assembly_file) + } json_dic["plotData"][0]["data"]["sparkline"] = total_bp except: diff --git a/flowcraft/templates/compile_reports.py b/flowcraft/templates/compile_reports.py new file mode 100644 index 00000000..9f28f790 --- /dev/null +++ b/flowcraft/templates/compile_reports.py @@ -0,0 +1,98 @@ +#!/usr/bin/python3 +import os +import sys +import json +import zipfile +import logging + +REPORTS = "${report}".split() +FORKS = "${forks}" +DAG = "${dag}" +MAIN_JS = "${js}" + + +html_template = """ + + + + + + + FlowCraft App + + +
+ + + + +""" + + +def main(reports, forks, dag, main_js): + + metadata = { + "nfMetadata": { + "scriptId": "${workflow.scriptId}", + "scriptName": "${workflow.scriptName}", + "profile": "${workflow.profile}", + "container": "${workflow.container}", + "containerEngine": "${workflow.containerEngine}", + "commandLine": "${workflow.commandLine}", + "runName": "${workflow.runName}", + "sessionId": "${workflow.sessionId}", + "projectDir": "${workflow.projectDir}", + "launchDir": "${workflow.launchDir}", + "startTime": "${workflow.start}" + } + } + + # Add nextflow metadata + storage = [] + + # Add forks dictionary + try: + with open(forks) as fh: + forks = json.load(fh) + metadata["nfMetadata"]["forks"] = forks + except json.JSONDecodeError: + logging.warning("Could not parse versions JSON: {}".format( + dag)) + + # Add tree DAG in JSON format + try: + with open(dag) as fh: + dag = json.load(fh) + metadata["nfMetadata"]["dag"] = dag + except json.JSONDecodeError: + logging.warning("Could not parse versions JSON: {}".format( + dag)) + + storage.append(metadata) + # Write metadata information to dotfile. This dotfile is then sent to the + # ReportHTTP, when available in the afterScript process directive. + with open(".metadata.json", "w") as fh: + fh.write(json.dumps(metadata, separators=(",", ":"))) + + for r in reports: + with open(r) as fh: + rjson = json.load(fh) + storage.append(rjson) + print("{}: {}".format(rjson["processName"], + sys.getsizeof(json.dumps(rjson)))) + + with open("pipeline_report.html", "w") as html_fh: + html_fh.write(html_template.format( + json.dumps({"data": {"results": storage}}, separators=(",", ":")))) + + with zipfile.ZipFile(MAIN_JS) as zf: + os.mkdir("src") + zf.extractall("./src") + + with open("pipeline_report.json", "w") as rep_fh: + rep_fh.write(json.dumps({"data": {"results": storage}}, + separators=(",", ":"))) + + +if __name__ == "__main__": + main(REPORTS, FORKS, DAG, MAIN_JS) diff --git a/flowcraft/templates/fastqc.py b/flowcraft/templates/fastqc.py index 461f139f..30144b0d 100644 --- a/flowcraft/templates/fastqc.py +++ b/flowcraft/templates/fastqc.py @@ -47,7 +47,7 @@ logger = get_logger(__file__) -def __set_version_fastqc(): +def __get_version_fastqc(): try: diff --git a/flowcraft/templates/fastqc_report.py b/flowcraft/templates/fastqc_report.py index 79e10b9c..debe03d1 100644 --- a/flowcraft/templates/fastqc_report.py +++ b/flowcraft/templates/fastqc_report.py @@ -87,6 +87,7 @@ def _get_quality_stats(d, start_str, field_start=1, field_end=2): """ + min_parsed = False parse = False report = [] start_str = start_str @@ -106,9 +107,25 @@ def _get_quality_stats(d, start_str, field_start=1, field_end=2): return report, status elif parse: + fields = line.strip().split() - report.append((str(fields[0]), - ";".join(fields[field_start: field_end]))) + + # This is triggered when the first value of a line series is + # not 1. If the starting point of the series is a number + # different from 1, fill the report with 0 until that point + if not min_parsed: + if fields[0] != "1": + try: + blank_points = int(fields[0]) - 1 + report.extend([0] * blank_points) + except ValueError: + pass + min_parsed = True + + report.append(";".join([ + str(round(float(x), 2)) for x in + fields[field_start: field_end] + ])) def write_json_report(sample_id, data1, data2): @@ -487,12 +504,12 @@ def check_summary_health(summary_file, **kwargs): # WARNINGS # Check for fail sensitive if cat in warning_fail_sensitive and test == "FAIL": - warning.append("{}:low".format(cat)) + warning.append("Failed category: {}".format(cat)) logger.warning("Category {} flagged at a fail sensitive " "category".format(cat)) if cat in warning_must_pass and test != "PASS": - warning.append("{}:low".format(cat)) + warning.append("Did not pass category: {}".format(cat)) logger.warning("Category {} flagged at a must pass " "category".format(cat)) @@ -589,7 +606,7 @@ def main(sample_id, result_p1, result_p2, opts): json_dic["fail"] = [{ "sample": sample_id, "table": "qc", - "value": fail_msg + "value": [fail_msg] }] report_fh.write( json.dumps(json_dic, separators=(",", ":"))) diff --git a/flowcraft/templates/integrity_coverage.py b/flowcraft/templates/integrity_coverage.py index 3714ff65..71007ba6 100755 --- a/flowcraft/templates/integrity_coverage.py +++ b/flowcraft/templates/integrity_coverage.py @@ -74,8 +74,8 @@ """ -__version__ = "1.0.0" -__build__ = "16012018" +__version__ = "1.0.1" +__build__ = "03082018" __template__ = "integrity_coverage-nf" import os @@ -370,10 +370,12 @@ def main(sample_id, fastq_pair, gsize, minimum_coverage, opts): "value": nreads, "table": "qc", "columnBar": True}, - {"header": "Coverage (1st)", + {"header": "Coverage", "value": exp_coverage, "table": "qc", - "columnBar": True} + "columnBar": True, + "failThreshold": minimum_coverage + } ] }], "plotData": [{ @@ -382,24 +384,24 @@ def main(sample_id, fastq_pair, gsize, minimum_coverage, opts): "sparkline": chars } }], - "minCoverage": minimum_coverage } else: json_dic = { "tableRow": [{ "sample": sample_id, "data": [ - {"header": "Coverage (2nd)", + {"header": "Coverage", "value": exp_coverage, "table": "qc", - "columnBar": True} + "columnBar": True, + "failThreshold": minimum_coverage + } ], }], - "minCoverage": minimum_coverage } # Get encoding - if len(encoding) > 1: + if len(encoding) > 0: encoding = set(encoding) phred = set(phred) # Get encoding and phred as strings @@ -414,10 +416,17 @@ def main(sample_id, fastq_pair, gsize, minimum_coverage, opts): phred_fh.write(phred) # Encoding not found else: - logger.warning("Could not guess encoding and phred from " - "FastQ") - enc_fh.write("None") - phred_fh.write("None") + if not skip_encoding: + encoding_msg = "Could not guess encoding and phred from " \ + "FastQ" + logger.warning(encoding_msg) + json_dic["warnings"] = [{ + "sample": sample_id, + "table": "qc", + "value": [encoding_msg] + }] + enc_fh.write("None") + phred_fh.write("None") # Estimate coverage logger.info("Estimating coverage based on a genome size of " @@ -442,7 +451,7 @@ def main(sample_id, fastq_pair, gsize, minimum_coverage, opts): json_dic["fail"] = [{ "sample": sample_id, "table": "qc", - "value": fail_msg + "value": [fail_msg] }] json_report.write(json.dumps(json_dic, separators=(",", ":"))) diff --git a/flowcraft/templates/mapping2json.py b/flowcraft/templates/mapping2json.py index 83bf5eea..cf9c220f 100755 --- a/flowcraft/templates/mapping2json.py +++ b/flowcraft/templates/mapping2json.py @@ -262,11 +262,23 @@ def main(depth_file, json_dict, cutoff, sample_id): output_json.write(json.dumps(percentage_bases_covered)) json_dic = { + "tableRow": [{ + "sample": sample_id, + "data": [{ + "header": "Mapping", + "table": "plasmids", + "patlas_mapping": percentage_bases_covered, + "value": len(percentage_bases_covered) + }] + }], "sample": sample_id, "patlas_mapping": percentage_bases_covered, - "plotData": { - "mappingPlasmids": dict_cov, - } + "plotData": [{ + "sample": sample_id, + "data": { + "patlasMappingSliding": dict_cov + }, + }] } logger.debug("Size of dict_cov: {} kb".format(asizeof(json_dic)/1024)) diff --git a/flowcraft/templates/mashdist2json.py b/flowcraft/templates/mashdist2json.py index dd4ce603..491d02fc 100644 --- a/flowcraft/templates/mashdist2json.py +++ b/flowcraft/templates/mashdist2json.py @@ -22,12 +22,12 @@ """ -__version__ = "1.3.0" -__build__ = "04072018" +__version__ = "1.4.0" +__build__ = "04092018" __template__ = "mashsdist2json-nf" -import os import json +import os from flowcraft_utils.flowcraft_base import get_logger, MainWrapper @@ -37,14 +37,16 @@ MASH_TXT = '$mashtxt' HASH_CUTOFF = '$shared_hashes' SAMPLE_ID = '$sample_id' + ASSEMBLY_IN = '$fasta' logger.debug("Running {} with parameters:".format( os.path.basename(__file__))) logger.debug("MASH_TXT: {}".format(MASH_TXT)) logger.debug("HASH_CUTOFF: {}".format(HASH_CUTOFF)) logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID)) + logger.debug("ASSEMBLY_IN: {}".format(ASSEMBLY_IN)) -def send_to_output(master_dict, mash_output, sample_id): +def send_to_output(master_dict, mash_output, sample_id, assembly_file): """Send dictionary to output json file This function sends master_dict dictionary to a json file if master_dict is populated with entries, otherwise it won't create the file @@ -68,6 +70,9 @@ def send_to_output(master_dict, mash_output, sample_id): ------- """ + + plot_dict = {} + # create a new file only if master_dict is populated if master_dict: out_file = open("{}.json".format( @@ -75,18 +80,43 @@ def send_to_output(master_dict, mash_output, sample_id): out_file.write(json.dumps(master_dict)) out_file.close() - json_dic = { + # iterate through master_dict in order to make contigs the keys + for k,v in master_dict.items(): + if not v[2] in plot_dict: + plot_dict[v[2]] = [k] + else: + plot_dict[v[2]].append(k) + + number_hits = len(master_dict) + else: + number_hits = 0 + + json_dic = { + "tableRow": [{ + "sample": sample_id, + "data": [{ + "header": "Mash Dist", + "table": "plasmids", + "patlas_mashdist": master_dict, + "value": number_hits + }] + }], + "plotData": [{ "sample": sample_id, - "patlas_mashdist": master_dict - } + "data": { + "patlasMashDistXrange": plot_dict + }, + "assemblyFile": assembly_file + }] + } - with open(".report.json", "w") as json_report: - json_report.write(json.dumps(json_dic, separators=(",", ":"))) + with open(".report.json", "w") as json_report: + json_report.write(json.dumps(json_dic, separators=(",", ":"))) @MainWrapper -def main(mash_output, hash_cutoff, sample_id): - ''' +def main(mash_output, hash_cutoff, sample_id, assembly_file): + """ Main function that allows to dump a mash dist txt file to a json file Parameters @@ -99,7 +129,7 @@ def main(mash_output, hash_cutoff, sample_id): to the results outputs sample_id: str The name of the sample. - ''' + """ input_f = open(mash_output, "r") @@ -133,8 +163,8 @@ def main(mash_output, hash_cutoff, sample_id): ] # assures that file is closed in last iteration of the loop - send_to_output(master_dict, mash_output, sample_id) + send_to_output(master_dict, mash_output, sample_id, assembly_file) if __name__ == "__main__": - main(MASH_TXT, HASH_CUTOFF, SAMPLE_ID) + main(MASH_TXT, HASH_CUTOFF, SAMPLE_ID, ASSEMBLY_IN) diff --git a/flowcraft/templates/mashscreen2json.py b/flowcraft/templates/mashscreen2json.py index 3558f06e..75d10148 100644 --- a/flowcraft/templates/mashscreen2json.py +++ b/flowcraft/templates/mashscreen2json.py @@ -67,11 +67,11 @@ def main(mash_output, sample_id): for line in read_mash_output: tab_split = line.split("\t") identity = tab_split[0] - #shared_hashes = tab_split[1] + # shared_hashes = tab_split[1] median_multiplicity = tab_split[2] - #p_value = tab_split[3] + # p_value = tab_split[3] query_id = tab_split[4] - #query-comment should not exist here and it is irrelevant + # query-comment should not exist here and it is irrelevant # here identity is what in fact interests to report to json but # median_multiplicity also is important since it gives an rough @@ -109,14 +109,21 @@ def main(mash_output, sample_id): output_json.close() json_dic = { - "sample_id": sample_id, - "patlas_mashscreen": filtered_dic - # TODO add information for report webapp + "tableRow": [{ + "sample": sample_id, + "data": [{ + "header": "Mash Screen", + "table": "plasmids", + "patlas_mashscreen": filtered_dic, + "value": len(filtered_dic) + }] + }], } with open(".report.json", "w") as json_report: json_report.write(json.dumps(json_dic, separators=(",", ":"))) + if __name__ == "__main__": main(MASH_TXT, SAMPLE_ID) diff --git a/flowcraft/templates/process_abricate.py b/flowcraft/templates/process_abricate.py index c86575e9..605d0b49 100755 --- a/flowcraft/templates/process_abricate.py +++ b/flowcraft/templates/process_abricate.py @@ -430,24 +430,45 @@ def get_plot_data(self): List of JSON/dict objects with the report data. """ - json_dic = {"plotData": {}} + json_dic = {"plotData": []} + sample_dic = {} + sample_assembly_map = {} for entry in self.storage.values(): + + sample_id = re.match("(.*)_abr", entry["log_file"]).groups()[0] + if sample_id not in sample_dic: + sample_dic[sample_id] = {} + # Get contig ID using the same regex as in `assembly_report.py` # template contig_id = self._get_contig_id(entry["reference"]) # Get database database = entry["database"] - if database not in json_dic["plotData"]: - json_dic["plotData"][database] = [] + if database not in sample_dic[sample_id]: + sample_dic[sample_id][database] = [] + + # Update the sample-assembly correspondence dict + if sample_id not in sample_assembly_map: + sample_assembly_map[sample_id] = entry["infile"] - json_dic["plotData"][database].append( + sample_dic[sample_id][database].append( {"contig": contig_id, "seqRange": entry["seq_range"], "gene": entry["gene"].replace("'", ""), "accession": entry["accession"], "coverage": entry["coverage"], - "identity": entry["identity"]} + "identity": entry["identity"], + }, + ) + + for sample, data in sample_dic.items(): + json_dic["plotData"].append( + { + "sample": sample, + "data": {"abricateXrange": data}, + "assemblyFile": sample_assembly_map[sample] + } ) return json_dic diff --git a/flowcraft/templates/process_assembly.py b/flowcraft/templates/process_assembly.py index fb6209e7..23aab3e5 100644 --- a/flowcraft/templates/process_assembly.py +++ b/flowcraft/templates/process_assembly.py @@ -505,16 +505,16 @@ def main(sample_id, assembly_file, gsize, opts, assembler): assembly_len) logger.warning(warn_msg) warn_fh.write(warn_msg) - fails = "Small_genome_size_({})".format(assembly_len) + fails = warn_msg if assembly_len > t_150: - warn_msg = "Assembly size ({}) smaller than the maximum" \ + warn_msg = "Assembly size ({}) larger than the maximum" \ " threshold of 150% of expected genome size.".format( assembly_len) logger.warning(warn_msg) warn_fh.write(warn_msg) - fails = "Large_genome_size_({})".format(assembly_len) + fails = warn_msg logger.debug("Checking number of contigs: {}".format( len(assembly_obj.contigs))) @@ -522,12 +522,14 @@ def main(sample_id, assembly_file, gsize, opts, assembler): if len(assembly_obj.contigs) > contig_threshold: warn_msg = "The number of contigs ({}) exceeds the threshold of " \ - "100 contigs per 1.5Mb ({})".format( - assembly_obj.contigs, contig_threshold) + "{} contigs per 1.5Mb ({})".format( + len(assembly_obj.contigs), + max_contigs, + round(contig_threshold, 1)) logger.warning(warn_msg) warn_fh.write(warn_msg) - warnings.append("excessive_contigs:moderate") + warnings.append(warn_msg) # Write filtered assembly logger.debug("Renaming old assembly file to: {}".format( @@ -553,18 +555,20 @@ def main(sample_id, assembly_file, gsize, opts, assembler): "columnBar": True} ] }], - "warnings": [{ + } + + if warnings: + json_dic["warnings"] = [{ "sample": sample_id, "table": "assembly", "value": warnings }] - } if fails: json_dic["fail"] = [{ "sample": sample_id, "table": "assembly", - "value": fails + "value": [fails] }] json_report.write(json.dumps(json_dic, separators=(",", ":"))) diff --git a/flowcraft/templates/process_assembly_mapping.py b/flowcraft/templates/process_assembly_mapping.py index 435cdd57..6e892341 100644 --- a/flowcraft/templates/process_assembly_mapping.py +++ b/flowcraft/templates/process_assembly_mapping.py @@ -362,7 +362,7 @@ def check_filtered_assembly(coverage_info, coverage_bp, minimum_coverage, "{}".format(total_assembled_bp)) warnings = [] - fails = "" + fails = [] health = True with open(".warnings", "w") as warn_fh, \ @@ -379,7 +379,7 @@ def check_filtered_assembly(coverage_info, coverage_bp, minimum_coverage, assembly_len) logger.warning(warn_msg) warn_fh.write(warn_msg) - fails = "Large_genome_size_({})".format(assembly_len) + fails.append("Large_genome_size_({})".format(assembly_len)) # If the number of contigs in the filtered assembly size crosses the # max_contigs threshold, issue a warning @@ -388,11 +388,11 @@ def check_filtered_assembly(coverage_info, coverage_bp, minimum_coverage, contig_threshold = max_contigs * genome_size / 1.5 if ncontigs > contig_threshold: warn_msg = "The number of contigs ({}) exceeds the threshold of " \ - "100 contigs per 1.5Mb: {}".format( - ncontigs, contig_threshold) + "100 contigs per 1.5Mb ({})".format( + ncontigs, round(contig_threshold, 1)) logger.warning(warn_msg) warn_fh.write(warn_msg) - warnings.append("excessive_contigs:high") + warnings.append(warn_msg) # If the filtered assembly size falls below the 80% genome size # threshold, fail this check and return False @@ -402,7 +402,7 @@ def check_filtered_assembly(coverage_info, coverage_bp, minimum_coverage, assembly_len) logger.warning(warn_msg) warn_fh.write(warn_msg) - fails = "Small_genome_size_({})".format(assembly_len) + fails.append("Small_genome_size_({})".format(assembly_len)) assembly_len = sum([v for v in contig_size.values()]) total_assembled_bp = sum( [sum(coverage_bp[x]) for x in coverage_info if x in @@ -417,8 +417,8 @@ def check_filtered_assembly(coverage_info, coverage_bp, minimum_coverage, json_dic = { "plotData": [{ "sample": sample_id, - "data:": {"sparkline": total_assembled_bp, - "coverageDist": [x["cov"] for x in coverage_info.values()] + "data": { + "sparkline": total_assembled_bp } }] } @@ -433,7 +433,7 @@ def check_filtered_assembly(coverage_info, coverage_bp, minimum_coverage, json_dic["fail"] = [{ "sample": sample_id, "table": "assembly", - "value": fails + "value": [fails] }] json_report.write(json.dumps(json_dic, separators=(",", ":"))) diff --git a/flowcraft/templates/process_mapping.py b/flowcraft/templates/process_mapping.py new file mode 100644 index 00000000..9e97d990 --- /dev/null +++ b/flowcraft/templates/process_mapping.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 + +import re +import os +import json + +from flowcraft_utils.flowcraft_base import get_logger, MainWrapper + + + +""" +Purpose +------- + +This module is intended to process the output of mapping proces from a single +sample from the program Bowtie for the report component. +The main input is an log file produced by the mapper. + +Expected input +-------------- + +The following variables are expected whether using NextFlow or the +:py:func:`main` executor. + +- ``sample_id``: Sample Identification string. + - e.g.: ``'SampleA'`` +- ``bowtie_log``: Log file from the mapper. + - e.g.: ``'bowtie.log'`` + +Generated output +---------------- +- ``.report.jason``: Data structure for the report + +Code documentation +------------------ + +""" + +__version__ = "1.0.1" +__build__ = "10.09.2018" +__template__ = "remove_host-nf" + +logger = get_logger(__file__) + + +if __file__.endswith(".command.sh"): + SAMPLE_ID = '$sample_id' + BOWTIE_LOG = '$bowtie_log' + logger.debug("Running {} with parameters:".format( + os.path.basename(__file__))) + logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID)) + logger.debug("BOWTIE_LOG: {}".format(BOWTIE_LOG)) + + + +class Bowtie: + """ + Class to parse and store the info in the bowtie log file. + + """ + + def __init__(self, sample_id, bowtie_log): + + self.sample = sample_id + """ + str: The name of the sample for the assembly. + """ + + self.n_reads = 0 + + self.align_0x = 0 + + self.align_1x = 0 + + self.align_mt1x = 0 + + self.overall_rate = 0.0 + + # Parse assembly and populate self.n_reads, self.align_0x, self.align_1x, self.align_mt1x and self.overall_rate + self.parse_log(bowtie_log) + + + def set_n_reads(self, n_reads): + self.n_reads = int(n_reads) + + + def set_align_0x(self,align_0x): + self.align_0x = align_0x + + + def set_align_1x(self,align_1x): + self.align_1x = align_1x + + + def set_align_mt1x(self,align_mt1x): + self.align_mt1x = align_mt1x + + + def set_overall_rate(self,overall_rate): + self.overall_rate = overall_rate + + + def parse_log(self, bowtie_log): + """Parse a bowtie log file. + + This is a bowtie log parsing method that populates the + :py:attr:`self.n_reads, self.align_0x, self.align_1x, self.align_mt1x and self.overall_rate` attributes with + data from the log file. + + Disclamer: THIS METHOD IS HORRIBLE BECAUSE THE BOWTIE LOG IS HORRIBLE. + + The insertion of data on the attribytes is done by the + :py:meth:`set_attribute method. + + Parameters + ---------- + bowtie_log : str + Path to the boetie log file. + + """ + + print("is here!") + + # Regexes - thanks to https://github.com/ewels/MultiQC/blob/master/multiqc/modules/bowtie2/bowtie2.py + regexes = { + 'unpaired': { + 'unpaired_aligned_none': r"(\\d+) \\([\\d\\.]+%\\) aligned 0 times", + 'unpaired_aligned_one': r"(\\d+) \\([\\d\\.]+%\\) aligned exactly 1 time", + 'unpaired_aligned_multi': r"(\\d+) \\([\\d\\.]+%\\) aligned >1 times" + }, + 'paired': { + 'paired_aligned_none': r"(\\d+) \\([\\d\\.]+%\\) aligned concordantly 0 times", + 'paired_aligned_one': r"(\\d+) \\([\\d\\.]+%\\) aligned concordantly exactly 1 time", + 'paired_aligned_multi': r"(\\d+) \\([\\d\\.]+%\\) aligned concordantly >1 times", + 'paired_aligned_discord_one': r"(\\d+) \\([\\d\\.]+%\\) aligned discordantly 1 time", + 'paired_aligned_discord_multi': r"(\\d+) \\([\\d\\.]+%\\) aligned discordantly >1 times", + 'paired_aligned_mate_one': r"(\\d+) \\([\\d\\.]+%\\) aligned exactly 1 time", + 'paired_aligned_mate_multi': r"(\\d+) \\([\\d\\.]+%\\) aligned >1 times", + 'paired_aligned_mate_none': r"(\\d+) \\([\\d\\.]+%\\) aligned 0 times" + } + } + + #Missing parser for unpaired (not implemented in flowcraft yet) + + with open(bowtie_log, "r") as f: + #Go through log file line by line + for l in f: + + print(l) + + #total reads + total = re.search(r"(\\d+) reads; of these:", l) + print(total) + if total: + print(total) + self.set_n_reads(total.group(1)) + + + # Paired end reads aka the pain + paired = re.search(r"(\\d+) \\([\\d\\.]+%\\) were paired; of these:", l) + if paired: + paired_total = int(paired.group(1)) + + paired_numbers = {} + + # Do nested loop whilst we have this level of indentation + l = f.readline() + while l.startswith(' '): + for k, r in regexes['paired'].items(): + match = re.search(r, l) + if match: + paired_numbers[k] = int(match.group(1)) + l = f.readline() + + + align_zero_times = paired_numbers['paired_aligned_none'] + paired_numbers['paired_aligned_mate_none'] + if align_zero_times: + self.set_align_0x(align_zero_times) + + align_one_time = paired_numbers['paired_aligned_one'] + paired_numbers['paired_aligned_mate_one'] + if align_one_time: + self.set_align_1x(align_one_time) + + align_more_than_one_time = paired_numbers['paired_aligned_multi'] + paired_numbers['paired_aligned_mate_multi'] + if align_more_than_one_time: + self.set_align_mt1x(align_more_than_one_time) + + + # Overall alignment rate + overall = re.search(r"([\\d\\.]+)% overall alignment rate", l) + if overall: + self.overall_rate = float(overall.group(1)) + + +@MainWrapper +def main(sample_id, bowite_log): + """Main executor of the process_mapping template. + + Parameters + ---------- + sample_id : str + Sample Identification string. + boetie_log: str + Path to the log file generated by bowtie. + + """ + + logger.info("Starting mapping file processing") + warnings = [] + fails = "" + + bowtie_info = Bowtie(sample_id, bowite_log) + + print(bowtie_info.overall_rate) + + + with open(".report.json", "w") as json_report: + json_dic = { + "tableRow": [{ + "sample": sample_id, + "data": [ + {"header": "Reads", + "value": int(bowtie_info.n_reads), + "table": "mapping", + "columnBar": False}, + {"header": "Unmapped", + "value": int(bowtie_info.align_0x), + "table": "mapping", + "columnBar": False}, + {"header": "Mapped 1x", + "value": int(bowtie_info.align_1x), + "table": "mapping", + "columnBar": False}, + {"header": "Mapped >1x", + "value": int(bowtie_info.align_mt1x), + "table": "mapping", + "columnBar": False}, + {"header": "Overall alignment rate (%)", + "value": float(bowtie_info.overall_rate), + "table": "mapping", + "columnBar": False} + ] + }], + } + + if warnings: + json_dic["warnings"] = [{ + "sample": sample_id, + "table": "mapping", + "value": warnings + }] + + if fails: + json_dic["fail"] = [{ + "sample": sample_id, + "table": "mapping", + "value": [fails] + }] + + json_report.write(json.dumps(json_dic, separators=(",", ":"))) + + with open(".status", "w") as status_fh: + status_fh.write("pass") + + +if __name__ == '__main__': + + main(SAMPLE_ID, BOWTIE_LOG) \ No newline at end of file diff --git a/flowcraft/templates/process_newick.py b/flowcraft/templates/process_newick.py new file mode 100644 index 00000000..942cbe72 --- /dev/null +++ b/flowcraft/templates/process_newick.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 + +import os +import json +import dendropy + +from flowcraft_utils.flowcraft_base import get_logger, MainWrapper + + + +""" +Purpose +------- + +This module is intended to process the newick generated by + a proces to generate a report. The newick tree will be + rooted (midpoint). + + +Expected input +-------------- + +The following variables are expected whether using NextFlow or the +:py:func:`main` executor. + +- ``newick``: phylogenetic tree in newick format. + +Generated output +---------------- +- ``.report.jason``: Data structure for the report + +Code documentation +------------------ + +""" + +__version__ = "1.0.1" +__build__ = "20.09.2018" +__template__ = "raxml-nf" + +logger = get_logger(__file__) + + +if __file__.endswith(".command.sh"): + NEWICK = '$newick' + logger.debug("Running {} with parameters:".format( + os.path.basename(__file__))) + logger.debug("NEWICK: {}".format(NEWICK)) + + + +@MainWrapper +def main(newick): + """Main executor of the process_newick template. + + Parameters + ---------- + newick : str + path to the newick file. + + """ + + logger.info("Starting newick file processing") + + print(newick) + + tree = dendropy.Tree.get(file=open(newick, 'r'), schema="newick") + + tree.reroot_at_midpoint() + + to_write=tree.as_string("newick").strip().replace("[&R] ", '').replace(' ', '_').replace("'", "") + + with open(".report.json", "w") as json_report: + json_dic = { + "treeData": [{ + "trees": [ + to_write + ] + }], + } + + json_report.write(json.dumps(json_dic, separators=(",", ":"))) + + with open(".status", "w") as status_fh: + status_fh.write("pass") + + +if __name__ == '__main__': + + main(NEWICK) + diff --git a/flowcraft/templates/process_viral_assembly.py b/flowcraft/templates/process_viral_assembly.py new file mode 100644 index 00000000..334728c6 --- /dev/null +++ b/flowcraft/templates/process_viral_assembly.py @@ -0,0 +1,564 @@ +#!/usr/bin/env python3 + +import os +import json +import operator +from itertools import groupby + +from flowcraft_utils.flowcraft_base import get_logger, MainWrapper + + + +""" +Purpose +------- + +This module is intended to process the output of assembly process from a single +sample from the program Spades or Megahit for the report component. +The main input is an fasta file produced by the assembler. + +Expected input +-------------- + +The following variables are expected whether using NextFlow or the +:py:func:`main` executor. + +- ``sample_id``: Sample Identification string. + - e.g.: ``'SampleA'`` +- ``assembly``: fasta file from the assembler. + - e.g.: ``'spades.fasta'`` +- ``orfSize``: minimum contig size to be considered a complete ORF + +Generated output +---------------- +- ``.report.jason``: Data structure for the report + +Code documentation +------------------ + +""" + +__version__ = "1.0.1" +__build__ = "11.09.2018" +__template__ = "viral_assembly-nf" + +logger = get_logger(__file__) + +if __file__.endswith(".command.sh"): + SAMPLE_ID = '$sample_id' + ASSEMBLY = '$assembly' + MINSIZE = '$min_size' + logger.debug("Running {} with parameters:".format( + os.path.basename(__file__))) + logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID)) + logger.debug("ASSEMBLY: {}".format(ASSEMBLY)) + logger.debug("MINSIZE: {}".format(MINSIZE)) + + +class Assembly: + """Class that parses and filters a Fasta assembly file + + This class parses an assembly fasta file, collects a number + of summary statistics and metadata from the contigs, filters + contigs based on user-defined metrics and writes filtered assemblies + and reports. + + Parameters + ---------- + assembly_file : str + Path to assembly file. + min_contig_len : int + Minimum contig length when applying the initial assembly filter. + min_kmer_cov : int + Minimum k-mer coverage when applying the initial assembly. + filter. + sample_id : str + Name of the sample for the current assembly. + """ + + def __init__(self, assembly_file, min_contig_len, min_kmer_cov, + sample_id, min_size): + + self.contigs = {} + """ + dict: Dictionary storing data for each contig. + """ + + self.filtered_ids = [] + """ + list: List of filtered contig_ids. + """ + + self.min_gc = 0.05 + """ + float: Sets the minimum GC content on a contig. + """ + + self.sample = sample_id + """ + str: The name of the sample for the assembly. + """ + + self.nORFs = 0 + """ + int: number of complete ORFs in the assembly. + """ + + self.report = {} + """ + dict: Will contain the filtering results for each contig. + """ + + self.filters = [ + ["length", ">=", min_contig_len], + ["kmer_cov", ">=", min_kmer_cov] + ] + """ + list: Setting initial filters to check when parsing the assembly file. + This can be later changed using the 'filter_contigs' method. + """ + + # Parse assembly and populate self.contigs + self._parse_assembly(assembly_file) + + #Gets the number of ORFs + self.getORFs(assembly_file, min_size) + + def getORFs(self, assembly, min_size): + + f_open = open(assembly, "rU") + + entry = (x[1] for x in groupby(f_open, lambda line: line[0] == ">")) + + ORF = 0 + + for header in entry: + seq = "".join(s.strip() for s in entry.__next__()) + if len(seq) >= int(min_size): + ORF += 1 + + self.nORFs = ORF + + + @staticmethod + def _parse_coverage(header_str): + """Attempts to retrieve the coverage value from the header string. + + It splits the header by "_" and then screens the list backwards in + search of the first float value. This will be interpreted as the + coverage value. If it cannot find a float value, it returns None. + This search methodology is based on the strings of assemblers + like spades and skesa that put the mean kmer coverage for each + contig in its corresponding fasta header. + + Parameters + ---------- + header_str : str + String + + Returns + ------- + float or None + The coverage value for the contig. None if it cannot find the + value in the provide string. + """ + + cov = None + for i in header_str.split("_")[::-1]: + try: + cov = float(i) + break + except ValueError: + continue + + return cov + + def _parse_assembly(self, assembly_file): + """Parse an assembly fasta file. + + This is a Fasta parsing method that populates the + :py:attr:`~Assembly.contigs` attribute with data for each contig in the + assembly. + + The insertion of data on the self.contigs is done by the + :py:meth:`Assembly._populate_contigs` method, which also calculates + GC content and proportions. + + Parameters + ---------- + assembly_file : str + Path to the assembly fasta file. + + """ + + # Temporary storage of sequence data + seq_temp = [] + # Id counter for contig that will serve as key in self.contigs + contig_id = 0 + # Initialize kmer coverage and header + cov, header = None, None + + with open(assembly_file) as fh: + + logger.debug("Starting iteration of assembly file: {}".format( + assembly_file)) + for line in fh: + # Skip empty lines + if not line.strip(): + continue + else: + # Remove whitespace surrounding line for further processing + line = line.strip() + + if line.startswith(">"): + # If a sequence has already been populated, save the + # previous contig information + if seq_temp: + # Use join() to convert string list into the full + # contig string. This is generally much more efficient + # than successively concatenating strings. + seq = "".join(seq_temp) + + logger.debug("Populating contig with contig_id '{}', " + "header '{}' and cov '{}'".format( + contig_id, header, cov)) + self._populate_contigs(contig_id, header, cov, seq) + + # Reset temporary sequence storage + seq_temp = [] + contig_id += 1 + + header = line[1:] + cov = self._parse_coverage(line) + + else: + seq_temp.append(line) + + # Populate last contig entry + logger.debug("Populating contig with contig_id '{}', " + "header '{}' and cov '{}'".format( + contig_id, header, cov)) + seq = "".join(seq_temp) + self._populate_contigs(contig_id, header, cov, seq) + + def _populate_contigs(self, contig_id, header, cov, sequence): + """ Inserts data from a single contig into\ + :py:attr:`~Assembly.contigs`. + + By providing a contig id, the original header, the coverage that + is parsed from the header and the sequence, this method will + populate the :py:attr:`~Assembly.contigs` attribute. + + Parameters + ---------- + contig_id : int + Arbitrary unique contig identifier. + header : str + Original header of the current contig. + cov : float + The contig coverage, parsed from the fasta header + sequence : str + The complete sequence of the contig. + + """ + + # Get AT/GC/N counts and proportions. + # Note that self._get_gc_content returns a dictionary with the + # information on the GC/AT/N counts and proportions. This makes it + # much easier to add to the contigs attribute using the ** notation. + gc_kwargs = self._get_gc_content(sequence, len(sequence)) + logger.debug("Populate GC content with: {}".format(gc_kwargs)) + + self.contigs[contig_id] = { + "header": header, + "sequence": sequence, + "length": len(sequence), + "kmer_cov": cov, + **gc_kwargs + } + + @staticmethod + def _get_gc_content(sequence, length): + """Get GC content and proportions. + + Parameters + ---------- + sequence : str + The complete sequence of the contig. + length : int + The length of the sequence contig. + + Returns + ------- + x : dict + Dictionary with the at/gc/n counts and proportions + + """ + + # Get AT/GC/N counts + at = sum(map(sequence.count, ["A", "T"])) + gc = sum(map(sequence.count, ["G", "C"])) + n = length - (at + gc) + + # Get AT/GC/N proportions + at_prop = at / length + gc_prop = gc / length + n_prop = n / length + + return {"at": at, "gc": gc, "n": n, + "at_prop": at_prop, "gc_prop": gc_prop, "n_prop": n_prop} + + @staticmethod + def _test_truth(x, op, y): + """ Test the truth of a comparisong between x and y using an \ + ``operator``. + + If you want to compare '100 > 200', this method can be called as:: + + self._test_truth(100, ">", 200). + + Parameters + ---------- + x : int + Arbitrary value to compare in the left + op : str + Comparison operator + y : int + Arbitrary value to compare in the rigth + + Returns + ------- + x : bool + The 'truthness' of the test + """ + + ops = { + ">": operator.gt, + "<": operator.lt, + ">=": operator.ge, + "<=": operator.le, + } + + return ops[op](x, y) + + def filter_contigs(self, *comparisons): + """Filters the contigs of the assembly according to user provided\ + comparisons. + + The comparisons must be a list of three elements with the + :py:attr:`~Assembly.contigs` key, operator and test value. For + example, to filter contigs with a minimum length of 250, a comparison + would be:: + + self.filter_contigs(["length", ">=", 250]) + + The filtered contig ids will be stored in the + :py:attr:`~Assembly.filtered_ids` list. + + The result of the test for all contigs will be stored in the + :py:attr:`~Assembly.report` dictionary. + + Parameters + ---------- + comparisons : list + List with contig key, operator and value to test. + + """ + + # Reset list of filtered ids + self.filtered_ids = [] + self.report = {} + + gc_filters = [ + ["gc_prop", ">=", self.min_gc], + ["gc_prop", "<=", 1 - self.min_gc] + ] + + self.filters = list(comparisons) + gc_filters + + logger.debug("Filtering contigs using filters: {}".format( + self.filters)) + + for contig_id, contig in self.contigs.items(): + for key, op, value in list(comparisons) + gc_filters: + if not self._test_truth(contig[key], op, value): + self.filtered_ids.append(contig_id) + self.report[contig_id] = "{}/{}/{}".format(key, + contig[key], + value) + break + else: + self.report[contig_id] = "pass" + + def get_assembly_length(self): + """Returns the length of the assembly, without the filtered contigs. + + Returns + ------- + x : int + Total length of the assembly. + + """ + + return sum( + [vals["length"] for contig_id, vals in self.contigs.items() + if contig_id not in self.filtered_ids]) + + def write_assembly(self, output_file, filtered=True): + """Writes the assembly to a new file. + + The ``filtered`` option controls whether the new assembly will be + filtered or not. + + Parameters + ---------- + output_file : str + Name of the output assembly file. + filtered : bool + If ``True``, does not include filtered ids. + """ + + logger.debug("Writing the filtered assembly into: {}".format( + output_file)) + with open(output_file, "w") as fh: + + for contig_id, contig in self.contigs.items(): + if contig_id not in self.filtered_ids and filtered: + fh.write(">{}_{}\\n{}\\n".format(self.sample, + contig["header"], + contig["sequence"])) + + def write_report(self, output_file): + """Writes a report with the test results for the current assembly + + Parameters + ---------- + output_file : str + Name of the output assembly file. + + """ + + logger.debug("Writing the assembly report into: {}".format( + output_file)) + with open(output_file, "w") as fh: + + for contig_id, vals in self.report.items(): + fh.write("{}, {}\\n".format(contig_id, vals)) + + + +@MainWrapper +def main(sample_id, assembly_file, minsize): + """Main executor of the process_mapping template. + + Parameters + ---------- + sample_id : str + Sample Identification string. + assembly: str + Path to the fatsa file generated by the assembler. + minsize: str + Min contig size to be considered a complete ORF + + """ + + logger.info("Starting assembly file processing") + warnings = [] + fails = "" + + # Parse the spades assembly file and perform the first filtering. + logger.info("Starting assembly parsing") + assembly_obj = Assembly(assembly_file, 0, 0, + sample_id, minsize) + + if 'spades' in assembly_file: + assembler = "SPAdes" + else: + assembler = "MEGAHIT" + + with open(".warnings", "w") as warn_fh: + + t_80 = int(minsize) * 0.8 + t_150 = int(minsize) * 1.5 + # Check if assembly size of the first assembly is lower than 80% of the + # estimated genome size - DENV ORF has min 10k nt. If True, redo the filtering without the + # k-mer coverage filter + assembly_len = assembly_obj.get_assembly_length() + logger.debug("Checking assembly length: {}".format(assembly_len)) + + if assembly_len < t_80: + + logger.warning("Assembly size ({}) smaller than the minimum " + "threshold of 80% of expected genome size. " + "Applying contig filters without the k-mer " + "coverage filter".format(assembly_len)) + + assembly_len = assembly_obj.get_assembly_length() + logger.debug("Checking updated assembly length: " + "{}".format(assembly_len)) + if assembly_len < t_80: + + warn_msg = "Assembly size smaller than the minimum" \ + " threshold of 80% of expected genome size: {}".format( + assembly_len) + logger.warning(warn_msg) + warn_fh.write(warn_msg) + fails = warn_msg + + if assembly_len > t_150: + + warn_msg = "Assembly size ({}) larger than the maximum" \ + " threshold of 150% of expected genome size.".format( + assembly_len) + logger.warning(warn_msg) + warn_fh.write(warn_msg) + fails = warn_msg + + + # Write json report + with open(".report.json", "w") as json_report: + json_dic = { + "tableRow": [{ + "sample": sample_id, + "data": [ + {"header": "Contigs ({})".format(assembler), + "value": len(assembly_obj.contigs), + "table": "assembly", + "columnBar": True}, + {"header": "Assembled BP ({})".format(assembler), + "value": assembly_len, + "table": "assembly", + "columnBar": True}, + {"header": "ORFs", + "value": assembly_obj.nORFs, + "table": "assembly", + "columnBar":False} + ] + }], + } + + if warnings: + json_dic["warnings"] = [{ + "sample": sample_id, + "table": "assembly", + "value": warnings + }] + + if fails: + json_dic["fail"] = [{ + "sample": sample_id, + "table": "assembly", + "value": [fails] + }] + + json_report.write(json.dumps(json_dic, separators=(",", ":"))) + + with open(".status", "w") as status_fh: + status_fh.write("pass") + + + +if __name__ == '__main__': + + main(SAMPLE_ID, ASSEMBLY, MINSIZE) + diff --git a/flowcraft/templates/skesa.py b/flowcraft/templates/skesa.py index 44fa48aa..b03b2585 100644 --- a/flowcraft/templates/skesa.py +++ b/flowcraft/templates/skesa.py @@ -24,6 +24,8 @@ - ``${sample_id}_*.assembly.fasta`` : Main output of skesawith the assembly - e.g.: ``sample_1_skesa.fasta`` +- ``clear`` : If 'true', remove the input fastq files at the end of the + component run, IF THE FILES ARE IN THE WORK DIRECTORY Code documentation ------------------ @@ -63,7 +65,7 @@ def __get_version_skesa(): version = "undefined" return { - "program": "SPAdes", + "program": "skesa", "version": version, } diff --git a/flowcraft/templates/spades.py b/flowcraft/templates/spades.py index 9649a0d1..32bd9e68 100644 --- a/flowcraft/templates/spades.py +++ b/flowcraft/templates/spades.py @@ -86,6 +86,7 @@ def __get_version_spades(): KMERS = '$kmers'.strip() CLEAR = '$clear' OPTS = [x.strip() for x in '$opts'.strip("[]").split(",")] + CLEAR = '$clear' logger.debug("Running {} with parameters:".format( os.path.basename(__file__))) logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID)) diff --git a/flowcraft/templates/split_fasta.py b/flowcraft/templates/split_fasta.py index 638af116..04b57f99 100755 --- a/flowcraft/templates/split_fasta.py +++ b/flowcraft/templates/split_fasta.py @@ -75,8 +75,8 @@ def main(sample_id, assembly, min_size): headerStr = header.__next__()[1:].strip() seq = "".join(s.strip() for s in entry.__next__()) if len(seq) >= min_size: - with open(sample_id + '_' + headerStr.replace(" ","_") + '_split.fasta', "w") as output_file: - output_file.write(">" + sample_id + "_" + headerStr + "\\n" + seq + "\\n") + with open(sample_id + '_' + headerStr.replace(" ","_").replace("=","_") + '.fasta', "w") as output_file: + output_file.write(">" + sample_id + "_" + headerStr.replace(" ","_").replace("=","_") + "\\n" + seq + "\\n") success += 1 f_open.close() diff --git a/flowcraft/templates/trimmomatic.py b/flowcraft/templates/trimmomatic.py index 6cfb52bf..7264bd1c 100644 --- a/flowcraft/templates/trimmomatic.py +++ b/flowcraft/templates/trimmomatic.py @@ -49,7 +49,7 @@ # TODO: What to do when there is encoding failure __version__ = "1.0.3" -__build__ = "20062018" +__build__ = "29062018" __template__ = "trimmomatic-nf" import os @@ -205,7 +205,7 @@ def write_report(storage_dic, output_file, sample_id): "tableRow": [{ "sample": sample_id, "data": [ - {"header": "trimmed", + {"header": "Trimmed (%)", "value": vals["total_trim_perc"], "table": "qc", "columnBar": True}, diff --git a/flowcraft/tests/test_assemblerflow.py b/flowcraft/tests/test_assemblerflow.py index 401f6c99..ab43dd69 100644 --- a/flowcraft/tests/test_assemblerflow.py +++ b/flowcraft/tests/test_assemblerflow.py @@ -51,7 +51,8 @@ def test_build_file_2(tmp): "{}".format(p), "--pipeline-only"]) af.build(args) - assert sorted(os.listdir(tmp)) == [".treeDag.json", "containers.config", + assert sorted(os.listdir(tmp)) == [".forkTree.json", ".treeDag.json", + "containers.config", "lib", "params.config", "resources.config", "teste.html", "teste.nf", "user.config"] diff --git a/setup.py b/setup.py index 36a555ec..b553b843 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,7 @@ "profiles.config", "bin/*", "lib/*", + "resources/*", "generator/templates/*"]}, data_files=[("", ["LICENSE"])], install_requires=[