feat: Add rspr entry (#171)

* feat: Add rspr entry * docs: Add the rSPR entry to the documentation * chore: Add rSPR entry params to schema
beiko-lab · Oct 23, 2023 · 7e494ab · 7e494ab
1 parent 55a170b
commit 7e494ab
Show file tree

Hide file tree

Showing 8 changed files with 94 additions and 2 deletions.
diff --git a/bin/rspr_approx.py b/bin/rspr_approx.py
@@ -36,6 +36,7 @@ def parse_args(args=None):
         "--annotation",
         dest="ANNOTATION",
         help="Annotation table from BAKTA/PROKKA",
+        nargs="?",
     )
     parser.add_argument(
         "-o", "--output", dest="OUTPUT_DIR", default="approx", help="Gene tree list"
@@ -295,7 +296,8 @@ def main(args=None):
     make_heatmap(results, fig_path)
 
     results.reset_index("file_name", inplace=True)
-    results = join_annotation_data(results, args.ANNOTATION)
+    if args.ANNOTATION:
+        results = join_annotation_data(results, args.ANNOTATION)
     res_path = os.path.join(args.OUTPUT_DIR, "output.tsv")
     df_with_groups = make_groups_from_csv(results, args.MIN_RSPR_DISTANCE)
     df_with_groups.to_csv(res_path, sep="\t", index=False)

diff --git a/docs/params.md b/docs/params.md
@@ -93,8 +93,10 @@ Parameters for the recombination subworkflow
 | `run_rspr` | Run rSPR | `boolean` |  |  |  |
 | `min_rspr_distance` | Minimum rSPR distance used to define processing groups | `integer` | 10 |  |  |
 | `min_branch_length` | Minimum rSPR branch length | `integer` | 0 |  |  |
-| `max_support_threshold` | Maximum rSPR support threshold | `integer` | 0 |  |  |
+| `max_support_threshold` | Maximum rSPR support threshold | `number` | 0.7 |  |  |
 | `max_approx_rspr` | Maximum approximate rSPR distance for filtering | `integer` | -1 |  |  |
+| `core_gene_tree` | Core (or reference) genome tree. Used in the rSPR entry. | `string` |  |  |  |
+| `concatenated_annotation` | TSV table of annotations for all genomes. Such as the ones generated by Bakta or Prokka in ARETE. | `string` |  |  |  |
 
 ## Institutional config options
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -149,6 +149,33 @@ To execute phylogenomic and pangenomics analysis on pre-existing assemblies:
 nextflow run beiko-lab/ARETE -entry phylogenomics --input_sample_table samplesheet.csv -profile docker
 ```
 
+### rSPR Entry
+
+To execute the rSPR analysis on pre-existing trees:
+
+```bash
+nextflow run beiko-lab/ARETE \
+  -entry rspr \
+  --input_sample_table samplesheet.csv \
+  --core_gene_tree core_gene_alignment.tre \
+  --concatenated_annotation BAKTA.txt \
+  -profile docker
+```
+
+The parameters being:
+
+- `--core_gene_tree` - The reference tree, coming from a core genome alignment, like the one generated by panaroo in ARETE.
+- `--concatenated_annotation` - The tabular annotation results (TSV) for all genomes, like the ones generated at the end of Prokka or Bakta in ARETE. Although useful, it's not necessary to execute the rSPR entry.
+- `--input_sample_table` - A samplesheet containing all individual gene trees in the following format:
+
+`gene_tree,path
+CDS_0000,/path/to/CDS_0000.tre
+CDS_0001,/path/to/CDS_0001.tre
+CDS_0002,/path/to/CDS_0002.tre
+CDS_0003,/path/to/CDS_0003.tre
+CDS_0004,/path/to/CDS_0004.tre
+`
+
 ## Updating the pipeline
 
 When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline:

diff --git a/main.nf b/main.nf
@@ -37,6 +37,7 @@ include { ANNOTATION } from './workflows/arete'
 include { PHYLO } from './workflows/arete'
 include { QUALITYCHECK } from './workflows/arete'
 include { POPPUNK } from './workflows/arete'
+include { RUN_RSPR } from './workflows/arete'
 
 
 //
@@ -68,6 +69,9 @@ workflow poppunk {
     POPPUNK()
 }
 
+workflow rspr {
+    RUN_RSPR()
+}
 /*
 ========================================================================================
     RUN ALL WORKFLOWS

diff --git a/nextflow.config b/nextflow.config
@@ -66,6 +66,10 @@ params {
     max_support_threshold      = 0.7
     max_approx_rspr            = -1
 
+    // rSPR entry
+    core_gene_tree             = null
+    concatenated_annotation    = null
+
     // MultiQC options
     multiqc_config             = null
     multiqc_title              = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -279,6 +279,14 @@
                     "type": "integer",
                     "default": -1,
                     "description": "Maximum approximate rSPR distance for filtering"
+                },
+                "core_gene_tree": {
+                    "type": "string",
+                    "description": "Core (or reference) genome tree. Used in the rSPR entry."
+                },
+                "concatenated_annotation": {
+                    "type": "string",
+                    "description": "TSV table of annotations for all genomes. Such as the ones generated by Bakta or Prokka in ARETE."
                 }
             },
             "fa_icon": "fas fa-bezier-curve"

diff --git a/subworkflows/local/rspr_input_check.nf b/subworkflows/local/rspr_input_check.nf
@@ -0,0 +1,27 @@
+workflow RSPR_INPUT_CHECK {
+    take:
+    samplesheet
+
+    main:
+    samplesheet
+        .splitCsv(header: true)
+        .map { it -> get_sample_info_rspr(it.path) }
+        .set { trees }
+
+    emit:
+    trees
+}
+
+def get_sample_info_rspr(row) {
+
+    def array = []
+    if (!file(row).exists()) {
+        print("***")
+        print(row)
+        print("***")
+        exit 1, "ERROR: Please check input samplesheet -> Tree file does not exist!\n${row.path}"
+    }
+    array = [ file(row)]
+
+    return array
+}
diff --git a/workflows/arete.nf b/workflows/arete.nf
@@ -38,6 +38,7 @@ ch_multiqc_logo            = params.multiqc_logo   ? Channel.fromPath( params.mu
 include { INPUT_CHECK } from '../subworkflows/local/input_check'
 include { PHYLO_INPUT_CHECK } from '../subworkflows/local/phylo_input_check'
 include { ANNOTATION_INPUT_CHECK } from '../subworkflows/local/annotation_input_check'
+include { RSPR_INPUT_CHECK } from '../subworkflows/local/rspr_input_check'
 include { ASSEMBLE_SHORTREADS } from '../subworkflows/local/assembly'
 include { ANNOTATE_ASSEMBLIES } from '../subworkflows/local/annotation'
 include { CHECK_ASSEMBLIES } from '../subworkflows/local/assemblyqc'
@@ -591,6 +592,23 @@ workflow POPPUNK {
     )
 }
 
+
+workflow RUN_RSPR {
+    if (params.input_sample_table) { ch_input = Channel.of(file(params.input_sample_table)) } else { exit 1, 'Input samplesheet not specified!' }
+    if (params.core_gene_tree) { ch_core = file(params.core_gene_tree) } else { exit 1, 'Core tree not specified!' }
+    ch_annotation_data = params.concatenated_annotation ? file(params.concatenated_annotation) : []
+
+    RSPR_INPUT_CHECK (
+        ch_input
+    )
+
+    RSPR (
+        ch_core,
+        RSPR_INPUT_CHECK.out.trees,
+        ch_annotation_data
+    )
+}
+
 /*
 ========================================================================================
     COMPLETION EMAIL AND SUMMARY