feat: Add gene order entry (#176)

* feat: Add gene order entry * docs: Add gene order entry * docs: Update diagram
beiko-lab · Nov 18, 2023 · b22988b · b22988b
1 parent 6b7e268
commit b22988b
Show file tree

Hide file tree

Showing 10 changed files with 260 additions and 122 deletions.
diff --git a/assets/arete.diagram.light.png b/assets/arete.diagram.light.png
diff --git a/assets/arete.diagram.png b/assets/arete.diagram.png
diff --git a/docs/assets/arete.diagram.light.png b/docs/assets/arete.diagram.light.png
diff --git a/docs/assets/arete.diagram.png b/docs/assets/arete.diagram.png
diff --git a/docs/assets/arete.drawio.xml b/docs/assets/arete.drawio.xml
diff --git a/docs/usage.md b/docs/usage.md
@@ -220,6 +220,26 @@ nextflow run beiko-lab/ARETE \
   -profile docker
 ```
 
+### Gene Order Entry
+
+To execute the Gene Order analysis on pre-existing assemblies and RGI annotations:
+
+```bash
+nextflow run beiko-lab/ARETE \
+  -entry gene_order \
+  --input_sample_table gene_order_samplesheet.csv \
+  -profile docker
+```
+
+- `--input_sample_table` - A samplesheet containing a fasta file, a genbank file and an RGI output file for each assembly:
+
+```
+sample,fna_file_path,gbk,rgi
+SAMD00052607,SAMD00052607.faa,SAMD00052607.gbk,SAMD00052607_rgi.txt
+SAMEA1466699,SAMEA1466699.faa,SAMEA1466699.gbk,SAMEA1466699_rgi.txt
+SAMEA1486355,SAMEA1486355.faa,SAMEA1486355.gbk,SAMEA1486355_rgi.txt
+```
+
 ## Updating the pipeline
 
 When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline:

diff --git a/main.nf b/main.nf
@@ -40,6 +40,7 @@ include { POPPUNK } from './workflows/arete'
 include { RUN_RSPR } from './workflows/arete'
 include { RUN_EVOLCCM } from './workflows/arete'
 include { RUN_RECOMBINATION } from './workflows/arete'
+include { RUN_GENE_ORDER } from './workflows/arete'
 
 //
 // WORKFLOW: Run main nf-core/arete analysis pipeline
@@ -81,6 +82,10 @@ workflow evolccm {
 workflow recombination {
     RUN_RECOMBINATION()
 }
+
+workflow gene_order {
+    RUN_GENE_ORDER()
+}
 /*
 ========================================================================================
     RUN ALL WORKFLOWS

diff --git a/subworkflows/local/geneorder_input_check.nf b/subworkflows/local/geneorder_input_check.nf
@@ -0,0 +1,47 @@
+workflow GENEORDER_INPUT_CHECK {
+    take:
+    samplesheet
+
+    main:
+    samplesheet
+        .splitCsv(header: true)
+        .map { it -> get_sample_info_geneorder(it) }
+        .set { geneorder_input }
+
+    geneorder_input
+        .map { meta, assemblies, gbks, rgis -> meta.id }
+        .subscribe { if ( "$it".contains(".") ) exit 1, "Please review data input, sampleIDs may not contain dots, but \"$it\" does." }
+
+
+    emit:
+    geneorder_input
+}
+
+def get_sample_info_geneorder(row) {
+    def meta = [:]
+    meta.id           = row.sample
+    meta.single_end = true //Bit of a hack; call assemblies "single end" to allow passing to kraken
+
+    def array = []
+    if (!file(row.fna_file_path).exists()) {
+        print("***")
+        print(row.fna_file_path)
+        print("***")
+        exit 1, "ERROR: Please check input samplesheet -> Assembly file does not exist!\n${row.fna_file_path}"
+    }
+    if (!file(row.gbk).exists()) {
+        print("***")
+        print(row.gbk)
+        print("***")
+        exit 1, "ERROR: Please check input samplesheet -> GenBank file does not exist!\n${row.gbk}"
+    }
+    if (!file(row.rgi).exists()) {
+        print("***")
+        print(row.rgi)
+        print("***")
+        exit 1, "ERROR: Please check input samplesheet -> RGI file does not exist!\n${row.rgi}"
+    }
+    array = [ meta, file(row.fna_file_path), file(row.gbk), file(row.rgi) ]
+
+    return array
+}
diff --git a/test/gene-order/gene_order_samplesheet.csv b/test/gene-order/gene_order_samplesheet.csv
@@ -0,0 +1,4 @@
+sample,fna_file_path,gbk,rgi
+SAMD00052607,test/gene-order/FAA/SAMD00052607.faa,test/gene-order/GBK/SAMD00052607.gbk,test/gene-order/RGI/SAMD00052607_rgi.txt
+SAMEA1466699,test/gene-order/FAA/SAMEA1466699.faa,test/gene-order/GBK/SAMEA1466699.gbk,test/gene-order/RGI/SAMEA1466699_rgi.txt
+SAMEA1486355,test/gene-order/FAA/SAMEA1486355.faa,test/gene-order/GBK/SAMEA1486355.gbk,test/gene-order/RGI/SAMEA1486355_rgi.txt
diff --git a/workflows/arete.nf b/workflows/arete.nf
@@ -35,6 +35,7 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check'
 include { PHYLO_INPUT_CHECK } from '../subworkflows/local/phylo_input_check'
 include { ANNOTATION_INPUT_CHECK } from '../subworkflows/local/annotation_input_check'
 include { RSPR_INPUT_CHECK } from '../subworkflows/local/rspr_input_check'
+include { GENEORDER_INPUT_CHECK } from '../subworkflows/local/geneorder_input_check'
 include { ASSEMBLE_SHORTREADS } from '../subworkflows/local/assembly'
 include { ANNOTATE_ASSEMBLIES } from '../subworkflows/local/annotation'
 include { CHECK_ASSEMBLIES } from '../subworkflows/local/assemblyqc'
@@ -683,6 +684,35 @@ workflow RUN_RECOMBINATION {
     ch_software_versions = ch_software_versions.mix(MULTIQC.out.versions.ifEmpty(null))
 
 }
+
+workflow RUN_GENE_ORDER {
+    if (params.input_sample_table){ ch_input = Channel.of(file(params.input_sample_table)) } else { exit 1, 'Input samplesheet not specified!' }
+
+    GENEORDER_INPUT_CHECK (
+        ch_input
+    )
+
+    GENEORDER_INPUT_CHECK.out.geneorder_input
+        .set { all_inputs }
+
+    all_inputs
+        .map { it -> [it[0], it[1]] }
+        .set { assemblies }
+
+    all_inputs
+        .map { it -> [it[0], it[2]] }
+        .set { gbks }
+
+    all_inputs
+        .map { it -> [it[0], it[3]] }
+        .set { rgis }
+
+    GENE_ORDER (
+        assemblies,
+        gbks,
+        rgis
+    )
+}
 /*
 ========================================================================================
     COMPLETION EMAIL AND SUMMARY