Skip to content

Commit

Permalink
feat: Add rspr entry (#171)
Browse files Browse the repository at this point in the history
* feat: Add rspr entry

* docs: Add the rSPR entry to the documentation

* chore: Add rSPR entry params to schema
  • Loading branch information
jvfe committed Oct 23, 2023
1 parent 55a170b commit 7e494ab
Show file tree
Hide file tree
Showing 8 changed files with 94 additions and 2 deletions.
4 changes: 3 additions & 1 deletion bin/rspr_approx.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def parse_args(args=None):
"--annotation",
dest="ANNOTATION",
help="Annotation table from BAKTA/PROKKA",
nargs="?",
)
parser.add_argument(
"-o", "--output", dest="OUTPUT_DIR", default="approx", help="Gene tree list"
Expand Down Expand Up @@ -295,7 +296,8 @@ def main(args=None):
make_heatmap(results, fig_path)

results.reset_index("file_name", inplace=True)
results = join_annotation_data(results, args.ANNOTATION)
if args.ANNOTATION:
results = join_annotation_data(results, args.ANNOTATION)
res_path = os.path.join(args.OUTPUT_DIR, "output.tsv")
df_with_groups = make_groups_from_csv(results, args.MIN_RSPR_DISTANCE)
df_with_groups.to_csv(res_path, sep="\t", index=False)
Expand Down
4 changes: 3 additions & 1 deletion docs/params.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,10 @@ Parameters for the recombination subworkflow
| `run_rspr` | Run rSPR | `boolean` | | | |
| `min_rspr_distance` | Minimum rSPR distance used to define processing groups | `integer` | 10 | | |
| `min_branch_length` | Minimum rSPR branch length | `integer` | 0 | | |
| `max_support_threshold` | Maximum rSPR support threshold | `integer` | 0 | | |
| `max_support_threshold` | Maximum rSPR support threshold | `number` | 0.7 | | |
| `max_approx_rspr` | Maximum approximate rSPR distance for filtering | `integer` | -1 | | |
| `core_gene_tree` | Core (or reference) genome tree. Used in the rSPR entry. | `string` | | | |
| `concatenated_annotation` | TSV table of annotations for all genomes. Such as the ones generated by Bakta or Prokka in ARETE. | `string` | | | |

## Institutional config options

Expand Down
27 changes: 27 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,33 @@ To execute phylogenomic and pangenomics analysis on pre-existing assemblies:
nextflow run beiko-lab/ARETE -entry phylogenomics --input_sample_table samplesheet.csv -profile docker
```

### rSPR Entry

To execute the rSPR analysis on pre-existing trees:

```bash
nextflow run beiko-lab/ARETE \
-entry rspr \
--input_sample_table samplesheet.csv \
--core_gene_tree core_gene_alignment.tre \
--concatenated_annotation BAKTA.txt \
-profile docker
```

The parameters being:

- `--core_gene_tree` - The reference tree, coming from a core genome alignment, like the one generated by panaroo in ARETE.
- `--concatenated_annotation` - The tabular annotation results (TSV) for all genomes, like the ones generated at the end of Prokka or Bakta in ARETE. Although useful, it's not necessary to execute the rSPR entry.
- `--input_sample_table` - A samplesheet containing all individual gene trees in the following format:

`gene_tree,path
CDS_0000,/path/to/CDS_0000.tre
CDS_0001,/path/to/CDS_0001.tre
CDS_0002,/path/to/CDS_0002.tre
CDS_0003,/path/to/CDS_0003.tre
CDS_0004,/path/to/CDS_0004.tre
`

## Updating the pipeline

When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline:
Expand Down
4 changes: 4 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ include { ANNOTATION } from './workflows/arete'
include { PHYLO } from './workflows/arete'
include { QUALITYCHECK } from './workflows/arete'
include { POPPUNK } from './workflows/arete'
include { RUN_RSPR } from './workflows/arete'


//
Expand Down Expand Up @@ -68,6 +69,9 @@ workflow poppunk {
POPPUNK()
}

workflow rspr {
RUN_RSPR()
}
/*
========================================================================================
RUN ALL WORKFLOWS
Expand Down
4 changes: 4 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ params {
max_support_threshold = 0.7
max_approx_rspr = -1

// rSPR entry
core_gene_tree = null
concatenated_annotation = null

// MultiQC options
multiqc_config = null
multiqc_title = null
Expand Down
8 changes: 8 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,14 @@
"type": "integer",
"default": -1,
"description": "Maximum approximate rSPR distance for filtering"
},
"core_gene_tree": {
"type": "string",
"description": "Core (or reference) genome tree. Used in the rSPR entry."
},
"concatenated_annotation": {
"type": "string",
"description": "TSV table of annotations for all genomes. Such as the ones generated by Bakta or Prokka in ARETE."
}
},
"fa_icon": "fas fa-bezier-curve"
Expand Down
27 changes: 27 additions & 0 deletions subworkflows/local/rspr_input_check.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
workflow RSPR_INPUT_CHECK {
take:
samplesheet

main:
samplesheet
.splitCsv(header: true)
.map { it -> get_sample_info_rspr(it.path) }
.set { trees }

emit:
trees
}

def get_sample_info_rspr(row) {

def array = []
if (!file(row).exists()) {
print("***")
print(row)
print("***")
exit 1, "ERROR: Please check input samplesheet -> Tree file does not exist!\n${row.path}"
}
array = [ file(row)]

return array
}
18 changes: 18 additions & 0 deletions workflows/arete.nf
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.mu
include { INPUT_CHECK } from '../subworkflows/local/input_check'
include { PHYLO_INPUT_CHECK } from '../subworkflows/local/phylo_input_check'
include { ANNOTATION_INPUT_CHECK } from '../subworkflows/local/annotation_input_check'
include { RSPR_INPUT_CHECK } from '../subworkflows/local/rspr_input_check'
include { ASSEMBLE_SHORTREADS } from '../subworkflows/local/assembly'
include { ANNOTATE_ASSEMBLIES } from '../subworkflows/local/annotation'
include { CHECK_ASSEMBLIES } from '../subworkflows/local/assemblyqc'
Expand Down Expand Up @@ -591,6 +592,23 @@ workflow POPPUNK {
)
}


workflow RUN_RSPR {
if (params.input_sample_table) { ch_input = Channel.of(file(params.input_sample_table)) } else { exit 1, 'Input samplesheet not specified!' }
if (params.core_gene_tree) { ch_core = file(params.core_gene_tree) } else { exit 1, 'Core tree not specified!' }
ch_annotation_data = params.concatenated_annotation ? file(params.concatenated_annotation) : []

RSPR_INPUT_CHECK (
ch_input
)

RSPR (
ch_core,
RSPR_INPUT_CHECK.out.trees,
ch_annotation_data
)
}

/*
========================================================================================
COMPLETION EMAIL AND SUMMARY
Expand Down

0 comments on commit 7e494ab

Please sign in to comment.