Skip to content

Commit

Permalink
feat: Add genome subsampling (#63)
Browse files Browse the repository at this point in the history
* feat: Add module to extract distances

* feat: Incorporate subsampling into main workflows

* test: Add test for subsampling subworkflow

* chore: Remove commented out code

* test: Add subsetting to poppunk test

* test: Add subsetting to test profile

* fix: Change way of setting distance output

* refactor: Extract distances should be part of subsetting

* test: Change test to use popdb

* test: Fix poppunk trace size

* refactor: Update default threshold

* feat: Write removed genomes to a file

* test: Update similarity

* feat: Add poppunk entry

* fix: Change variable declaration

* refactor: Change calculus

* test: Update subsampling test
  • Loading branch information
jvfe committed Apr 5, 2023
1 parent 00735d1 commit af41029
Show file tree
Hide file tree
Showing 14 changed files with 265 additions and 26 deletions.
8 changes: 8 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,14 @@ process {
]
}

withName: POPPUNK_EXTRACT_DISTANCES {
publishDir = [
path: { "${params.outdir}/poppunk_results/distances/" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

// pangenomics

withName: PANAROO_RUN {
Expand Down
1 change: 1 addition & 0 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ params {
db_cache = false
use_full_alignment = false
use_fasttree = true
enable_subsetting = true
use_prokka = true
skip_kraken = true
skip_poppunk = true
Expand Down
6 changes: 6 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ include { ARETE } from './workflows/arete'
include { ASSEMBLY } from './workflows/arete'
include { ANNOTATION } from './workflows/arete'
include { QUALITYCHECK } from './workflows/arete'
include { POPPUNK } from './workflows/arete'


//
Expand All @@ -57,6 +58,11 @@ workflow annotation {
workflow assembly_qc {
QUALITYCHECK()
}

workflow poppunk {
POPPUNK()
}

/*
========================================================================================
RUN ALL WORKFLOWS
Expand Down
32 changes: 32 additions & 0 deletions modules/local/poppunk/extractdistances/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
process POPPUNK_EXTRACT_DISTANCES {
label 'process_high'

conda (params.enable_conda ? "bioconda::poppunk=2.6.0" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/poppunk:2.6.0--py39h9b916c0_0':
'quay.io/biocontainers/poppunk:2.6.0--py39h9b916c0_0' }"

input:

path poppunk_db

output:

path "poppunk_db_distances.tsv", emit: poppunk_distances
path "versions.yml", emit: versions

script:
def args = task.ext.args ?: ''

"""
poppunk_extract_distances.py \\
--distances $poppunk_db/${poppunk_db}.dists \\
--output poppunk_db_distances.tsv \\
$args
cat <<-END_VERSIONS > versions.yml
"${task.process}":
poppunk: \$(echo \$(poppunk --version 2>&1) | sed 's/^.*poppunk //;')
END_VERSIONS
"""
}
33 changes: 33 additions & 0 deletions modules/local/poppunk/extractdistances/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: poppunk_extract_distances
description: Extract pairwise distances between genomes
keywords:
- genomes
- poppunk
- database
- distances
tools:
- poppunk:
description: Population partitioning using nucleotide k-mers
homepage: https://poppunk.net/
documentation: https://poppunk.readthedocs.io/en/latest/
tool_dev_url: https://github.com/bacpop/PopPUNK
doi: "doi:10.1101/gr.241455.118"
licence: ["Apache-2.0"]

input:
- poppunk_db:
type: directory
description: Directory containing the PopPunk database

output:
- poppunk_distances:
type: file
description: TSV file containing pairwise distances
pattern: "poppunk_db_distances.tsv"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"

authors:
- "@jvfe"
5 changes: 5 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ params {
poppunk_model = null
run_poppunk_qc = false

// Subsampling parameters
enable_subsetting = false
core_similarity = 99.9
accessory_similarity = 99

// Phylogenomics parameters
use_full_alignment = false
use_fasttree = true
Expand Down
17 changes: 17 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,23 @@
"type": "boolean",
"description": "Whether to run the QC step for PopPunk",
"fa_icon": "fas fa-industry"
},
"enable_subsetting": {
"type": "boolean",
"fa_icon": "fas fa-tasks",
"description": "Enable subsetting workflow based on genome similarity"
},
"core_similarity": {
"type": "number",
"default": 99.99,
"fa_icon": "fas fa-clone",
"description": "Similarity threshold for core genomes"
},
"accessory_similarity": {
"type": "number",
"default": 99,
"fa_icon": "far fa-clone",
"description": "Similarity threshold for accessory genes"
}
}
},
Expand Down
1 change: 1 addition & 0 deletions subworkflows/local/poppunk.nf
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,5 @@ workflow RUN_POPPUNK {
poppunk_version = POPPUNK_FITMODEL.out.versions.ifEmpty(null)
poppunk_results = poppunk_results
poppunk_visualisations = POPPUNK_VISUALISE.out.poppunk_visualizations
poppunk_db = poppunk_db
}
36 changes: 36 additions & 0 deletions subworkflows/local/subsample.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
include { POPPUNK_EXTRACT_DISTANCES } from '../../modules/local/poppunk/extractdistances/main'

workflow SUBSET_GENOMES {

take:
genome_assemblies
poppunk_db

main:

POPPUNK_EXTRACT_DISTANCES(poppunk_db)
POPPUNK_EXTRACT_DISTANCES.out.poppunk_distances.set{ poppunk_distances }

def core_threshold = 100.0 - params.core_similarity
def accessory_threshold = 100.0 - params.accessory_similarity

poppunk_distances
.splitCsv(header: true, sep: '\t')
.filter { row -> (row.Core.toFloat() * 100) < core_threshold && (row.Accessory.toFloat() * 100) < accessory_threshold }
.map { row -> row.Query }
.set { genomes_to_remove }

genomes_to_remove
.unique()
.collectFile(newLine: true)
.collectFile(name: 'removed_genomes.txt', storeDir: "${params.outdir}/poppunk_results")

genome_assemblies
.combine (genomes_to_remove.collect().map { [it] })
.filter { meta, path, to_remove -> !(meta.id in to_remove) }
.map { it[0, 1] }
.set { filtered_genomes }

emit:
filtered_genomes = filtered_genomes
}
Binary file added test/popdb/popdb.dists.npy
Binary file not shown.
Binary file added test/popdb/popdb.dists.pkl
Binary file not shown.
Binary file added test/popdb/popdb.h5
Binary file not shown.
44 changes: 44 additions & 0 deletions tests/subworkflows/local/subsample.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
nextflow_workflow {

name "Test Workflow SUBSET_GENOMES"
script "subworkflows/local/subsample.nf"
workflow "SUBSET_GENOMES"

test("Subsampling subworkflow runs without failures") {

when {
params {
// define parameters here. Example:
outdir = "$outputDir"
core_similarity = 99
accessory_similarity = 95
}
workflow {
"""
// define inputs of the workflow here. Example:
input[0] = Channel.of(
[[id:'SRR14022735'], "$baseDir/test/SRR14022735_T1.scaffolds.fa"],
[[id:'SRR14022737'], "$baseDir/test/SRR14022737_T1.scaffolds.fa"],
[[id:'SRR14022754'], "$baseDir/test/SRR14022754_T1.scaffolds.fa"],
[[id:'SRR14022764'], "$baseDir/test/SRR14022764_T1.scaffolds.fa"],
)
input[1] = Channel.fromPath("$baseDir/test/popdb")
"""
}
}

then {
assert workflow.success
assert workflow.out.filtered_genomes.size() == 3
assert workflow.out.filtered_genomes.toList() == [
[['id':'SRR14022735'], "$baseDir/test/SRR14022735_T1.scaffolds.fa"],
[['id':'SRR14022754'], "$baseDir/test/SRR14022754_T1.scaffolds.fa"],
[['id':'SRR14022764'], "$baseDir/test/SRR14022764_T1.scaffolds.fa"]
]


}

}

}
Loading

0 comments on commit af41029

Please sign in to comment.