Skip to content

Commit

Permalink
Add ability to use db cache
Browse files Browse the repository at this point in the history
  • Loading branch information
alexmanuele committed Jul 19, 2022
1 parent 6e9a1d5 commit 50193b3
Show file tree
Hide file tree
Showing 7 changed files with 219 additions and 32 deletions.
63 changes: 63 additions & 0 deletions modules/local/get_db_cache.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// Import generic module functions
include { initOptions; saveFiles; getSoftwareName } from './functions'

params.options = [:]
options = initOptions(params.options)

//process for acquiring cached databases
process GET_DB_CACHE {
input:
path(dbcache)

output:
path "VFDB_setB_pro.fas.gz", emit: vfdb
path "CAZyDB.07312020.fa", emit: cazydb
path "BacMet2_predicted_database.fasta.gz", emit: bacmet
path "card.json", emit: card_json
path "card.version.txt", emit: card_version
path("""k2_standard_8gb_20201202"""), emit: minikraken

script:
"""
cp $dbcache/VFDB_setB_pro.fas.gz .
cp $dbcache/CAZyDB.07312020.fa .
cp $dbcache/BacMet2_predicted_database.fasta.gz .
cp $dbcache/card.json .
cp $dbcache/card.version.txt .
cp -r $dbcache/k2_standard_8gb_20201202 .
"""
}

// process GET_ANNOTATION_DB_CACHE {
// input:
// path(dbcache)

// output:
// path "VFDB_setB_pro.fas.gz", emit: vfdb
// path "CAZyDB.07312020.fa", emit: cazydb
// path "BacMet2_predicted_database.fasta.gz", emit: bacmet
// path "card.json", emit: card_json
// path "card.version.txt", emit: card_version

// script:
// """
// cp VFDB_setB_pro.fas.gz .
// cp CAZyDB.07312020.fa .
// cp BacMet2_predicted_database.fasta.gz .
// cp card.json .
// cp card.version.txt .
// """
// }

// process GET_ASSEMBLY_DB_CACHE {
// input:
// path(dbcache)

// output:
// path("""k2_standard_8gb_20201202"""), emit: minikraken

// script:
// """
// cp -r $dbcache/k2_standard_8gb_20201202 .
// """
// }
18 changes: 10 additions & 8 deletions modules/local/get_minikraken.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,21 @@ params.options = [:]
options = initOptions(params.options)

process KRAKEN2_DB {
publishDir 'dbcache/', mode:'copy'
//publishDir 'dbcache/', mode:'copy'
tag "minikraken"
label 'process_high'

//output:
//path("""k2_standard_8gb_20201202"""), emit: minikraken
output:
path("""k2_standard_8gb_20201202"""), emit: minikraken

script:
// """
// curl https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20201202.tar.gz --output k2_standard_8gb_20201202.tar.gz
// mkdir -p k2_standard_8gb_20201202
// tar xvf k2_standard_8gb_20201202.tar.gz -C k2_standard_8gb_20201202
// """
"""
curl https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20201202.tar.gz --output k2_standard_8gb_20201202.tar.gz
mkdir -p k2_standard_8gb_20201202
tar xvf k2_standard_8gb_20201202.tar.gz -C k2_standard_8gb_20201202
"""
// stub:
/*
minikraken = file("./dbcache/k2_standard_8gb_20201202")
if (!minikraken.exists()){
println "FARRRRT"
Expand All @@ -33,6 +34,7 @@ process KRAKEN2_DB {
echo "database is cached"
"""
}
*/
// else{
// """
// ln -s ${minikraken} .
Expand Down
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ try {
System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config")
}

singularity.cacheDir = "container_cache"

profiles {
debug { process.beforeScript = 'echo $HOSTNAME' }
Expand Down
84 changes: 69 additions & 15 deletions subworkflows/local/annotation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ include { RGI;
UPDATE_RGI_DB } from '../../modules/local/rgi' addParams( options: [:] )
include { MOB_RECON } from '../../modules/local/mobsuite' addParams( options: [:] )


// Usage pattern from nf-core/rnaseq: Empty dummy file for optional inputs
ch_dummy_input = file("$projectDir/assets/dummy_file.txt", checkIfExists: true)

Expand All @@ -44,6 +43,13 @@ workflow ANNOTATE_ASSEMBLIES {
take:
assemblies
bakta_db
vfdb_cache
cazydb_cache
bacmet_cache
card_json_cache
card_version_cache


main:

//if (params.input_sample_table){ ch_input = file(params.input_sample_table) } else { exit 1, 'Input samplesheet not specified!' }
Expand All @@ -55,14 +61,64 @@ workflow ANNOTATE_ASSEMBLIES {
//ANNOTATION_INPUT_CHECK(ch_input)

/*
* Module: Annotate AMR
* Load in the databases. Check if they were cached, otherwise run the processes that get them
*/
ch_cazy_db = Channel.empty()
ch_bacmet_db = Channel.empty()
ch_vfdb = Channel.empty()
ch_card_version = Channel.empty()
ch_card_json = Channel.empty()
ch_kraken_db = Channel.empty()

// Note: I hate how inelegant this block is. Works for now, but consider looking for a more elegant nextflow pattern
/*
* Load BLAST databases
*/
if (vfdb_cache){
ch_vfdb = ch_vfdb.mix(vfdb_cache)
}
else{
GET_VFDB()
ch_vfdb = ch_vfdb.mix(GET_VFDB.out.vfdb)
}
if(bacmet_cache){
ch_bacmet_db = ch_bacmet_db.mix(bacmet_cache)
}
else{
GET_BACMET()
ch_bacmet_db = ch_bacmet_db.mix(GET_BACMET.out.bacmet)
}
if (cazydb_cache){
ch_cazy_db = ch_cazy_db.mix(ch_cazy_db)
}
else{
GET_CAZYDB()
ch_cazy_db = ch_cazy_db.mix(GET_CAZYDB.out.cazydb)
}
/*
* Load RGI for AMR annotation
*/
if (card_json_cache){
ch_card_json = ch_card_json.mix(card_json_cache)
ch_software_versions = ch_software_versions.mix(card_version_cache)
}
else{
UPDATE_RGI_DB()
ch_card_json = ch_card_json.mix(UPDATE_RGI_DB.out.card_json)
ch_software_versions = ch_software_versions.mix(UPDATE_RGI_DB.out.card_version.ifEmpty(null))
}


/*
* Run RGI
*/
UPDATE_RGI_DB()
ch_software_versions = ch_software_versions.mix(UPDATE_RGI_DB.out.card_version.ifEmpty(null))
RGI(assemblies, UPDATE_RGI_DB.out.card_json)
RGI(assemblies, ch_card_json)
ch_software_versions = ch_software_versions.mix(RGI.out.version.first().ifEmpty(null))

//TODO prokka is in both annotation and assembly right now...

/*
* Run gene finding software (Prokka or Bakta)
*/
ch_ffn_files = Channel.empty()
ch_gff_files = Channel.empty()
if (bakta_db){
Expand All @@ -82,27 +138,25 @@ workflow ANNOTATE_ASSEMBLIES {
ch_gff_files = ch_gff_files.mix(PROKKA.out.gff)
}


/*
* Module: Mob-Suite
* Module: Mob-Suite. Database is included in singularity container
*/
MOB_RECON(assemblies)
ch_software_versions = ch_software_versions.mix(MOB_RECON.out.version.first().ifEmpty(null))



/*
* Module: BLAST vs CAZY, VFDB, Bacmet
* Run DIAMOND blast annotation with databases
*/
GET_CAZYDB()
GET_BACMET()
GET_VFDB()
DIAMOND_MAKE_CAZY(GET_CAZYDB.out.cazydb)
DIAMOND_MAKE_CAZY(ch_cazy_db)
ch_software_versions = ch_software_versions.mix(DIAMOND_MAKE_CAZY.out.versions.ifEmpty(null))
DIAMOND_BLAST_CAZY(ch_ffn_files, DIAMOND_MAKE_CAZY.out.db, "CAZYDB")

DIAMOND_MAKE_VFDB(GET_VFDB.out.vfdb)
DIAMOND_MAKE_VFDB(ch_vfdb)
DIAMOND_BLAST_VFDB(ch_ffn_files, DIAMOND_MAKE_VFDB.out.db, "VFDB")

DIAMOND_MAKE_BACMET(GET_BACMET.out.bacmet)
DIAMOND_MAKE_BACMET(ch_bacmet_db)
DIAMOND_BLAST_BACMET(ch_ffn_files, DIAMOND_MAKE_BACMET.out.db, "BACMET")

ch_multiqc_files = Channel.empty()
Expand Down
14 changes: 11 additions & 3 deletions subworkflows/local/assembly.nf
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ include { CHECKM_LINEAGEWF } from '../../modules/nf-core/modules/checkm/lineagew
include { GET_SOFTWARE_VERSIONS } from '../../modules/local/get_software_versions' addParams( options: [publish_files : ['tsv':'']] )
include { KRAKEN2_DB } from '../../modules/local/get_minikraken' addParams( options: [:] )


// Usage pattern from nf-core/rnaseq: Empty dummy file for optional inputs
ch_dummy_input = file("$projectDir/assets/dummy_file.txt", checkIfExists: true)

Expand All @@ -51,6 +50,7 @@ workflow ASSEMBLE_SHORTREADS{
reads
ch_reference_genome
use_reference_genome
krakendb_cache

main:
/////////////////// Read Processing /////////////////////////////
Expand All @@ -76,8 +76,16 @@ workflow ASSEMBLE_SHORTREADS{
///*
// * MODULE: Run Kraken2
// */
KRAKEN2_DB()
ch_kraken_db = Channel.fromPath("dbcache/k2_standard_8gb_20201202")
ch_kraken_db = Channel.empty()
if (krakendb_cache){
ch_kraken_db = ch_kraken_db.mix(krakendb_cache)
}
else{
KRAKEN2_DB()
ch_kraken_db = ch_kraken_db.mix(KRAKEN2_DB.minikraken)
}


//KRAKEN2_RUN(FASTP.out.reads, KRAKEN2_DB.out.minikraken)
KRAKEN2_RUN(FASTP.out.reads, ch_kraken_db)
ch_software_versions = ch_software_versions.mix(KRAKEN2_RUN.out.versions.first().ifEmpty(null))
Expand Down
2 changes: 1 addition & 1 deletion subworkflows/local/phylo.nf
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ workflow PHYLOGENOMICS{

emit:
phylo_software = ch_software_versions
core_alignment = PANAROO_RUN.out.aln
core_alignment = ch_core_gene_alignment
}

/*
Expand Down
69 changes: 64 additions & 5 deletions workflows/arete.nf
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ include { RGI;
UPDATE_RGI_DB } from '../modules/local/rgi' addParams( options: [:] )
include { MOB_RECON } from '../modules/local/mobsuite' addParams( options: [:] )
include { KRAKEN2_DB } from '../modules/local/get_minikraken' addParams( options: [:] )
include { GET_DB_CACHE } from '../modules/local/get_db_cache' addParams( options: [:] )


// Usage pattern from nf-core/rnaseq: Empty dummy file for optional inputs
Expand Down Expand Up @@ -120,6 +121,19 @@ workflow ARETE {
ch_bakta_db = false
}

//db_cache = params.db_cache ? params.db_cache: false
//ch_db_cache = Channel.empty()
// ch_assembly_db_cache = Channel.empty()
// ch_annotation_db_cache = Channel.empty()
// if (params.db_cache){
// ch_assembly_db_cache = GET_ASSEMBLY_DB_CACHE(file(params.db_cache))
// ch_annotation_db_cache = GET_ANNOTATION_DB_CACHE(file(params.db_cache))
// }
// else{
// ch_assembly_db_cache = false
// ch_annotation_db_cache = false
// }
db_cache = params.db_cache ? params.db_cache : false
use_roary = params.use_roary ? true : false
use_full_alignment = params.use_full_alignment ? true : false
use_fasttree = params.use_fasttree ? true: false
Expand All @@ -136,13 +150,58 @@ workflow ARETE {
*/
INPUT_CHECK(ch_input)

/////////////////// ASSEMBLY ///////////////////////////
ASSEMBLE_SHORTREADS(INPUT_CHECK.out.reads, ch_reference_genome, use_reference_genome)
ch_software_versions = ch_software_versions.mix(ASSEMBLE_SHORTREADS.out.assembly_software)
if(db_cache){
GET_DB_CACHE(db_cache)
/////////////////// ASSEMBLY ///////////////////////////
ASSEMBLE_SHORTREADS(
INPUT_CHECK.out.reads,
ch_reference_genome,
use_reference_genome,
GET_DB_CACHE.out.minikraken
)

/////////////////// ANNOTATION ///////////////////////////
ANNOTATE_ASSEMBLIES(
ASSEMBLE_SHORTREADS.out.scaffolds,
ch_bakta_db,
GET_DB_CACHE.out.vfdb,
GET_DB_CACHE.out.cazydb,
GET_DB_CACHE.out.bacmet,
GET_DB_CACHE.out.card_json,
GET_DB_CACHE.out.card_version
)

/////////////////// ANNOTATION ///////////////////////////
ANNOTATE_ASSEMBLIES(ASSEMBLE_SHORTREADS.out.scaffolds, ch_bakta_db)

}
else{
ASSEMBLE_SHORTREADS(
INPUT_CHECK.out.reads,
ch_reference_genome,
use_reference_genome,
[]
)

/////////////////// ANNOTATION ///////////////////////////
ANNOTATE_ASSEMBLIES(
ASSEMBLE_SHORTREADS.out.scaffolds,
ch_bakta_db,
[],
[],
[],
[],
[]
)
}
ch_software_versions = ch_software_versions.mix(ANNOTATE_ASSEMBLIES.out.annotation_software)
ch_software_versions = ch_software_versions.mix(ASSEMBLE_SHORTREADS.out.assembly_software)

// /////////////////// ASSEMBLY ///////////////////////////
// ASSEMBLE_SHORTREADS(INPUT_CHECK.out.reads, ch_reference_genome, use_reference_genome, db_cache)
// ch_software_versions = ch_software_versions.mix(ASSEMBLE_SHORTREADS.out.assembly_software)

// /////////////////// ANNOTATION ///////////////////////////
// ANNOTATE_ASSEMBLIES(ASSEMBLE_SHORTREADS.out.scaffolds, ch_bakta_db, db_cache)
// ch_software_versions = ch_software_versions.mix(ANNOTATE_ASSEMBLIES.out.annotation_software)

////////////////////////// PANGENOME /////////////////////////////////////
PHYLOGENOMICS(ANNOTATE_ASSEMBLIES.out.gff, use_roary, use_full_alignment, use_fasttree)
Expand Down

0 comments on commit 50193b3

Please sign in to comment.