Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Standardize wdl description #392

Merged
merged 8 commits into from
Apr 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 18 additions & 21 deletions pipelines/ONT/Assembly/ONTAssembleWithCanu.wdl
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
version 1.0

######################################################################################
## A workflow that performs single sample genome assembly on ONT reads from one or
## more flow cells. The workflow merges multiple samples into a single BAM prior to
## genome assembly and variant calling.
######################################################################################

import "../../../tasks/Utility/Utils.wdl" as Utils
import "../../../tasks/Assembly/Canu.wdl" as Canu
import "../../../tasks/Preprocessing/Medaka.wdl" as Medaka
Expand All @@ -14,22 +8,9 @@ import "../../../tasks/QC/Quast.wdl" as Quast
import "../../../tasks/Utility/Finalize.wdl" as FF

workflow ONTAssembleWithCanu {
input {
String gcs_fastq_dir

File ref_map_file

Float correct_error_rate = 0.15
Float trim_error_rate = 0.15
Float assemble_error_rate = 0.15
String medaka_model = "r941_prom_high_g360"

String participant_name
String prefix

String gcs_out_root_dir
meta {
description: "A workflow that performs single sample genome assembly on ONT reads from one or more flow cells. The workflow merges multiple samples into a single BAM prior to genome assembly and variant calling."
}

parameter_meta {
gcs_fastq_dir: "GCS path to unaligned CCS BAM files"

Expand All @@ -46,6 +27,22 @@ workflow ONTAssembleWithCanu {
gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files"
}

input {
String gcs_fastq_dir

File ref_map_file

Float correct_error_rate = 0.15
Float trim_error_rate = 0.15
Float assemble_error_rate = 0.15
String medaka_model = "r941_prom_high_g360"

String participant_name
String prefix

String gcs_out_root_dir
}

Map[String, String] ref_map = read_map(ref_map_file)

String outdir = sub(gcs_out_root_dir, "/$", "") + "/ONTAssembleWithCanu/~{prefix}"
Expand Down
34 changes: 16 additions & 18 deletions pipelines/ONT/Assembly/ONTAssembleWithFlye.wdl
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
version 1.0

######################################################################################
## A workflow that performs single sample genome assembly on ONT reads from one or
## more flow cells. The workflow merges multiple samples into a single BAM prior to
## genome assembly and variant calling.
######################################################################################

import "../../../tasks/Utility/Utils.wdl" as Utils
import "../../../tasks/Assembly/Flye.wdl" as Flye
import "../../../tasks/Preprocessing/Medaka.wdl" as Medaka
Expand All @@ -14,19 +8,9 @@ import "../../../tasks/QC/Quast.wdl" as Quast
import "../../../tasks/Utility/Finalize.wdl" as FF

workflow ONTAssembleWithFlye {
input {
String gcs_fastq_dir

File ref_map_file

String medaka_model = "r941_prom_high_g360"

String participant_name
String prefix

String gcs_out_root_dir
meta {
description: "Perform single sample genome assembly on ONT reads from one or more flow cells. The workflow merges multiple samples into a single BAM prior to genome assembly and variant calling."
}

parameter_meta {
gcs_fastq_dir: "GCS path to unaligned CCS BAM files"

Expand All @@ -40,6 +24,20 @@ workflow ONTAssembleWithFlye {
gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files"
}

input {
String gcs_fastq_dir

File ref_map_file

String medaka_model = "r941_prom_high_g360"

String participant_name
String prefix

String gcs_out_root_dir
}


Map[String, String] ref_map = read_map(ref_map_file)

String outdir = sub(gcs_out_root_dir, "/$", "") + "/ONTAssembleWithFlye/~{prefix}"
Expand Down
14 changes: 14 additions & 0 deletions pipelines/ONT/Epigenomics/ONTMethylation.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,20 @@ import "../../../tasks/Preprocessing/Guppy.wdl" as Guppy
import "../../../tasks/Utility/Finalize.wdl" as FF

workflow ONTMethylation {

meta {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you!

description: "ONT Methylation pipeline"
}
parameter_meta {
gcs_fast5_dir: "GCS directory containing fast5 files"
ref_map_file: "Reference map file"
variants: "VCF file containing variants"
variants_tbi: "Tabix index for VCF file"
participant_name: "Participant name"
prefix: "Prefix for output files"
gcs_out_root_dir: "GCS directory to write output files"
}

input {
String gcs_fast5_dir

Expand Down
9 changes: 9 additions & 0 deletions pipelines/ONT/MultiAnalysis/ONTPfHrp2Hrp3Status.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@ import "../../../structs/Structs.wdl"
import "../../../tasks/Utility/Finalize.wdl" as FF

workflow ONTPfHrp2Hrp3Status {

meta {
description: "Determine if HRP2 and HRP3 are deleted in a sample"
}
parameter_meta {
bam: "BAM file"
bai: "BAM index file"
}

input {
File bam
File bai
Expand Down
10 changes: 10 additions & 0 deletions pipelines/ONT/MultiAnalysis/ONTPfTypeDrugResistanceMarkers.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,16 @@ import "../../../structs/Structs.wdl"
import "../../../tasks/Utility/Finalize.wdl" as FF

workflow ONTPfTypeDrugResistanceMarkers {

meta {
description: "Workflow to generate a report of drug resistance markers"
}
parameter_meta {
vcf: "VCF file to process"
dir_prefix: "Prefix for output directory"
gcs_out_root_dir: "GCS output root directory"
}

input {
File vcf

Expand Down
12 changes: 12 additions & 0 deletions pipelines/ONT/Preprocessing/ONTBasecall.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@ import "../../../tasks/Preprocessing/Guppy.wdl" as Guppy
import "../../../tasks/Utility/Finalize.wdl" as FF

workflow ONTBasecall {

meta {
description: "Basecall ONT reads"
}
parameter_meta {
gcs_fast5_dir: "GCS path to the directory containing fast5 files"
config: "Guppy config file"
barcode_kit: "Guppy barcode kit"
gcs_out_root_dir: "GCS path to the root directory for output"
prefix: "Prefix for output directory"
}

input {
String gcs_fast5_dir
String config = "dna_r10.4.1_e8.2_400bps_sup.cfg"
Expand Down
34 changes: 19 additions & 15 deletions pipelines/ONT/Preprocessing/ONTFlowcell.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,10 @@ import "../../../tasks/Visualization/NanoPlot.wdl" as NP
import "../../../tasks/Utility/Finalize.wdl" as FF

workflow ONTFlowcell {
input {
File? final_summary
File? sequencing_summary
String? fastq_dir

File ref_map_file

String SM
String ID

Int num_shards = 300
String experiment_type
String dir_prefix

String gcs_out_root_dir
meta {
description: "Align ONT reads to a reference genome"
}

parameter_meta {
final_summary: "GCS path to '*final_summary*.txt*' file for basecalled fastq files"
sequencing_summary: "GCS path to '*sequencing_summary*.txt*' file for basecalled fastq files"
Expand All @@ -42,6 +29,23 @@ workflow ONTFlowcell {
gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files"
}

input {
File? final_summary
File? sequencing_summary
String? fastq_dir

File ref_map_file

String SM
String ID

Int num_shards = 300
String experiment_type
String dir_prefix

String gcs_out_root_dir
}

Map[String, String] ref_map = read_map(ref_map_file)
Map[String, String] map_presets = {
'DNA': 'map-ont',
Expand Down
52 changes: 25 additions & 27 deletions pipelines/ONT/VariantCalling/ONTWholeGenome.wdl
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
version 1.0

######################################################################################
## A workflow that performs single sample variant calling on Oxford Nanopore reads
## from one or more flow cells. The workflow merges multiple samples into a single BAM
## prior to variant calling.
######################################################################################

import "../../../tasks/Utility/ONTUtils.wdl" as ONT
import "../../../tasks/Utility/Utils.wdl" as Utils
import "../../../tasks/VariantCalling/CallVariantsONT.wdl" as VAR
Expand All @@ -14,6 +8,31 @@ import "../../../tasks/Utility/Finalize.wdl" as FF
import "../../../tasks/QC/SampleLevelAlignedMetrics.wdl" as COV

workflow ONTWholeGenome {

meta {
description: "A workflow that performs single sample variant calling on Oxford Nanopore reads from one or more flow cells. The workflow merges multiple flowcells into a single BAM prior to variant calling."
}
parameter_meta {
aligned_bams: "GCS path to aligned BAM files"
aligned_bais: "GCS path to aligned BAM file indices"
participant_name: "name of the participant from whom these samples were obtained"

ref_map_file: "table indicating reference sequence and auxillary file locations"
gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files"

call_svs: "whether to call SVs"
fast_less_sensitive_sv: "to trade less sensitive SV calling for faster speed"

call_small_variants: "whether to call small variants"
call_small_vars_on_mitochondria: "if false, will not attempt to call variants on mitochondria; if true, some samples might fail (caller feature) due to lack of signal"
sites_vcf: "for use with Clair"
sites_vcf_tbi: "for use with Clair"

run_dv_pepper_analysis: "to turn on DV-Pepper analysis or not (non-trivial increase in cost and runtime)"
ref_scatter_interval_list_locator: "A file holding paths to interval_list files; needed only when running DV-Pepper"
ref_scatter_interval_list_ids: "A file that gives short IDs to the interval_list files; needed only when running DV-Pepper"
}

input {
Array[File] aligned_bams
Array[File] aligned_bais
Expand Down Expand Up @@ -42,27 +61,6 @@ workflow ONTWholeGenome {
File? ref_scatter_interval_list_ids
}

parameter_meta {
aligned_bams: "GCS path to aligned BAM files"
aligned_bais: "GCS path to aligned BAM file indices"
participant_name: "name of the participant from whom these samples were obtained"

ref_map_file: "table indicating reference sequence and auxillary file locations"
gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files"

call_svs: "whether to call SVs"
fast_less_sensitive_sv: "to trade less sensitive SV calling for faster speed"

call_small_variants: "whether to call small variants"
call_small_vars_on_mitochondria: "if false, will not attempt to call variants on mitochondria; if true, some samples might fail (caller feature) due to lack of signal"
sites_vcf: "for use with Clair"
sites_vcf_tbi: "for use with Clair"

run_dv_pepper_analysis: "to turn on DV-Pepper analysis or not (non-trivial increase in cost and runtime)"
ref_scatter_interval_list_locator: "A file holding paths to interval_list files; needed only when running DV-Pepper"
ref_scatter_interval_list_ids: "A file that gives short IDs to the interval_list files; needed only when running DV-Pepper"
}

Map[String, String] ref_map = read_map(ref_map_file)

String outdir = sub(gcs_out_root_dir, "/$", "") + "/ONTWholeGenome/~{participant_name}"
Expand Down
51 changes: 24 additions & 27 deletions pipelines/PacBio/Alignment/PBFlowcell.wdl
Original file line number Diff line number Diff line change
@@ -1,12 +1,5 @@
version 1.0

##########################################################################################
## A workflow that performs CCS correction on PacBio HiFi reads from a single flow cell.
## The workflow shards the subreads into clusters and performs CCS in parallel on each cluster.
## Ultimately, all the corrected reads (and uncorrected) are gathered into a single BAM.
## Various metrics are produced along the way.
##########################################################################################

import "../../../tasks/Utility/PBUtils.wdl" as PB
import "../../../tasks/Alignment/AlignReads.wdl" as AR
import "../../../tasks/Utility/Utils.wdl" as Utils
Expand All @@ -20,6 +13,30 @@ import "../../../tasks/Transcriptomics/MASSeq.wdl" as MAS
import "../../../tasks/Utility/JupyterNotebooks.wdl" as JUPYTER

workflow PBFlowcell {

meta {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jonn-smith can you please add your description of the MASseq part?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the genome side, here's my proposed description:

The workflow performs the alignment of an SMRT cell's worth of data to a reference.
For genomic sequencing data, the workflow also optionally performs CCS correction if the data is from a CCS library but did not get corrected on-instrument.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Long term, we should dis-integrate this workflow and update it to match Revio outputs, which is assumed to be the major working machines down the road.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@SHuang-Broad how's this:

For MAS-seq transcriptome data, this workflow will determine the most likely MAS-seq model, then it will use that model to annotate, segment, and filter the CCS reads. These CCS reads will then be aligned to the reference in trascriptome alignemnt mode. Note: Currently the MAS-seq workflow separates CLR reads, but does not process them.

description: "The workflow performs the alignment of an SMRT cell's worth of data to a reference. For genomic sequencing data, the workflow also optionally performs CCS correction if the data is from a CCS library but did not get corrected on-instrument. For MAS-seq transcriptome data, this workflow will determine the most likely MAS-seq model, then it will use that model to annotate, segment, and filter the CCS reads. These CCS reads will then be aligned to the reference in trascriptome alignemnt mode. Note: Currently the MAS-seq workflow separates CLR reads, but does not process them."
}
parameter_meta {
bam: "GCS path to raw subread bam"
ccs_report_txt: "GCS path to CCS report txt, required if on-instrument corrected, otherwise CCS is run in this workflow for CCS libraries"
pbi: "GCS path to pbi index for raw subread bam"
ref_map_file: "table indicating reference sequence and auxillary file locations"

SM: "the value to place in the BAM read group's SM field"
LB: "the value to place in the BAM read group's LB (library) field"

num_shards: "number of shards into which fastq files should be batched"
experiment_type: "type of experiment run (CLR, CCS, ISOSEQ, MASSEQ)"
dir_prefix: "directory prefix for output files"

mas_seq_model: "Longbow model to use for MAS-seq data."

DEBUG_MODE: "[default valued] enables debugging tasks / subworkflows (default: false)"

gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files"
}

input {
File bam
File pbi
Expand All @@ -45,26 +62,6 @@ workflow PBFlowcell {
Boolean DEBUG_MODE = false
}

parameter_meta {
bam: "GCS path to raw subread bam"
ccs_report_txt: "GCS path to CCS report txt, required if on-instrument corrected, otherwise CCS is run in this workflow for CCS libraries"
pbi: "GCS path to pbi index for raw subread bam"
ref_map_file: "table indicating reference sequence and auxillary file locations"

SM: "the value to place in the BAM read group's SM field"
LB: "the value to place in the BAM read group's LB (library) field"

num_shards: "number of shards into which fastq files should be batched"
experiment_type: "type of experiment run (CLR, CCS, ISOSEQ, MASSEQ)"
dir_prefix: "directory prefix for output files"

mas_seq_model: "Longbow model to use for MAS-seq data."

DEBUG_MODE: "[default valued] enables debugging tasks / subworkflows (default: false)"

gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files"
}

# Call our timestamp so we can store outputs without clobbering previous runs:
call Utils.GetCurrentTimestampString as WdlExecutionStartTimestamp { input: }

Expand Down
Loading