diff --git a/.dockstore.yml b/.dockstore.yml index 0c9012485..059762af5 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -169,6 +169,11 @@ workflows: primaryDescriptorPath: /pipes/WDL/workflows/fetch_sra_to_bam.wdl testParameterFiles: - /empty.json + - name: fetch_multiple_sra_to_bams + subclass: WDL + primaryDescriptorPath: /pipes/WDL/workflows/fetch_multiple_sra_to_bams.wdl + testParameterFiles: + - /empty.json - name: filter_classified_bam_to_taxa subclass: WDL primaryDescriptorPath: /pipes/WDL/workflows/filter_classified_bam_to_taxa.wdl diff --git a/pipes/WDL/tasks/tasks_ncbi_tools.wdl b/pipes/WDL/tasks/tasks_ncbi_tools.wdl index 28fcf64bd..3d459ea9e 100644 --- a/pipes/WDL/tasks/tasks_ncbi_tools.wdl +++ b/pipes/WDL/tasks/tasks_ncbi_tools.wdl @@ -9,7 +9,7 @@ task Fetch_SRA_to_BAM { Int? machine_mem_gb String docker = "quay.io/broadinstitute/ncbi-tools:2.10.7.10" } - Int disk_size = 750 + Int disk_size = 6000 meta { description: "This searches NCBI SRA for accessions using the Entrez interface, collects associated metadata, and returns read sets as unaligned BAM files with metadata loaded in. Useful metadata from BioSample is also output from this task directly. This has been tested with both SRA and ENA accessions. This queries the NCBI production database, and as such, the output of this task is non-deterministic given the same input." volatile: true @@ -26,7 +26,8 @@ task Fetch_SRA_to_BAM { MODEL=$(jq -r ".EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.EXPERIMENT.PLATFORM.$PLATFORM.INSTRUMENT_MODEL" SRA.json) SAMPLE=$(jq -r '.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.SAMPLE.IDENTIFIERS.EXTERNAL_ID|select(.namespace == "BioSample")|.content' SRA.json) LIBRARY=$(jq -r .EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.EXPERIMENT.alias SRA.json) - RUNDATE=$(jq -r '.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.RUN_SET.RUN.SRAFiles|if (.SRAFile|type) == "object" then .SRAFile.date else [.SRAFile[]|select(.supertype == "Original")][0].date end' SRA.json | cut -f 1 -d ' ') + # if there are multiple runs, select the one matching the SRA accession specified in the task input + RUNDATE=$(jq -r '(.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.RUN_SET | (if (.RUN|type) == "object" then (.RUN) else (.RUN[] | select(any(.; .accession == "~{SRA_ID}"))) end) | .SRAFiles) | if (.SRAFile|type) == "object" then .SRAFile.date else [.SRAFile[]|select(.supertype == "Original" or .supertype=="Primary ETL")][0].date end' SRA.json | cut -f 1 -d ' ') if [[ -n "~{sample_name}" ]]; then SAMPLE="~{sample_name}" diff --git a/pipes/WDL/workflows/fetch_multiple_sra_to_bams.wdl b/pipes/WDL/workflows/fetch_multiple_sra_to_bams.wdl new file mode 100644 index 000000000..4c253a2e2 --- /dev/null +++ b/pipes/WDL/workflows/fetch_multiple_sra_to_bams.wdl @@ -0,0 +1,103 @@ +version 1.0 + +import "../tasks/tasks_ncbi_tools.wdl" as ncbi_tools +import "../tasks/tasks_utils.wdl" as utils + +workflow fetch_multiple_sra_to_bams { + meta { + description: "Retrieve reads for multiple SRA run IDs from the NCBI Short Read Archive in unaligned BAM format (multiple bam files) with relevant metadata encoded." + author: "Broad Viral Genomics" + email: "viral-ngs@broadinstitute.org" + allowNestedInputs: true + } + + input { + Array[String]+ SRA_IDs + } + + parameter_meta { + SRA_IDs: { + description: "SRA run accessions (ex. *RR#######), NOT SRA study or sample accessions." + } + } + + scatter(sra_id in SRA_IDs) { + call ncbi_tools.Fetch_SRA_to_BAM as scattered_fetch_sra_to_bam { + input:SRA_ID = sra_id + } + + Map[String,String] sra_outputs_map = { + "reads_ubam": scattered_fetch_sra_to_bam.reads_ubam, + "sequencing_center": scattered_fetch_sra_to_bam.sequencing_center, + "sequencing_platform": scattered_fetch_sra_to_bam.sequencing_platform, + "sequencing_platform_model": scattered_fetch_sra_to_bam.sequencing_platform_model, + "biosample_accession": scattered_fetch_sra_to_bam.biosample_accession, + "library_id": scattered_fetch_sra_to_bam.library_id, + "run_date": scattered_fetch_sra_to_bam.run_date, + "sample_collection_date": scattered_fetch_sra_to_bam.sample_collection_date, + "sample_collected_by": scattered_fetch_sra_to_bam.sample_collected_by, + "sample_strain": scattered_fetch_sra_to_bam.sample_strain, + "sample_geo_loc": scattered_fetch_sra_to_bam.sample_geo_loc, + "sra_metadata": scattered_fetch_sra_to_bam.sra_metadata + } + + Array[String] metadata_for_accession = [ + sra_id, + scattered_fetch_sra_to_bam.reads_ubam, + scattered_fetch_sra_to_bam.sequencing_center, + scattered_fetch_sra_to_bam.sequencing_platform, + scattered_fetch_sra_to_bam.sequencing_platform_model, + scattered_fetch_sra_to_bam.biosample_accession, + scattered_fetch_sra_to_bam.library_id, + scattered_fetch_sra_to_bam.run_date, + scattered_fetch_sra_to_bam.sample_collection_date, + scattered_fetch_sra_to_bam.sample_collected_by, + scattered_fetch_sra_to_bam.sample_strain, + scattered_fetch_sra_to_bam.sample_geo_loc, + scattered_fetch_sra_to_bam.sra_metadata + ] + + String sra_accession = sra_id + } + + # create mapping from input SRA_ID to corresponding map of k:v containing metadata + scatter(paired_metadata in zip(sra_accession, sra_outputs_map)){ + Map[String,Map[String,String]] combined_output_map = { + paired_metadata.left: paired_metadata.right + } + } + + Array[String] metadata_header = [ + "sra_run_accession", + "reads_ubam", + "sequencing_center", + "sequencing_platform", + "sequencing_platform_model", + "biosample_accession", + "library_id", + "run_date", + "sample_collection_date", + "sample_collected_by", + "sample_strain", + "sample_geo_loc", + "sra_metadata" + ] + + #String input_ids_string = sep('_',SRA_IDs) # WDL >=1.1 (join all specified IDs) + String input_ids_string = flatten([SRA_IDs])[0] # WDL 1.0 (just use the first ID) + + call utils.concatenate as combined_metadata { + input: + # note that metadata_for_accession has type Array[Array[String]] since it is plural gathered scatter output + infiles = [write_tsv([metadata_header]), write_tsv(metadata_for_accession)], + output_name = "run_metadata-${input_ids_string}.tsv" + } + + output { + # bam files for requested SRA IDs + Array[File] read_bams = scattered_fetch_sra_to_bam.reads_ubam + + Array[ Map[ String, Map[String,String] ] ] collected_sra_metadata = combined_output_map + File collected_sra_metadata_tsv = combined_metadata.combined + } +}