From 74388b32c2ec8de0af4c862e0cac47dcce94de21 Mon Sep 17 00:00:00 2001 From: Steve Huang Date: Wed, 27 Dec 2023 14:03:42 -0500 Subject: [PATCH] Two new workflows: handle (ref-indepedent, alignment) a single piece of on-instrument demultiplexed CCS bam --- .dockstore.yml | 6 + .../ProcessOnInstrumentDemuxedChunk.wdl | 181 ++++++++++++++++++ ...ProcessOnInstrumentDemuxedChunkRefFree.wdl | 165 ++++++++++++++++ 3 files changed, 352 insertions(+) create mode 100644 wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl create mode 100644 wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunkRefFree.wdl diff --git a/.dockstore.yml b/.dockstore.yml index 828b38b4d..ff5fb5ceb 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -81,6 +81,12 @@ workflows: - name: PBMASIsoSeqDemultiplex subclass: wdl primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/PBMASIsoSeqDemultiplex.wdl +- name: ProcessOnInstrumentDemuxedChunkRefFree + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunkRefFree.wdl +- name: ProcessOnInstrumentDemuxedChunk + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl ################################################### # TechAgnostic - *mics data processing diff --git a/wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl b/wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl new file mode 100644 index 000000000..5a706a095 --- /dev/null +++ b/wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl @@ -0,0 +1,181 @@ +version 1.0 + +import "../../../tasks/Utility/GeneralUtils.wdl" as GU + +import "../../../tasks/Alignment/AlignHiFiUBAM.wdl" as ALN + +import "../../TechAgnostic/Utility/AlignedBamQCandMetrics.wdl" as QCMetrics + +import "../../../tasks/Utility/Finalize.wdl" as FF + +workflow ProcessOnInstrumentDemuxedChunk { + + meta { + desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform alignment and QC check, and collect a few metrics." + } + + parameter_meta { + + ######### + # inputs + readgroup_id: + "Unique ID for a readgroup, for example (D|E)A[0-9]{6}-; no whitespaces allowed" + + bam_SM_field: + "value to place in the SM field of the resulting BAM header's @RG line" + + platform: + "PacBio platform used for generating the data; accepted value: [Sequel, Revio]" + + gcs_out_root_dir: + "output files will be copied over there" + + qc_metrics_config_json: + "A config json to for running the QC and metrics-collection sub-workflow 'AlignedBamQCandMetrics'" + + fingerprint_sample_id: + "For fingerprint verification: the ID of the sample supposedly this BAM belongs to; note that the fingerprint VCF is assumed to be located at {fingerprint_store}/{fingerprint_sample_id}*.vcf(.gz)?" + + expected_sex_type: + "If provided, triggers sex concordance check. Accepted value: [M, F, NA, na]" + + ######### + # outputs + wgs_cov: + "whole genome mean coverage" + + nanoplot_summ: + "Summary on alignment metrics provided by Nanoplot (todo: study the value of this output)" + + sam_flag_stats: + "SAM flag stats" + + fingerprint_check: + "Summary on (human) fingerprint checking results" + + contamination_est: + "cross-(human)individual contamination estimation by VerifyBAMID2" + + inferred_sex_info: + "Inferred sex concordance information if expected sex type is provided" + + methyl_tag_simple_stats: + "Simple stats on the reads with & without SAM methylation tags (MM/ML)." + + aBAM_metrics_files: + "A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks" + } + + input { + String gcs_out_root_dir + + File uBAM + File? uPBI + + String readgroup_id + String bam_SM_field + + String platform + + # args for optional QC subworkflows + File? qc_metrics_config_json + String? fingerprint_sample_id + String? expected_sex_type + + File ref_map_file + String disk_type = "SSD" + } + + output { + String last_processing_date = today.yyyy_mm_dd + + File aligned_bam = FinalizeAlignedBam.gcs_path + File aligned_bai = FinalizeAlignedBai.gcs_path + File aligned_pbi = FinalizeAlignedPbi.gcs_path + + String movie = AlignHiFiUBAM.movie + + # metrics block (caveat: always has to keep an eye on the QC subworkflow about outputs) + Float wgs_cov = QCandMetrics.wgs_cov + Map[String, Float] nanoplot_summ = QCandMetrics.nanoplot_summ + Map[String, Float] sam_flag_stats = QCandMetrics.sam_flag_stats + + # fingerprint + Map[String, String]? fingerprint_check = QCandMetrics.fingerprint_check + + # contam + Float? contamination_est = QCandMetrics.contamination_est + + # sex concordance + Map[String, String]? inferred_sex_info = QCandMetrics.inferred_sex_info + + # methyl + Map[String, String]? methyl_tag_simple_stats = QCandMetrics.methyl_tag_simple_stats + + # file-based QC/metrics outputs all packed into a finalization map + Map[String, String] aBAM_metrics_files = QCandMetrics.aBAM_metrics_files + } + + ################################################################################### + # prep work + + # where to store final results + String workflow_name = "ProcessOnInstrumentDemuxedChunk" + String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name + + # String bc_specific_out = outdir + '/' + readgroup_id + String bc_specific_aln_out = outdir + '/alignments/' + readgroup_id + String bc_specific_metrics_out = outdir + "/metrics/" + readgroup_id + + ################################################################################### + # align + call ALN.AlignHiFiUBAM as AlignHiFiUBAM { input: + uBAM = uBAM, + uPBI = uPBI, + bam_sample_name = bam_SM_field, + ref_map_file = ref_map_file, + application = 'HIFI' + } + + File aBAM = AlignHiFiUBAM.aligned_bam + File aBAI = AlignHiFiUBAM.aligned_bai + File aPBI = AlignHiFiUBAM.aligned_pbi + + # save + call FF.FinalizeToFile as FinalizeAlignedBam { input: outdir = bc_specific_aln_out, file = aBAM, name = readgroup_id + '.bam' } + call FF.FinalizeToFile as FinalizeAlignedBai { input: outdir = bc_specific_aln_out, file = aBAI, name = readgroup_id + '.bai' } + call FF.FinalizeToFile as FinalizeAlignedPbi { input: outdir = bc_specific_aln_out, file = aPBI, name = readgroup_id + '.pbi' } + + ################################################################################### + # QC + AlignedBamQCnMetricsConfig qcm_config = read_json(select_first([qc_metrics_config_json])) + call QCMetrics.Work as QCandMetrics { input: + bam = aBAM, + bai = aBAI, + + tech = platform, + + cov_bed = qcm_config.cov_bed, + cov_bed_descriptor = qcm_config.cov_bed_descriptor, + + fingerprint_vcf_store = qcm_config.fingerprint_vcf_store, + fingerprint_sample_id = fingerprint_sample_id, + + expected_sex_type = expected_sex_type, + + vbid2_config_json = qcm_config.vbid2_config_json, + + methyl_tag_check_bam_descriptor = qcm_config.methyl_tag_check_bam_descriptor, + save_methyl_uncalled_reads = qcm_config.save_methyl_uncalled_reads, + + ref_map_file = ref_map_file, + disk_type = disk_type, + + output_prefix = readgroup_id, # String output_prefix = bam_sample_name + "." + flowcell + gcs_out_root_dir = bc_specific_metrics_out, + } + + ################################################################################### + + call GU.GetTodayDate as today {} +} diff --git a/wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunkRefFree.wdl b/wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunkRefFree.wdl new file mode 100644 index 000000000..6fc022fc9 --- /dev/null +++ b/wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunkRefFree.wdl @@ -0,0 +1,165 @@ +version 1.0 + + +import "../../../tasks/Utility/BAMutils.wdl" as BU +import "../../../tasks/Utility/GeneralUtils.wdl" as GU + +import "../../../tasks/Visualization/NanoPlot.wdl" as QC0 +import "../../TechAgnostic/Utility/CountTheBeans.wdl" as QC1 +import "../../TechAgnostic/Utility/DystPeaker.wdl" as QC2 +import "../../TechAgnostic/Utility/FASTQstats.wdl" as QC3 + +import "../../../tasks/Utility/Finalize.wdl" as FF +import "../../TechAgnostic/Utility/SaveFilesToDestination.wdl" as SAVE + +workflow ProcessOnInstrumentDemuxedChunkRefFree { + + meta { + desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform ref-independent prep work, and collect a few metrics" + } + + parameter_meta { + readgroup_id: + "Unique ID for a readgroup, for example (D|E)A[0-9]{6}-" + + short_reads_threshold: + "a length threshold below which reads are classified as short" + + bam_descriptor: + "a one-word description of the purpose of the BAM (e.g. 'a_particular_seq_center'; used for saving the reads that don't have any MM/ML tags; doesn't need to be single-file specific)" + + run_nanoplot: + "if true, will kick off Nanoplot to collect metrics on the uBAM; this isn't necessary if you also run the alignment version to process the uBAM as Nanoplot is run there automatically and produces a superset of metrics" + + nanoplot_u_summ: + "A few metrics output by Nanoplot on the uBAM" + seqkit_stats: + "A few metrics output by seqkit stats" + + read_len_summaries: + "A few metrics summarizing the read length distribution" + read_len_peaks: + "Peaks of the read length distribution (heruistic)" + read_len_deciles: + "Deciles of the read length distribution" + + methyl_tag_simple_stats: + "Simple stats on the reads with & without SAM methylation tags (MM/ML)." + + uBAM_metrics_files: + "A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks" + } + + input { + File uBAM + String readgroup_id + + String bam_descriptor + Int short_reads_threshold + + Boolean run_nanoplot = false + String disk_type = "SSD" + + String gcs_out_root_dir + } + + output { + String movie = movie_name + + File hifi_fq = FinalizeFQ.gcs_path + + String last_processing_date = today.yyyy_mm_dd + + # todo merge these two together + Map[String, Float] seqkit_stats = FASTQstats.stats + Map[String, Float]? nanoplot_u_summ = NanoPlotFromUBam.stats_map + + Map[String, String] read_len_summaries = DystPeaker.read_len_summaries + Array[Int] read_len_peaks = DystPeaker.read_len_peaks + Array[Int] read_len_deciles = DystPeaker.read_len_deciles + + # methylation call rate stats + Map[String, String] methyl_tag_simple_stats = NoMissingBeans.methyl_tag_simple_stats + + # file-based outputs all packed into a finalization map + Map[String, String] uBAM_metrics_files = files_out + } + + ################################################################################### + # prep work + + # where to store final results + String workflow_name = "ProcessOnInstrumentDemuxedChunkRefFree" + String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name + + String outdir_ref_free = outdir + '/RefFree' # ideally, this isn't necessary, but it's a relic we'd not touch right now (2023-12-23) + String bc_specific_out = outdir_ref_free + '/' + readgroup_id + String bc_specific_metrics_out = bc_specific_out + "/metrics" + + ################################################################################### + call BU.GetReadGroupInfo as RG {input: bam = uBAM, keys = ['PU']} + String movie_name = RG.read_group_info['PU'] + + ################################################################################### + # convert to FASTQ (most for assembly) + call BU.BamToFastq {input: bam = uBAM, prefix = "does_not_matter"} + call FF.FinalizeToFile as FinalizeFQ { input: + outdir = bc_specific_out, + file = BamToFastq.reads_fq, + name = readgroup_id + '.hifi.fq.gz' + } + + ################################################################################### + # more QCs and metrics + + ########## + if (run_nanoplot) { + call QC0.NanoPlotFromUBam {input: uBAM = uBAM} + FinalizationManifestLine a = object + {files_to_save: flatten([[NanoPlotFromUBam.stats], NanoPlotFromUBam.plots]), + is_singleton_file: false, + destination: bc_specific_metrics_out + "/nanoplot", + output_attribute_name: "nanoplot"} + } + ########## + call QC1.CountTheBeans as NoMissingBeans { input: + bam=uBAM, + bam_descriptor=bam_descriptor, + gcs_out_root_dir=bc_specific_metrics_out, + use_local_ssd=disk_type=='LOCAL' + } + Map[String, String] methyl_out = {"missing_methyl_tag_reads": + NoMissingBeans.methyl_tag_simple_stats['files_holding_reads_without_tags']} + ########## + call QC2.DystPeaker { input: + input_file=uBAM, input_is_bam=true, + id=readgroup_id, short_reads_threshold=short_reads_threshold, + gcs_out_root_dir=bc_specific_metrics_out + } + Map[String, String] read_len_out = {"read_len_hist": DystPeaker.read_len_hist, + "raw_rl_file": DystPeaker.read_len_summaries['raw_rl_file']} + + ########## + call QC3.FASTQstats { input: + reads=BamToFastq.reads_fq, + file_type='FASTQ' + } + + ################################################################################### + if (run_nanoplot) { + call SAVE.SaveFilestoDestination as SaveRest { input: + instructions = select_all([a]), + already_finalized = select_all([methyl_out, + read_len_out]) + } + } + if (!run_nanoplot) { + call GU.MergeMaps { input: + one = methyl_out, + two = read_len_out, + } + } + Map[String, String] files_out = select_first([SaveRest.result, MergeMaps.merged]) + + call GU.GetTodayDate as today {} +}