Skip to content

Commit

Permalink
Two new workflows:
Browse files Browse the repository at this point in the history
    handle (ref-indepedent, alignment) a single piece of on-instrument demultiplexed CCS bam
  • Loading branch information
SHuang-Broad committed Jan 15, 2024
1 parent 4c32ae1 commit 74388b3
Show file tree
Hide file tree
Showing 3 changed files with 352 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ workflows:
- name: PBMASIsoSeqDemultiplex
subclass: wdl
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/PBMASIsoSeqDemultiplex.wdl
- name: ProcessOnInstrumentDemuxedChunkRefFree
subclass: wdl
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunkRefFree.wdl
- name: ProcessOnInstrumentDemuxedChunk
subclass: wdl
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl

###################################################
# TechAgnostic - *mics data processing
Expand Down
181 changes: 181 additions & 0 deletions wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
version 1.0

import "../../../tasks/Utility/GeneralUtils.wdl" as GU

import "../../../tasks/Alignment/AlignHiFiUBAM.wdl" as ALN

import "../../TechAgnostic/Utility/AlignedBamQCandMetrics.wdl" as QCMetrics

import "../../../tasks/Utility/Finalize.wdl" as FF

workflow ProcessOnInstrumentDemuxedChunk {

meta {
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform alignment and QC check, and collect a few metrics."
}

parameter_meta {

#########
# inputs
readgroup_id:
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>; no whitespaces allowed"

bam_SM_field:
"value to place in the SM field of the resulting BAM header's @RG line"

platform:
"PacBio platform used for generating the data; accepted value: [Sequel, Revio]"

gcs_out_root_dir:
"output files will be copied over there"

qc_metrics_config_json:
"A config json to for running the QC and metrics-collection sub-workflow 'AlignedBamQCandMetrics'"

fingerprint_sample_id:
"For fingerprint verification: the ID of the sample supposedly this BAM belongs to; note that the fingerprint VCF is assumed to be located at {fingerprint_store}/{fingerprint_sample_id}*.vcf(.gz)?"

expected_sex_type:
"If provided, triggers sex concordance check. Accepted value: [M, F, NA, na]"

#########
# outputs
wgs_cov:
"whole genome mean coverage"

nanoplot_summ:
"Summary on alignment metrics provided by Nanoplot (todo: study the value of this output)"

sam_flag_stats:
"SAM flag stats"

fingerprint_check:
"Summary on (human) fingerprint checking results"

contamination_est:
"cross-(human)individual contamination estimation by VerifyBAMID2"

inferred_sex_info:
"Inferred sex concordance information if expected sex type is provided"

methyl_tag_simple_stats:
"Simple stats on the reads with & without SAM methylation tags (MM/ML)."

aBAM_metrics_files:
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks"
}

input {
String gcs_out_root_dir

File uBAM
File? uPBI

String readgroup_id
String bam_SM_field

String platform

# args for optional QC subworkflows
File? qc_metrics_config_json
String? fingerprint_sample_id
String? expected_sex_type

File ref_map_file
String disk_type = "SSD"
}

output {
String last_processing_date = today.yyyy_mm_dd

File aligned_bam = FinalizeAlignedBam.gcs_path
File aligned_bai = FinalizeAlignedBai.gcs_path
File aligned_pbi = FinalizeAlignedPbi.gcs_path

String movie = AlignHiFiUBAM.movie

# metrics block (caveat: always has to keep an eye on the QC subworkflow about outputs)
Float wgs_cov = QCandMetrics.wgs_cov
Map[String, Float] nanoplot_summ = QCandMetrics.nanoplot_summ
Map[String, Float] sam_flag_stats = QCandMetrics.sam_flag_stats

# fingerprint
Map[String, String]? fingerprint_check = QCandMetrics.fingerprint_check

# contam
Float? contamination_est = QCandMetrics.contamination_est

# sex concordance
Map[String, String]? inferred_sex_info = QCandMetrics.inferred_sex_info

# methyl
Map[String, String]? methyl_tag_simple_stats = QCandMetrics.methyl_tag_simple_stats

# file-based QC/metrics outputs all packed into a finalization map
Map[String, String] aBAM_metrics_files = QCandMetrics.aBAM_metrics_files
}

###################################################################################
# prep work

# where to store final results
String workflow_name = "ProcessOnInstrumentDemuxedChunk"
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name

# String bc_specific_out = outdir + '/' + readgroup_id
String bc_specific_aln_out = outdir + '/alignments/' + readgroup_id
String bc_specific_metrics_out = outdir + "/metrics/" + readgroup_id

###################################################################################
# align
call ALN.AlignHiFiUBAM as AlignHiFiUBAM { input:
uBAM = uBAM,
uPBI = uPBI,
bam_sample_name = bam_SM_field,
ref_map_file = ref_map_file,
application = 'HIFI'
}

File aBAM = AlignHiFiUBAM.aligned_bam
File aBAI = AlignHiFiUBAM.aligned_bai
File aPBI = AlignHiFiUBAM.aligned_pbi

# save
call FF.FinalizeToFile as FinalizeAlignedBam { input: outdir = bc_specific_aln_out, file = aBAM, name = readgroup_id + '.bam' }
call FF.FinalizeToFile as FinalizeAlignedBai { input: outdir = bc_specific_aln_out, file = aBAI, name = readgroup_id + '.bai' }
call FF.FinalizeToFile as FinalizeAlignedPbi { input: outdir = bc_specific_aln_out, file = aPBI, name = readgroup_id + '.pbi' }

###################################################################################
# QC
AlignedBamQCnMetricsConfig qcm_config = read_json(select_first([qc_metrics_config_json]))
call QCMetrics.Work as QCandMetrics { input:
bam = aBAM,
bai = aBAI,

tech = platform,

cov_bed = qcm_config.cov_bed,
cov_bed_descriptor = qcm_config.cov_bed_descriptor,

fingerprint_vcf_store = qcm_config.fingerprint_vcf_store,
fingerprint_sample_id = fingerprint_sample_id,

expected_sex_type = expected_sex_type,

vbid2_config_json = qcm_config.vbid2_config_json,

methyl_tag_check_bam_descriptor = qcm_config.methyl_tag_check_bam_descriptor,
save_methyl_uncalled_reads = qcm_config.save_methyl_uncalled_reads,

ref_map_file = ref_map_file,
disk_type = disk_type,

output_prefix = readgroup_id, # String output_prefix = bam_sample_name + "." + flowcell
gcs_out_root_dir = bc_specific_metrics_out,
}

###################################################################################

call GU.GetTodayDate as today {}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
version 1.0


import "../../../tasks/Utility/BAMutils.wdl" as BU
import "../../../tasks/Utility/GeneralUtils.wdl" as GU

import "../../../tasks/Visualization/NanoPlot.wdl" as QC0
import "../../TechAgnostic/Utility/CountTheBeans.wdl" as QC1
import "../../TechAgnostic/Utility/DystPeaker.wdl" as QC2
import "../../TechAgnostic/Utility/FASTQstats.wdl" as QC3

import "../../../tasks/Utility/Finalize.wdl" as FF
import "../../TechAgnostic/Utility/SaveFilesToDestination.wdl" as SAVE

workflow ProcessOnInstrumentDemuxedChunkRefFree {

meta {
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform ref-independent prep work, and collect a few metrics"
}

parameter_meta {
readgroup_id:
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>"

short_reads_threshold:
"a length threshold below which reads are classified as short"

bam_descriptor:
"a one-word description of the purpose of the BAM (e.g. 'a_particular_seq_center'; used for saving the reads that don't have any MM/ML tags; doesn't need to be single-file specific)"

run_nanoplot:
"if true, will kick off Nanoplot to collect metrics on the uBAM; this isn't necessary if you also run the alignment version to process the uBAM as Nanoplot is run there automatically and produces a superset of metrics"

nanoplot_u_summ:
"A few metrics output by Nanoplot on the uBAM"
seqkit_stats:
"A few metrics output by seqkit stats"

read_len_summaries:
"A few metrics summarizing the read length distribution"
read_len_peaks:
"Peaks of the read length distribution (heruistic)"
read_len_deciles:
"Deciles of the read length distribution"

methyl_tag_simple_stats:
"Simple stats on the reads with & without SAM methylation tags (MM/ML)."

uBAM_metrics_files:
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks"
}

input {
File uBAM
String readgroup_id

String bam_descriptor
Int short_reads_threshold

Boolean run_nanoplot = false
String disk_type = "SSD"

String gcs_out_root_dir
}

output {
String movie = movie_name

File hifi_fq = FinalizeFQ.gcs_path

String last_processing_date = today.yyyy_mm_dd

# todo merge these two together
Map[String, Float] seqkit_stats = FASTQstats.stats
Map[String, Float]? nanoplot_u_summ = NanoPlotFromUBam.stats_map

Map[String, String] read_len_summaries = DystPeaker.read_len_summaries
Array[Int] read_len_peaks = DystPeaker.read_len_peaks
Array[Int] read_len_deciles = DystPeaker.read_len_deciles

# methylation call rate stats
Map[String, String] methyl_tag_simple_stats = NoMissingBeans.methyl_tag_simple_stats

# file-based outputs all packed into a finalization map
Map[String, String] uBAM_metrics_files = files_out
}

###################################################################################
# prep work

# where to store final results
String workflow_name = "ProcessOnInstrumentDemuxedChunkRefFree"
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name

String outdir_ref_free = outdir + '/RefFree' # ideally, this isn't necessary, but it's a relic we'd not touch right now (2023-12-23)
String bc_specific_out = outdir_ref_free + '/' + readgroup_id
String bc_specific_metrics_out = bc_specific_out + "/metrics"

###################################################################################
call BU.GetReadGroupInfo as RG {input: bam = uBAM, keys = ['PU']}
String movie_name = RG.read_group_info['PU']

###################################################################################
# convert to FASTQ (most for assembly)
call BU.BamToFastq {input: bam = uBAM, prefix = "does_not_matter"}
call FF.FinalizeToFile as FinalizeFQ { input:
outdir = bc_specific_out,
file = BamToFastq.reads_fq,
name = readgroup_id + '.hifi.fq.gz'
}

###################################################################################
# more QCs and metrics

##########
if (run_nanoplot) {
call QC0.NanoPlotFromUBam {input: uBAM = uBAM}
FinalizationManifestLine a = object
{files_to_save: flatten([[NanoPlotFromUBam.stats], NanoPlotFromUBam.plots]),
is_singleton_file: false,
destination: bc_specific_metrics_out + "/nanoplot",
output_attribute_name: "nanoplot"}
}
##########
call QC1.CountTheBeans as NoMissingBeans { input:
bam=uBAM,
bam_descriptor=bam_descriptor,
gcs_out_root_dir=bc_specific_metrics_out,
use_local_ssd=disk_type=='LOCAL'
}
Map[String, String] methyl_out = {"missing_methyl_tag_reads":
NoMissingBeans.methyl_tag_simple_stats['files_holding_reads_without_tags']}
##########
call QC2.DystPeaker { input:
input_file=uBAM, input_is_bam=true,
id=readgroup_id, short_reads_threshold=short_reads_threshold,
gcs_out_root_dir=bc_specific_metrics_out
}
Map[String, String] read_len_out = {"read_len_hist": DystPeaker.read_len_hist,
"raw_rl_file": DystPeaker.read_len_summaries['raw_rl_file']}

##########
call QC3.FASTQstats { input:
reads=BamToFastq.reads_fq,
file_type='FASTQ'
}

###################################################################################
if (run_nanoplot) {
call SAVE.SaveFilestoDestination as SaveRest { input:
instructions = select_all([a]),
already_finalized = select_all([methyl_out,
read_len_out])
}
}
if (!run_nanoplot) {
call GU.MergeMaps { input:
one = methyl_out,
two = read_len_out,
}
}
Map[String, String] files_out = select_first([SaveRest.result, MergeMaps.merged])

call GU.GetTodayDate as today {}
}

0 comments on commit 74388b3

Please sign in to comment.