Skip to content

Commit

Permalink
New subworkflow to align PacBio uBAM
Browse files Browse the repository at this point in the history
  • Loading branch information
SHuang-Broad committed Jan 12, 2024
1 parent b5dc978 commit 4c32ae1
Showing 1 changed file with 105 additions and 0 deletions.
105 changes: 105 additions & 0 deletions wdl/tasks/Alignment/AlignHiFiUBAM.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
version 1.0

import "../Utility/Utils.wdl"
import "../Utility/GeneralUtils.wdl"
import "../Utility/PBUtils.wdl" as PB
import "../Utility/BAMutils.wdl" as BU

workflow AlignHiFiUBAM {
meta {
desciption:
"Align an CCS/HiFi unaligned BAM from a single readgroup."
}

parameter_meta {
application: "Application of the data; accepted values are: [CCS, HIFI, ISOSEQ, MASSEQ]"
library: "If provided, the output aligned BAM will use this value for the LB field in its readgroup header line (@RG)"
drop_per_base_N_pulse_tags: "If false, the per-base and pulse tags will not be stripped away from the input uBAM, if they are available (please think through the storage implications)"
}

input {
File uBAM
File? uPBI
String bam_sample_name
String? library

String application

Boolean drop_per_base_N_pulse_tags = true
File ref_map_file
}

output {
File aligned_bam = aBAM
File aligned_bai = aBAI
File aligned_pbi = IndexAlignedReads.pbi

String movie = movie_name
}

# todo: verify if this is still necessary
Map[String, String] map_presets = {
# 'CLR': 'SUBREAD', # don't support any more
'CCS': 'CCS',
'HIFI': 'HIFI',
'ISOSEQ': 'ISOSEQ',
'MASSEQ': 'SUBREAD',
}

Map[String, String] ref_map = read_map(ref_map_file)

call BU.GetReadGroupInfo as RG {input: bam = uBAM, keys = ['SM', 'LB', 'PU']}
String LB = select_first([library, RG.read_group_info['LB']])
String movie_name = RG.read_group_info['PU']

###################################################################################
Int shard_threshold = 50

if (ceil(size(uBAM, "GiB")) > shard_threshold) {# shard & align

if (! defined(uPBI) ) {
call PB.PBIndex as PBIndex {input: bam = uBAM}
}

Int num_shards = ceil(size(uBAM, "GiB") / shard_threshold )
call Utils.ComputeAllowedLocalSSD as Guess {input: intended_gb = 3*ceil(size(uBAM, "GiB") + size(uPBI, "GiB"))}
call PB.ShardLongReads {
input:
unaligned_bam = uBAM, unaligned_pbi = select_first([uPBI, PBIndex.pbi]),
num_shards = num_shards,
num_ssds = Guess.numb_of_local_ssd
}

scatter (unaligned_bam in ShardLongReads.unmapped_shards) {
# sometimes we see the sharded bams mising EOF marker, which then fails record counts, use this as a checkpoint
call Utils.CountBamRecords as ValidateShard {input: bam = unaligned_bam}

call PB.Align as AlignReads {
input:
bam = unaligned_bam,
sample_name = bam_sample_name,
library = LB,
ref_fasta = ref_map['fasta'],
map_preset = map_presets['CCS'],
drop_per_base_N_pulse_tags = drop_per_base_N_pulse_tags
}
}

call Utils.MergeBams as MergeAlignedReads { input: bams = AlignReads.aligned_bam, prefix = basename(uBAM, ".bam") }
}
if (! (ceil(size(uBAM, "GiB")) > shard_threshold)) {
call PB.Align as AlignReadsTogether {
input:
bam = uBAM,
sample_name = bam_sample_name,
library = LB,
ref_fasta = ref_map['fasta'],
map_preset = map_presets['CCS'],
drop_per_base_N_pulse_tags = drop_per_base_N_pulse_tags
}
}

File aBAM = select_first([MergeAlignedReads.merged_bam, AlignReadsTogether.aligned_bam])
File aBAI = select_first([MergeAlignedReads.merged_bai, AlignReadsTogether.aligned_bai])
call PB.PBIndex as IndexAlignedReads { input: bam = aBAM }
}

0 comments on commit 4c32ae1

Please sign in to comment.