Skip to content
Browse files

Small changes to how MBA works with bacterial contamination (#701)

* - unmapping reads results in their alignment being thrown away when converting to CRAM. this PR saves the Previous Alignment information in a tag called PA, before removing it from the read and unmapping it.
it also add the comment "Cross-species contamination" to the CO tag which is already reserved for COmments.
- tests were updated
- this commit also limits (to 1000) the number of log warnings regarding bacterial contamination reads

- added the old alignment information to the unmapped reads as an extra tag
- added a warning that no more warnings will be emited (when limit is reached)

* - adding option to move alignment information

* - added more tests as requested by reviewer.
- fix a bug
- even when asking for DO_NOT_CHANGE mapping quality needs to be set to 0 in order to make SamRecord valid

* responding to most reviewer comments. still a few comments remain...

* - removed warning and converted it to an info emitted on progress reports _if_ the number of unmapped reads is non zero.
- added constructors that do not change, with the DO_NOT_CHANGE value for the enum to keep the API from breaking.
  • Loading branch information
yfarjoun committed Jan 7, 2017
1 parent c8d6de5 commit a325e3356483f02dc066db713c6f0b098f6a3b4b
@@ -61,6 +61,7 @@
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;

* Abstract class that coordinates the general task of taking in a set of alignment information,
@@ -88,6 +89,7 @@
public static final int MAX_RECORDS_IN_RAM = 500000;

private static final char[] RESERVED_ATTRIBUTE_STARTS = {'X', 'Y', 'Z'};
private int crossSpeciesReads = 0;

private final Log log = Log.getInstance(AbstractAlignmentMerger.class);
private final ProgressLogger progress = new ProgressLogger(this.log, 1000000, "Merged", "records");
@@ -115,6 +117,8 @@
private boolean keepAlignerProperPairFlags = false;
private boolean addMateCigar = false;
private boolean unmapContaminantReads = false;
private UnmappingReadStrategy unmappingReadsStrategy = UnmappingReadStrategy.DO_NOT_CHANGE;

private final SamRecordFilter alignmentFilter = new SamRecordFilter() {
public boolean filterOut(final SAMRecord record) {
@@ -158,13 +162,73 @@ void close() {

public enum UnmappingReadStrategy {
// Leave on record, and copy to tag
COPY_TO_TAG(false, true),
// Leave on record, but do not create additional tag
DO_NOT_CHANGE(false, false),
// Add tag with information, and remove from standard fields in record
MOVE_TO_TAG(true, true);

private final boolean resetMappingInformation, populatePATag;

UnmappingReadStrategy(final boolean resetMappingInformation, final boolean populatePATag) {
this.resetMappingInformation = resetMappingInformation;
this.populatePATag = populatePATag;

public boolean isResetMappingInformation() {
return resetMappingInformation;

public boolean isPopulatePaTag() {
return populatePATag;

protected abstract CloseableIterator<SAMRecord> getQuerynameSortedAlignedRecords();

protected boolean ignoreAlignment(final SAMRecord sam) { return false; } // default implementation

protected boolean isContaminant(final HitsForInsert hits) { return false; } // default implementation

/** constructor with a default setting for unmappingReadsStrategy.
* see full constructor for parameters
public AbstractAlignmentMerger(final File unmappedBamFile, final File targetBamFile,
final File referenceFasta, final boolean clipAdapters,
final boolean bisulfiteSequence, final boolean alignedReadsOnly,
final SAMProgramRecord programRecord, final List<String> attributesToRetain,
final List<String> attributesToRemove,
final Integer read1BasesTrimmed, final Integer read2BasesTrimmed,
final List<SamPairUtil.PairOrientation> expectedOrientations,
final SAMFileHeader.SortOrder sortOrder,
final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy,
final boolean addMateCigar,
final boolean unmapContaminantReads) {

* Constructor
@@ -194,8 +258,10 @@ void close() {
* @param primaryAlignmentSelectionStrategy What to do when there are multiple primary alignments, or multiple
* alignments but none primary, for a read or read pair.
* @param addMateCigar True if we are to add or maintain the mate CIGAR (MC) tag, false if we are to remove or not include.
* @param unmapContaminantReads If true, identify reads having the signature of contamination from a foreign organism (i.e. mostly clipped bases),
* @param unmapContaminantReads If true, identify reads having the signature of cross-species contamination (i.e. mostly clipped bases),
* and mark them as unmapped.
* @param unmappingReadsStrategy An enum describing how to deal with reads whose mapping information are being removed (currently this happens due to cross-species
* contamination). Ignored unless unmapContaminantReads is true.
public AbstractAlignmentMerger(final File unmappedBamFile, final File targetBamFile,
final File referenceFasta, final boolean clipAdapters,
@@ -207,7 +273,8 @@ public AbstractAlignmentMerger(final File unmappedBamFile, final File targetBamF
final SAMFileHeader.SortOrder sortOrder,
final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy,
final boolean addMateCigar,
final boolean unmapContaminantReads) {
final boolean unmapContaminantReads,
final UnmappingReadStrategy unmappingReadsStrategy) {
@@ -257,6 +324,7 @@ public AbstractAlignmentMerger(final File unmappedBamFile, final File targetBamF

this.addMateCigar = addMateCigar;
this.unmapContaminantReads = unmapContaminantReads;
this.unmappingReadsStrategy = unmappingReadsStrategy;

/** Allows the caller to override the maximum records in RAM. */
@@ -517,7 +585,9 @@ public static void fixNmMdAndUq(final SAMRecord record, final ReferenceSequenceF
private void addIfNotFiltered(final Sink out, final SAMRecord rec) {
if (includeSecondaryAlignments || !rec.getNotPrimaryAlignmentFlag()) {
if (this.progress.record(rec) && crossSpeciesReads > 0) {"%d Reads have been unmapped due to being suspected of being Cross-species contamination.", crossSpeciesReads));

@@ -539,10 +609,20 @@ private HitsForInsert nextAligned() {
return null;

private int numCrossSpeciesContaminantWarnings = 0;

* Copies alignment info from aligned to unaligned read, clips as appropriate, and sets PG ID.
* May also un-map the resulting read if the alignment is bad (e.g. no unclipped bases).
* Depending on the value of unmappingReadsStrategy this function will potentially
* - reset the mapping information (MOVE_TO_TAG)
* - copy the original mapping information to a tag "PA" (MOVE_TO_TAG, or COPY_TO_TAG)
* a third possibility for unmappingReadsStrategy is to do nothig (DO_NOT_CHANGE)
* This is because the CRAM format will reset mapping information for reads that are flagged as unaligned.
* @param unaligned Original SAMRecord, and object into which values are copied.
* @param aligned Holds alignment info that will be copied into unaligned.
* @param isContaminant Should this read be unmapped due to contamination?
@@ -557,16 +637,48 @@ private void transferAlignmentInfoToFragment(final SAMRecord unaligned, final SA
log.warn("Record mapped off end of reference; making unmapped: " + aligned);
} else if (isContaminant) {
log.warn("Record looks like foreign contamination; making unmapped: " + aligned);
// NB: for reads that look like contamination, just set unmapped flag and zero MQ but keep other flags as-is.
// this maintains the sort order so that downstream analyses can use them for calculating evidence
// of contamination vs other causes (e.g. structural variants)


if (unmappingReadsStrategy.isPopulatePaTag()) {
unaligned.setAttribute("PA", encodeMappingInformation(aligned));

if (unmappingReadsStrategy.isResetMappingInformation()) {

// Unmapped read cannot have non-zero mapping quality and remain valid
unaligned.setAttribute(, "Cross-species contamination");

// if there already is a comment, add second comment with a | separator:
Optional<String> optionalComment = Optional.ofNullable(unaligned.getStringAttribute(;
unaligned.setAttribute( -> s + " | ").orElse("") +, "Cross-species contamination");

* Encodes mapping information from a record into a string according to the format sepcified in the
* Sam-Spec under the SA tag. No protection against missing values (for cigar, and NM tag).
* (Might make sense to move this to htsJDK.)
* @param rec SAMRecord whose alignment information will be encoded
* @return String encoding rec's alignment information according to SA tag in the SAM spec
static private String encodeMappingInformation(SAMRecord rec) {
return String.join(",",
((Integer) rec.getAlignmentStart()).toString(),
((Integer) rec.getMappingQuality()).toString(),

* Copies alignment info from aligned to unaligned read, if there is an alignment, and sets mate information.
@@ -206,6 +206,9 @@
@Option(doc = "If UNMAP_CONTAMINANT_READS is set, require this many unclipped bases or else the read will be marked as contaminant.")
public int MIN_UNCLIPPED_BASES = 32;

@Option(doc = "How to deal with alignment information in reads that are being unmapped (e.g. due to cross-species contamination.) Currently ignored unless UNMAP_CONTAMINANT_READS = true", optional = true)
public AbstractAlignmentMerger.UnmappingReadStrategy UNMAPPED_READ_STRATEGY = AbstractAlignmentMerger.UnmappingReadStrategy.DO_NOT_CHANGE;

private static final Log log = Log.getInstance(MergeBamAlignment.class);

@@ -260,7 +263,7 @@ protected int doWork() {

0 comments on commit a325e33

Please sign in to comment.
You can’t perform that action at this time.