Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix several issues with M2 and HC force-calling mode #5874

Merged
merged 2 commits into from Apr 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -20,6 +20,8 @@ public abstract class AssemblyBasedCallerArgumentCollection {
public static final String BAM_WRITER_TYPE_LONG_NAME = "bam-writer-type";
public static final String DONT_USE_SOFT_CLIPPED_BASES_LONG_NAME = "dont-use-soft-clipped-bases";
public static final String DO_NOT_RUN_PHYSICAL_PHASING_LONG_NAME = "do-not-run-physical-phasing";
public static final String MAX_MNP_DISTANCE_LONG_NAME = "max-mnp-distance";
public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist";

public static final String MIN_BASE_QUALITY_SCORE_LONG_NAME = "min-base-quality-score";
public static final String SMITH_WATERMAN_LONG_NAME = "smith-waterman";
Expand Down Expand Up @@ -111,4 +113,14 @@ public ReadThreadingAssembler createReadThreadingAssembler() {
@Advanced
@Argument(fullName="emit-ref-confidence", shortName="ERC", doc="(BETA feature) Mode for emitting reference confidence scores", optional = true)
public ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE;

protected abstract int getDefaultMaxMnpDistance();

/**
* Two or more phased substitutions separated by this distance or less are merged into MNPs.
*/
@Advanced
@Argument(fullName = MAX_MNP_DISTANCE_LONG_NAME, shortName = MAX_MNP_DISTANCE_SHORT_NAME,
doc = "Two or more phased substitutions separated by this distance or less are merged into MNPs.", optional = true)
public int maxMnpDistance = getDefaultMaxMnpDistance();
}
Expand Up @@ -13,6 +13,7 @@
import org.broadinstitute.hellbender.engine.AlignmentContext;
import org.broadinstitute.hellbender.engine.AssemblyRegion;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
Expand All @@ -29,10 +30,7 @@
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState;
import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.read.*;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
Expand All @@ -49,6 +47,7 @@
public final class AssemblyBasedCallerUtils {

static final int REFERENCE_PADDING_FOR_ASSEMBLY = 500;
public static final int NUM_HAPLOTYPES_TO_INJECT_FORCE_CALLING_ALLELES_INTO = 5;
public static final String SUPPORTED_ALLELES_TAG="XA";
public static final String CALLABLE_REGION_TAG = "CR";
public static final String ALIGNMENT_REGION_TAG = "AR";
Expand Down Expand Up @@ -275,7 +274,7 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region,

final byte[] fullReferenceWithPadding = region.getAssemblyRegionReference(referenceReader, REFERENCE_PADDING_FOR_ASSEMBLY);
final SimpleInterval paddedReferenceLoc = getPaddedReferenceLoc(region, REFERENCE_PADDING_FOR_ASSEMBLY, referenceReader);
final Haplotype referenceHaplotype = createReferenceHaplotype(region, paddedReferenceLoc, referenceReader);
final Haplotype refHaplotype = createReferenceHaplotype(region, paddedReferenceLoc, referenceReader);

final ReadErrorCorrector readErrorCorrector = argumentCollection.assemblerArgs.errorCorrectReads ?
new ReadErrorCorrector(argumentCollection.assemblerArgs.kmerLengthForReadErrorCorrection,
Expand All @@ -286,9 +285,12 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region,
null;

try {
final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly(region, referenceHaplotype, fullReferenceWithPadding,
paddedReferenceLoc, givenAlleles, readErrorCorrector, header,
aligner);
final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly(region, refHaplotype, fullReferenceWithPadding,
paddedReferenceLoc, readErrorCorrector, header, aligner);
if (!givenAlleles.isEmpty()) {
addGivenAlleles(region.getExtendedSpan().getStart(), givenAlleles, argumentCollection.maxMnpDistance, aligner, refHaplotype, assemblyResultSet);
}

assemblyResultSet.setDebug(argumentCollection.assemblerArgs.debugAssembly);
assemblyResultSet.debugDump(logger);
return assemblyResultSet;
Expand All @@ -305,6 +307,57 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region,
}
}

@VisibleForTesting
static void addGivenAlleles(final int assemblyRegionStart, final List<VariantContext> givenAlleles, final int maxMnpDistance,
final SmithWatermanAligner aligner, final Haplotype refHaplotype, final AssemblyResultSet assemblyResultSet) {
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
final Map<Integer, VariantContext> assembledVariants = assemblyResultSet.getVariationEvents(maxMnpDistance).stream()
.collect(Collectors.groupingBy(VariantContext::getStart, Collectors.collectingAndThen(Collectors.toList(), AssemblyBasedCallerUtils::makeMergedVariantContext)));

final List<Haplotype> assembledHaplotypes = assemblyResultSet.getHaplotypeList();
for (final VariantContext givenVC : givenAlleles) {
final VariantContext assembledVC = assembledVariants.get(givenVC.getStart());
final int givenVCRefLength = givenVC.getReference().length();
final Allele longerRef = (assembledVC == null || givenVCRefLength > assembledVC.getReference().length()) ? givenVC.getReference() : assembledVC.getReference();
final List<Allele> unassembledGivenAlleles;
if (assembledVC == null) {
unassembledGivenAlleles = givenVC.getAlternateAlleles();
} else {
// map all alleles to the longest common reference
final Set<Allele> assembledAlleleSet = new HashSet<>(longerRef.length() == assembledVC.getReference().length() ? assembledVC.getAlternateAlleles() :
ReferenceConfidenceVariantContextMerger.remapAlleles(assembledVC, longerRef));
final Set<Allele> givenAlleleSet = new HashSet<>(longerRef.length() == givenVCRefLength ? givenVC.getAlternateAlleles() :
ReferenceConfidenceVariantContextMerger.remapAlleles(givenVC, longerRef));
unassembledGivenAlleles = givenAlleleSet.stream().filter(a -> !assembledAlleleSet.contains(a)).collect(Collectors.toList());
}

// choose the highest-scoring haplotypes along with the reference for building force-calling haplotypes
final List<Haplotype> baseHaplotypes = unassembledGivenAlleles.isEmpty() ? Collections.emptyList() : assembledHaplotypes.stream()
.sorted(Comparator.comparingInt((Haplotype hap) -> hap.isReference() ? 1 : 0).thenComparingDouble(hap -> hap.getScore()).reversed())
.limit(NUM_HAPLOTYPES_TO_INJECT_FORCE_CALLING_ALLELES_INTO)
.collect(Collectors.toList());

for (final Allele givenAllele : unassembledGivenAlleles) {
for (final Haplotype baseHaplotype : baseHaplotypes) {
// make sure this allele doesn't collide with a variant on the haplotype
if (baseHaplotype.getEventMap()!= null && baseHaplotype.getEventMap().getVariantContexts().stream().anyMatch(vc -> vc.overlaps(givenVC))) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So if the GGA allele is a SNP that is spanned by a deletion in the discovered variants the it's only added to the reference haplotype, right? And it will still get output in the vcf?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes and yes.

continue;
}

final Haplotype insertedHaplotype = baseHaplotype.insertAllele(longerRef, givenAllele, activeRegionStart + givenVC.getStart() - assemblyRegionStart, givenVC.getStart());
if (insertedHaplotype != null) { // can be null if the requested allele can't be inserted into the haplotype
final Cigar cigar = CigarUtils.calculateCigar(refHaplotype.getBases(), insertedHaplotype.getBases(), aligner);
insertedHaplotype.setCigar(cigar);
insertedHaplotype.setGenomeLocation(refHaplotype.getGenomeLocation());
insertedHaplotype.setAlignmentStartHapwrtRef(activeRegionStart);
assemblyResultSet.add(insertedHaplotype);
}
}
}
}
assemblyResultSet.regenerateVariationEvents(maxMnpDistance);
}

/**
* Annotates reads in ReadLikelihoods with alignment region (the ref region spanned by the haplotype the read is aligned to) and
* callable region (the ref region over which a caller is using these ReadLikelihoods to call variants)
Expand Down Expand Up @@ -817,4 +870,5 @@ private static VariantContext phaseVC(final VariantContext vc, final String ID,
}
return new VariantContextBuilder(vc).genotypes(phasedGenotypes).make();
}

}
Expand Up @@ -43,6 +43,7 @@ public final class AssemblyResultSet {
private boolean wasTrimmed = false;
private final SortedSet<Integer> kmerSizes;
private SortedSet<VariantContext> variationEvents;
private OptionalInt lastMaxMnpDistanceUsed = OptionalInt.empty();
private boolean debug;
private static final Logger logger = LogManager.getLogger(AssemblyResultSet.class);

Expand Down Expand Up @@ -92,6 +93,7 @@ public AssemblyResultSet trimTo(final AssemblyRegion trimmedAssemblyRegion) {
result.setRegionForGenotyping(trimmedAssemblyRegion);
result.setFullReferenceWithPadding(fullReferenceWithPadding);
result.setPaddedReferenceLoc(paddedReferenceLoc);
result.variationPresent = haplotypes.stream().anyMatch(Haplotype::isNonReference);
if (result.refHaplotype == null) {
throw new IllegalStateException("missing reference haplotype in the trimmed set");
}
Expand Down Expand Up @@ -510,14 +512,24 @@ private void updateReferenceHaplotype(final Haplotype newHaplotype) {
*/
public SortedSet<VariantContext> getVariationEvents(final int maxMnpDistance) {
ParamUtils.isPositiveOrZero(maxMnpDistance, "maxMnpDistance may not be negative.");
if (variationEvents == null) {
final List<Haplotype> haplotypeList = getHaplotypeList();
EventMap.buildEventMapsForHaplotypes(haplotypeList, fullReferenceWithPadding, paddedReferenceLoc, debug, maxMnpDistance);
variationEvents = EventMap.getAllVariantContexts(haplotypeList);

final boolean sameMnpDistance = lastMaxMnpDistanceUsed.isPresent() && maxMnpDistance == lastMaxMnpDistanceUsed.getAsInt();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How would the MNP distance change within the same tool execution?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It shouldn't ever change, but I lean toward keeping the logic self-consistent without assuming anything about the tools that invoke it. Of course, some of these classes are so entwined with HC and M2 that it's safe to assume things, but in this case the price of caution is small.

Also, the code for exactly when event maps are cached was quite brittle in that you could write reasonable code and encounter gotchas, so I felt like making it all more explicit as to what had and had not been computed previously.

lastMaxMnpDistanceUsed = OptionalInt.of(maxMnpDistance);

if (variationEvents == null || !sameMnpDistance || haplotypes.stream().anyMatch(hap -> hap.isNonReference() && hap.getEventMap() == null)) {
regenerateVariationEvents(maxMnpDistance);
}
return variationEvents;
}

public void regenerateVariationEvents(int maxMnpDistance) {
final List<Haplotype> haplotypeList = getHaplotypeList();
EventMap.buildEventMapsForHaplotypes(haplotypeList, fullReferenceWithPadding, paddedReferenceLoc, debug, maxMnpDistance);
variationEvents = EventMap.getAllVariantContexts(haplotypeList);
lastMaxMnpDistanceUsed = OptionalInt.of(maxMnpDistance);
variationPresent = haplotypeList.stream().anyMatch(Haplotype::isNonReference);
}

public void setDebug(boolean debug) {
this.debug = debug;
}
Expand Down
Expand Up @@ -21,8 +21,6 @@
public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection implements Serializable{
private static final long serialVersionUID = 1L;

public static final String MAX_MNP_DISTANCE_LONG_NAME = "max-mnp-distance";
public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist";
public static final String GQ_BAND_LONG_NAME = "gvcf-gq-bands";
public static final String GQ_BAND_SHORT_NAME = "GQB";
public static final String CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME = "correct-overlapping-quality";
Expand All @@ -31,6 +29,9 @@ public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgume
@ArgumentCollection
public StandardCallerArgumentCollection standardArgs = new StandardCallerArgumentCollection();

@Override
protected int getDefaultMaxMnpDistance() { return 0; }

@Override
protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgumentCollection() {
return new HaplotypeCallerReadThreadingAssemblerArgumentCollection();
Expand Down Expand Up @@ -133,15 +134,6 @@ protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgu
@Argument(fullName = "dont-genotype", doc = "Perform assembly but do not genotype variants", optional = true)
public boolean dontGenotype = false;

/**
* Two or more phased substitutions separated by this distance or less are merged into MNPs.
*/
@Advanced
@Argument(fullName = MAX_MNP_DISTANCE_LONG_NAME, shortName = MAX_MNP_DISTANCE_SHORT_NAME,
doc = "Two or more phased substitutions separated by this distance or less are merged into MNPs. " +
"WARNING: When used in GVCF mode, resulting GVCFs cannot be joint-genotyped.", optional = true)
public int maxMnpDistance = 0;

/**
* As of GATK 3.3, HaplotypeCaller outputs physical (read-based) information (see version 3.3 release notes and documentation for details). This argument disables that behavior.
*/
Expand Down
Expand Up @@ -542,9 +542,6 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur
final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, aligner, !hcArgs.doNotCorrectOverlappingBaseQualities);

final SortedSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents(hcArgs.maxMnpDistance);
// TODO - line bellow might be unnecessary : it might be that assemblyResult will always have those alleles anyway
// TODO - so check and remove if that is the case:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It feels so good to finally take care of that TODO that's been around for longer than I have.

allVariationEvents.addAll(givenAlleles);

final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(region, allVariationEvents);

Expand Down