Skip to content

Commit

Permalink
Merge pull request #938 from broadinstitute/eb_fix_spanning_deletions…
Browse files Browse the repository at this point in the history
…_in_genotyping

Added a fix for genotyping positions over spanning deletions.
  • Loading branch information
eitanbanks committed May 12, 2015
2 parents f77cee5 + 530e0e5 commit 53a34ce
Show file tree
Hide file tree
Showing 8 changed files with 288 additions and 92 deletions.
Expand Up @@ -88,9 +88,6 @@
*/
public class ReferenceConfidenceModel {

//public final static String INDEL_INFORMATIVE_DEPTH = "CD"; // temporarily taking this extra genotype level information out for now
public final static String ALTERNATE_ALLELE_STRING = "ALT"; // arbitrary alternate allele

private final GenomeLocParser genomeLocParser;

private final SampleList samples;
Expand Down Expand Up @@ -138,9 +135,7 @@ public ReferenceConfidenceModel(final GenomeLocParser genomeLocParser,
*/
public Set<VCFHeaderLine> getVCFHeaderLines() {
final Set<VCFHeaderLine> headerLines = new LinkedHashSet<>();
// TODO - do we need a new kind of VCF Header subclass for specifying arbitrary alternate alleles?
headerLines.add(new VCFSimpleHeaderLine(ALTERNATE_ALLELE_STRING, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location"));
//headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize));
headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location"));
return headerLines;
}

Expand Down
Expand Up @@ -165,6 +165,8 @@ public void initialize() {
// take care of the VCF headers
final Map<String, VCFHeader> vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit());
final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_NAME, "Represents any possible spanning deletion allele at this location"));
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags

final Set<String> samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
final VCFHeader vcfHeader = new VCFHeader(headerLines, samples);
Expand Down
Expand Up @@ -217,10 +217,13 @@ public void initialize() {
final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
headerLines.addAll(annotationEngine.getVCFAnnotationDescriptions());
headerLines.addAll(genotypingEngine.getAppropriateVCFInfoHeaders());
// add the pool values for each genotype

// add headers for annotations added by this tool
headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_NAME, "Represents any possible spanning deletion allele at this location"));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY));
headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY));
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags
if ( dbsnp != null && dbsnp.dbsnp.isBound() )
VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY);

Expand Down
Expand Up @@ -113,20 +113,27 @@ public static VariantContext merge(final List<VariantContext> VCs, final GenomeL
final Map<String, List<Comparable>> annotationMap = new LinkedHashMap<>();
final GenotypesContext genotypes = GenotypesContext.create();

final int variantContextCount = VCs.size();
// In this list we hold the mapping of each variant context alleles.
final List<Pair<VariantContext,List<Allele>>> vcAndNewAllelePairs = new ArrayList<>(variantContextCount);
final List<Pair<VariantContext,List<Allele>>> vcAndNewAllelePairs = new ArrayList<>(VCs.size());
// Keep track of whether we saw a spanning deletion and a non-spanning event
boolean sawSpanningDeletion = false;
boolean sawNonSpanningEvent = false;

// cycle through and add info from the other VCs
for ( final VariantContext vc : VCs ) {

// if this context doesn't start at the current location then it must be a spanning event (deletion or ref block)
final boolean isSpanningEvent = loc.getStart() != vc.getStart();
// record whether it's also a spanning deletion/event (we know this because the VariantContext type is no
// longer "symbolic" but "mixed" because there are real alleles mixed in with the symbolic non-ref allele)
sawSpanningDeletion |= ( isSpanningEvent && vc.isMixed() ) || vc.getAlternateAlleles().contains(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE);
sawNonSpanningEvent |= ( !isSpanningEvent && vc.isMixed() );

vcAndNewAllelePairs.add(new Pair<>(vc,isSpanningEvent ? replaceWithNoCalls(vc.getAlleles())
: remapAlleles(vc.getAlleles(), refAllele, finalAlleleSet)));
vcAndNewAllelePairs.add(new Pair<>(vc, isSpanningEvent ? replaceWithNoCallsAndDels(vc) : remapAlleles(vc, refAllele, finalAlleleSet)));
}

// Add <NON_REF> to the end if at all required in in the output.
// Add <DEL> and <NON_REF> to the end if at all required in in the output.
if ( sawSpanningDeletion && (sawNonSpanningEvent || !removeNonRefSymbolicAllele) ) finalAlleleSet.add(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE);
if (!removeNonRefSymbolicAllele) finalAlleleSet.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);

final List<Allele> allelesList = new ArrayList<>(finalAlleleSet);
Expand Down Expand Up @@ -171,13 +178,27 @@ public static VariantContext merge(final List<VariantContext> VCs, final GenomeL

final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);

// note that in order to calculate the end position, we need a list of alleles that doesn't include anything symbolic
final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(allelesList)
.chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(allelesList, loc.getStart(), loc.getStart())
.chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(nonSymbolicAlleles(allelesList), loc.getStart(), loc.getStart())
.genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to re-genotype later

return builder.make();
}

/**
* @param list the original alleles list
* @return a non-null list of non-symbolic alleles
*/
private static List<Allele> nonSymbolicAlleles(final List<Allele> list) {
final List<Allele> result = new ArrayList<>(list.size());
for ( final Allele allele : list ) {
if ( !allele.isSymbolic() )
result.add(allele);
}
return result;
}

/**
* Determines the ref allele given the provided reference base at this position
*
Expand Down Expand Up @@ -246,27 +267,28 @@ private static void addReferenceConfidenceAttributes(final Map<String, Object> m
* collects alternative alleles present in variant context and add them to the {@code finalAlleles} set.
* </li></ul>
*
* @param vcAlleles the variant context allele list.
* @param refAllele final reference allele.
* @param vc the variant context.
* @param refAllele final reference allele.
* @param finalAlleles where to add the final set of non-ref called alleles.
* @return never {@code null}
*/
//TODO as part of a larger refactoring effort {@link #remapAlleles} can be merged with {@link GATKVariantContextUtils#remapAlleles}.
private static List<Allele> remapAlleles(final List<Allele> vcAlleles, final Allele refAllele, final LinkedHashSet<Allele> finalAlleles) {
final Allele vcRef = vcAlleles.get(0);
if (!vcRef.isReference()) throw new IllegalStateException("the first allele of the vc allele list must be reference");
private static List<Allele> remapAlleles(final VariantContext vc, final Allele refAllele, final LinkedHashSet<Allele> finalAlleles) {

final Allele vcRef = vc.getReference();
final byte[] refBases = refAllele.getBases();
final int extraBaseCount = refBases.length - vcRef.getBases().length;
if (extraBaseCount < 0) throw new IllegalStateException("the wrong reference was selected");
final List<Allele> result = new ArrayList<>(vcAlleles.size());

for (final Allele a : vcAlleles) {
if (a.isReference()) {
result.add(refAllele);
} else if (a.isSymbolic()) {
final List<Allele> result = new ArrayList<>(vc.getNAlleles());
result.add(refAllele);

for (final Allele a : vc.getAlternateAlleles()) {
if (a.isSymbolic()) {
result.add(a);
// we always skip <NON_REF> when adding to finalAlleles this is done outside if applies.
if (!a.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE))
// we always skip <NON_REF> when adding to finalAlleles; this is done outside if applies.
// we also skip <*DEL> if there isn't a real alternate allele.
if ( !a.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) && !vc.isSymbolic() )
finalAlleles.add(a);
} else if (a.isCalled()) {
final Allele newAllele;
Expand All @@ -287,17 +309,31 @@ private static List<Allele> remapAlleles(final List<Allele> vcAlleles, final All
}

/**
* Replaces any alleles in the list with NO CALLS, except for the generic ALT allele
* Replaces any alleles in the VariantContext with NO CALLS or the symbolic deletion allele as appropriate, except for the generic ALT allele
*
* @param alleles list of alleles to replace
* @param vc VariantContext with the alleles to replace
* @return non-null list of alleles
*/
private static List<Allele> replaceWithNoCalls(final List<Allele> alleles) {
if ( alleles == null ) throw new IllegalArgumentException("list of alleles cannot be null");
private static List<Allele> replaceWithNoCallsAndDels(final VariantContext vc) {
if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null");

final List<Allele> result = new ArrayList<>(vc.getNAlleles());

final List<Allele> result = new ArrayList<>(alleles.size());
for ( final Allele allele : alleles )
result.add(allele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) ? allele : Allele.NO_CALL);
// no-call the reference allele
result.add(Allele.NO_CALL);

// handle the alternate alleles
for ( final Allele allele : vc.getAlternateAlleles() ) {
final Allele replacement;
if ( allele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) )
replacement = allele;
else if ( allele.length() < vc.getReference().length() )
replacement = GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE;
else
replacement = Allele.NO_CALL;

result.add(replacement);
}
return result;
}

Expand Down Expand Up @@ -387,43 +423,16 @@ protected static int[] getIndexesOfRelevantAlleles(final List<Allele> remappedAl
throw new UserException("The list of input alleles must contain " + GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records");

final int indexOfNonRef = remappedAlleles.indexOf(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);

//if the refs don't match then let the non-ref allele be the most likely of the alts
//TODO: eventually it would be nice to be able to trim alleles for spanning events to see if they really do have the same ref
final boolean refsMatch = targetAlleles.get(0).equals(remappedAlleles.get(0),false);
final int indexOfBestAlt;
if (!refsMatch && g.hasPL()) {
final int[] PLs = g.getPL().clone();
PLs[0] = Integer.MAX_VALUE; //don't pick 0/0
final int indexOfBestAltPL = MathUtils.minElementIndex(PLs);
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(indexOfBestAltPL);
indexOfBestAlt = pair.alleleIndex2;
}
else
indexOfBestAlt = indexOfNonRef;

final int[] indexMapping = new int[targetAlleles.size()];

// the reference likelihoods should always map to each other (even if the alleles don't)
indexMapping[0] = 0;

// create the index mapping, using the <ALT> allele whenever such a mapping doesn't exist
final int targetNonRef = targetAlleles.indexOf(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final boolean targetHasNonRef = targetNonRef != -1;
final int lastConcreteAlt = targetHasNonRef ? targetAlleles.size()-2 : targetAlleles.size()-1;
for ( int i = 1; i <= lastConcreteAlt; i++ ) {
// create the index mapping, using the <NON-REF> allele whenever such a mapping doesn't exist
for ( int i = 1; i < targetAlleles.size(); i++ ) {
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i));
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
}
if (targetHasNonRef) {
if (refsMatch) {
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(targetNonRef));
indexMapping[targetNonRef] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
}
else {
indexMapping[targetNonRef] = indexOfBestAlt;
}
}

return indexMapping;
}
Expand Down
Expand Up @@ -100,7 +100,7 @@ public void testTetraploidRun() {
" -V:sample3 " + privateTestDir + "tetraploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals",
1,
Arrays.asList("7b3153135e4f8e1d137d3f4beb46f182"));
Arrays.asList("ebe26077809961f53d5244643d24fd45"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}

Expand All @@ -112,7 +112,7 @@ public void testMixedPloidyRun() {
" -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals",
1,
Arrays.asList("4f546634213ece6f08ec9258620b92bb"));
Arrays.asList("2d36a5f996cad47e5d05fcd78f6e572e"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}

Expand Down Expand Up @@ -190,32 +190,43 @@ public void testOneHasDeletionAndTwoHasRefBlock() throws Exception {
@Test
public void testMD5s() throws Exception {
final String cmd = baseTestString(" -L 1:69485-69791");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("b7c753452ab0c05f9cee538e420b87fa"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("83ea9f4a9aadb1218c21c9d3780e8009"));
spec.disableShadowBCF();
executeTest("testMD5s", spec);
}

@Test
public void testBasepairResolutionOutput() throws Exception {
final String cmd = baseTestString(" -L 1:69485-69791 --convertToBasePairResolution");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("bb6420ead95da4c72e76ca4bf5860ef0"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("f153cb6e986efc9b50f0b8833fe5d3da"));
spec.disableShadowBCF();
executeTest("testBasepairResolutionOutput", spec);
}

@Test
public void testBreakBlocks() throws Exception {
final String cmd = baseTestString(" -L 1:69485-69791 --breakBandsAtMultiplesOf 5");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("dd31182124c4b78a8a03edb1e0cf618b"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("6626ff272e7e76fba091f5bde4a1f963"));
spec.disableShadowBCF();
executeTest("testBreakBlocks", spec);
}

@Test
public void testSpanningDeletions() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf",
1,
Arrays.asList("fba48ce2bf8761366ff2cd0b45d0421f"));
spec.disableShadowBCF();
executeTest("testSpanningDeletions", spec);
}

@Test
public void testWrongReferenceBaseBugFix() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input1.vcf"
+ " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input2.vcf") + " -o %s --no_cmdline_in_header");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("75d81c247bef5e394b4a2d4126aee2b3"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("331c1a4a6a72ea1617c1697a5d945d56"));
spec.disableShadowBCF();
executeTest("testWrongReferenceBaseBugFix",spec);

Expand All @@ -224,7 +235,7 @@ public void testWrongReferenceBaseBugFix() throws Exception {
@Test
public void testBasepairResolutionInput() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -V " + privateTestDir + "gvcf.basepairResolution.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("994ff5c219c683635eadc1624cbbda74"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("207e89b5677fbf0ef4d1ff768262cf0c"));
spec.disableShadowBCF();
executeTest("testBasepairResolutionInput", spec);
}
Expand Down

0 comments on commit 53a34ce

Please sign in to comment.