Skip to content

Commit

Permalink
Merge pull request #810 from broadinstitute/ms_gcWriteMultiRefAlleles
Browse files Browse the repository at this point in the history
A couple of fixes for writing VCFs in GenotypeConcordance
  • Loading branch information
meganshand committed Jun 1, 2017
2 parents 1c54e16 + dd4dc88 commit d4a632a
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 11 deletions.
23 changes: 12 additions & 11 deletions src/main/java/picard/vcf/GenotypeConcordance.java
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,11 @@ private void writeVcfTuple(final VcfTuple tuple, final VariantContextWriter writ
callContext = tuple.rightVariantContext.get();
}

//Don't write symbolic alleles to output VCF
if (truthContext != null && truthContext.isSymbolic() || callContext != null && callContext.isSymbolic()) {
return;
}

// Get the alleles for each genotype. No alleles will be extracted for a genotype if the genotype is
// mixed, filtered, or missing.
final Alleles alleles = normalizeAlleles(truthContext, TRUTH_SAMPLE, callContext, CALL_SAMPLE);
Expand All @@ -432,14 +437,10 @@ private void writeVcfTuple(final VcfTuple tuple, final VariantContextWriter writ
final List<Allele> truthAlleles = alleles.truthAlleles();
final List<Allele> callAlleles = alleles.callAlleles();

// Get the alleles present at this site for both samples to use for the output variant context.
// Get the alleles present at this site for both samples to use for the output variant context, but remove no calls.
final Set<Allele> siteAlleles = new HashSet<>();
if (truthContext != null) {
siteAlleles.addAll(truthContext.getAlleles());
}
if (callContext != null) {
siteAlleles.addAll(callContext.getAlleles());
}
siteAlleles.addAll(allAlleles);
siteAlleles.remove(Allele.NO_CALL);

// Initialize the variant context builder
final VariantContext initialContext = (callContext == null) ? truthContext : callContext;
Expand Down Expand Up @@ -739,17 +740,17 @@ final protected static Alleles normalizeAlleles(final VariantContext truthContex
// Truth reference is shorter than call reference
final String suffix = getStringSuffix(callRef, truthRef, "Ref alleles mismatch between: " + truthContext + " and " + callContext);
final int insertIdx = truthRef.length();
truthAllele1 = spliceOrAppendString(truthAllele1, suffix, insertIdx);
truthAllele2 = spliceOrAppendString(truthAllele2, suffix, insertIdx);
truthAllele1 = truthAllele1.equals(Allele.NO_CALL_STRING) ? truthAllele1 : spliceOrAppendString(truthAllele1, suffix, insertIdx);
truthAllele2 = truthAllele2.equals(Allele.NO_CALL_STRING) ? truthAllele2 : spliceOrAppendString(truthAllele2, suffix, insertIdx);
truthRef = truthRef + suffix;

}
else if (truthRef.length() > callRef.length()) {
// call reference is shorter than truth:
final String suffix = getStringSuffix(truthRef, callRef, "Ref alleles mismatch between: " + truthContext + " and " + callContext);
final int insertIdx = callRef.length();
callAllele1 = spliceOrAppendString(callAllele1, suffix, insertIdx);
callAllele2 = spliceOrAppendString(callAllele2, suffix, insertIdx);
callAllele1 = callAllele1.equals(Allele.NO_CALL_STRING) ? callAllele1 : spliceOrAppendString(callAllele1, suffix, insertIdx);
callAllele2 = callAllele2.equals(Allele.NO_CALL_STRING) ? callAllele2 : spliceOrAppendString(callAllele2, suffix, insertIdx);
callRef = callRef + suffix;
}
else {
Expand Down
35 changes: 35 additions & 0 deletions src/test/java/picard/vcf/GenotypeConcordanceTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
package picard.vcf;

import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.metrics.StringHeader;
import htsjdk.samtools.util.BufferedLineReader;
import htsjdk.samtools.util.IOUtil;
import htsjdk.variant.variantcontext.Allele;
Expand Down Expand Up @@ -100,6 +101,8 @@ public class GenotypeConcordanceTest {

private static final String NORMALIZE_ALLELES_TRUTH = "normalize_alleles_truth.vcf";
private static final String NORMALIZE_ALLELES_CALL = "normalize_alleles_call.vcf";
private static final String NORMALIZE_NO_CALLS_TRUTH = "normalize_no_calls_truth.vcf";
private static final String NORMALIZE_NO_CALLS_CALL = "normalize_no_calls_call.vcf";

@AfterClass
public void tearDown() {
Expand Down Expand Up @@ -572,4 +575,36 @@ public void testNoCallVariants() {

Assert.assertEquals(genotypeConcordance.instanceMain(new String[0]), 0);
}

@Test
public void testNormalizeAllelesForWritingVCF() throws FileNotFoundException {
final File truthVcfPath = new File(TEST_DATA_PATH.getAbsolutePath(), NORMALIZE_NO_CALLS_TRUTH);
final File callVcfPath = new File(TEST_DATA_PATH.getAbsolutePath(), NORMALIZE_NO_CALLS_CALL);
final File outputBaseFileName = new File(OUTPUT_DATA_PATH, "MultipleRefAlleles");
final File outputContingencyMetrics = new File(outputBaseFileName.getAbsolutePath() + GenotypeConcordance.CONTINGENCY_METRICS_FILE_EXTENSION);
outputContingencyMetrics.deleteOnExit();

final GenotypeConcordance genotypeConcordance = new GenotypeConcordance();
genotypeConcordance.TRUTH_VCF = truthVcfPath;
genotypeConcordance.TRUTH_SAMPLE = "truth";
genotypeConcordance.CALL_VCF = callVcfPath;
genotypeConcordance.CALL_SAMPLE = "truth";
genotypeConcordance.OUTPUT = new File(OUTPUT_DATA_PATH, "MultipleRefAlleles");
genotypeConcordance.OUTPUT_VCF = true;

Assert.assertEquals(genotypeConcordance.instanceMain(new String[0]), 0);

final MetricsFile<GenotypeConcordanceContingencyMetrics, Comparable<?>> output = new MetricsFile<GenotypeConcordanceContingencyMetrics, Comparable<?>>();
output.read(new FileReader(outputContingencyMetrics));

for (final GenotypeConcordanceContingencyMetrics metrics : output.getMetrics()) {
if(metrics.VARIANT_TYPE == VariantContext.Type.INDEL){
Assert.assertEquals(metrics.TP_COUNT, 3);
Assert.assertEquals(metrics.TN_COUNT, 3);
Assert.assertEquals(metrics.FP_COUNT, 0);
Assert.assertEquals(metrics.FN_COUNT, 0);
Assert.assertEquals(metrics.EMPTY_COUNT, 2);
}
}
}
}
43 changes: 43 additions & 0 deletions testdata/picard/vcf/normalize_no_calls_call.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
##fileformat=VCFv4.1
##FILTER=<ID=Uncertain,Description="Uncertain genotype due to reason in filter INFO field">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Net Genotype quality across all datasets, defined as difference between most likely and next most likely genotype likelihoods">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Net Genotype across all datasets">
##INFO=<ID=set,Number=1,Type=String,Description="Source VCF for the merged record in CombineVariants">
##contig=<ID=1,length=249250621>
##contig=<ID=2,length=243199373>
##contig=<ID=3,length=198022430>
##contig=<ID=4,length=191154276>
##contig=<ID=5,length=180915260>
##contig=<ID=6,length=171115067>
##contig=<ID=7,length=159138663>
##contig=<ID=8,length=146364022>
##contig=<ID=9,length=141213431>
##contig=<ID=10,length=135534747>
##contig=<ID=11,length=135006516>
##contig=<ID=12,length=133851895>
##contig=<ID=13,length=115169878>
##contig=<ID=14,length=107349540>
##contig=<ID=15,length=102531392>
##contig=<ID=16,length=90354753>
##contig=<ID=17,length=81195210>
##contig=<ID=18,length=78077248>
##contig=<ID=19,length=59128983>
##contig=<ID=20,length=63025520>
##contig=<ID=21,length=48129895>
##contig=<ID=22,length=51304566>
##contig=<ID=X,length=155270560>
##contig=<ID=Y,length=59373566>
##contig=<ID=MT,length=16569>
##fileDate=20130719
##phasing=none
##reference=file:///seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta
##source=SelectVariants
##variants_justified=left
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT truth
1 1 . TCAAGGG TGGG 22132 PASS set=Intersection GT:GQ 0/1:99
1 10 . TCAACAA TCAA 22132 PASS set=Intersection GT:GQ 0/1:99
1 100 . TCAACAA TCAACAAGG 10461 PASS set=Intersection GT:GQ 0/1:99
1 1000 . TCAACAA T 10461 PASS set=Intersection GT:GQ ./.
1 1100 . TCAACAA T 10461 PASS set=Intersection GT:GQ 0/1:99
1 1200 . TCAACAA T 10461 PASS set=Intersection GT:GQ ./.
1 1300 . TCAACAA <ID> 10461 PASS set=Intersection GT:GQ 0/1:99
42 changes: 42 additions & 0 deletions testdata/picard/vcf/normalize_no_calls_truth.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
##fileformat=VCFv4.1
##FILTER=<ID=Uncertain,Description="Uncertain genotype due to reason in filter INFO field">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Net Genotype quality across all datasets, defined as difference between most likely and next most likely genotype likelihoods">
##INFO=<ID=set,Number=1,Type=String,Description="Source VCF for the merged record in CombineVariants">
##contig=<ID=1,length=249250621>
##contig=<ID=2,length=243199373>
##contig=<ID=3,length=198022430>
##contig=<ID=4,length=191154276>
##contig=<ID=5,length=180915260>
##contig=<ID=6,length=171115067>
##contig=<ID=7,length=159138663>
##contig=<ID=8,length=146364022>
##contig=<ID=9,length=141213431>
##contig=<ID=10,length=135534747>
##contig=<ID=11,length=135006516>
##contig=<ID=12,length=133851895>
##contig=<ID=13,length=115169878>
##contig=<ID=14,length=107349540>
##contig=<ID=15,length=102531392>
##contig=<ID=16,length=90354753>
##contig=<ID=17,length=81195210>
##contig=<ID=18,length=78077248>
##contig=<ID=19,length=59128983>
##contig=<ID=20,length=63025520>
##contig=<ID=21,length=48129895>
##contig=<ID=22,length=51304566>
##contig=<ID=X,length=155270560>
##contig=<ID=Y,length=59373566>
##contig=<ID=MT,length=16569>
##fileDate=20130719
##phasing=none
##reference=file:///seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta
##source=SelectVariants
##variants_justified=left
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT truth
1 1 . TCAA T 22132 PASS set=Intersection GT:GQ 0/1:99
1 10 . TCAA T 22132 PASS set=Intersection GT:GQ 0/1:99
1 100 . TCAA TCAAGG 10461 PASS set=Intersection GT:GQ 0/1:99
1 1000 . TCAA T 10461 PASS set=Intersection GT:GQ ./.
1 1100 . TCAA T 10461 PASS set=Intersection GT:GQ ./.
1 1200 . TCAA T 10461 PASS set=Intersection GT:GQ 0/1:99
1 1300 . TCAA <ID> 10461 PASS set=Intersection GT:GQ 0/1:99

0 comments on commit d4a632a

Please sign in to comment.