Merge e20c32c into 1302e07

bigdatagenomics · Sep 29, 2017 · ca48ea5 · ca48ea5
2 parents 1302e07 + e20c32c
commit ca48ea5
Show file tree

Hide file tree

Showing 4 changed files with 115 additions and 4 deletions.
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantContextConverter.scala
@@ -1110,7 +1110,7 @@ class VariantContextConverter(
         tryAndCatchStringCast(attr, attribute => {
           vcab.setFisherStrandBiasPValue(attribute.asInstanceOf[java.lang.Float])
         }, attribute => {
-          vcab.setFisherStrandBiasPValue(attribute.toFloat)
+          vcab.setFisherStrandBiasPValue(toFloat(attribute))
         })
       }).getOrElse(vcab)
   }
@@ -1124,7 +1124,7 @@ class VariantContextConverter(
         tryAndCatchStringCast(attr, attribute => {
           vcab.setRmsMapQ(attribute.asInstanceOf[java.lang.Float])
         }, attribute => {
-          vcab.setRmsMapQ(attribute.toFloat)
+          vcab.setRmsMapQ(toFloat(attribute))
         })
       }).getOrElse(vcab)
   }
@@ -1226,7 +1226,15 @@ class VariantContextConverter(
   private def toFloat(obj: java.lang.Object): Float = {
     tryAndCatchStringCast(obj, o => {
       o.asInstanceOf[java.lang.Float]
-    }, o => o.toFloat)
+    }, o => {
+      if (o == "+Inf") {
+        Float.PositiveInfinity
+      } else if (o == "-Inf") {
+        Float.NegativeInfinity
+      } else {
+        o.toFloat
+      }
+    })
   }
 
   // don't shadow toString
@@ -1271,7 +1279,7 @@ class VariantContextConverter(
       o.asInstanceOf[Array[java.lang.Float]]
         .map(f => f: Float)
     }, o => {
-      splitAndCheckForEmptyArray(o).map(_.toFloat)
+      splitAndCheckForEmptyArray(o).map(toFloat(_))
     })
   }
 

diff --git a/adam-core/src/test/resources/inf_float_values.vcf b/adam-core/src/test/resources/inf_float_values.vcf
@@ -0,0 +1,75 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=IndelFS,Description="FS > 200.0">
+##FILTER=<ID=IndelQD,Description="QD < 2.0">
+##FILTER=<ID=IndelReadPosRankSum,Description="ReadPosRankSum < -20.0">
+##FILTER=<ID=LowQual,Description="Low quality">
+##FILTER=<ID=VQSRTrancheSNP99.50to99.60,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -0.5377 <= x < -0.1787">
+##FILTER=<ID=VQSRTrancheSNP99.60to99.70,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -1.0634 <= x < -0.5377">
+##FILTER=<ID=VQSRTrancheSNP99.70to99.80,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -1.7119 <= x < -1.0634">
+##FILTER=<ID=VQSRTrancheSNP99.80to99.90,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -2.3301 <= x < -1.7119">
+##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -2.8169 <= x < -2.3301">
+##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="Truth sensitivity tranche level for SNP model at VQS Lod < -1918.1929">
+##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -1918.1929 <= x < -2.8169">
+##FILTER=<ID=dp,Description="Insufficient read depth">
+##FILTER=<ID=gq,Description="Insufficient genotype quality">
+##FILTER=<ID=rd,Description="Insufficient supporting reads">
+##FILTER=<ID=sx,Description="Heterozygous sex chromosomes in male sample, or Y chromosome in female sample">
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
+##FORMAT=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
+##FORMAT=<ID=FT,Number=.,Type=String,Description="Genotype-level filter">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the gVCF block">
+##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Root mean square (RMS) mapping quality">
+##FORMAT=<ID=MQ0,Number=1,Type=Float,Description="Total number of reads with mapping quality=0">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
+##FORMAT=<ID=PQ,Number=1,Type=Float,Description="Read-backed phasing quality">
+##FORMAT=<ID=float,Number=1,Type=Float,Description="Float test key">
+##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set ID">
+##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
+##GATKCommandLine=<ID=CombineVariants,Version=2.7-63-gc434461,Date="Mon Oct 14 15:08:05 EDT 2013",Epoch=1381777685067,CommandLineOptions="analysis_type=CombineVariants input_file=[] read_buffer_size=null phone_home=NO_ET gatk_key=/packages/gatk/1.5-21-g979a84a/src/eugene.fluder_mssm.edu.key tag=NA read_filter=[] intervals=[/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/.queueScatterGather/.qlog/r1-1-1.combined.rawGT.vcf.combine-sg/temp_01_of_20/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false variant=[(RodBinding name=SNP source=/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.recal.SNP.vcf), (RodBinding name=Indel source=/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.filt.IND.vcf)] out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub genotypemergeoption=UNSORTED filteredrecordsmergetype=KEEP_IF_ANY_UNFILTERED multipleallelesmergetype=BY_TYPE rod_priority_list=null printComplexMerges=false filteredAreUncalled=false minimalVCF=false setKey=null assumeIdenticalSamples=true minimumN=1 suppressCommandLineHeader=false mergeInfoWithMaxAC=false filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
+##INFO=<ID=1000G,Number=0,Type=Flag,Description="Membership in 1000 Genomes">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral allele">
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count">
+##INFO=<ID=AD,Number=R,Type=Integer,Description="Total read depths for each allele">
+##INFO=<ID=ADF,Number=R,Type=Integer,Description="Read depths for each allele on the forward strand">
+##INFO=<ID=ADR,Number=R,Type=Integer,Description="Read depths for each allele on the reverse strand">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele frequency for each allele">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=ANN,Number=.,Type=String,Description="Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'">
+##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
+##INFO=<ID=CIGAR,Number=A,Type=String,Description="Cigar string describing how to align alternate alleles to the reference allele">
+##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="Membership in dbSNP">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
+##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
+##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
+##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="Membership in HapMap2">
+##INFO=<ID=H3,Number=0,Type=Flag,Description="Membership in HapMap3">
+##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
+##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
+##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
+##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
+##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
+##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
+##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
+##INFO=<ID=NEGATIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the negative training set of bad variants">
+##INFO=<ID=POSITIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the positive training set of good variants">
+##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
+##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
+##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Somatic event">
+##INFO=<ID=VALIDATED,Number=0,Type=Flag,Description="Validated by follow-up experiment">
+##INFO=<ID=VQSLOD,Number=1,Type=Float,Description="Log odds ratio of being a true variant versus being false under the trained gaussian mixture model">
+##INFO=<ID=culprit,Number=1,Type=String,Description="The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out">
+##contig=<ID=1,length=249250621>
+##contig=<ID=2,length=249250621>
+##contig=<ID=13,length=249250621>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878	NA12891	NA12892
+1	14397	.	CTGT	C	.	IndelQD	AC=2;AF=+Inf;AN=6;BaseQRankSum=-Inf;ClippingRankSum=0.138;DP=69;FS=7.786;MLEAC=2;MLEAF=0.333;MQ=26.84;MQ0=0;MQRankSum=-1.906;QD=1.55;ReadPosRankSum=0.384	GT:AD:DP:FT:GQ:PL:MQ:float	0/1:16,4:20:rd:99:120,0,827:-Inf:+Inf	0/1:8,2:10:dp;rd:60:60,0,414:-Inf:+Inf	0/0:39,0:39:PASS:99:0,116,2114:-Inf:+Inf
+1	14522	.	G	A	.	VQSRTrancheSNP99.95to100.00	AC=2;AF=0.333;AN=6;BaseQRankSum=2.044;ClippingRankSum=-2.196;DP=48;FS=13.179;MLEAC=2;MLEAF=0.333;MQ=25.89;MQ0=0;MQRankSum=-0.063;QD=8.87;ReadPosRankSum=0.952;VQSLOD=-3.333;culprit=MQ	GT:AD:DP:FT:GQ:PL	0/1:10,5:15:dp:99:99,0,2147483647	0/1:2,5:7:dp;rd:34:128,0,34	0/0:26,0:26:PASS:78:0,78,783
+1	63735	rs201888535	CCTA	C	.	PASS	AC=1;AF=0.167;AN=6;BaseQRankSum=1.138;ClippingRankSum=0.448;DB;DP=176;FS=13.597;MLEAC=1;MLEAF=0.167;MQ=31.06;MQ0=0;MQRankSum=0.636;QD=9.98;ReadPosRankSum=-1.18	GT:AD:DP:FT:GQ:PL	0/0:27,0:27:PASS:79:0,79,1425	0/0:40,0:40:PASS:99:0,117,2120	0/1:23,74:97:rd:99:3034,0,942
+2	19190	.	GC	G	.	PASS	AC=3;AF=0.5;AN=6;BaseQRankSum=4.157;ClippingRankSum=3.666;DP=74;FS=37.037;MLEAC=3;MLEAF=0.5;MQ=22.26;MQ0=0;MQRankSum=0.195;QD=16.04;ReadPosRankSum=-4.072	GT:AD:DP:FT:GQ:PL	0/1:8,14:22:PASS:99:416,0,2147483647	0/1:18,13:31:PASS:99:353,0,503	0/1:5,15:20:rd:99:457,0,107
+13	752721	rs3131972	A	G	.	PASS	AC=6;AF=1.0;AN=6;DB;DP=69;FS=0.0;MLEAC=6;MLEAF=1.0;MQ=60.0;MQ0=0;POSITIVE_TRAIN_SITE;QD=31.67;VQSLOD=18.94;culprit=QD	GT:AD:DP:FT:GQ:PL	1/1:0,27:27:PASS:81:1021,81,0	1/1:0,19:19:dp:57:661,57,0	1/1:0,22:22:PASS:66:831,66,0
+13	752791	.	A	G	.	PASS	AC=6;AF=1.0;AN=6;DB;DP=69;FS=0.0;MLEAC=6;MLEAF=1.0;MQ=60.0;MQ0=0;POSITIVE_TRAIN_SITE;QD=31.67;VQSLOD=18.94;culprit=QD	GT:AD:DP:FT:GQ:PL:SB	1/1:0,27:27:PASS:81:1021,81,0:0,1,2,3	1/1:0,19:19:dp:57:661,57,0:4,5,6,7	1/1:0,22:22:PASS:66:831,66,0:2,3,4,5
diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantContextConverterSuite.scala
@@ -1484,6 +1484,20 @@ class VariantContextConverterSuite extends ADAMFunSuite {
     assert(va.getAlleleFrequency > 0.09f && va.getAlleleFrequency < 0.11f)
   }
 
+  test("single allele frequency is +Inf going htsjdk->adam") {
+    val acList: java.util.List[String] = List("+Inf")
+    val va = buildVariantAnnotation(Map(("AF", acList)),
+      converter.formatAlleleFrequency)
+    assert(va.getAlleleFrequency == Float.PositiveInfinity)
+  }
+
+  test("single allele frequency is -Inf going htsjdk->adam") {
+    val acList: java.util.List[String] = List("-Inf")
+    val va = buildVariantAnnotation(Map(("AF", acList)),
+      converter.formatAlleleFrequency)
+    assert(va.getAlleleFrequency == Float.NegativeInfinity)
+  }
+
   test("multiple allele frequencies going htsjdk->adam") {
     val acList: java.util.List[java.lang.Float] = List(
       0.1f, 0.01f, 0.001f).map(i => i: java.lang.Float)

diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala
@@ -161,6 +161,20 @@ class VariantContextRDDSuite extends ADAMFunSuite {
     assert(netAlleleDepths.forall(adall => (adall == "0,0" || adall == "")))
   }
 
+  sparkTest("support VCFs with +Inf/-Inf float values") {
+    val path = testFile("inf_float_values.vcf")
+    val vcs = sc.loadVcf(path, ValidationStringency.LENIENT)
+    val variant = vcs.toVariantRDD().rdd.filter(_.getStart == 14396L).first()
+    assert(variant.getAnnotation.getAlleleFrequency === Float.PositiveInfinity)
+    // -Inf INFO value --> -Infinity after conversion
+    assert(variant.getAnnotation.getAttributes.get("BaseQRankSum") === "-Infinity")
+
+    val genotype = vcs.toGenotypeRDD().rdd.filter(_.getVariant == variant).first()
+    assert(genotype.getVariantCallingAnnotations.getRmsMapQ === Float.NegativeInfinity)
+    // +Inf FORMAT value --> Infinity after conversion
+    assert(genotype.getVariantCallingAnnotations.getAttributes.get("float") === "Infinity")
+  }
+
   sparkTest("don't lose any variants when piping as VCF") {
     val smallVcf = testFile("small.vcf")
     val rdd: VariantContextRDD = sc.loadVcf(smallVcf)