Skip to content

Commit

Permalink
Merge e20c32c into 1302e07
Browse files Browse the repository at this point in the history
  • Loading branch information
fnothaft committed Sep 29, 2017
2 parents 1302e07 + e20c32c commit ca48ea5
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1110,7 +1110,7 @@ class VariantContextConverter(
tryAndCatchStringCast(attr, attribute => {
vcab.setFisherStrandBiasPValue(attribute.asInstanceOf[java.lang.Float])
}, attribute => {
vcab.setFisherStrandBiasPValue(attribute.toFloat)
vcab.setFisherStrandBiasPValue(toFloat(attribute))
})
}).getOrElse(vcab)
}
Expand All @@ -1124,7 +1124,7 @@ class VariantContextConverter(
tryAndCatchStringCast(attr, attribute => {
vcab.setRmsMapQ(attribute.asInstanceOf[java.lang.Float])
}, attribute => {
vcab.setRmsMapQ(attribute.toFloat)
vcab.setRmsMapQ(toFloat(attribute))
})
}).getOrElse(vcab)
}
Expand Down Expand Up @@ -1226,7 +1226,15 @@ class VariantContextConverter(
private def toFloat(obj: java.lang.Object): Float = {
tryAndCatchStringCast(obj, o => {
o.asInstanceOf[java.lang.Float]
}, o => o.toFloat)
}, o => {
if (o == "+Inf") {
Float.PositiveInfinity
} else if (o == "-Inf") {
Float.NegativeInfinity
} else {
o.toFloat
}
})
}

// don't shadow toString
Expand Down Expand Up @@ -1271,7 +1279,7 @@ class VariantContextConverter(
o.asInstanceOf[Array[java.lang.Float]]
.map(f => f: Float)
}, o => {
splitAndCheckForEmptyArray(o).map(_.toFloat)
splitAndCheckForEmptyArray(o).map(toFloat(_))
})
}

Expand Down
75 changes: 75 additions & 0 deletions adam-core/src/test/resources/inf_float_values.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
##fileformat=VCFv4.2
##FILTER=<ID=IndelFS,Description="FS > 200.0">
##FILTER=<ID=IndelQD,Description="QD < 2.0">
##FILTER=<ID=IndelReadPosRankSum,Description="ReadPosRankSum < -20.0">
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=VQSRTrancheSNP99.50to99.60,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -0.5377 <= x < -0.1787">
##FILTER=<ID=VQSRTrancheSNP99.60to99.70,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -1.0634 <= x < -0.5377">
##FILTER=<ID=VQSRTrancheSNP99.70to99.80,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -1.7119 <= x < -1.0634">
##FILTER=<ID=VQSRTrancheSNP99.80to99.90,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -2.3301 <= x < -1.7119">
##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -2.8169 <= x < -2.3301">
##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="Truth sensitivity tranche level for SNP model at VQS Lod < -1918.1929">
##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -1918.1929 <= x < -2.8169">
##FILTER=<ID=dp,Description="Insufficient read depth">
##FILTER=<ID=gq,Description="Insufficient genotype quality">
##FILTER=<ID=rd,Description="Insufficient supporting reads">
##FILTER=<ID=sx,Description="Heterozygous sex chromosomes in male sample, or Y chromosome in female sample">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##FORMAT=<ID=FT,Number=.,Type=String,Description="Genotype-level filter">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the gVCF block">
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Root mean square (RMS) mapping quality">
##FORMAT=<ID=MQ0,Number=1,Type=Float,Description="Total number of reads with mapping quality=0">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PQ,Number=1,Type=Float,Description="Read-backed phasing quality">
##FORMAT=<ID=float,Number=1,Type=Float,Description="Float test key">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set ID">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GATKCommandLine=<ID=CombineVariants,Version=2.7-63-gc434461,Date="Mon Oct 14 15:08:05 EDT 2013",Epoch=1381777685067,CommandLineOptions="analysis_type=CombineVariants input_file=[] read_buffer_size=null phone_home=NO_ET gatk_key=/packages/gatk/1.5-21-g979a84a/src/eugene.fluder_mssm.edu.key tag=NA read_filter=[] intervals=[/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/.queueScatterGather/.qlog/r1-1-1.combined.rawGT.vcf.combine-sg/temp_01_of_20/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false variant=[(RodBinding name=SNP source=/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.recal.SNP.vcf), (RodBinding name=Indel source=/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.filt.IND.vcf)] out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub genotypemergeoption=UNSORTED filteredrecordsmergetype=KEEP_IF_ANY_UNFILTERED multipleallelesmergetype=BY_TYPE rod_priority_list=null printComplexMerges=false filteredAreUncalled=false minimalVCF=false setKey=null assumeIdenticalSamples=true minimumN=1 suppressCommandLineHeader=false mergeInfoWithMaxAC=false filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
##INFO=<ID=1000G,Number=0,Type=Flag,Description="Membership in 1000 Genomes">
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral allele">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count">
##INFO=<ID=AD,Number=R,Type=Integer,Description="Total read depths for each allele">
##INFO=<ID=ADF,Number=R,Type=Integer,Description="Read depths for each allele on the forward strand">
##INFO=<ID=ADR,Number=R,Type=Integer,Description="Read depths for each allele on the reverse strand">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele frequency for each allele">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=ANN,Number=.,Type=String,Description="Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=CIGAR,Number=A,Type=String,Description="Cigar string describing how to align alternate alleles to the reference allele">
##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
##INFO=<ID=DB,Number=0,Type=Flag,Description="Membership in dbSNP">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=H2,Number=0,Type=Flag,Description="Membership in HapMap2">
##INFO=<ID=H3,Number=0,Type=Flag,Description="Membership in HapMap3">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=NEGATIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the negative training set of bad variants">
##INFO=<ID=POSITIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the positive training set of good variants">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Somatic event">
##INFO=<ID=VALIDATED,Number=0,Type=Flag,Description="Validated by follow-up experiment">
##INFO=<ID=VQSLOD,Number=1,Type=Float,Description="Log odds ratio of being a true variant versus being false under the trained gaussian mixture model">
##INFO=<ID=culprit,Number=1,Type=String,Description="The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out">
##contig=<ID=1,length=249250621>
##contig=<ID=2,length=249250621>
##contig=<ID=13,length=249250621>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892
1 14397 . CTGT C . IndelQD AC=2;AF=+Inf;AN=6;BaseQRankSum=-Inf;ClippingRankSum=0.138;DP=69;FS=7.786;MLEAC=2;MLEAF=0.333;MQ=26.84;MQ0=0;MQRankSum=-1.906;QD=1.55;ReadPosRankSum=0.384 GT:AD:DP:FT:GQ:PL:MQ:float 0/1:16,4:20:rd:99:120,0,827:-Inf:+Inf 0/1:8,2:10:dp;rd:60:60,0,414:-Inf:+Inf 0/0:39,0:39:PASS:99:0,116,2114:-Inf:+Inf
1 14522 . G A . VQSRTrancheSNP99.95to100.00 AC=2;AF=0.333;AN=6;BaseQRankSum=2.044;ClippingRankSum=-2.196;DP=48;FS=13.179;MLEAC=2;MLEAF=0.333;MQ=25.89;MQ0=0;MQRankSum=-0.063;QD=8.87;ReadPosRankSum=0.952;VQSLOD=-3.333;culprit=MQ GT:AD:DP:FT:GQ:PL 0/1:10,5:15:dp:99:99,0,2147483647 0/1:2,5:7:dp;rd:34:128,0,34 0/0:26,0:26:PASS:78:0,78,783
1 63735 rs201888535 CCTA C . PASS AC=1;AF=0.167;AN=6;BaseQRankSum=1.138;ClippingRankSum=0.448;DB;DP=176;FS=13.597;MLEAC=1;MLEAF=0.167;MQ=31.06;MQ0=0;MQRankSum=0.636;QD=9.98;ReadPosRankSum=-1.18 GT:AD:DP:FT:GQ:PL 0/0:27,0:27:PASS:79:0,79,1425 0/0:40,0:40:PASS:99:0,117,2120 0/1:23,74:97:rd:99:3034,0,942
2 19190 . GC G . PASS AC=3;AF=0.5;AN=6;BaseQRankSum=4.157;ClippingRankSum=3.666;DP=74;FS=37.037;MLEAC=3;MLEAF=0.5;MQ=22.26;MQ0=0;MQRankSum=0.195;QD=16.04;ReadPosRankSum=-4.072 GT:AD:DP:FT:GQ:PL 0/1:8,14:22:PASS:99:416,0,2147483647 0/1:18,13:31:PASS:99:353,0,503 0/1:5,15:20:rd:99:457,0,107
13 752721 rs3131972 A G . PASS AC=6;AF=1.0;AN=6;DB;DP=69;FS=0.0;MLEAC=6;MLEAF=1.0;MQ=60.0;MQ0=0;POSITIVE_TRAIN_SITE;QD=31.67;VQSLOD=18.94;culprit=QD GT:AD:DP:FT:GQ:PL 1/1:0,27:27:PASS:81:1021,81,0 1/1:0,19:19:dp:57:661,57,0 1/1:0,22:22:PASS:66:831,66,0
13 752791 . A G . PASS AC=6;AF=1.0;AN=6;DB;DP=69;FS=0.0;MLEAC=6;MLEAF=1.0;MQ=60.0;MQ0=0;POSITIVE_TRAIN_SITE;QD=31.67;VQSLOD=18.94;culprit=QD GT:AD:DP:FT:GQ:PL:SB 1/1:0,27:27:PASS:81:1021,81,0:0,1,2,3 1/1:0,19:19:dp:57:661,57,0:4,5,6,7 1/1:0,22:22:PASS:66:831,66,0:2,3,4,5
Original file line number Diff line number Diff line change
Expand Up @@ -1484,6 +1484,20 @@ class VariantContextConverterSuite extends ADAMFunSuite {
assert(va.getAlleleFrequency > 0.09f && va.getAlleleFrequency < 0.11f)
}

test("single allele frequency is +Inf going htsjdk->adam") {
val acList: java.util.List[String] = List("+Inf")
val va = buildVariantAnnotation(Map(("AF", acList)),
converter.formatAlleleFrequency)
assert(va.getAlleleFrequency == Float.PositiveInfinity)
}

test("single allele frequency is -Inf going htsjdk->adam") {
val acList: java.util.List[String] = List("-Inf")
val va = buildVariantAnnotation(Map(("AF", acList)),
converter.formatAlleleFrequency)
assert(va.getAlleleFrequency == Float.NegativeInfinity)
}

test("multiple allele frequencies going htsjdk->adam") {
val acList: java.util.List[java.lang.Float] = List(
0.1f, 0.01f, 0.001f).map(i => i: java.lang.Float)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,20 @@ class VariantContextRDDSuite extends ADAMFunSuite {
assert(netAlleleDepths.forall(adall => (adall == "0,0" || adall == "")))
}

sparkTest("support VCFs with +Inf/-Inf float values") {
val path = testFile("inf_float_values.vcf")
val vcs = sc.loadVcf(path, ValidationStringency.LENIENT)
val variant = vcs.toVariantRDD().rdd.filter(_.getStart == 14396L).first()
assert(variant.getAnnotation.getAlleleFrequency === Float.PositiveInfinity)
// -Inf INFO value --> -Infinity after conversion
assert(variant.getAnnotation.getAttributes.get("BaseQRankSum") === "-Infinity")

val genotype = vcs.toGenotypeRDD().rdd.filter(_.getVariant == variant).first()
assert(genotype.getVariantCallingAnnotations.getRmsMapQ === Float.NegativeInfinity)
// +Inf FORMAT value --> Infinity after conversion
assert(genotype.getVariantCallingAnnotations.getAttributes.get("float") === "Infinity")
}

sparkTest("don't lose any variants when piping as VCF") {
val smallVcf = testFile("small.vcf")
val rdd: VariantContextRDD = sc.loadVcf(smallVcf)
Expand Down

0 comments on commit ca48ea5

Please sign in to comment.