bigdatagenomics · jpdna · Oct 9, 2016 · Oct 14, 2016 · fnothaft · Oct 9, 2016
diff --git a/adam-apis/src/test/scala/org/bdgenomics/adam/apis/java/JavaADAMContextSuite.scala b/adam-apis/src/test/scala/org/bdgenomics/adam/apis/java/JavaADAMContextSuite.scala
@@ -69,30 +69,30 @@ class JavaADAMContextSuite extends ADAMFunSuite {
   sparkTest("can read and write a small .vcf as genotypes") {
     val path = copyResource("small.vcf")
     val aRdd = sc.loadGenotypes(path)
-    assert(aRdd.jrdd.count() === 15)
+    assert(aRdd.jrdd.count() === 18)
 
     val newRdd = JavaADAMGenotypeConduit.conduit(aRdd, sc)
 
-    assert(newRdd.jrdd.count() === 15)
+    assert(newRdd.jrdd.count() === 18)
   }
 
   sparkTest("can read and write a small .vcf as variants") {
     val path = copyResource("small.vcf")
     val aRdd = sc.loadVariants(path)
-    assert(aRdd.jrdd.count() === 5)
+    assert(aRdd.jrdd.count() === 6)
 
     val newRdd = JavaADAMVariantConduit.conduit(aRdd, sc)
 
-    assert(newRdd.jrdd.count() === 5)
+    assert(newRdd.jrdd.count() === 6)
   }
 
   ignore("can read and write a small .vcf as annotations") {
     val path = copyResource("small.vcf")
     val aRdd = sc.loadVariantAnnotations(path)
-    assert(aRdd.jrdd.count() === 5)
+    assert(aRdd.jrdd.count() === 6)
 
     val newRdd = JavaADAMAnnotationConduit.conduit(aRdd, sc)
 
-    assert(newRdd.jrdd.count() === 5)
+    assert(newRdd.jrdd.count() === 6)
   }
 }
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantAnnotationConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/VariantAnnotationConverter.scala
@@ -17,15 +17,16 @@
  */
 package org.bdgenomics.adam.converters
 
-import org.apache.avro.Schema
-import org.apache.avro.specific.SpecificRecord
 import htsjdk.variant.variantcontext.VariantContext
 import htsjdk.variant.vcf._
+import org.apache.avro.Schema
+import org.apache.avro.specific.SpecificRecord
 import org.bdgenomics.formats.avro.{
   DatabaseVariantAnnotation,
   Genotype,
   VariantCallingAnnotations
 }
+import scala.collection.JavaConversions._
 
 /**
  * Singleton object for building AttrKey instances.
@@ -123,6 +124,17 @@ private[converters] object VariantAnnotationConverter extends Serializable {
     case a: String            => java.lang.Boolean.valueOf(a)
   }
 
+  /**
+   * Converts a java String of comma delimited integers to a
+   * java.util.List of Integer
+   *
+   * @param attr Attribute to convert.
+   * @return Attribute as a java.util.List[Integer]
+   */
+  private def attrAsIntList(attr: Object): Object = attr match {
+    case a: java.lang.String => seqAsJavaList(a.split(",").map(java.lang.Integer.valueOf _))
+  }
+
   /**
    * Keys corresponding to the COSMIC mutation database.
    */
@@ -187,7 +199,7 @@ private[converters] object VariantAnnotationConverter extends Serializable {
     AttrKey("phaseQuality", attrAsInt _, new VCFFormatHeaderLine(VCFConstants.PHASE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")),
     AttrKey("phaseSetId", attrAsInt _, new VCFFormatHeaderLine(VCFConstants.PHASE_SET_KEY, 1, VCFHeaderLineType.Integer, "Phase set")),
     AttrKey("minReadDepth", attrAsInt _, new VCFFormatHeaderLine("MIN_DP", 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block")),
-    AttrKey("strandBiasComponents", attrAsInt _, new VCFFormatHeaderLine("SB", 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias."))
+    AttrKey("strandBiasComponents", attrAsIntList _, new VCFFormatHeaderLine("SB", 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias."))
   )
 
   /**

diff --git a/adam-core/src/test/resources/small.vcf b/adam-core/src/test/resources/small.vcf
@@ -20,6 +20,7 @@
 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
+##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
 ##GATKCommandLine=<ID=CombineVariants,Version=2.7-63-gc434461,Date="Mon Oct 14 15:08:05 EDT 2013",Epoch=1381777685067,CommandLineOptions="analysis_type=CombineVariants input_file=[] read_buffer_size=null phone_home=NO_ET gatk_key=/packages/gatk/1.5-21-g979a84a/src/eugene.fluder_mssm.edu.key tag=NA read_filter=[] intervals=[/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/.queueScatterGather/.qlog/r1-1-1.combined.rawGT.vcf.combine-sg/temp_01_of_20/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false variant=[(RodBinding name=SNP source=/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.recal.SNP.vcf), (RodBinding name=Indel source=/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.filt.IND.vcf)] out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub genotypemergeoption=UNSORTED filteredrecordsmergetype=KEEP_IF_ANY_UNFILTERED multipleallelesmergetype=BY_TYPE rod_priority_list=null printComplexMerges=false filteredAreUncalled=false minimalVCF=false setKey=null assumeIdenticalSamples=true minimumN=1 suppressCommandLineHeader=false mergeInfoWithMaxAC=false filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
 ##GATKCommandLine=<ID=GenotypeFiltration,Version=2.7-63-gc434461,Date="Mon Oct 14 15:13:40 EDT 2013",Epoch=1381778020289,CommandLineOptions="analysis_type=GenotypeFiltration input_file=[] read_buffer_size=null phone_home=NO_ET gatk_key=/packages/gatk/1.5-21-g979a84a/src/eugene.fluder_mssm.edu.key tag=NA read_filter=[] intervals=[/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/.queueScatterGather/.qlog/r1-1-1.combined.vcf.gtfilter-sg/temp_01_of_20/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.ped] pedigreeString=[] pedigreeValidationType=SILENT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false variant=(RodBinding name=variant source=/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.combined.rawGT.vcf) out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub min_GQ=30.0 min_DP=20 min_AF=0.3 max_AF=0.7 maleHaploidIntervals=null femaleNoVarIntervals=null filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
 ##GATKCommandLine=<ID=HaplotypeCaller,Version=2.7-63-gc434461,Date="Mon Oct 14 12:24:16 EDT 2013",Epoch=1381767856130,CommandLineOptions="analysis_type=HaplotypeCaller input_file=[/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.cohort.list] read_buffer_size=null phone_home=NO_ET gatk_key=/packages/gatk/1.5-21-g979a84a/src/eugene.fluder_mssm.edu.key tag=NA read_filter=[] intervals=[/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/.queueScatterGather/.qlog/r1-1-1.both.vcf.haplotypecall-sg/temp_01_of_20/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=250 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=16 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub graphOutput=null bamOutput=null bam_compression=null disable_bam_indexing=null generate_md5=null simplifyBAM=null bamWriterType=CALLED_HAPLOTYPES dbsnp=(RodBinding name=dbsnp source=/projects/ngs/resources/gatk/2.3/dbsnp_137.hg19.vcf) comp=[] annotation=[ClippingRankSumTest, DepthPerSampleHC] excludeAnnotation=[SpanningDeletions, TandemRepeatAnnotator] heterozygosity=0.001 indel_heterozygosity=1.25E-4 genotyping_mode=DISCOVERY standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=20.0 alleles=(RodBinding name= source=UNBOUND) max_alternate_alleles=6 input_prior=[] contamination_fraction_to_filter=0.0 contamination_fraction_per_sample_file=null p_nonref_model=EXACT_INDEPENDENT exactcallslog=null kmerSize=[10, 25] dontIncreaseKmerSizesForCycles=false numPruningSamples=1 dontRecoverDanglingTails=false emitRefConfidence=NONE GVCFGQBands=[1, 10, 20, 30, 40, 50] indelSizeToEliminateInRefModel=10 minPruning=2 gcpHMM=10 includeUmappedReads=false useAllelesTrigger=false useFilteredReadsForAnnotations=false phredScaledGlobalReadMismappingRate=45 maxNumHaplotypesInPopulation=128 mergeVariantsViaLD=false pair_hmm_implementation=LOGLESS_CACHING keepRG=null justDetermineActiveRegions=false dontGenotype=false errorCorrectKmers=false debug=false debugGraphTransformations=false useLowQualityBasesForAssembly=false dontTrimActiveRegions=false dontUseSoftClippedBases=false captureAssemblyFailureBAM=false allowCyclesInKmerGraphToGeneratePaths=false noFpga=false errorCorrectReads=false kmerLengthForReadErrorCorrection=25 minObservationsForKmerToBeSolid=20 maxDiscARExtension=25 maxGGAARExtension=300 paddingAroundIndels=150 paddingAroundSNPs=20 pcr_indel_model=CONSERVATIVE activityProfileOut=null activeRegionOut=null activeRegionIn=null activeRegionExtension=null forceActive=false activeRegionMaxSize=null bandPassSigma=null min_mapping_quality_score=20 filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
@@ -53,3 +54,4 @@
 1	19190	.	GC	G	1186.88	PASS	AC=3;AF=0.500;AN=6;BaseQRankSum=4.157;ClippingRankSum=3.666;DP=74;FS=37.037;MLEAC=3;MLEAF=0.500;MQ=22.26;MQ0=0;MQRankSum=0.195;QD=16.04;ReadPosRankSum=-4.072	GT:AD:DP:FT:GQ:PL	0/1:8,14:22:PASS:99:416,0,201	0/1:18,13:31:PASS:99:353,0,503	0/1:5,15:20:rd:99:457,0,107
 1	63735	rs201888535	CCTA	C	2994.09	PASS	AC=1;AF=0.167;AN=6;BaseQRankSum=1.138;ClippingRankSum=0.448;DB;DP=176;FS=13.597;MLEAC=1;MLEAF=0.167;MQ=31.06;MQ0=0;MQRankSum=0.636;QD=9.98;ReadPosRankSum=-1.180	GT:AD:DP:FT:GQ:PL	0/0:27,0:27:PASS:79:0,79,1425	0/0:40,0:40:PASS:99:0,117,2120	0/1:23,74:97:rd:99:3034,0,942
 1	752721	rs3131972	A	G	2486.90	PASS	AC=6;AF=1.00;AN=6;DB;DP=69;FS=0.000;MLEAC=6;MLEAF=1.00;MQ=60.00;MQ0=0;POSITIVE_TRAIN_SITE;QD=31.67;VQSLOD=18.94;culprit=QD	GT:AD:DP:FT:GQ:PL	1/1:0,27:27:PASS:81:1021,81,0	1/1:0,19:19:dp:57:661,57,0	1/1:0,22:22:PASS:66:831,66,0
+1	752791	.	A	G	2486.90	PASS	AC=6;AF=1.00;AN=6;DB;DP=69;FS=0.000;MLEAC=6;MLEAF=1.00;MQ=60.00;MQ0=0;POSITIVE_TRAIN_SITE;QD=31.67;VQSLOD=18.94;culprit=QD	GT:AD:DP:FT:GQ:PL:SB	1/1:0,27:27:PASS:81:1021,81,0:0,1,2,3	1/1:0,19:19:dp:57:661,57,0:4,5,6,7	1/1:0,22:22:PASS:66:831,66,0:2,3,4,5
diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantAnnotationsSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/converters/VariantAnnotationsSuite.scala
@@ -204,4 +204,22 @@ class VariantAnnotationsSuite extends ADAMFunSuite {
     assert(te.getDistance == 453)
     assert(te.getMessages.isEmpty)
   }
+
+  test("create java.util.List[Int] from SB tag String value") {
+    val sb_tagData = "2,3,4,5"
+    val sb_converter = VariantAnnotationConverter.FORMAT_KEYS
+      .filter(x => x.adamKey == "strandBiasComponents").head.attrConverter
+
+    val sb_parsed = sb_converter(sb_tagData).asInstanceOf[java.util.List[Int]]
+    val sb_component1 = sb_parsed.get(0)
+    val sb_component2 = sb_parsed.get(1)
+    val sb_component3 = sb_parsed.get(2)
+    val sb_component4 = sb_parsed.get(3)
+
+    assert(sb_component1 == 2 &&
+      sb_component2 == 3 &&
+      sb_component3 == 4 &&
+      sb_component4 == 5)
+
+  }
 }
diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala
@@ -149,7 +149,7 @@ class ADAMContextSuite extends ADAMFunSuite {
     val path = testFile("small.vcf")
 
     val vcs = sc.loadGenotypes(path).toVariantContextRDD.rdd.collect.sortBy(_.position)
-    assert(vcs.size === 5)
+    assert(vcs.size === 6)
 
     val vc = vcs.head
     assert(vc.genotypes.size === 3)
@@ -372,7 +372,7 @@ class ADAMContextSuite extends ADAMFunSuite {
     val path = testFile("bqsr1.vcf").replace("bqsr1", "*")
 
     val variants = sc.loadVcf(path).toVariantRDD
-    assert(variants.rdd.count === 691)
+    assert(variants.rdd.count === 692)
   }
 
   sparkTest("load vcf from a directory") {