Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix SB tag parsing #1203

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -69,30 +69,30 @@ class JavaADAMContextSuite extends ADAMFunSuite {
sparkTest("can read and write a small .vcf as genotypes") {
val path = copyResource("small.vcf")
val aRdd = sc.loadGenotypes(path)
assert(aRdd.jrdd.count() === 15)
assert(aRdd.jrdd.count() === 18)

val newRdd = JavaADAMGenotypeConduit.conduit(aRdd, sc)

assert(newRdd.jrdd.count() === 15)
assert(newRdd.jrdd.count() === 18)
}

sparkTest("can read and write a small .vcf as variants") {
val path = copyResource("small.vcf")
val aRdd = sc.loadVariants(path)
assert(aRdd.jrdd.count() === 5)
assert(aRdd.jrdd.count() === 6)

val newRdd = JavaADAMVariantConduit.conduit(aRdd, sc)

assert(newRdd.jrdd.count() === 5)
assert(newRdd.jrdd.count() === 6)
}

ignore("can read and write a small .vcf as annotations") {
val path = copyResource("small.vcf")
val aRdd = sc.loadVariantAnnotations(path)
assert(aRdd.jrdd.count() === 5)
assert(aRdd.jrdd.count() === 6)

val newRdd = JavaADAMAnnotationConduit.conduit(aRdd, sc)

assert(newRdd.jrdd.count() === 5)
assert(newRdd.jrdd.count() === 6)
}
}
Expand Up @@ -17,15 +17,16 @@
*/
package org.bdgenomics.adam.converters

import org.apache.avro.Schema
import org.apache.avro.specific.SpecificRecord
import htsjdk.variant.variantcontext.VariantContext
import htsjdk.variant.vcf._
import org.apache.avro.Schema
import org.apache.avro.specific.SpecificRecord
import org.bdgenomics.formats.avro.{
DatabaseVariantAnnotation,
Genotype,
VariantCallingAnnotations
}
import scala.collection.JavaConversions._

/**
* Singleton object for building AttrKey instances.
Expand Down Expand Up @@ -123,6 +124,17 @@ private[converters] object VariantAnnotationConverter extends Serializable {
case a: String => java.lang.Boolean.valueOf(a)
}

/**
* Converts a java String of comma delimited integers to a
* java.util.List of Integer
*
* @param attr Attribute to convert.
* @return Attribute as a java.util.List[Integer]
*/
private def attrAsIntList(attr: Object): Object = attr match {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a test that covers this function?

case a: java.lang.String => seqAsJavaList(a.split(",").map(java.lang.Integer.valueOf _))
}

/**
* Keys corresponding to the COSMIC mutation database.
*/
Expand Down Expand Up @@ -187,7 +199,7 @@ private[converters] object VariantAnnotationConverter extends Serializable {
AttrKey("phaseQuality", attrAsInt _, new VCFFormatHeaderLine(VCFConstants.PHASE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")),
AttrKey("phaseSetId", attrAsInt _, new VCFFormatHeaderLine(VCFConstants.PHASE_SET_KEY, 1, VCFHeaderLineType.Integer, "Phase set")),
AttrKey("minReadDepth", attrAsInt _, new VCFFormatHeaderLine("MIN_DP", 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block")),
AttrKey("strandBiasComponents", attrAsInt _, new VCFFormatHeaderLine("SB", 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias."))
AttrKey("strandBiasComponents", attrAsIntList _, new VCFFormatHeaderLine("SB", 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias."))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have a doc link for the SB tag being used for genotypes (i.e. as a FORMAT key)? What are the 4 different values? Are there always 4 values?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great, thanks!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SB components are a 2x2 matrix: forward/rev strand on one axis, ref/alt on the other axis

)

/**
Expand Down
2 changes: 2 additions & 0 deletions adam-core/src/test/resources/small.vcf
Expand Up @@ -20,6 +20,7 @@
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GATKCommandLine=<ID=CombineVariants,Version=2.7-63-gc434461,Date="Mon Oct 14 15:08:05 EDT 2013",Epoch=1381777685067,CommandLineOptions="analysis_type=CombineVariants input_file=[] read_buffer_size=null phone_home=NO_ET gatk_key=/packages/gatk/1.5-21-g979a84a/src/eugene.fluder_mssm.edu.key tag=NA read_filter=[] intervals=[/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/.queueScatterGather/.qlog/r1-1-1.combined.rawGT.vcf.combine-sg/temp_01_of_20/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false variant=[(RodBinding name=SNP source=/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.recal.SNP.vcf), (RodBinding name=Indel source=/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.filt.IND.vcf)] out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub genotypemergeoption=UNSORTED filteredrecordsmergetype=KEEP_IF_ANY_UNFILTERED multipleallelesmergetype=BY_TYPE rod_priority_list=null printComplexMerges=false filteredAreUncalled=false minimalVCF=false setKey=null assumeIdenticalSamples=true minimumN=1 suppressCommandLineHeader=false mergeInfoWithMaxAC=false filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
##GATKCommandLine=<ID=GenotypeFiltration,Version=2.7-63-gc434461,Date="Mon Oct 14 15:13:40 EDT 2013",Epoch=1381778020289,CommandLineOptions="analysis_type=GenotypeFiltration input_file=[] read_buffer_size=null phone_home=NO_ET gatk_key=/packages/gatk/1.5-21-g979a84a/src/eugene.fluder_mssm.edu.key tag=NA read_filter=[] intervals=[/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/.queueScatterGather/.qlog/r1-1-1.combined.vcf.gtfilter-sg/temp_01_of_20/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.ped] pedigreeString=[] pedigreeValidationType=SILENT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false variant=(RodBinding name=variant source=/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.combined.rawGT.vcf) out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub min_GQ=30.0 min_DP=20 min_AF=0.3 max_AF=0.7 maleHaploidIntervals=null femaleNoVarIntervals=null filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
##GATKCommandLine=<ID=HaplotypeCaller,Version=2.7-63-gc434461,Date="Mon Oct 14 12:24:16 EDT 2013",Epoch=1381767856130,CommandLineOptions="analysis_type=HaplotypeCaller input_file=[/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.cohort.list] read_buffer_size=null phone_home=NO_ET gatk_key=/packages/gatk/1.5-21-g979a84a/src/eugene.fluder_mssm.edu.key tag=NA read_filter=[] intervals=[/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/.queueScatterGather/.qlog/r1-1-1.both.vcf.haplotypecall-sg/temp_01_of_20/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=250 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=16 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub graphOutput=null bamOutput=null bam_compression=null disable_bam_indexing=null generate_md5=null simplifyBAM=null bamWriterType=CALLED_HAPLOTYPES dbsnp=(RodBinding name=dbsnp source=/projects/ngs/resources/gatk/2.3/dbsnp_137.hg19.vcf) comp=[] annotation=[ClippingRankSumTest, DepthPerSampleHC] excludeAnnotation=[SpanningDeletions, TandemRepeatAnnotator] heterozygosity=0.001 indel_heterozygosity=1.25E-4 genotyping_mode=DISCOVERY standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=20.0 alleles=(RodBinding name= source=UNBOUND) max_alternate_alleles=6 input_prior=[] contamination_fraction_to_filter=0.0 contamination_fraction_per_sample_file=null p_nonref_model=EXACT_INDEPENDENT exactcallslog=null kmerSize=[10, 25] dontIncreaseKmerSizesForCycles=false numPruningSamples=1 dontRecoverDanglingTails=false emitRefConfidence=NONE GVCFGQBands=[1, 10, 20, 30, 40, 50] indelSizeToEliminateInRefModel=10 minPruning=2 gcpHMM=10 includeUmappedReads=false useAllelesTrigger=false useFilteredReadsForAnnotations=false phredScaledGlobalReadMismappingRate=45 maxNumHaplotypesInPopulation=128 mergeVariantsViaLD=false pair_hmm_implementation=LOGLESS_CACHING keepRG=null justDetermineActiveRegions=false dontGenotype=false errorCorrectKmers=false debug=false debugGraphTransformations=false useLowQualityBasesForAssembly=false dontTrimActiveRegions=false dontUseSoftClippedBases=false captureAssemblyFailureBAM=false allowCyclesInKmerGraphToGeneratePaths=false noFpga=false errorCorrectReads=false kmerLengthForReadErrorCorrection=25 minObservationsForKmerToBeSolid=20 maxDiscARExtension=25 maxGGAARExtension=300 paddingAroundIndels=150 paddingAroundSNPs=20 pcr_indel_model=CONSERVATIVE activityProfileOut=null activeRegionOut=null activeRegionIn=null activeRegionExtension=null forceActive=false activeRegionMaxSize=null bandPassSigma=null min_mapping_quality_score=20 filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
Expand Down Expand Up @@ -53,3 +54,4 @@
1 19190 . GC G 1186.88 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=4.157;ClippingRankSum=3.666;DP=74;FS=37.037;MLEAC=3;MLEAF=0.500;MQ=22.26;MQ0=0;MQRankSum=0.195;QD=16.04;ReadPosRankSum=-4.072 GT:AD:DP:FT:GQ:PL 0/1:8,14:22:PASS:99:416,0,201 0/1:18,13:31:PASS:99:353,0,503 0/1:5,15:20:rd:99:457,0,107
1 63735 rs201888535 CCTA C 2994.09 PASS AC=1;AF=0.167;AN=6;BaseQRankSum=1.138;ClippingRankSum=0.448;DB;DP=176;FS=13.597;MLEAC=1;MLEAF=0.167;MQ=31.06;MQ0=0;MQRankSum=0.636;QD=9.98;ReadPosRankSum=-1.180 GT:AD:DP:FT:GQ:PL 0/0:27,0:27:PASS:79:0,79,1425 0/0:40,0:40:PASS:99:0,117,2120 0/1:23,74:97:rd:99:3034,0,942
1 752721 rs3131972 A G 2486.90 PASS AC=6;AF=1.00;AN=6;DB;DP=69;FS=0.000;MLEAC=6;MLEAF=1.00;MQ=60.00;MQ0=0;POSITIVE_TRAIN_SITE;QD=31.67;VQSLOD=18.94;culprit=QD GT:AD:DP:FT:GQ:PL 1/1:0,27:27:PASS:81:1021,81,0 1/1:0,19:19:dp:57:661,57,0 1/1:0,22:22:PASS:66:831,66,0
1 752791 . A G 2486.90 PASS AC=6;AF=1.00;AN=6;DB;DP=69;FS=0.000;MLEAC=6;MLEAF=1.00;MQ=60.00;MQ0=0;POSITIVE_TRAIN_SITE;QD=31.67;VQSLOD=18.94;culprit=QD GT:AD:DP:FT:GQ:PL:SB 1/1:0,27:27:PASS:81:1021,81,0:0,1,2,3 1/1:0,19:19:dp:57:661,57,0:4,5,6,7 1/1:0,22:22:PASS:66:831,66,0:2,3,4,5
Expand Up @@ -204,4 +204,22 @@ class VariantAnnotationsSuite extends ADAMFunSuite {
assert(te.getDistance == 453)
assert(te.getMessages.isEmpty)
}

test("create java.util.List[Int] from SB tag String value") {
val sb_tagData = "2,3,4,5"
val sb_converter = VariantAnnotationConverter.FORMAT_KEYS
.filter(x => x.adamKey == "strandBiasComponents").head.attrConverter

val sb_parsed = sb_converter(sb_tagData).asInstanceOf[java.util.List[Int]]
val sb_component1 = sb_parsed.get(0)
val sb_component2 = sb_parsed.get(1)
val sb_component3 = sb_parsed.get(2)
val sb_component4 = sb_parsed.get(3)

assert(sb_component1 == 2 &&
sb_component2 == 3 &&
sb_component3 == 4 &&
sb_component4 == 5)

}
}
Expand Up @@ -149,7 +149,7 @@ class ADAMContextSuite extends ADAMFunSuite {
val path = testFile("small.vcf")

val vcs = sc.loadGenotypes(path).toVariantContextRDD.rdd.collect.sortBy(_.position)
assert(vcs.size === 5)
assert(vcs.size === 6)

val vc = vcs.head
assert(vc.genotypes.size === 3)
Expand Down Expand Up @@ -372,7 +372,7 @@ class ADAMContextSuite extends ADAMFunSuite {
val path = testFile("bqsr1.vcf").replace("bqsr1", "*")

val variants = sc.loadVcf(path).toVariantRDD
assert(variants.rdd.count === 691)
assert(variants.rdd.count === 692)
}

sparkTest("load vcf from a directory") {
Expand Down