Skip to content

Commit

Permalink
HTSJDK, gimme a break.
Browse files Browse the repository at this point in the history
  • Loading branch information
fnothaft committed Dec 29, 2016
1 parent 5ec9a84 commit 7c69cd4
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 9 deletions.
Expand Up @@ -132,9 +132,14 @@ private[adam] object SupportedHeaderLines {
1,
VCFHeaderLineType.Integer,
"Phase set ID")

// note: this is not spec compliant!
// however, this is due to a bug in htsjdk --> https://github.com/samtools/htsjdk/issues/751
// the standard header lines get populated no matter what the header lines are on the
// file. not clear how to disable this...
lazy val phaseQuality = new VCFFormatHeaderLine(VCFConstants.PHASE_QUALITY_KEY,
1,
VCFHeaderLineType.Integer,
VCFHeaderLineType.Float,
"Read-backed phasing quality")
lazy val genotypeFilter = VCFStandardHeaderLines.getFormatLine(
VCFConstants.GENOTYPE_FILTER_KEY)
Expand Down
Expand Up @@ -721,12 +721,23 @@ private[adam] class VariantContextConverter extends Serializable with Logging {
obj.asInstanceOf[java.lang.String]
}

private def splitAndCheckForEmptyArray(s: String): Array[String] = {
val array = s.split(",")
if (array.forall(_ == ".")) {
Array.empty
} else {
require(array.forall(_ != "."),
"Array must either be fully defined or fully undefined.")
array
}
}

private def toIntArray(obj: java.lang.Object): Array[Int] = {
tryAndCatchStringCast(obj, o => {
o.asInstanceOf[Array[java.lang.Integer]]
.map(i => i: Int)
}, o => {
o.split(",").map(_.toInt)
splitAndCheckForEmptyArray(o).map(_.toInt)
})
}

Expand All @@ -735,7 +746,7 @@ private[adam] class VariantContextConverter extends Serializable with Logging {
o.asInstanceOf[Array[java.lang.Character]]
.map(c => c: Char)
}, o => {
o.split(",").map(s => {
splitAndCheckForEmptyArray(o).map(s => {
require(s.length == 1, "Expected character to have length 1.")
s(0)
})
Expand All @@ -747,15 +758,15 @@ private[adam] class VariantContextConverter extends Serializable with Logging {
o.asInstanceOf[Array[java.lang.Float]]
.map(f => f: Float)
}, o => {
o.split(",").map(_.toFloat)
splitAndCheckForEmptyArray(o).map(_.toFloat)
})
}

private def toStringArray(obj: java.lang.Object): Array[String] = {
tryAndCatchStringCast(obj, o => {
o.asInstanceOf[Array[java.lang.String]]
.map(s => s: String)
}, o => o.split(","))
}, o => splitAndCheckForEmptyArray(o))
}

private def filterArray[T](array: Array[T],
Expand All @@ -773,6 +784,7 @@ private[adam] class VariantContextConverter extends Serializable with Logging {
indices: List[Int]): Option[(String, List[String])] = {
Option(g.getExtendedAttribute(id))
.map(toFn)
.filter(_.nonEmpty)
.map(filterArray(_, indices))
.map(v => (id, v))
}
Expand All @@ -783,6 +795,7 @@ private[adam] class VariantContextConverter extends Serializable with Logging {
idx: Int): Option[(String, String)] = {
Option(g.getExtendedAttribute(id))
.map(toFn)
.filter(_.nonEmpty)
.map(array => (id, array(idx)))
}

Expand Down Expand Up @@ -893,9 +906,9 @@ private[adam] class VariantContextConverter extends Serializable with Logging {
.find(_.getID == key)
.isEmpty) {

None
} else {
Some(lineToExtractor(fl))
} else {
None
}
}
case _ => None
Expand Down
2 changes: 1 addition & 1 deletion adam-core/src/test/resources/sorted.lex.vcf
Expand Up @@ -24,7 +24,7 @@
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Root mean square (RMS) mapping quality">
##FORMAT=<ID=MQ0,Number=1,Type=Float,Description="Total number of reads with mapping quality=0">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PQ,Number=1,Type=Integer,Description="Read-backed phasing quality">
##FORMAT=<ID=PQ,Number=1,Type=Float,Description="Read-backed phasing quality">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set ID">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GATKCommandLine=<ID=CombineVariants,Version=2.7-63-gc434461,Date="Mon Oct 14 15:08:05 EDT 2013",Epoch=1381777685067,CommandLineOptions="analysis_type=CombineVariants input_file=[] read_buffer_size=null phone_home=NO_ET gatk_key=/packages/gatk/1.5-21-g979a84a/src/eugene.fluder_mssm.edu.key tag=NA read_filter=[] intervals=[/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/.queueScatterGather/.qlog/r1-1-1.combined.rawGT.vcf.combine-sg/temp_01_of_20/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false variant=[(RodBinding name=SNP source=/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.recal.SNP.vcf), (RodBinding name=Indel source=/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.filt.IND.vcf)] out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub genotypemergeoption=UNSORTED filteredrecordsmergetype=KEEP_IF_ANY_UNFILTERED multipleallelesmergetype=BY_TYPE rod_priority_list=null printComplexMerges=false filteredAreUncalled=false minimalVCF=false setKey=null assumeIdenticalSamples=true minimumN=1 suppressCommandLineHeader=false mergeInfoWithMaxAC=false filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
Expand Down
2 changes: 1 addition & 1 deletion adam-core/src/test/resources/sorted.vcf
Expand Up @@ -24,7 +24,7 @@
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Root mean square (RMS) mapping quality">
##FORMAT=<ID=MQ0,Number=1,Type=Float,Description="Total number of reads with mapping quality=0">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PQ,Number=1,Type=Integer,Description="Read-backed phasing quality">
##FORMAT=<ID=PQ,Number=1,Type=Float,Description="Read-backed phasing quality">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set ID">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GATKCommandLine=<ID=CombineVariants,Version=2.7-63-gc434461,Date="Mon Oct 14 15:08:05 EDT 2013",Epoch=1381777685067,CommandLineOptions="analysis_type=CombineVariants input_file=[] read_buffer_size=null phone_home=NO_ET gatk_key=/packages/gatk/1.5-21-g979a84a/src/eugene.fluder_mssm.edu.key tag=NA read_filter=[] intervals=[/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/.queueScatterGather/.qlog/r1-1-1.combined.rawGT.vcf.combine-sg/temp_01_of_20/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false variant=[(RodBinding name=SNP source=/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.recal.SNP.vcf), (RodBinding name=Indel source=/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.filt.IND.vcf)] out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub genotypemergeoption=UNSORTED filteredrecordsmergetype=KEEP_IF_ANY_UNFILTERED multipleallelesmergetype=BY_TYPE rod_priority_list=null printComplexMerges=false filteredAreUncalled=false minimalVCF=false setKey=null assumeIdenticalSamples=true minimumN=1 suppressCommandLineHeader=false mergeInfoWithMaxAC=false filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
Expand Down
Expand Up @@ -90,6 +90,7 @@ class VariantContextRDDSuite extends ADAMFunSuite {
val path = new File(tempDir, "test_single.vcf")
variants.saveAsVcf(path.getAbsolutePath, asSingleFile = true)
assert(path.exists)
println("%s/test_single.vcf".format(tempDir))
val vcRdd = sc.loadVcf("%s/test_single.vcf".format(tempDir))
assert(vcRdd.rdd.count === 1)
assert(vcRdd.sequences.records.size === 1)
Expand Down

0 comments on commit 7c69cd4

Please sign in to comment.