Skip to content

Commit

Permalink
[ADAM-1676] Add more finely grained validation for INFO/FORMAT fields.
Browse files Browse the repository at this point in the history
Resolves #1676. Pushes validation checking down to the single field conversion
level, which keeps a variant/genotype record around even if a bad INFO/FORMAT
field was attached to that record.
  • Loading branch information
fnothaft committed Sep 13, 2017
1 parent 9cb0c76 commit 84b5064
Show file tree
Hide file tree
Showing 3 changed files with 155 additions and 15 deletions.
Expand Up @@ -1582,7 +1582,22 @@ class VariantContextConverter(
.map(fn => {
fn(vc, _: Variant.Builder)
})
val converted = boundFns.foldLeft(variantBuilder)((vb: Variant.Builder, fn) => fn(vb))
val converted = boundFns.foldLeft(variantBuilder)((vb: Variant.Builder, fn) => {
try {
fn(vb)
} catch {
case t: Throwable => {
if (stringency == ValidationStringency.STRICT) {
throw t
} else {
if (stringency == ValidationStringency.LENIENT) {
log.warn("Converting variant field from %s with function %s line %s failed: %s".format(vc, fn, t))
}
vb
}
}
}
})

val variant = variantBuilder.build
val variantAnnotationBuilder = VariantAnnotation.newBuilder
Expand All @@ -1592,7 +1607,22 @@ class VariantContextConverter(
fn(vc, _: VariantAnnotation.Builder, variant, alleleIdx)
})
val convertedAnnotation = boundAnnotationFns.foldLeft(variantAnnotationBuilder)(
(vab: VariantAnnotation.Builder, fn) => fn(vab))
(vab: VariantAnnotation.Builder, fn) => {
try {
fn(vab)
} catch {
case t: Throwable => {
if (stringency == ValidationStringency.STRICT) {
throw t
} else {
if (stringency == ValidationStringency.LENIENT) {
log.warn("Generating annotation tag from line %s with function %sfailed: %s".format(vab, fn, t))
}
vab
}
}
}
})

val indices = Array.empty[Int]

Expand Down Expand Up @@ -1680,7 +1710,23 @@ class VariantContextConverter(
.map(fn => {
fn(g, _: Genotype.Builder, alleleIdx, indices)
})
val convertedCore = boundFns.foldLeft(builder)((gb: Genotype.Builder, fn) => fn(gb))
val convertedCore = boundFns.foldLeft(builder)((gb: Genotype.Builder, fn) => {
try {
fn(gb)
} catch {
case t: Throwable => {
if (stringency == ValidationStringency.STRICT) {
throw t
} else {
if (stringency == ValidationStringency.LENIENT) {
log.warn("Converting genotype field in %s with function %s failed: %s".format(
g, fn, t))
}
gb
}
}
}
})

// if we have a non-ref allele, fold and build
val coreWithOptNonRefs = nonRefIndex.fold(convertedCore)(nonRefAllele => {
Expand All @@ -1699,7 +1745,23 @@ class VariantContextConverter(
fn(g, _: VariantCallingAnnotations.Builder, alleleIdx, indices)
})
val convertedAnnotations = boundAnnotationFns.foldLeft(vcAnns)(
(vcab: VariantCallingAnnotations.Builder, fn) => fn(vcab))
(vcab: VariantCallingAnnotations.Builder, fn) => {
try {
fn(vcab)
} catch {
case t: Throwable => {
if (stringency == ValidationStringency.STRICT) {
throw t
} else {
if (stringency == ValidationStringency.LENIENT) {
log.warn("Converting genotype annotation field in %s with function %s failed: %s".format(
g, fn, t))
}
vcab
}
}
}
})

// pull out the attribute map and process
val attrMap = attributeFns.flatMap(fn => fn(g, alleleIdx, indices))
Expand Down Expand Up @@ -2013,7 +2075,22 @@ class VariantContextConverter(

// bind the conversion functions and fold
val convertedWithVariants = variantExtractFns.foldLeft(builder)(
(vcb: VariantContextBuilder, fn) => fn(v, vcb))
(vcb: VariantContextBuilder, fn) => {
try {
fn(v, vcb)
} catch {
case t: Throwable => {
if (stringency == ValidationStringency.STRICT) {
throw t
} else {
if (stringency == ValidationStringency.LENIENT) {
log.warn("Applying extraction function %s to %s failed with %s.".format(fn, v, t))
}
vcb
}
}
}
})

// extract from annotations, if present
val convertedWithAttrs = Option(v.getAnnotation)
Expand All @@ -2028,7 +2105,20 @@ class VariantContextConverter(
attributeFns.foldLeft(convertedWithAnnotations)((vcb: VariantContextBuilder, fn) => {
val optAttrPair = fn(attributes)
optAttrPair.fold(vcb)(pair => {
vcb.attribute(pair._1, pair._2)
try {
vcb.attribute(pair._1, pair._2)
} catch {
case t: Throwable => {
if (stringency == ValidationStringency.STRICT) {
throw t
} else {
if (stringency == ValidationStringency.LENIENT) {
log.warn("Applying annotation extraction function %s to %s failed with %s.".format(fn, v, t))
}
vcb
}
}
}
})
})
})
Expand Down Expand Up @@ -2084,26 +2174,72 @@ class VariantContextConverter(

// bind the conversion functions and fold
val convertedCore = genotypeExtractFns.foldLeft(builder)(
(gb: GenotypeBuilder, fn) => fn(g, gb))
(gb: GenotypeBuilder, fn) => {
try {
fn(g, gb)
} catch {
case t: Throwable => {
if (stringency == ValidationStringency.STRICT) {
throw t
} else {
if (stringency == ValidationStringency.LENIENT) {
log.warn("Applying annotation extraction function %s to %s failed with %s.".format(fn, g, t))
}

gb
}
}
}
})

// convert the annotations if they exist
val gtWithAnnotations = Option(g.getVariantCallingAnnotations)
.fold(convertedCore)(vca => {

// bind the annotation conversion functions and fold
val convertedAnnotations = genotypeAnnotationExtractFns.foldLeft(convertedCore)(
(gb: GenotypeBuilder, fn) => fn(vca, gb))
(gb: GenotypeBuilder, fn) => {
try {
fn(vca, gb)
} catch {
case t: Throwable => {
if (stringency == ValidationStringency.STRICT) {
throw t
} else {
if (stringency == ValidationStringency.LENIENT) {
log.warn("Applying annotation extraction function %s to %s failed with %s.".format(fn, vca, t))
}

gb
}
}
}
})

// get the attribute map
val attributes: Map[String, String] = vca.getAttributes.toMap

// apply the attribute converters and return
attributeFns.foldLeft(convertedAnnotations)((gb: GenotypeBuilder, fn) => {
val optAttrPair = fn(attributes)
try {
val optAttrPair = fn(attributes)

optAttrPair.fold(gb)(pair => {
gb.attribute(pair._1, pair._2)
})
optAttrPair.fold(gb)(pair => {
gb.attribute(pair._1, pair._2)
})
} catch {
case t: Throwable => {
if (stringency == ValidationStringency.STRICT) {
throw t
} else {
if (stringency == ValidationStringency.LENIENT) {
log.warn("Applying attribute extraction function %s to %s failed with %s.".format(fn, vca, t))
}

gb
}
}
}
})
})

Expand Down
Expand Up @@ -306,7 +306,10 @@ class VariantContextConverterSuite extends ADAMFunSuite {

val optHtsjdkVC = converter.convert(ADAMVariantContext(variant, Seq(genotype)))

assert(optHtsjdkVC.isEmpty)
assert(optHtsjdkVC.isDefined)
optHtsjdkVC.foreach(vc => {
assert(!vc.getGenotype("NA12878").hasAnyAttribute("SB"))
})
}

test("Convert htsjdk multi-allelic sites-only SNVs to ADAM") {
Expand Down
Expand Up @@ -132,13 +132,14 @@ class VariantContextRDDSuite extends ADAMFunSuite {
sparkTest("transform a vcf file with bad header") {
val path = testFile("invalid/truth_small_variants.vcf")
val before = sc.loadVcf(path, ValidationStringency.SILENT)
assert(before.rdd.count == 1)
assert(before.rdd.count === 7)
assert(before.toGenotypeRDD().rdd.filter(_.getPhaseSetId == null).count === 7)

val tempPath = tmpLocation(".adam")
before.toVariantRDD().saveAsParquet(tempPath)

val after = sc.loadVariants(tempPath).toVariantContextRDD()
assert(after.rdd.count == 1)
assert(after.rdd.count === 7)
}

sparkTest("don't lose any variants when piping as VCF") {
Expand Down

0 comments on commit 84b5064

Please sign in to comment.