From 1e5450c943c1f75bf7f3f14c9b8b305effa82ec6 Mon Sep 17 00:00:00 2001 From: Frank Austin Nothaft Date: Tue, 16 Jan 2018 09:20:23 -0800 Subject: [PATCH] [ADAM-1874] Dedupe samples when loading VCFs. Resolves #1874. While samples should be unique in a single VCF, we may load data from multiple VCFs that contain the same samples (e.g., VCFs from a single sequencing project where the VCFs are split by chromosome). This change dedupes sample IDs on load. --- .../main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala | 4 +++- .../scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala | 8 +++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala index 4af7247c28..96e6c19608 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala @@ -1119,9 +1119,11 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log val files = getFsAndFilesWithFilter(pathName, new NoPrefixFileFilter("_")) // load yonder the metadata - files.map(p => loadSingleVcfMetadata(p.toString)).reduce((p1, p2) => { + val (sequences, samples, headerLines) = files.map(p => loadSingleVcfMetadata(p.toString)).reduce((p1, p2) => { (p1._1 ++ p2._1, p1._2 ++ p2._2, p1._3 ++ p2._3) }) + + (sequences, samples.distinct, headerLines) } /** diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala index 91724d64ab..baf5a4c162 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala @@ -473,7 +473,13 @@ class ADAMContextSuite extends ADAMFunSuite { sparkTest("load vcf with a glob") { val path = testFile("bqsr1.vcf").replace("bqsr1", "*") - val variants = sc.loadVcf(path).toVariants + val vcs = sc.loadVcf(path) + + assert(vcs.samples.size === 8) + assert(vcs.headerLines.size === 154) + assert(vcs.sequences.size === 31) + + val variants = vcs.toVariants assert(variants.rdd.count === 782) }