diff --git a/README.md b/README.md index 3a45fbf270..59b21d1c3f 100644 --- a/README.md +++ b/README.md @@ -72,12 +72,12 @@ ADAM ACTIONS countContigKmers : Counts the k-mers/q-mers from a read dataset. transformAlignments : Convert SAM/BAM to ADAM format and optionally perform read pre-processing transformations transformFeatures : Convert a file with sequence features into corresponding ADAM format and vice versa + transformGenotypes : Convert a file with genotypes into corresponding ADAM format and vice versa + transformVariants : Convert a file with variants into corresponding ADAM format and vice versa mergeShards : Merges the shards of a file reads2coverage : Calculate the coverage from a given ADAM file CONVERSION OPERATIONS - vcf2adam : Convert a VCF file to the corresponding ADAM format - adam2vcf : Convert an ADAM variant to the VCF ADAM format fasta2adam : Converts a text FASTA sequence file into an ADAMNucleotideContig Parquet file which represents assembled sequences. adam2fasta : Convert ADAM nucleotide contig fragments to FASTA files adam2fastq : Convert BAM to FASTQ files diff --git a/adam-apis/src/test/java/org/bdgenomics/adam/apis/java/JavaADAMGenotypeConduit.java b/adam-apis/src/test/java/org/bdgenomics/adam/apis/java/JavaADAMGenotypeConduit.java index 94e470789f..4a8a4b20fa 100644 --- a/adam-apis/src/test/java/org/bdgenomics/adam/apis/java/JavaADAMGenotypeConduit.java +++ b/adam-apis/src/test/java/org/bdgenomics/adam/apis/java/JavaADAMGenotypeConduit.java @@ -34,7 +34,7 @@ public static GenotypeRDD conduit(final GenotypeRDD recordRdd, // make temp directory and save file Path tempDir = Files.createTempDirectory("javaAC"); String fileName = tempDir.toString() + "/testRdd.genotype.adam"; - recordRdd.save(fileName); + recordRdd.saveAsParquet(fileName); // create a new adam context and load the file JavaADAMContext jac = new JavaADAMContext(ac); diff --git a/adam-apis/src/test/java/org/bdgenomics/adam/apis/java/JavaADAMVariantConduit.java b/adam-apis/src/test/java/org/bdgenomics/adam/apis/java/JavaADAMVariantConduit.java index 45c711d23f..58b8bbbdc6 100644 --- a/adam-apis/src/test/java/org/bdgenomics/adam/apis/java/JavaADAMVariantConduit.java +++ b/adam-apis/src/test/java/org/bdgenomics/adam/apis/java/JavaADAMVariantConduit.java @@ -34,7 +34,7 @@ public static VariantRDD conduit(final VariantRDD recordRdd, // make temp directory and save file Path tempDir = Files.createTempDirectory("javaAC"); String fileName = tempDir.toString() + "/testRdd.variant.adam"; - recordRdd.save(fileName); + recordRdd.saveAsParquet(fileName); // create a new adam context and load the file JavaADAMContext jac = new JavaADAMContext(ac); diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Vcf.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Vcf.scala deleted file mode 100644 index af3f20cb27..0000000000 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Vcf.scala +++ /dev/null @@ -1,105 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.cli - -import htsjdk.samtools.ValidationStringency -import org.apache.spark.SparkContext -import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.utils.cli._ -import org.bdgenomics.utils.misc.Logging -import org.kohsuke.args4j.{ Option => Args4jOption, Argument } - -object ADAM2Vcf extends BDGCommandCompanion { - - val commandName = "adam2vcf" - val commandDescription = "Convert an ADAM variant to the VCF ADAM format" - - def apply(cmdLine: Array[String]) = { - new ADAM2Vcf(Args4j[ADAM2VcfArgs](cmdLine)) - } -} - -class ADAM2VcfArgs extends Args4jBase with ParquetArgs { - - @Argument(required = true, metaVar = "ADAM", usage = "The ADAM variant files to convert", index = 0) - var adamFile: String = _ - - @Argument(required = true, metaVar = "VCF", usage = "Location to write VCF data", index = 1) - var outputPath: String = null - - @Args4jOption(required = false, name = "-coalesce", usage = "Set the number of partitions written to the ADAM output directory") - var coalesce: Int = -1 - - @Args4jOption(required = false, name = "-sort_on_save", usage = "Sort the VCF output by contig index.") - var sort: Boolean = false - - @Args4jOption(required = false, - name = "-sort_lexicographically_on_save", - usage = "Sort the VCF output by lexicographic order. Conflicts with -sort_on_save.") - var sortLexicographically: Boolean = false - - @Args4jOption(required = false, name = "-single", usage = "Save as a single VCF file.") - var single: Boolean = false - - @Args4jOption(required = false, name = "-disable_fast_concat", - usage = "Disables the parallel file concatenation engine.") - var disableFastConcat: Boolean = false - - @Args4jOption(required = false, name = "-stringency", usage = "Stringency level for various checks; can be SILENT, LENIENT, or STRICT. Defaults to STRICT") - var stringency: String = "STRICT" -} - -class ADAM2Vcf(val args: ADAM2VcfArgs) extends BDGSparkCommand[ADAM2VcfArgs] with Logging { - val companion = ADAM2Vcf - val stringency = ValidationStringency.valueOf(args.stringency) - - def run(sc: SparkContext) { - require(!(args.sort && args.sortLexicographically), - "Cannot set both -sort_on_save and -sort_lexicographically_on_save.") - - val adamGTs = sc.loadParquetGenotypes(args.adamFile) - - val coalesce = if (args.coalesce > 0) { - Some(args.coalesce) - } else { - None - } - - // convert to variant contexts and prep for save - val variantContexts = adamGTs.toVariantContextRDD - val maybeCoalescedVcs = if (args.coalesce > 0) { - variantContexts.transform(_.coalesce(args.coalesce)) - } else { - variantContexts - } - - // sort if requested - val maybeSortedVcs = if (args.sort) { - maybeCoalescedVcs.sort() - } else if (args.sortLexicographically) { - maybeCoalescedVcs.sortLexicographically() - } else { - maybeCoalescedVcs - } - - maybeSortedVcs.saveAsVcf(args.outputPath, - asSingleFile = args.single, - stringency, - disableFastConcat = args.disableFastConcat) - } -} diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala index 68ef9b53ea..5d83a078f8 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala @@ -36,6 +36,8 @@ object ADAMMain { CountContigKmers, TransformAlignments, TransformFeatures, + TransformGenotypes, + TransformVariants, MergeShards, Reads2Coverage ) @@ -43,8 +45,6 @@ object ADAMMain { CommandGroup( "CONVERSION OPERATIONS", List( - Vcf2ADAM, - ADAM2Vcf, Fasta2ADAM, ADAM2Fasta, ADAM2Fastq, diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformGenotypes.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformGenotypes.scala new file mode 100644 index 0000000000..31d8c435ba --- /dev/null +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformGenotypes.scala @@ -0,0 +1,132 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.cli + +import htsjdk.samtools.ValidationStringency +import org.apache.spark.SparkContext +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.rdd.{ ADAMSaveAnyArgs, GenomicRDD } +import org.bdgenomics.utils.cli._ +import org.kohsuke.args4j.{ Argument, Option ⇒ Args4jOption } + +object TransformGenotypes extends BDGCommandCompanion { + val commandName = "transformGenotypes" + val commandDescription = "Convert a file with genotypes into corresponding ADAM format and vice versa" + + def apply(cmdLine: Array[String]) = { + new TransformGenotypes(Args4j[TransformGenotypesArgs](cmdLine)) + } +} + +class TransformGenotypesArgs extends Args4jBase with ADAMSaveAnyArgs with ParquetArgs { + @Argument(required = true, metaVar = "INPUT", usage = "The genotypes file to convert (e.g., .vcf, .vcf.gz, .vcf.bgzf, .vcf.bgz). If extension is not detected, Parquet is assumed.", index = 0) + var inputPath: String = null + + @Argument(required = true, metaVar = "OUTPUT", usage = "Location to write ADAM genotypes data. If extension is not detected, Parquet is assumed.", index = 1) + var outputPath: String = null + + @Args4jOption(required = false, name = "-coalesce", usage = "Number of partitions written to the ADAM output directory.") + var coalesce: Int = -1 + + @Args4jOption(required = false, name = "-force_shuffle_coalesce", usage = "Even if the repartitioned RDD has fewer partitions, force a shuffle.") + var forceShuffle: Boolean = false + + @Args4jOption(required = false, name = "-sort_on_save", usage = "Sort VCF output by contig index.") + var sort: Boolean = false + + @Args4jOption(required = false, name = "-sort_lexicographically_on_save", usage = "Sort VCF output by lexicographic order. Conflicts with -sort_on_save.") + var sortLexicographically: Boolean = false + + @Args4jOption(required = false, name = "-single", usage = "Save as a single VCF file.") + var asSingleFile: Boolean = false + + @Args4jOption(required = false, name = "-defer_merging", usage = "Defers merging single file output.") + var deferMerging: Boolean = false + + @Args4jOption(required = false, name = "-disable_fast_concat", usage = "Disables the parallel file concatenation engine.") + var disableFastConcat: Boolean = false + + @Args4jOption(required = false, name = "-stringency", usage = "Stringency level for various checks; can be SILENT, LENIENT, or STRICT. Defaults to STRICT.") + var stringency: String = "STRICT" + + // must be defined due to ADAMSaveAnyArgs, but unused here + var sortFastqOutput: Boolean = false +} + +/** + * Convert a file with genotypes into corresponding ADAM format and vice versa. + */ +class TransformGenotypes(val args: TransformGenotypesArgs) + extends BDGSparkCommand[TransformGenotypesArgs] { + val companion = TransformGenotypes + val stringency = ValidationStringency.valueOf(args.stringency) + + /** + * Coalesce the specified GenomicRDD if requested. + * + * @param rdd GenomicRDD to coalesce. + * @return The specified GenomicRDD coalesced if requested. + */ + private def maybeCoalesce[U <: GenomicRDD[_, U]](rdd: U): U = { + if (args.coalesce != -1) { + log.info("Coalescing the number of partitions to '%d'".format(args.coalesce)) + if (args.coalesce > rdd.rdd.partitions.length || args.forceShuffle) { + rdd.transform(_.coalesce(args.coalesce, shuffle = true)) + } else { + rdd.transform(_.coalesce(args.coalesce, shuffle = false)) + } + } else { + rdd + } + } + + /** + * Sort the specified GenomicRDD if requested. + * + * @param rdd GenomicRDD to sort. + * @return The specified GenomicRDD sorted if requested. + */ + private def maybeSort[U <: GenomicRDD[_, U]](rdd: U): U = { + if (args.sort) { + log.info("Sorting before saving") + rdd.sort() + } else if (args.sortLexicographically) { + log.info("Sorting lexicographically before saving") + rdd.sortLexicographically() + } else { + rdd + } + } + + def run(sc: SparkContext) { + require(!(args.sort && args.sortLexicographically), + "Cannot set both -sort_on_save and -sort_lexicographically_on_save.") + + val genotypes = sc.loadGenotypes( + args.inputPath, + optPredicate = None, + optProjection = None, + stringency = stringency) + + if (args.outputPath.endsWith(".vcf")) { + maybeSort(maybeCoalesce(genotypes.toVariantContextRDD)).saveAsVcf(args) + } else { + maybeSort(maybeCoalesce(genotypes)).saveAsParquet(args) + } + } +} diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformVariants.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformVariants.scala new file mode 100644 index 0000000000..65c131f62a --- /dev/null +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformVariants.scala @@ -0,0 +1,132 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.cli + +import htsjdk.samtools.ValidationStringency +import org.apache.spark.SparkContext +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.rdd.{ ADAMSaveAnyArgs, GenomicRDD } +import org.bdgenomics.utils.cli._ +import org.kohsuke.args4j.{ Argument, Option ⇒ Args4jOption } + +object TransformVariants extends BDGCommandCompanion { + val commandName = "transformVariants" + val commandDescription = "Convert a file with variants into corresponding ADAM format and vice versa" + + def apply(cmdLine: Array[String]) = { + new TransformVariants(Args4j[TransformVariantsArgs](cmdLine)) + } +} + +class TransformVariantsArgs extends Args4jBase with ADAMSaveAnyArgs with ParquetArgs { + @Argument(required = true, metaVar = "INPUT", usage = "The variants file to convert (e.g., .vcf, .vcf.gz, .vcf.bgzf, .vcf.bgz). If extension is not detected, Parquet is assumed.", index = 0) + var inputPath: String = null + + @Argument(required = true, metaVar = "OUTPUT", usage = "Location to write ADAM variants data. If extension is not detected, Parquet is assumed.", index = 1) + var outputPath: String = null + + @Args4jOption(required = false, name = "-coalesce", usage = "Number of partitions written to the ADAM output directory.") + var coalesce: Int = -1 + + @Args4jOption(required = false, name = "-force_shuffle_coalesce", usage = "Even if the repartitioned RDD has fewer partitions, force a shuffle.") + var forceShuffle: Boolean = false + + @Args4jOption(required = false, name = "-sort_on_save", usage = "Sort VCF output by contig index.") + var sort: Boolean = false + + @Args4jOption(required = false, name = "-sort_lexicographically_on_save", usage = "Sort VCF output by lexicographic order. Conflicts with -sort_on_save.") + var sortLexicographically: Boolean = false + + @Args4jOption(required = false, name = "-single", usage = "Save as a single VCF file.") + var asSingleFile: Boolean = false + + @Args4jOption(required = false, name = "-defer_merging", usage = "Defers merging single file output.") + var deferMerging: Boolean = false + + @Args4jOption(required = false, name = "-disable_fast_concat", usage = "Disables the parallel file concatenation engine.") + var disableFastConcat: Boolean = false + + @Args4jOption(required = false, name = "-stringency", usage = "Stringency level for various checks; can be SILENT, LENIENT, or STRICT. Defaults to STRICT.") + var stringency: String = "STRICT" + + // must be defined due to ADAMSaveAnyArgs, but unused here + var sortFastqOutput: Boolean = false +} + +/** + * Convert a file with variants into corresponding ADAM format and vice versa. + */ +class TransformVariants(val args: TransformVariantsArgs) + extends BDGSparkCommand[TransformVariantsArgs] { + val companion = TransformVariants + val stringency = ValidationStringency.valueOf(args.stringency) + + /** + * Coalesce the specified GenomicRDD if requested. + * + * @param rdd GenomicRDD to coalesce. + * @return The specified GenomicRDD coalesced if requested. + */ + private def maybeCoalesce[U <: GenomicRDD[_, U]](rdd: U): U = { + if (args.coalesce != -1) { + log.info("Coalescing the number of partitions to '%d'".format(args.coalesce)) + if (args.coalesce > rdd.rdd.partitions.length || args.forceShuffle) { + rdd.transform(_.coalesce(args.coalesce, shuffle = true)) + } else { + rdd.transform(_.coalesce(args.coalesce, shuffle = false)) + } + } else { + rdd + } + } + + /** + * Sort the specified GenomicRDD if requested. + * + * @param rdd GenomicRDD to sort. + * @return The specified GenomicRDD sorted if requested. + */ + private def maybeSort[U <: GenomicRDD[_, U]](rdd: U): U = { + if (args.sort) { + log.info("Sorting before saving") + rdd.sort() + } else if (args.sortLexicographically) { + log.info("Sorting lexicographically before saving") + rdd.sortLexicographically() + } else { + rdd + } + } + + def run(sc: SparkContext) { + require(!(args.sort && args.sortLexicographically), + "Cannot set both -sort_on_save and -sort_lexicographically_on_save.") + + val variants = sc.loadVariants( + args.inputPath, + optPredicate = None, + optProjection = None, + stringency = stringency) + + if (args.outputPath.endsWith(".vcf")) { + maybeSort(maybeCoalesce(variants.toVariantContextRDD)).saveAsVcf(args, stringency) + } else { + maybeSort(maybeCoalesce(variants)).saveAsParquet(args) + } + } +} diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Vcf2ADAM.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Vcf2ADAM.scala deleted file mode 100644 index 7a94168726..0000000000 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Vcf2ADAM.scala +++ /dev/null @@ -1,82 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.cli - -import htsjdk.samtools.ValidationStringency -import org.apache.spark.SparkContext -import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.utils.cli._ -import org.bdgenomics.utils.misc.Logging -import org.kohsuke.args4j.{ Option => Args4jOption, Argument } - -object Vcf2ADAM extends BDGCommandCompanion { - val commandName = "vcf2adam" - val commandDescription = "Convert a VCF file to the corresponding ADAM format" - - def apply(cmdLine: Array[String]) = { - new Vcf2ADAM(Args4j[Vcf2ADAMArgs](cmdLine)) - } -} - -class Vcf2ADAMArgs extends Args4jBase with ParquetSaveArgs { - - @Argument(required = true, metaVar = "VCF", usage = "The VCF file to convert", index = 0) - var vcfPath: String = _ - - @Argument(required = true, metaVar = "ADAM", usage = "Location to write ADAM Variant data", index = 1) - var outputPath: String = null - - @Args4jOption(required = false, name = "-coalesce", usage = "Set the number of partitions written to the ADAM output directory") - var coalesce: Int = -1 - - @Args4jOption(required = false, name = "-force_shuffle_coalesce", usage = "Even if the repartitioned RDD has fewer partitions, force a shuffle.") - var forceShuffle: Boolean = false - - @Args4jOption(required = false, name = "-only_variants", usage = "Output Variant objects instead of Genotypes") - var onlyVariants: Boolean = false - - @Args4jOption(required = false, name = "-stringency", usage = "Stringency level for various checks; can be SILENT, LENIENT, or STRICT. Defaults to STRICT") - var stringency: String = "STRICT" -} - -class Vcf2ADAM(val args: Vcf2ADAMArgs) extends BDGSparkCommand[Vcf2ADAMArgs] with Logging { - val companion = Vcf2ADAM - val stringency = ValidationStringency.valueOf(args.stringency) - - def run(sc: SparkContext) { - - val variantContextRdd = sc.loadVcf(args.vcfPath, stringency) - val variantContextsToSave = if (args.coalesce > 0) { - variantContextRdd.transform( - _.coalesce(args.coalesce, shuffle = args.coalesce > variantContextRdd.rdd.partitions.length || args.forceShuffle) - ) - } else { - variantContextRdd - } - - if (args.onlyVariants) { - variantContextsToSave - .toVariantRDD - .saveAsParquet(args) - } else { - variantContextsToSave - .toGenotypeRDD - .saveAsParquet(args) - } - } -} diff --git a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformGenotypesSuite.scala b/adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformGenotypesSuite.scala new file mode 100644 index 0000000000..538f0d7cbc --- /dev/null +++ b/adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformGenotypesSuite.scala @@ -0,0 +1,57 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.cli + +import org.bdgenomics.adam.util.ADAMFunSuite + +class TransformGenotypesSuite extends ADAMFunSuite { + + sparkTest("save a file sorted by contig index") { + val inputPath = copyResource("random.vcf") + val intermediatePath = tmpFile("genotypes.adam") + val actualPath = tmpFile("sorted.vcf") + val expectedPath = copyResource("sorted.vcf") + + TransformGenotypes( + Array(inputPath, intermediatePath) + ).run(sc) + + TransformGenotypes( + Array(intermediatePath, actualPath, "-sort_on_save", "-single") + ).run(sc) + + checkFiles(expectedPath, actualPath) + } + + sparkTest("save a lexicographically sorted file") { + val inputPath = copyResource("random.vcf") + val intermediatePath = tmpFile("genotypes.lex.adam") + val actualPath = tmpFile("sorted.lex.vcf") + val expectedPath = copyResource("sorted.lex.vcf") + + TransformGenotypes( + Array(inputPath, intermediatePath) + ).run(sc) + + TransformGenotypes( + Array(intermediatePath, actualPath, "-sort_lexicographically_on_save", "-single") + ).run(sc) + + checkFiles(expectedPath, actualPath) + } +} diff --git a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/ADAM2VcfSuite.scala b/adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformVariantsSuite.scala similarity index 55% rename from adam-cli/src/test/scala/org/bdgenomics/adam/cli/ADAM2VcfSuite.scala rename to adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformVariantsSuite.scala index b2ea74a107..c63ff2a1cd 100644 --- a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/ADAM2VcfSuite.scala +++ b/adam-cli/src/test/scala/org/bdgenomics/adam/cli/TransformVariantsSuite.scala @@ -19,33 +19,39 @@ package org.bdgenomics.adam.cli import org.bdgenomics.adam.util.ADAMFunSuite -class ADAM2VcfSuite extends ADAMFunSuite { +class TransformVariantsSuite extends ADAMFunSuite { - ignore("save a file sorted by contig index") { + sparkTest("save a file sorted by contig index") { val inputPath = copyResource("random.vcf") val intermediatePath = tmpFile("variants.adam") - val outputPath = tmpFile("sorted.vcf") + val actualPath = tmpFile("sorted-variants.vcf") + val expectedPath = copyResource("sorted-variants.vcf") - Vcf2ADAM(Array(inputPath, intermediatePath)).run(sc) - ADAM2Vcf(Array(intermediatePath, - outputPath, - "-sort_on_save", - "-single")).run(sc) + TransformVariants( + Array(inputPath, intermediatePath) + ).run(sc) - checkFiles(outputPath, copyResource("sorted.vcf")) + TransformVariants( + Array(intermediatePath, actualPath, "-sort_on_save", "-single") + ).run(sc) + + checkFiles(expectedPath, actualPath) } - ignore("save a lexicographically sorted file") { + sparkTest("save a lexicographically sorted file") { val inputPath = copyResource("random.vcf") val intermediatePath = tmpFile("variants.lex.adam") - val outputPath = tmpFile("sorted.lex.vcf") + val actualPath = tmpFile("sorted-variants.lex.vcf") + val expectedPath = copyResource("sorted-variants.lex.vcf") + + TransformVariants( + Array(inputPath, intermediatePath) + ).run(sc) - Vcf2ADAM(Array(inputPath, intermediatePath)).run(sc) - ADAM2Vcf(Array(intermediatePath, - outputPath, - "-sort_lexicographically_on_save", - "-single")).run(sc) + TransformVariants( + Array(intermediatePath, actualPath, "-sort_lexicographically_on_save", "-single") + ).run(sc) - checkFiles(outputPath, copyResource("sorted.lex.vcf")) + checkFiles(expectedPath, actualPath) } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/DefaultHeaderLines.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/DefaultHeaderLines.scala index 380389173d..8aed777d11 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/DefaultHeaderLines.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/DefaultHeaderLines.scala @@ -150,7 +150,7 @@ object DefaultHeaderLines { VCFHeaderLineType.Float, "Read-backed phasing quality") lazy val genotypeFilter = new VCFFormatHeaderLine("FT", - 1, + VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype-level filter") lazy val fisherStrand = new VCFFormatHeaderLine("FS", diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/GenotypeRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/GenotypeRDD.scala index 362155906e..89e8a0f2c3 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/GenotypeRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/GenotypeRDD.scala @@ -17,7 +17,6 @@ */ package org.bdgenomics.adam.rdd.variant -import htsjdk.samtools.ValidationStringency import htsjdk.variant.vcf.VCFHeaderLine import org.apache.spark.rdd.RDD import org.bdgenomics.adam.converters.DefaultHeaderLines @@ -28,15 +27,14 @@ import org.bdgenomics.adam.models.{ SequenceDictionary, VariantContext } -import org.bdgenomics.adam.rdd.{ JavaSaveArgs, MultisampleAvroGenomicRDD } +import org.bdgenomics.adam.rdd.MultisampleAvroGenomicRDD import org.bdgenomics.adam.rich.RichVariant import org.bdgenomics.adam.serialization.AvroSerializer -import org.bdgenomics.utils.cli.SaveArgs import org.bdgenomics.utils.interval.array.{ IntervalArray, IntervalArraySerializer } -import org.bdgenomics.formats.avro.{ Contig, Genotype, Sample } +import org.bdgenomics.formats.avro.{ Genotype, Sample } import scala.reflect.ClassTag private[adam] case class GenotypeArray( @@ -92,16 +90,6 @@ case class GenotypeRDD(rdd: RDD[Genotype], IntervalArray(rdd, GenotypeArray.apply(_, _)) } - /** - * Java-friendly method for saving. - * - * @param filePath Path to save file to. If ends in ".vcf", saves as VCF, else - * saves as Parquet. - */ - def save(filePath: java.lang.String) { - save(new JavaSaveArgs(filePath)) - } - /** * @return Returns this GenotypeRDD squared off as a VariantContextRDD. */ @@ -119,59 +107,6 @@ case class GenotypeRDD(rdd: RDD[Genotype], VariantContextRDD(vcRdd, sequences, samples, headerLines) } - /** - * Automatically detects the extension and saves to either VCF or Parquet. - * - * @param args Arguments configuring how to save the output. - */ - def save(args: SaveArgs): Boolean = { - maybeSaveVcf(args) || { - saveAsParquet(args); true - } - } - - /** - * Explicitly saves to VCF. - * - * @param args Arguments configuring how/where to save the output. - * @param sortOnSave Whether to sort when saving or not. - */ - def saveAsVcf(args: SaveArgs, - sortOnSave: Boolean = false) { - toVariantContextRDD.saveAsVcf(args, sortOnSave) - } - - /** - * If the file has a ".vcf" extension, saves to VCF. - * - * @param args Arguments defining how/where to save. - * @return True if file is successfully saved as VCF. - */ - private def maybeSaveVcf(args: SaveArgs): Boolean = { - if (args.outputPath.endsWith(".vcf")) { - saveAsVcf(args) - true - } else { - false - } - } - - /** - * Explicitly saves to VCF. - * - * @param filePath The filepath to save to. - * @param asSingleFile If true, saves the output as a single file by merging - * the sharded output after completing the write to HDFS. If false, the - * output of this call will be written as shards, where each shard has a - * valid VCF header. - * @param stringency The validation stringency to use when writing the VCF. - */ - def saveAsVcf(filePath: String, - asSingleFile: Boolean, - stringency: ValidationStringency) { - toVariantContextRDD.saveAsVcf(filePath, asSingleFile, stringency) - } - /** * @param newRdd An RDD to replace the underlying RDD with. * @return Returns a new GenotypeRDD with the underlying RDD replaced. diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDD.scala index 57f369478a..1784f4e5a1 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDD.scala @@ -19,7 +19,6 @@ package org.bdgenomics.adam.rdd.variant import htsjdk.samtools.ValidationStringency import htsjdk.variant.vcf.{ VCFHeader, VCFHeaderLine } -import java.io.OutputStream import org.apache.hadoop.io.LongWritable import org.apache.hadoop.fs.Path import org.apache.spark.rdd.RDD @@ -28,7 +27,6 @@ import org.bdgenomics.adam.converters.{ VariantContextConverter } import org.bdgenomics.adam.models.{ - ReferencePosition, ReferenceRegion, ReferenceRegionSerializer, SequenceDictionary, @@ -36,14 +34,12 @@ import org.bdgenomics.adam.models.{ VariantContextSerializer } import org.bdgenomics.adam.rdd.{ + ADAMSaveAnyArgs, FileMerger, MultisampleGenomicRDD, VCFHeaderUtils } -import org.bdgenomics.adam.rich.RichVariant -import org.bdgenomics.adam.serialization.AvroSerializer import org.bdgenomics.formats.avro.Sample -import org.bdgenomics.utils.cli.SaveArgs import org.bdgenomics.utils.misc.Logging import org.bdgenomics.utils.interval.array.{ IntervalArray, @@ -130,12 +126,18 @@ case class VariantContextRDD(rdd: RDD[VariantContext], * Converts an RDD of ADAM VariantContexts to HTSJDK VariantContexts * and saves to disk as VCF. * - * @param filePath The filepath to save to. - * @param sortOnSave Whether to sort before saving. + * @param args Arguments defining where to save the file. + * @param stringency The validation stringency to use when writing the VCF. + * Defaults to LENIENT. */ - def saveAsVcf(args: SaveArgs, - sortOnSave: Boolean) { - saveAsVcf(args.outputPath, sortOnSave) + def saveAsVcf(args: ADAMSaveAnyArgs, + stringency: ValidationStringency = ValidationStringency.LENIENT): Unit = { + saveAsVcf( + args.outputPath, + asSingleFile = args.asSingleFile, + deferMerging = args.deferMerging, + disableFastConcat = args.disableFastConcat, + stringency = stringency) } /** @@ -147,14 +149,17 @@ case class VariantContextRDD(rdd: RDD[VariantContext], * the sharded output after completing the write to HDFS. If false, the * output of this call will be written as shards, where each shard has a * valid VCF header. Default is false. - * @param stringency The validation stringency to use when writing the VCF. + * @param deferMerging If true and asSingleFile is true, we will save the + * output shards as a headerless file, but we will not merge the shards. * @param disableFastConcat If asSingleFile is true and deferMerging is false, * disables the use of the parallel file merging engine. + * @param stringency The validation stringency to use when writing the VCF. */ def saveAsVcf(filePath: String, - asSingleFile: Boolean = false, - stringency: ValidationStringency = ValidationStringency.LENIENT, - disableFastConcat: Boolean = false) { + asSingleFile: Boolean, + deferMerging: Boolean, + disableFastConcat: Boolean, + stringency: ValidationStringency): Unit = { val vcfFormat = VCFFormat.inferFromFilePath(filePath) assert(vcfFormat == VCFFormat.VCF, "BCF not yet supported") // TODO: Add BCF support @@ -208,13 +213,15 @@ case class VariantContextRDD(rdd: RDD[VariantContext], conf ) - // merge shards - FileMerger.mergeFiles(rdd.context, - fs, - new Path(filePath), - new Path(tailPath), - Some(headPath), - disableFastConcat = disableFastConcat) + // optionally merge + if (!deferMerging) { + FileMerger.mergeFiles(rdd.context, + fs, + new Path(filePath), + new Path(tailPath), + Some(headPath), + disableFastConcat = disableFastConcat) + } } else { // write shards diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantRDD.scala index 6c9ab7d091..dc010abfe6 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantRDD.scala @@ -17,7 +17,6 @@ */ package org.bdgenomics.adam.rdd.variant -import htsjdk.samtools.ValidationStringency import htsjdk.variant.vcf.{ VCFHeader, VCFHeaderLine } import org.apache.hadoop.fs.Path import org.apache.spark.rdd.RDD @@ -30,15 +29,10 @@ import org.bdgenomics.adam.models.{ } import org.bdgenomics.adam.rdd.{ AvroGenomicRDD, - JavaSaveArgs, VCFHeaderUtils } import org.bdgenomics.adam.serialization.AvroSerializer -import org.bdgenomics.formats.avro.{ - Contig, - Sample, - Variant -} +import org.bdgenomics.formats.avro.Sample import org.bdgenomics.formats.avro.{ Contig, Variant } import org.bdgenomics.utils.interval.array.{ IntervalArray, @@ -111,31 +105,6 @@ case class VariantRDD(rdd: RDD[Variant], (headerLines ++ iterableRdds.flatMap(_.headerLines)).distinct) } - /** - * Java-friendly method for saving to Parquet. - * - * @param filePath Path to save to. - */ - def save(filePath: java.lang.String) { - saveAsParquet(new JavaSaveArgs(filePath)) - } - - /** - * Explicitly saves to VCF. - * - * @param filePath The filepath to save to. - * @param asSingleFile If true, saves the output as a single file by merging - * the sharded output after completing the write to HDFS. If false, the - * output of this call will be written as shards, where each shard has a - * valid VCF header. - * @param stringency The validation stringency to use when writing the VCF. - */ - def saveAsVcf(filePath: String, - asSingleFile: Boolean, - stringency: ValidationStringency) { - toVariantContextRDD.saveAsVcf(filePath, asSingleFile, stringency) - } - /** * @return Returns this VariantRDD as a VariantContextRDD. */ diff --git a/adam-core/src/test/resources/random.vcf b/adam-core/src/test/resources/random.vcf index a40cdb9646..ed60baae7a 100644 --- a/adam-core/src/test/resources/random.vcf +++ b/adam-core/src/test/resources/random.vcf @@ -16,7 +16,7 @@ ##FILTER= ##FORMAT= ##FORMAT= -##FORMAT= +##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= diff --git a/adam-core/src/test/resources/sorted-variants.lex.vcf b/adam-core/src/test/resources/sorted-variants.lex.vcf new file mode 100644 index 0000000000..8747f34534 --- /dev/null +++ b/adam-core/src/test/resources/sorted-variants.lex.vcf @@ -0,0 +1,74 @@ +##fileformat=VCFv4.2 +##FILTER= 200.0"> +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 14397 . CTGT C . IndelQD AC=2;AF=0.333;AN=6;BaseQRankSum=1.8;ClippingRankSum=0.138;DP=69;FS=7.786;MLEAC=2;MLEAF=0.333;MQ=26.84;MQ0=0;MQRankSum=-1.906;QD=1.55;ReadPosRankSum=0.384 +1 14522 . G A . VQSRTrancheSNP99.95to100.00 AC=2;AF=0.333;AN=6;BaseQRankSum=2.044;ClippingRankSum=-2.196;DP=48;FS=13.179;MLEAC=2;MLEAF=0.333;MQ=25.89;MQ0=0;MQRankSum=-0.063;QD=8.87;ReadPosRankSum=0.952;VQSLOD=-3.333;culprit=MQ +1 63735 rs201888535 CCTA C . PASS AC=1;AF=0.167;AN=6;BaseQRankSum=1.138;ClippingRankSum=0.448;DB;DP=176;FS=13.597;MLEAC=1;MLEAF=0.167;MQ=31.06;MQ0=0;MQRankSum=0.636;QD=9.98;ReadPosRankSum=-1.18 +13 752721 rs3131972 A G . PASS AC=6;AF=1.0;AN=6;DB;DP=69;FS=0.0;MLEAC=6;MLEAF=1.0;MQ=60.0;MQ0=0;POSITIVE_TRAIN_SITE;QD=31.67;VQSLOD=18.94;culprit=QD +13 752791 . A G . PASS AC=6;AF=1.0;AN=6;DB;DP=69;FS=0.0;MLEAC=6;MLEAF=1.0;MQ=60.0;MQ0=0;POSITIVE_TRAIN_SITE;QD=31.67;VQSLOD=18.94;culprit=QD +2 19190 . GC G . PASS AC=3;AF=0.5;AN=6;BaseQRankSum=4.157;ClippingRankSum=3.666;DP=74;FS=37.037;MLEAC=3;MLEAF=0.5;MQ=22.26;MQ0=0;MQRankSum=0.195;QD=16.04;ReadPosRankSum=-4.072 diff --git a/adam-core/src/test/resources/sorted-variants.vcf b/adam-core/src/test/resources/sorted-variants.vcf new file mode 100644 index 0000000000..e57be7cd49 --- /dev/null +++ b/adam-core/src/test/resources/sorted-variants.vcf @@ -0,0 +1,74 @@ +##fileformat=VCFv4.2 +##FILTER= 200.0"> +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 14397 . CTGT C . IndelQD AC=2;AF=0.333;AN=6;BaseQRankSum=1.8;ClippingRankSum=0.138;DP=69;FS=7.786;MLEAC=2;MLEAF=0.333;MQ=26.84;MQ0=0;MQRankSum=-1.906;QD=1.55;ReadPosRankSum=0.384 +1 14522 . G A . VQSRTrancheSNP99.95to100.00 AC=2;AF=0.333;AN=6;BaseQRankSum=2.044;ClippingRankSum=-2.196;DP=48;FS=13.179;MLEAC=2;MLEAF=0.333;MQ=25.89;MQ0=0;MQRankSum=-0.063;QD=8.87;ReadPosRankSum=0.952;VQSLOD=-3.333;culprit=MQ +1 63735 rs201888535 CCTA C . PASS AC=1;AF=0.167;AN=6;BaseQRankSum=1.138;ClippingRankSum=0.448;DB;DP=176;FS=13.597;MLEAC=1;MLEAF=0.167;MQ=31.06;MQ0=0;MQRankSum=0.636;QD=9.98;ReadPosRankSum=-1.18 +2 19190 . GC G . PASS AC=3;AF=0.5;AN=6;BaseQRankSum=4.157;ClippingRankSum=3.666;DP=74;FS=37.037;MLEAC=3;MLEAF=0.5;MQ=22.26;MQ0=0;MQRankSum=0.195;QD=16.04;ReadPosRankSum=-4.072 +13 752721 rs3131972 A G . PASS AC=6;AF=1.0;AN=6;DB;DP=69;FS=0.0;MLEAC=6;MLEAF=1.0;MQ=60.0;MQ0=0;POSITIVE_TRAIN_SITE;QD=31.67;VQSLOD=18.94;culprit=QD +13 752791 . A G . PASS AC=6;AF=1.0;AN=6;DB;DP=69;FS=0.0;MLEAC=6;MLEAF=1.0;MQ=60.0;MQ0=0;POSITIVE_TRAIN_SITE;QD=31.67;VQSLOD=18.94;culprit=QD diff --git a/adam-core/src/test/resources/sorted.lex.vcf b/adam-core/src/test/resources/sorted.lex.vcf index af4582b7c9..b66aca8454 100644 --- a/adam-core/src/test/resources/sorted.lex.vcf +++ b/adam-core/src/test/resources/sorted.lex.vcf @@ -17,7 +17,7 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##FORMAT= +##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= diff --git a/adam-core/src/test/resources/sorted.vcf b/adam-core/src/test/resources/sorted.vcf index c050c54a56..480618990c 100644 --- a/adam-core/src/test/resources/sorted.vcf +++ b/adam-core/src/test/resources/sorted.vcf @@ -17,7 +17,7 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##FORMAT= +##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala index f96fc17b19..460105cb49 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala @@ -407,7 +407,7 @@ class ADAMContextSuite extends ADAMFunSuite { val path = testFile("bqsr1.vcf").replace("bqsr1", "*") val variants = sc.loadVcf(path).toVariantRDD - assert(variants.rdd.count === 722) + assert(variants.rdd.count === 734) } sparkTest("load vcf from a directory") { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala index 4514a6ce41..57bfc2f1e9 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala @@ -19,6 +19,7 @@ package org.bdgenomics.adam.rdd.variant import com.google.common.collect.ImmutableList import com.google.common.io.Files +import htsjdk.samtools.ValidationStringency import java.io.File import org.bdgenomics.adam.models.{ SequenceDictionary, @@ -73,7 +74,7 @@ class VariantContextRDDSuite extends ADAMFunSuite { sparkTest("can write, then read in .vcf file") { val path = new File(tempDir, "test.vcf") - variants.saveAsVcf(TestSaveArgs(path.getAbsolutePath), false) + variants.saveAsVcf(TestSaveArgs(path.getAbsolutePath)) assert(path.exists) val vcRdd = sc.loadVcf("%s/test.vcf/part-r-00000".format(tempDir)) @@ -97,7 +98,11 @@ class VariantContextRDDSuite extends ADAMFunSuite { sparkTest("can write as a single file, then read in .vcf file") { val path = new File(tempDir, "test_single.vcf") - variants.saveAsVcf(path.getAbsolutePath, asSingleFile = true) + variants.saveAsVcf(path.getAbsolutePath, + asSingleFile = true, + deferMerging = false, + disableFastConcat = false, + ValidationStringency.LENIENT) assert(path.exists) val vcRdd = sc.loadVcf("%s/test_single.vcf".format(tempDir)) assert(vcRdd.rdd.count === 1) @@ -127,7 +132,10 @@ class VariantContextRDDSuite extends ADAMFunSuite { variants.sort() .saveAsVcf(outputPath, - asSingleFile = true) + asSingleFile = true, + deferMerging = false, + disableFastConcat = false, + ValidationStringency.LENIENT) checkFiles(outputPath, testFile("sorted.vcf")) } @@ -139,7 +147,10 @@ class VariantContextRDDSuite extends ADAMFunSuite { variants.sortLexicographically() .saveAsVcf(outputPath, - asSingleFile = true) + asSingleFile = true, + deferMerging = false, + disableFastConcat = false, + ValidationStringency.LENIENT) checkFiles(outputPath, testFile("sorted.lex.vcf")) } diff --git a/docs/source/01_intro.md b/docs/source/01_intro.md index e2913b9476..ff5c4a2fb5 100644 --- a/docs/source/01_intro.md +++ b/docs/source/01_intro.md @@ -108,32 +108,25 @@ Usage: adam-submit [ --] Choose one of the following commands: ADAM ACTIONS - depth : Calculate the depth from a given ADAM file, at each variant in a VCF - count_kmers : Counts the k-mers/q-mers from a read dataset. - count_contig_kmers : Counts the k-mers/q-mers from a read dataset. + countKmers : Counts the k-mers/q-mers from a read dataset. + countContigKmers : Counts the k-mers/q-mers from a read dataset. transform : Convert SAM/BAM to ADAM format and optionally perform read pre-processing transformations - adam2fastq : Convert BAM to FASTQ files - plugin : Executes an ADAMPlugin - flatten : Convert a ADAM format file to a version with a flattened schema, suitable for querying with tools like Impala + transformFeatures : Convert a file with sequence features into corresponding ADAM format and vice versa + transformGenotypes : Convert a file with genotypes into corresponding ADAM format and vice versa + transformVariants : Convert a file with variants into corresponding ADAM format and vice versa + mergeShards : Merges the shards of a file + reads2coverage : Calculate the coverage from a given ADAM file CONVERSION OPERATIONS - vcf2adam : Convert a VCF file to the corresponding ADAM format - anno2adam : Convert a annotation file (in VCF format) to the corresponding ADAM format - adam2vcf : Convert an ADAM variant to the VCF ADAM format fasta2adam : Converts a text FASTA sequence file into an ADAMNucleotideContig Parquet file which represents assembled sequences. adam2fasta : Convert ADAM nucleotide contig fragments to FASTA files - features2adam : Convert a file with sequence features into corresponding ADAM format - wigfix2bed : Locally convert a wigFix file to BED format + adam2fastq : Convert BAM to FASTQ files fragments2reads : Convert alignment records into fragment records. reads2fragments : Convert alignment records into fragment records. PRINT print : Print an ADAM formatted file - print_genes : Load a GTF file containing gene annotations and print the corresponding gene models flagstat : Print statistics on reads in an ADAM file (similar to samtools flagstat) - print_tags : Prints the values and counts of all tags in a set of records - listdict : Print the contents of an ADAM sequence dictionary - allelecount : Calculate Allele frequencies view : View certain reads from an alignment-record file. ``` diff --git a/docs/source/50_cli.md b/docs/source/50_cli.md index 9c7d66832b..5b388c3c09 100644 --- a/docs/source/50_cli.md +++ b/docs/source/50_cli.md @@ -241,6 +241,96 @@ options]{#legacy-output}, `transformFeatures` has one optional argument: Parquet), sets the number of partitions to load. If not provided, this is chosen by Spark. +### transformGenotypes + +Loads a genotype file into the ADAM `Genotype` schema, and saves it back. The +input and output formats are autodetected. Takes two required arguments: + +1. `INPUT`: The input path. A file containing genotypes in any of the supported + ADAM genotype input formats. +2. `OUTPUT`: The path to save the transformed genotypes to. Supports any of ADAM's + genotype output formats. + +Beyond the [default options](#default-args) and the [legacy output +options]{#legacy-output}, `transformGenotypes` has additional arguments: + +* `-coalesce`: Sets the number of partitions to coalesce the output to. + If `-force_shuffle_coalesce` is not provided, the Spark engine may ignore + the coalesce directive. +* `-force_shuffle_coalesce`: Forces a shuffle that leads to the output being + saved with the number of partitions requested by `-coalesce`. This is + necessary if the `-coalesce` would increase the number of partitions, or + if it would reduce the number of partitions to fewer than the number of + Spark executors. This may have a substantial performance cost, and will + invalidate any sort order. +* `-sort_on_save`: Sorts the genotypes when saving, where contigs are ordered + by sequence index. Conflicts with `-sort_lexicographically_on_save`. +* `-sort_lexicographically_on_save`: Sorts the genotypes when saving, where + contigs are ordered lexicographically. Conflicts with `-sort_on_save`. +* `-single`: Saves the VCF file as headerless shards, and then merges the + sharded files into a single VCF. +* `-stringency`: Sets the validation stringency for conversion. + Defaults to `LENIENT.` See [validation stringency](#validation) for more + details. + +In this command, the validation stringency is applied to the +individual genotypes. If a genotype fails validation, the +individual genotype will be dropped (for lenient or silent validation, +under strict validation, conversion will fail). Header lines are not validated. +Due to a constraint imposed by the [htsjdk](https://github.com/samtools/htsjdk) +library, which we use to parse VCF files, user provided header lines that do not +match the header line definitions from the +[VCF 4.2](https://samtools.github.io/hts-specs/VCFv4.2.pdf) spec will be +overridden with the line definitions from the specification. Unfortunately, this +behavior cannot be disabled. If there is a user provided vs. spec mismatch in +format/info field count or type, this will likely cause validation failures +during conversion. + +### transformVariants + +Loads a variant file into the ADAM `Variant` schema, and saves it back. The +input and output formats are autodetected. Takes two required arguments: + +1. `INPUT`: The input path. A file containing variants in any of the supported + ADAM variant input formats. +2. `OUTPUT`: The path to save the transformed variants to. Supports any of ADAM's + variant output formats. + +Beyond the [default options](#default-args) and the [legacy output +options]{#legacy-output}, `transformVariants` has additional arguments: + +* `-coalesce`: Sets the number of partitions to coalesce the output to. + If `-force_shuffle_coalesce` is not provided, the Spark engine may ignore + the coalesce directive. +* `-force_shuffle_coalesce`: Forces a shuffle that leads to the output being + saved with the number of partitions requested by `-coalesce`. This is + necessary if the `-coalesce` would increase the number of partitions, or + if it would reduce the number of partitions to fewer than the number of + Spark executors. This may have a substantial performance cost, and will + invalidate any sort order. +* `-sort_on_save`: Sorts the variants when saving, where contigs are ordered + by sequence index. Conflicts with `-sort_lexicographically_on_save`. +* `-sort_lexicographically_on_save`: Sorts the variants when saving, where + contigs are ordered lexicographically. Conflicts with `-sort_on_save`. +* `-single`: Saves the VCF file as headerless shards, and then merges the + sharded files into a single VCF. +* `-stringency`: Sets the validation stringency for conversion. + Defaults to `LENIENT.` See [validation stringency](#validation) for more + details. + +In this command, the validation stringency is applied to the +individual variants. If a variant fails validation, the +individual variant will be dropped (for lenient or silent validation, +under strict validation, conversion will fail). Header lines are not validated. +Due to a constraint imposed by the [htsjdk](https://github.com/samtools/htsjdk) +library, which we use to parse VCF files, user provided header lines that do not +match the header line definitions from the +[VCF 4.2](https://samtools.github.io/hts-specs/VCFv4.2.pdf) spec will be +overridden with the line definitions from the specification. Unfortunately, this +behavior cannot be disabled. If there is a user provided vs. spec mismatch in +format/info field count or type, this will likely cause validation failures +during conversion. + ### mergeShards A CLI tool for merging a [sharded legacy file](#legacy-output) that was written @@ -292,68 +382,6 @@ following options: These tools convert data between a legacy genomic file format and using ADAM's schemas to store data in Parquet. -### vcf2adam and adam2vcf - -These commands convert between VCF and Parquet using the Genotype and Variant -schemas. - -`vcf2adam` takes two required arguments: - -1. `VCF`: The VCF file to convert to Parquet. -2. `ADAM`: The path to save the converted Parquet data at. - -`vcf2adam` supports the full set of [default options](#default-args). -Additionally, `vcf2adam` takes the following options: - -* `-only_variants`: Instead of saving the VCF file as Genotypes, only save the - Variants from the VCF. This is useful if loading a sites-only VCF, e.g., for - [BQSR](#known-snps) or [Indel realignment](#known-indels). -* `-coalesce`: Sets the number of partitions to coalesce the output to. - If `-force_shuffle_coalesce` is not provided, the Spark engine may ignore - the coalesce directive. -* `-force_shuffle_coalesce`: Forces a shuffle that leads to the output being - saved with the number of partitions requested by `-coalesce`. This is - necessary if the `-coalesce` would increase the number of partitions, or - if it would reduce the number of partitions to fewer than the number of - Spark executors. This may have a substantial performance cost, and will - invalidate any sort order. -* `-stringency`: Sets the validation stringency for conversion. - Defaults to `LENIENT.` See [validation stringency](#validation) for more - details. - -`adam2vcf` takes two required arguments: - -1. `ADAM`: The Parquet file of Genotypes to convert to VCF. -2. `VCF`: The path to save the VCF file to. - -`adam2vcf` only supports the `-print_metrics` option from the [default -options](#default-args). Additionally, `adam2vcf` takes the following options: - -* `-coalesce`: Sets the number of partitions to coalesce the output to. - The Spark engine may ignore the coalesce directive. -* `-sort_on_save`: Sorts the variants when saving, where contigs are ordered - by sequence index. Conflicts with `-sort_lexicographically_on_save`. -* `-sort_lexicographically_on_save`: Sorts the variants when saving, where - contigs are ordered lexicographically. Conflicts with `-sort_on_save`. -* `-single`: Saves the VCF file as headerless shards, and then merges the - sharded files into a single VCF. -* `-stringency`: Sets the validation stringency for conversion. - Defaults to `LENIENT.` See [validation stringency](#validation) for more - details. - -In these commands, the validation stringency is applied to the -individual variants and genotypes. If a variant or genotype fails validation, the -individual variant or genotype will be dropped (for lenient or silent validation, -under strict validation, conversion will fail). Header lines are not validated. -Due to a constraint imposed by the [htsjdk](https://github.com/samtools/htsjdk) -library, which we use to parse VCF files, user provided header lines that do not -match the header line definitions from the -[VCF 4.2](https://samtools.github.io/hts-specs/VCFv4.2.pdf) spec will be -overridden with the line definitions from the specification. Unfortunately, this -behavior cannot be disabled. If there is a user provided vs. spec mismatch in -format/info field count or type, this will likely cause validation failures -during conversion. - ### fasta2adam and adam2fasta These commands convert between FASTA and Parquet files storing assemblies using