Convert to VariantContextRDD for sorting genotypes, update docs.

bigdatagenomics · May 18, 2017 · 1a60973 · 1a60973
1 parent 3f7b84c
commit 1a60973
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 84 deletions.
diff --git a/README.md b/README.md
@@ -72,12 +72,12 @@ ADAM ACTIONS
     countContigKmers : Counts the k-mers/q-mers from a read dataset.
            transform : Convert SAM/BAM to ADAM format and optionally perform read pre-processing transformations
    transformFeatures : Convert a file with sequence features into corresponding ADAM format and vice versa
+  transformGenotypes : Convert a file with genotypes into corresponding ADAM format and vice versa
+   transformVariants : Convert a file with variants into corresponding ADAM format and vice versa
          mergeShards : Merges the shards of a file
       reads2coverage : Calculate the coverage from a given ADAM file
 
 CONVERSION OPERATIONS
-            vcf2adam : Convert a VCF file to the corresponding ADAM format
-            adam2vcf : Convert an ADAM variant to the VCF ADAM format
           fasta2adam : Converts a text FASTA sequence file into an ADAMNucleotideContig Parquet file which represents assembled sequences.
           adam2fasta : Convert ADAM nucleotide contig fragments to FASTA files
           adam2fastq : Convert BAM to FASTQ files

diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformGenotypes.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformGenotypes.scala
@@ -69,12 +69,21 @@ class TransformGenotypesArgs extends Args4jBase with ADAMSaveAnyArgs with Parque
   var sortFastqOutput: Boolean = false
 }
 
+/**
+ * Convert a file with genotypes into corresponding ADAM format and vice versa.
+ */
 class TransformGenotypes(val args: TransformGenotypesArgs)
     extends BDGSparkCommand[TransformGenotypesArgs] {
   val companion = TransformGenotypes
   val stringency = ValidationStringency.valueOf(args.stringency)
 
-  private def maybeCoalesce(rdd: GenotypeRDD): GenotypeRDD = {
+  /**
+   * Coalesce the specified VariantContextRDD if requested.
+   *
+   * @param rdd VariantContextRDD to coalesce.
+   * @return The specified VariantContextRDD coalesced if requested.
+   */
+  private def maybeCoalesce(rdd: VariantContextRDD): VariantContextRDD = {
     if (args.coalesce != -1) {
       log.info("Coalescing the number of partitions to '%d'".format(args.coalesce))
       if (args.coalesce > rdd.rdd.partitions.length || args.forceShuffle) {
@@ -87,7 +96,13 @@ class TransformGenotypes(val args: TransformGenotypesArgs)
     }
   }
 
-  private def maybeSort(rdd: GenotypeRDD): GenotypeRDD = {
+  /**
+   * Sort the specified VariantContextRDD if requested.
+   *
+   * @param rdd VariantContextRDD to sort.
+   * @return The specified VariantContextRDD sorted if requested.
+   */
+  private def maybeSort(rdd: VariantContextRDD): VariantContextRDD = {
     if (args.sort) {
       log.info("Sorting before saving")
       rdd.sort()
@@ -109,13 +124,23 @@ class TransformGenotypes(val args: TransformGenotypesArgs)
       optProjection = None,
       stringency = stringency)
 
+    // convert to variant contexts
+    val variantContexts = genotypes.toVariantContextRDD
+
     // coalesce if requested
-    val maybeCoalescedGenotypes = maybeCoalesce(genotypes)
+    val maybeCoalescedVariantContexts = maybeCoalesce(variantContexts)
 
     // sort or sort lexicographcally if requested
-    val maybeSortedGenotypes = maybeSort(maybeCoalescedGenotypes)
+    val maybeSortedVariantContexts = maybeSort(maybeCoalescedVariantContexts)
 
     // save as VCF or Parquet
-    maybeSortedGenotypes.save(args, stringency)
+    if (args.outputPath.endsWith(".vcf")) {
+      maybeSortedVariantContexts.saveAsVcf(args, stringency)
+    } else {
+      // convert back to genotypes
+      val maybeSortedGenotypes = maybeSortedVariantContexts.toGenotypeRDD
+      // and save as Parquet
+      maybeSortedGenotypes.saveAsParquet(args)
+    }
   }
 }
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformVariants.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformVariants.scala
@@ -69,11 +69,20 @@ class TransformVariantsArgs extends Args4jBase with ADAMSaveAnyArgs with Parquet
   var sortFastqOutput: Boolean = false
 }
 
+/**
+ * Convert a file with variants into corresponding ADAM format and vice versa.
+ */
 class TransformVariants(val args: TransformVariantsArgs)
     extends BDGSparkCommand[TransformVariantsArgs] {
   val companion = TransformVariants
   val stringency = ValidationStringency.valueOf(args.stringency)
 
+  /**
+   * Coalesce the specified VariantRDD if requested.
+   *
+   * @param rdd VariantRDD to coalesce.
+   * @return The specified VariantRDD coalesced if requested.
+   */
   private def maybeCoalesce(rdd: VariantRDD): VariantRDD = {
     if (args.coalesce != -1) {
       log.info("Coalescing the number of partitions to '%d'".format(args.coalesce))
@@ -87,6 +96,12 @@ class TransformVariants(val args: TransformVariantsArgs)
     }
   }
 
+  /**
+   * Sort the specified VariantRDD if requested.
+   *
+   * @param rdd VariantRDD to sort.
+   * @return The specified VariantRDD sorted if requested.
+   */
   private def maybeSort(rdd: VariantRDD): VariantRDD = {
     if (args.sort) {
       log.info("Sorting before saving")

diff --git a/docs/source/01_intro.md b/docs/source/01_intro.md
@@ -108,32 +108,25 @@ Usage: adam-submit [<spark-args> --] <adam-args>
 Choose one of the following commands:
 
 ADAM ACTIONS
-               depth : Calculate the depth from a given ADAM file, at each variant in a VCF
-         count_kmers : Counts the k-mers/q-mers from a read dataset.
-  count_contig_kmers : Counts the k-mers/q-mers from a read dataset.
+          countKmers : Counts the k-mers/q-mers from a read dataset.
+    countContigKmers : Counts the k-mers/q-mers from a read dataset.
            transform : Convert SAM/BAM to ADAM format and optionally perform read pre-processing transformations
-          adam2fastq : Convert BAM to FASTQ files
-              plugin : Executes an ADAMPlugin
-             flatten : Convert a ADAM format file to a version with a flattened schema, suitable for querying with tools like Impala
+   transformFeatures : Convert a file with sequence features into corresponding ADAM format and vice versa
+  transformGenotypes : Convert a file with genotypes into corresponding ADAM format and vice versa
+   transformVariants : Convert a file with variants into corresponding ADAM format and vice versa
+         mergeShards : Merges the shards of a file
+      reads2coverage : Calculate the coverage from a given ADAM file
 
 CONVERSION OPERATIONS
-            vcf2adam : Convert a VCF file to the corresponding ADAM format
-           anno2adam : Convert a annotation file (in VCF format) to the corresponding ADAM format
-            adam2vcf : Convert an ADAM variant to the VCF ADAM format
           fasta2adam : Converts a text FASTA sequence file into an ADAMNucleotideContig Parquet file which represents assembled sequences.
           adam2fasta : Convert ADAM nucleotide contig fragments to FASTA files
-       features2adam : Convert a file with sequence features into corresponding ADAM format
-          wigfix2bed : Locally convert a wigFix file to BED format
+          adam2fastq : Convert BAM to FASTQ files
      fragments2reads : Convert alignment records into fragment records.
      reads2fragments : Convert alignment records into fragment records.
 
 PRINT
                print : Print an ADAM formatted file
-         print_genes : Load a GTF file containing gene annotations and print the corresponding gene models
             flagstat : Print statistics on reads in an ADAM file (similar to samtools flagstat)
-          print_tags : Prints the values and counts of all tags in a set of records
-            listdict : Print the contents of an ADAM sequence dictionary
-         allelecount : Calculate Allele frequencies
                 view : View certain reads from an alignment-record file.
 ```
 

diff --git a/docs/source/50_cli.md b/docs/source/50_cli.md
@@ -241,6 +241,96 @@ options]{#legacy-output}, `transformFeatures` has one optional argument:
   Parquet), sets the number of partitions to load. If not provided, this is
   chosen by Spark.
 
+### transformGenotypes
+
+Loads a genotype file into the ADAM `Genotype` schema, and saves it back. The
+input and output formats are autodetected. Takes two required arguments:
+
+1. `INPUT`: The input path. A file containing genotypes in any of the supported
+  ADAM genotype input formats.
+2. `OUTPUT`: The path to save the transformed genotypes to. Supports any of ADAM's
+  genotype output formats.
+
+Beyond the [default options](#default-args) and the [legacy output
+options]{#legacy-output}, `transformGenotypes` has additional arguments:
+
+* `-coalesce`: Sets the number of partitions to coalesce the output to.
+  If `-force_shuffle_coalesce` is not provided, the Spark engine may ignore
+  the coalesce directive.
+* `-force_shuffle_coalesce`: Forces a shuffle that leads to the output being
+  saved with the number of partitions requested by `-coalesce`. This is
+  necessary if the `-coalesce` would increase the number of partitions, or
+  if it would reduce the number of partitions to fewer than the number of
+  Spark executors. This may have a substantial performance cost, and will
+  invalidate any sort order.
+* `-sort_on_save`: Sorts the genotypes when saving, where contigs are ordered
+  by sequence index. Conflicts with `-sort_lexicographically_on_save`.
+* `-sort_lexicographically_on_save`: Sorts the genotypes when saving, where
+  contigs are ordered lexicographically. Conflicts with `-sort_on_save`.
+* `-single`: Saves the VCF file as headerless shards, and then merges the
+  sharded files into a single VCF.
+* `-stringency`: Sets the validation stringency for conversion.
+  Defaults to `LENIENT.` See [validation stringency](#validation) for more
+  details.
+
+In this command, the validation stringency is applied to the
+individual genotypes. If a genotype fails validation, the
+individual genotype will be dropped (for lenient or silent validation,
+under strict validation, conversion will fail). Header lines are not validated.
+Due to a constraint imposed by the [htsjdk](https://github.com/samtools/htsjdk)
+library, which we use to parse VCF files, user provided header lines that do not
+match the header line definitions from the
+[VCF 4.2](https://samtools.github.io/hts-specs/VCFv4.2.pdf) spec will be
+overridden with the line definitions from the specification. Unfortunately, this
+behavior cannot be disabled. If there is a user provided vs. spec mismatch in
+format/info field count or type, this will likely cause validation failures
+during conversion.
+
+### transformVariants
+
+Loads a variant file into the ADAM `Variant` schema, and saves it back. The
+input and output formats are autodetected. Takes two required arguments:
+
+1. `INPUT`: The input path. A file containing variants in any of the supported
+  ADAM variant input formats.
+2. `OUTPUT`: The path to save the transformed variants to. Supports any of ADAM's
+  variant output formats.
+
+Beyond the [default options](#default-args) and the [legacy output
+options]{#legacy-output}, `transformVariants` has additional arguments:
+
+* `-coalesce`: Sets the number of partitions to coalesce the output to.
+  If `-force_shuffle_coalesce` is not provided, the Spark engine may ignore
+  the coalesce directive.
+* `-force_shuffle_coalesce`: Forces a shuffle that leads to the output being
+  saved with the number of partitions requested by `-coalesce`. This is
+  necessary if the `-coalesce` would increase the number of partitions, or
+  if it would reduce the number of partitions to fewer than the number of
+  Spark executors. This may have a substantial performance cost, and will
+  invalidate any sort order.
+* `-sort_on_save`: Sorts the variants when saving, where contigs are ordered
+  by sequence index. Conflicts with `-sort_lexicographically_on_save`.
+* `-sort_lexicographically_on_save`: Sorts the variants when saving, where
+  contigs are ordered lexicographically. Conflicts with `-sort_on_save`.
+* `-single`: Saves the VCF file as headerless shards, and then merges the
+  sharded files into a single VCF.
+* `-stringency`: Sets the validation stringency for conversion.
+  Defaults to `LENIENT.` See [validation stringency](#validation) for more
+  details.
+
+In this command, the validation stringency is applied to the
+individual variants. If a variant fails validation, the
+individual variant will be dropped (for lenient or silent validation,
+under strict validation, conversion will fail). Header lines are not validated.
+Due to a constraint imposed by the [htsjdk](https://github.com/samtools/htsjdk)
+library, which we use to parse VCF files, user provided header lines that do not
+match the header line definitions from the
+[VCF 4.2](https://samtools.github.io/hts-specs/VCFv4.2.pdf) spec will be
+overridden with the line definitions from the specification. Unfortunately, this
+behavior cannot be disabled. If there is a user provided vs. spec mismatch in
+format/info field count or type, this will likely cause validation failures
+during conversion.
+
 ### mergeShards
 
 A CLI tool for merging a [sharded legacy file](#legacy-output) that was written
@@ -292,68 +382,6 @@ following options:
 These tools convert data between a legacy genomic file format and using ADAM's
 schemas to store data in Parquet.
 
-### vcf2adam and adam2vcf
-
-These commands convert between VCF and Parquet using the Genotype and Variant
-schemas.
-
-`vcf2adam` takes two required arguments:
-
-1. `VCF`: The VCF file to convert to Parquet.
-2. `ADAM`: The path to save the converted Parquet data at.
-
-`vcf2adam` supports the full set of [default options](#default-args).
-Additionally, `vcf2adam` takes the following options:
-
-* `-only_variants`: Instead of saving the VCF file as Genotypes, only save the
-  Variants from the VCF. This is useful if loading a sites-only VCF, e.g., for
-  [BQSR](#known-snps) or [Indel realignment](#known-indels).
-* `-coalesce`: Sets the number of partitions to coalesce the output to.
-  If `-force_shuffle_coalesce` is not provided, the Spark engine may ignore
-  the coalesce directive.
-* `-force_shuffle_coalesce`: Forces a shuffle that leads to the output being
-  saved with the number of partitions requested by `-coalesce`. This is
-  necessary if the `-coalesce` would increase the number of partitions, or
-  if it would reduce the number of partitions to fewer than the number of
-  Spark executors. This may have a substantial performance cost, and will
-  invalidate any sort order.
-* `-stringency`: Sets the validation stringency for conversion.
-  Defaults to `LENIENT.` See [validation stringency](#validation) for more
-  details.
-
-`adam2vcf` takes two required arguments:
-
-1. `ADAM`: The Parquet file of Genotypes to convert to VCF.
-2. `VCF`: The path to save the VCF file to.
-
-`adam2vcf` only supports the `-print_metrics` option from the [default
-options](#default-args). Additionally, `adam2vcf` takes the following options:
-
-* `-coalesce`: Sets the number of partitions to coalesce the output to.
-  The Spark engine may ignore the coalesce directive.
-* `-sort_on_save`: Sorts the variants when saving, where contigs are ordered
-  by sequence index. Conflicts with `-sort_lexicographically_on_save`.
-* `-sort_lexicographically_on_save`: Sorts the variants when saving, where
-  contigs are ordered lexicographically. Conflicts with `-sort_on_save`.
-* `-single`: Saves the VCF file as headerless shards, and then merges the
-  sharded files into a single VCF.
-* `-stringency`: Sets the validation stringency for conversion.
-  Defaults to `LENIENT.` See [validation stringency](#validation) for more
-  details.
-
-In these commands, the validation stringency is applied to the
-individual variants and genotypes. If a variant or genotype fails validation, the
-individual variant or genotype will be dropped (for lenient or silent validation,
-under strict validation, conversion will fail). Header lines are not validated.
-Due to a constraint imposed by the [htsjdk](https://github.com/samtools/htsjdk)
-library, which we use to parse VCF files, user provided header lines that do not
-match the header line definitions from the
-[VCF 4.2](https://samtools.github.io/hts-specs/VCFv4.2.pdf) spec will be
-overridden with the line definitions from the specification. Unfortunately, this
-behavior cannot be disabled. If there is a user provided vs. spec mismatch in
-format/info field count or type, this will likely cause validation failures
-during conversion.
-
 ### fasta2adam and adam2fasta
 
 These commands convert between FASTA and Parquet files storing assemblies using