Skip to content

Commit

Permalink
Add single file argument to count kmers.
Browse files Browse the repository at this point in the history
  • Loading branch information
heuermh committed May 6, 2022
1 parent 5070db2 commit a5d0ce6
Show file tree
Hide file tree
Showing 6 changed files with 148 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ import grizzled.slf4j.Logging
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.cli.FileSystemUtils._
import org.bdgenomics.adam.projections.{ AlignmentField, Projection }
import org.bdgenomics.adam.ds.ADAMContext._
import org.bdgenomics.adam.projections.{ AlignmentField, Projection }
import org.bdgenomics.adam.util.TextRddWriter._
import org.bdgenomics.formats.avro.Alignment
import org.bdgenomics.utils.cli._
import org.kohsuke.args4j.{ Argument, Option => Args4jOption }
Expand All @@ -37,16 +38,26 @@ object CountReadKmers extends BDGCommandCompanion {
}

class CountReadKmersArgs extends Args4jBase with ParquetArgs with CramArgs {
@Argument(required = true, metaVar = "INPUT", usage = "The ADAM, BAM or SAM file to count kmers from", index = 0)
@Argument(required = true, metaVar = "INPUT", usage = "The ADAM, BAM or SAM file to count kmers from.", index = 0)
var inputPath: String = null
@Argument(required = true, metaVar = "OUTPUT", usage = "Location for storing k-mer counts", index = 1)

@Argument(required = true, metaVar = "OUTPUT", usage = "Location for storing k-mer counts.", index = 1)
var outputPath: String = null
@Argument(required = true, metaVar = "KMER_LENGTH", usage = "Length of k-mers", index = 2)

@Argument(required = true, metaVar = "KMER_LENGTH", usage = "Length of k-mers.", index = 2)
var kmerLength: Int = 0

@Args4jOption(required = false, name = "-print_histogram", usage = "Prints a histogram of counts.")
var printHistogram: Boolean = false
@Args4jOption(required = false, name = "-repartition", usage = "Set the number of partitions to map data to")

@Args4jOption(required = false, name = "-repartition", usage = "Set the number of partitions to map data to.")
var repartition: Int = -1

@Args4jOption(required = false, name = "-single", usage = "Save as a single file, for text format.")
var asSingleFile: Boolean = false

@Args4jOption(required = false, name = "-disable_fast_concat", usage = "Disables the parallel file concatenation engine.")
var disableFastConcat: Boolean = false
}

class CountReadKmers(protected val args: CountReadKmersArgs) extends BDGSparkCommand[CountReadKmersArgs] with Logging {
Expand All @@ -71,11 +82,11 @@ class CountReadKmers(protected val args: CountReadKmersArgs) extends BDGSparkCom
// count kmers
val countedKmers = adamRecords.countKmers(args.kmerLength)

// cache counted kmers
countedKmers.cache()

// print histogram, if requested
if (args.printHistogram) {
// cache counted kmers
countedKmers.cache()

countedKmers.map(kv => kv._2.toLong)
.countByValue()
.toSeq
Expand All @@ -84,6 +95,9 @@ class CountReadKmers(protected val args: CountReadKmersArgs) extends BDGSparkCom
}

// save as text file
countedKmers.saveAsTextFile(args.outputPath)
writeTextRdd(countedKmers.map(kv => kv._1 + "\t" + kv._2),
args.outputPath,
asSingleFile = args.asSingleFile,
disableFastConcat = args.disableFastConcat)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import grizzled.slf4j.Logging
import org.apache.spark.SparkContext
import org.bdgenomics.adam.ds.ADAMContext._
import org.bdgenomics.adam.cli.FileSystemUtils._
import org.bdgenomics.adam.util.TextRddWriter._
import org.bdgenomics.utils.cli._
import org.kohsuke.args4j.{ Argument, Option => Args4jOption }

Expand All @@ -34,14 +35,23 @@ object CountSliceKmers extends BDGCommandCompanion {
}

class CountSliceKmersArgs extends Args4jBase with ParquetArgs {
@Argument(required = true, metaVar = "INPUT", usage = "The ADAM or FASTA file to count kmers from", index = 0)
@Argument(required = true, metaVar = "INPUT", usage = "The ADAM or FASTA file to count kmers from.", index = 0)
var inputPath: String = null
@Argument(required = true, metaVar = "OUTPUT", usage = "Location for storing k-mer counts", index = 1)

@Argument(required = true, metaVar = "OUTPUT", usage = "Location for storing k-mer counts.", index = 1)
var outputPath: String = null
@Argument(required = true, metaVar = "KMER_LENGTH", usage = "Length of k-mers", index = 2)

@Argument(required = true, metaVar = "KMER_LENGTH", usage = "Length of k-mers.", index = 2)
var kmerLength: Int = 0

@Args4jOption(required = false, name = "-print_histogram", usage = "Prints a histogram of counts.")
var printHistogram: Boolean = false

@Args4jOption(required = false, name = "-single", usage = "Save as a single file, for text format.")
var asSingleFile: Boolean = false

@Args4jOption(required = false, name = "-disable_fast_concat", usage = "Disables the parallel file concatenation engine.")
var disableFastConcat: Boolean = false
}

class CountSliceKmers(protected val args: CountSliceKmersArgs) extends BDGSparkCommand[CountSliceKmersArgs] with Logging {
Expand All @@ -52,9 +62,10 @@ class CountSliceKmers(protected val args: CountSliceKmersArgs) extends BDGSparkC

// read from disk
val slices = sc.loadSlices(args.inputPath)
val withReferences = if (slices.references.size == 0) slices.createReferences() else slices

// count kmers
val countedKmers = slices.countKmers(args.kmerLength)
val countedKmers = withReferences.countKmers(args.kmerLength)

// print histogram, if requested
if (args.printHistogram) {
Expand All @@ -69,7 +80,9 @@ class CountSliceKmers(protected val args: CountSliceKmersArgs) extends BDGSparkC
}

// save as text file
countedKmers.map(kv => kv._1 + ", " + kv._2)
.saveAsTextFile(args.outputPath)
writeTextRdd(countedKmers.map(kv => kv._1 + "\t" + kv._2),
args.outputPath,
asSingleFile = args.asSingleFile,
disableFastConcat = args.disableFastConcat)
}
}
41 changes: 41 additions & 0 deletions adam-cli/src/test/resources/artificial.counts.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
GGGGGGGGAAAAAAAAAAGGG 1
AAAAAAAAAAAAAAAAAAAAG 1
GGGGAAAAAAAAAAGGGGGGG 1
GGGGGGGAAAAAAAAAAGGGG 1
AGGGGGGGGGGAAAAAAAAAA 2
AAAAAAAAAAAAAGGGGGGGG 1
GGGGGAAAAAAAAAAGGGGGG 1
GAAAAAAAAAAAAAAAAAAAA 1
AAAAAAAAAAAAAAAGGGGGG 1
GGGGGGGGGAAAAAAAAAAAA 1
AAAAAAAAAAAAAAAAAAAGG 1
AAAGGGGGGGGGGAAAAAAAA 2
GGGGGGGGAAAAAAAAAAAAA 1
GGAAAAAAAAAAGGGGGGGGG 1
GGGGGGAAAAAAAAAAGGGGG 1
AAAAGGGGGGGGGGAAAAAAA 2
GGGGGGGAAAAAAAAAAAAAA 1
AAAAAAAAAAAAAAGGGGGGG 1
GGGGGGGGGGAAAAAAAAAAG 1
AAAAAAAAAGGGGGGGGGGAA 2
AAGGGGGGGGGGAAAAAAAAA 2
GGAAAAAAAAAAAAAAAAAAA 1
AAAAAAAAAAAAAAAAGGGGG 1
AAAAAAAAAAAGGGGGGGGGG 1
AAAAAAAAAAAAAAAAAGGGG 1
GGGGGGGGGGAAAAAAAAAAA 1
GGGAAAAAAAAAAAAAAAAAA 1
GGGGGGGGGAAAAAAAAAAGG 1
GAAAAAAAAAAGGGGGGGGGG 1
AAAAAAGGGGGGGGGGAAAAA 2
AAAAAGGGGGGGGGGAAAAAA 2
GGGAAAAAAAAAAGGGGGGGG 1
GGGGGAAAAAAAAAAAAAAAA 1
AAAAAAAAAAAAAAAAAAAAA 1050
AAAAAAAAAAAAAAAAAAGGG 1
GGGGGGAAAAAAAAAAAAAAA 1
AAAAAAAAGGGGGGGGGGAAA 2
GGGGAAAAAAAAAAAAAAAAA 1
AAAAAAAGGGGGGGGGGAAAA 2
AAAAAAAAAAGGGGGGGGGGA 2
AAAAAAAAAAAAGGGGGGGGG 1
3 changes: 3 additions & 0 deletions adam-cli/src/test/resources/sorted.counts.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ACACACAC 1
ACACACACACAC 1
ACACACACAC 3
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.cli

import org.bdgenomics.adam.ds.ADAMContext._
import org.bdgenomics.adam.util.ADAMFunSuite

class CountReadKmersSuite extends ADAMFunSuite {
sparkTest("count kmers to single file") {
val inputPath = copyResource("sorted.sam")
val actualPath = tmpFile("sorted.counts.txt")
val expectedPath = copyResource("sorted.counts.txt")
CountReadKmers(Array("-single", inputPath, actualPath, "21")).run(sc)
checkFiles(expectedPath, actualPath)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.cli

import org.bdgenomics.adam.ds.ADAMContext._
import org.bdgenomics.adam.util.ADAMFunSuite

class CountSliceKmersSuite extends ADAMFunSuite {
sparkTest("count slice kmers to single file") {
val inputPath = copyResource("artificial.fa")
val actualPath = tmpFile("artificial.counts.txt")
val expectedPath = copyResource("artificial.counts.txt")
CountSliceKmers(Array("-single", inputPath, actualPath, "21")).run(sc)
checkFiles(expectedPath, actualPath)
}
}

0 comments on commit a5d0ce6

Please sign in to comment.