Skip to content

Commit

Permalink
Add sort and maximum length arguments.
Browse files Browse the repository at this point in the history
  • Loading branch information
heuermh committed May 9, 2022
1 parent 78772a3 commit cc368a5
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ class CountReadKmersArgs extends Args4jBase with ParquetArgs with CramArgs {
@Args4jOption(required = false, name = "-print_histogram", usage = "Prints a histogram of counts.")
var printHistogram: Boolean = false

@Args4jOption(required = false, name = "-sort", usage = "Sort kmers before writing.")
var sort: Boolean = false

@Args4jOption(required = false, name = "-repartition", usage = "Set the number of partitions to map data to.")
var repartition: Int = -1

Expand Down Expand Up @@ -94,8 +97,10 @@ class CountReadKmers(protected val args: CountReadKmersArgs) extends BDGSparkCom
.foreach(println)
}

val maybeSorted = if (args.sort) countedKmers.sortBy(_._1) else countedKmers

// save as text file
writeTextRdd(countedKmers.map(kv => kv._1 + "\t" + kv._2),
writeTextRdd(maybeSorted.map(kv => kv._1 + "\t" + kv._2),
args.outputPath,
asSingleFile = args.asSingleFile,
disableFastConcat = args.disableFastConcat)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ class CountSliceKmersArgs extends Args4jBase with ParquetArgs {
@Args4jOption(required = false, name = "-print_histogram", usage = "Prints a histogram of counts.")
var printHistogram: Boolean = false

@Args4jOption(required = false, name = "-maximum_length", usage = "Maximum slice length. Defaults to 10000L.")
var maximumLength: Long = 10000L

@Args4jOption(required = false, name = "-sort", usage = "Sort kmers before writing.")
var sort: Boolean = false

@Args4jOption(required = false, name = "-single", usage = "Save as a single file, for text format.")
var asSingleFile: Boolean = false

Expand All @@ -61,7 +67,7 @@ class CountSliceKmers(protected val args: CountSliceKmersArgs) extends BDGSparkC
checkWriteablePath(args.outputPath, sc.hadoopConfiguration)

// read from disk
val slices = sc.loadSlices(args.inputPath)
val slices = sc.loadSlices(args.inputPath, maximumLength = args.maximumLength)
val withReferences = if (slices.references.size == 0) slices.createReferences() else slices

// count kmers
Expand All @@ -79,8 +85,10 @@ class CountSliceKmers(protected val args: CountSliceKmersArgs) extends BDGSparkC
.foreach(println)
}

val maybeSorted = if (args.sort) countedKmers.sortBy(_._1) else countedKmers

// save as text file
writeTextRdd(countedKmers.map(kv => kv._1 + "\t" + kv._2),
writeTextRdd(maybeSorted.map(kv => kv._1 + "\t" + kv._2),
args.outputPath,
asSingleFile = args.asSingleFile,
disableFastConcat = args.disableFastConcat)
Expand Down

0 comments on commit cc368a5

Please sign in to comment.