Skip to content

Commit

Permalink
Account for duplicate kmers on left flank.
Browse files Browse the repository at this point in the history
  • Loading branch information
heuermh committed May 9, 2022
1 parent a5d0ce6 commit 78772a3
Showing 1 changed file with 12 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -580,10 +580,18 @@ sealed abstract class SliceDataset extends AvroGenomicDataset[Slice, SliceProduc
*/
def countKmers(kmerLength: Int): RDD[(String, Long)] = {
flankAdjacent(kmerLength).rdd.flatMap(r => {
// cut each read into k-mers, and attach a count of 1L
r.getSequence
.sliding(kmerLength)
.map(k => (k, 1L))
// first slice has no left flank
if (r.getStart == 0) {
r.getSequence
.sliding(kmerLength)
.map(k => (k, 1L))
} else {
// account for duplicate kmers on left flank
r.getSequence
.substring(kmerLength + 1)
.sliding(kmerLength)
.map(k => (k, 1L))
}
}).reduceByKey((k1: Long, k2: Long) => k1 + k2)
}

Expand Down

0 comments on commit 78772a3

Please sign in to comment.