-
Notifications
You must be signed in to change notification settings - Fork 4
/
ska_dict.rs
333 lines (300 loc) · 10.7 KB
/
ska_dict.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
//! Type for building a split k-mer dictionary from a fasta/fastq file.
//!
//! The dictionary has the concatenated left and right part of the split k-mer
//! (bit-encoded DNA bases) as keys, and ASCII IUPAC bases as values for the
//! middle base.
//!
//! Prefer to use [`crate::merge_ska_dict::build_and_merge`] over this
//! function directly.
//!
//! If you want to use this directly, build with [`SkaDict::new()`].
//!
//! These should then be converted or merged into to [`crate::merge_ska_dict::MergeSkaDict`], which
//! lets you do more useful things with them.
//!
//! Note that you should use an appropriate `sample_idx` when building if you
//! know how many you will be merging, which will let you use [`crate::merge_ska_dict::MergeSkaDict::merge()`].
//! Otherwise you will need to use the slower [`crate::merge_ska_dict::MergeSkaDict::extend()`]
use std::cmp::Ordering;
use hashbrown::HashMap;
extern crate needletail;
use needletail::{parse_fastx_file, parser::Format};
pub mod split_kmer;
use super::QualOpts;
use crate::ska_dict::split_kmer::SplitKmer;
pub mod bit_encoding;
use crate::ska_dict::bit_encoding::{decode_base, UInt, IUPAC};
pub mod bloom_filter;
use crate::ska_dict::bloom_filter::KmerFilter;
pub mod nthash;
/// Holds the split-kmer dictionary, and basic information such as k-mer size.
#[derive(Debug, Clone, Default)]
pub struct SkaDict<IntT> {
/// K-mer size
k: usize,
/// Whether reverse-complement was counted
rc: bool,
/// Sample index, if being used in a merge
sample_idx: usize,
/// Sample name
name: String,
/// Split k-mer dictionary split-k:middle-base
split_kmers: HashMap<IntT, u8>,
/// A bloom filter for counting from fastq files
kmer_filter: KmerFilter,
}
impl<IntT> SkaDict<IntT>
where
IntT: for<'a> UInt<'a>,
{
/// Adds a split-kmer and middle base to dictionary.
fn add_to_dict(&mut self, kmer: IntT, base: u8) {
self.split_kmers
.entry(kmer)
.and_modify(|b| *b = IUPAC[base as usize * 256 + *b as usize])
.or_insert(decode_base(base));
}
/// Adds a split k-mer which is a self-rc to the dict
/// This requires amibguity of middle_base + rc(middle_base) to be added
fn add_palindrome_to_dict(&mut self, kmer: IntT, base: u8) {
self.split_kmers
.entry(kmer)
.and_modify(|b| {
*b = match b {
b'W' => {
if base == 0 || base == 2 {
b'W'
} else {
b'N'
}
}
b'S' => {
if base == 0 || base == 2 {
b'N'
} else {
b'S'
}
}
b'N' => b'N',
_ => panic!("Palindrome middle base not W/S: {}", *b as char),
}
})
.or_insert(match base {
0 | 2 => b'W', // A or T
1 | 3 => b'S', // C or G
_ => panic!("Base encoding error: {}", base as char),
});
}
/// Iterates through all the k-mers from an input fastx file and adds them
/// to the dictionary
fn add_file_kmers(
&mut self,
filename: &str,
is_reads: bool,
qual: &QualOpts,
proportion_reads: Option<f64>,
) {
let mut step: usize = 1;
if proportion_reads.is_some() {
step = (1.0 / proportion_reads.unwrap()).round() as usize;
}
let mut reader =
parse_fastx_file(filename).unwrap_or_else(|_| panic!("Invalid path/file: {filename}"));
let mut iter_reads = 0;
while let Some(record) = reader.next() {
if iter_reads % step != 0 {
iter_reads += 1;
continue;
} else {
iter_reads += 1;
}
let seqrec = record.expect("Invalid FASTA/Q record");
let kmer_opt = SplitKmer::new(
seqrec.seq(),
seqrec.num_bases(),
seqrec.qual(),
self.k,
self.rc,
qual.min_qual,
qual.qual_filter,
is_reads,
);
if let Some(mut kmer_it) = kmer_opt {
if !is_reads
|| (kmer_it.middle_base_qual()
&& Ordering::is_eq(self.kmer_filter.filter(&kmer_it)))
{
let (kmer, base, _rc) = kmer_it.get_curr_kmer();
if kmer_it.self_palindrome() {
self.add_palindrome_to_dict(kmer, base);
} else {
self.add_to_dict(kmer, base);
}
}
while let Some((kmer, base, _rc)) = kmer_it.get_next_kmer() {
if !is_reads
|| (kmer_it.middle_base_qual()
&& Ordering::is_eq(self.kmer_filter.filter(&kmer_it)))
{
if kmer_it.self_palindrome() {
self.add_palindrome_to_dict(kmer, base);
} else {
self.add_to_dict(kmer, base);
}
}
}
}
}
}
/// Build a split-kmer dictionary from input fastx file(s)
///
/// Prefer to use [`crate::merge_ska_dict::build_and_merge()`] over this
/// function directly.
///
/// # Examples
///
/// To build with a FASTA
/// ```
/// use ska::ska_dict::SkaDict;
/// use ska::{QualOpts, QualFilter};
///
/// let k = 31;
/// let sample_idx = 0;
/// let quality = QualOpts {min_count: 1, min_qual: 0, qual_filter: QualFilter::NoFilter};
/// let proportion_reads = None;
/// let ska_dict = SkaDict::<u64>::new(k, sample_idx, (&"tests/test_files_in/test_1.fa", None), "test_1", true, &quality, proportion_reads);
/// ```
///
/// With FASTQ pair, only allowing k-mers with a count over 2, and where all
/// bases have a PHRED score of 20 or more
/// ```
/// use ska::ska_dict::SkaDict;
/// use ska::{QualOpts, QualFilter};
///
/// let quality = QualOpts {min_count: 2, min_qual: 20, qual_filter: QualFilter::Middle};
/// let k = 9;
/// let sample_idx = 0;
/// let proportion_reads = None;
/// let ska_dict = SkaDict::<u64>::new(k, sample_idx,
/// (&"tests/test_files_in/test_1_fwd.fastq.gz",
/// Some(&"tests/test_files_in/test_2_fwd.fastq.gz".to_string())),
/// "sample1", true, &quality, proportion_reads);
/// ```
///
/// # Panics
///
/// Panics if:
/// - k-mer length is invalid (<5, >63 or even)
/// - Input file cannot be read
/// - Input file contains invalid fastx record
/// - Input file contains no valid sequence to find at least on split k-mer
#[allow(clippy::too_many_arguments)]
pub fn new(
k: usize,
sample_idx: usize,
files: (&str, Option<&String>),
name: &str,
rc: bool,
qual: &QualOpts,
proportion_reads: Option<f64>,
) -> Self {
if !(5..=63).contains(&k) || k % 2 == 0 {
panic!("Invalid k-mer length");
}
let mut sk_dict = Self {
k,
rc,
sample_idx,
name: name.to_string(),
split_kmers: HashMap::default(),
kmer_filter: KmerFilter::new(qual.min_count),
};
// Check if we're working with reads, and initalise the CM filter if so
let mut reader_peek =
parse_fastx_file(files.0).unwrap_or_else(|_| panic!("Invalid path/file: {}", files.0));
let seq_peek = reader_peek
.next()
.expect("Invalid FASTA/Q record")
.expect("Invalid FASTA/Q record");
let mut is_reads = false;
if seq_peek.format() == Format::Fastq {
sk_dict.kmer_filter.init();
is_reads = true;
}
// Build the dict
sk_dict.add_file_kmers(files.0, is_reads, qual, proportion_reads);
if let Some(second_filename) = files.1 {
sk_dict.add_file_kmers(second_filename, is_reads, qual, proportion_reads);
}
if sk_dict.ksize() == 0 {
panic!("{} has no valid sequence", files.0);
}
sk_dict
}
/// K-mer length used for split k-mers
pub fn kmer_len(&self) -> usize {
self.k
}
/// Whether reverse-complement was counted
pub fn rc(&self) -> bool {
self.rc
}
/// Total number of split k-mers in the dictionary
pub fn ksize(&self) -> usize {
self.split_kmers.len()
}
/// Sample index for merging
pub fn idx(&self) -> usize {
self.sample_idx
}
/// Split k-mer dictonary
pub fn kmers(&self) -> &HashMap<IntT, u8> {
&self.split_kmers
}
/// Sample name
pub fn name(&self) -> &String {
&self.name
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_add_palindrome_to_dict() {
// Initialize the test object
let mut test_obj = SkaDict::<u64>::default();
// Test case 1: Updating existing entry
test_obj.split_kmers.insert(123, b'W');
test_obj.add_palindrome_to_dict(123, 1);
assert_eq!(test_obj.split_kmers[&123], b'N');
// Test case 2: Adding new entry with base 0
test_obj.split_kmers.clear();
test_obj.add_palindrome_to_dict(456, 0);
assert_eq!(test_obj.split_kmers[&456], b'W');
// Test case 3: Adding new entry with base 3
test_obj.split_kmers.clear();
test_obj.add_palindrome_to_dict(789, 3);
assert_eq!(test_obj.split_kmers[&789], b'S');
// Test case 4: Updating existing twice
test_obj.split_kmers.insert(123, b'W');
test_obj.add_palindrome_to_dict(123, 1);
test_obj.add_palindrome_to_dict(123, 1);
assert_eq!(test_obj.split_kmers[&123], b'N');
}
#[test]
#[should_panic]
fn test_panic_add_palindrome_to_dict() {
// Test case 4: Panicking with invalid base
let mut test_obj_panic = SkaDict::<u64>::default();
test_obj_panic.add_palindrome_to_dict(987, 5);
}
#[test]
#[should_panic]
fn test_panic2_add_palindrome_to_dict() {
// Test case 5: Panicking with invalid middle base
let mut test_obj_panic = SkaDict::<u64>::default();
test_obj_panic.split_kmers.clear();
test_obj_panic.split_kmers.insert(555, b'A');
test_obj_panic.add_palindrome_to_dict(555, 1);
}
}