Skip to content

Commit

Permalink
Merge pull request #19 from bacpop/unbuffered-merge
Browse files Browse the repository at this point in the history
Memory optimisation: don't store whole build array
  • Loading branch information
johnlees authored Feb 18, 2023
2 parents b3896a9 + 203d780 commit ba5b0d2
Show file tree
Hide file tree
Showing 55 changed files with 194 additions and 144 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "ska"
version = "0.2.1"
version = "0.2.2"
authors = [
"John Lees <jlees@ebi.ac.uk>",
"Simon Harris <simon.harris@gatesfoundation.org>",
Expand Down
36 changes: 11 additions & 25 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::fmt;

use clap::{ArgGroup, Parser, Subcommand, ValueEnum};

extern crate num_cpus;
use super::QualFilter;

/// Default split k-mer size
pub const DEFAULT_KMER: usize = 17;
Expand Down Expand Up @@ -45,14 +45,21 @@ fn valid_cpus(s: &str) -> Result<usize, String> {
let threads: usize = s
.parse()
.map_err(|_| format!("`{s}` isn't a valid number of cores"))?;
let max_threads = num_cpus::get();
if threads < 1 || threads > max_threads {
Err("Threads must be between 1 and {max_threads}".to_string())
if threads < 1 {
Err("Threads must be one or higher".to_string())
} else {
Ok(threads)
}
}

/// Prints a warning if more threads than available have been requested
pub fn check_threads(threads: usize) {
let max_threads = num_cpus::get();
if threads > max_threads {
log::warn!("{threads} threads is greater than available cores {max_threads}");
}
}

/// Possible output file types
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
pub enum FileType {
Expand Down Expand Up @@ -84,27 +91,6 @@ impl fmt::Display for FilterType {
}
}

/// Possible quality score filters when building with reads
#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
pub enum QualFilter {
/// Ignore quality scores in reads
NoFilter,
/// Filter middle bases below quality threshold
Middle,
/// Filter entire k-mer when any base below quality threshold
Strict,
}

impl fmt::Display for QualFilter {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
Self::NoFilter => write!(f, "No quality filtering"),
Self::Middle => write!(f, "Middle base quality filtering"),
Self::Strict => write!(f, "Whole k-mer quality filtering"),
}
}
}

/// Options that apply to all subcommands
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
Expand Down
10 changes: 7 additions & 3 deletions src/io_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use std::path::Path;

use regex::Regex;

use super::QualOpts;
use crate::merge_ska_array::MergeSkaArray;
use crate::merge_ska_dict::{build_and_merge, InputFastx};
use crate::ska_dict::bit_encoding::UInt;
Expand Down Expand Up @@ -63,13 +64,16 @@ pub fn load_array<IntT: for<'a> UInt<'a>>(
} else {
log::info!("Multiple files as input, running ska build with default settings");
let input_files = read_input_fastas(input);
let default_qual = QualOpts {
min_count: DEFAULT_MINCOUNT,
min_qual: DEFAULT_MINQUAL,
qual_filter: DEFAULT_QUALFILTER,
};
let merged_dict = build_and_merge(
&input_files,
DEFAULT_KMER,
!DEFAULT_STRAND,
DEFAULT_MINCOUNT,
DEFAULT_MINQUAL,
&DEFAULT_QUALFILTER,
&default_qual,
threads,
);
Ok(MergeSkaArray::new(&merged_dict))
Expand Down
100 changes: 77 additions & 23 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,20 @@
//! FASTQ files must be paired end. If you'd like to request more flexibility in
//! this regard please contact us.
//!
//! ### Threads
//!
//! The maximum threads actually used will be a power of two, so if you provided
//! `--threads 6` only four would be used. Additionally, at least ten samples per
//! thread are required so maximums are:
//!
//! | Samples | Maximum threads |
//! |---------|-----------------|
//! | 1-19 | 1 |
//! | 20-39 | 2 |
//! | 40-79 | 4 |
//!
//! and so on. Use `-v` to see a message with the number being used.
//!
//! ## ska align
//!
//! Creates an alignment from a `.skf` file or sequence files. Sites (columns) are
Expand Down Expand Up @@ -222,7 +236,7 @@
//! ```rust
//! use ska::merge_ska_dict::{InputFastx, build_and_merge};
//! use ska::merge_ska_array::MergeSkaArray;
//! use ska::cli::QualFilter;
//! use ska::{QualOpts, QualFilter};
//!
//! // Build, merge
//! let input_files: [InputFastx; 2] = [("test1".to_string(),
Expand All @@ -233,13 +247,11 @@
//! None)];
//! let rc = true;
//! let k = 17;
//! let min_count = 1;
//! let min_qual = 0;
//! let quality = QualOpts {min_count: 1, min_qual: 0, qual_filter: QualFilter::NoFilter};
//! let threads = 2;
//! let qual_filter = QualFilter::NoFilter;
//! // NB u64 for k<=31, u128 for k<=63
//! let merged_dict =
//! build_and_merge::<u64>(&input_files, k, rc, min_count, min_qual, &qual_filter, threads);
//! build_and_merge::<u64>(&input_files, k, rc, &quality, threads);
//!
//! // Save
//! let ska_array = MergeSkaArray::new(&merged_dict);
Expand Down Expand Up @@ -280,8 +292,12 @@
//!

#![warn(missing_docs)]
use std::fmt;
use std::time::Instant;

use clap::ValueEnum;
extern crate num_cpus;

pub mod merge_ska_dict;
pub mod ska_dict;
use crate::merge_ska_dict::build_and_merge;
Expand All @@ -300,11 +316,56 @@ use crate::cli::*;
pub mod io_utils;
use crate::io_utils::*;

/// Possible quality score filters when building with reads
#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
pub enum QualFilter {
/// Ignore quality scores in reads
NoFilter,
/// Filter middle bases below quality threshold
Middle,
/// Filter entire k-mer when any base below quality threshold
Strict,
}

impl fmt::Display for QualFilter {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
Self::NoFilter => write!(f, "No quality filtering"),
Self::Middle => write!(f, "Middle base quality filtering"),
Self::Strict => write!(f, "Whole k-mer quality filtering"),
}
}
}

/// Quality filtering options for FASTQ files
pub struct QualOpts {
/// Minimum k-mer count across reads to be added
pub min_count: u16,
/// Minimum base quality to be added
pub min_qual: u8,
/// [`QualFilter`]: apply quality across whole k-mer, just middle base, or not at all
pub qual_filter: QualFilter,
}

impl fmt::Display for QualOpts {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"minimum quality {} ({}); filter: {}",
self.min_qual,
(self.min_qual + 33) as char,
self.qual_filter
)
}
}

#[doc(hidden)]
pub fn main() {
let args = cli_args();
if args.verbose {
simple_logger::init_with_level(log::Level::Info).unwrap();
} else {
simple_logger::init_with_level(log::Level::Warn).unwrap();
}

eprintln!("SKA: Split K-mer Analysis (the alignment-free aligner)");
Expand All @@ -321,36 +382,27 @@ pub fn main() {
qual_filter,
threads,
} => {
check_threads(*threads);

// Read input
let input_files = get_input_list(file_list, seq_files);
let quality = QualOpts {
min_count: *min_count,
min_qual: *min_qual,
qual_filter: *qual_filter,
};

// Build, merge
let rc = !*single_strand;
if *k <= 31 {
log::info!("k={}: using 64-bit representation", *k);
let merged_dict = build_and_merge::<u64>(
&input_files,
*k,
rc,
*min_count,
*min_qual,
qual_filter,
*threads,
);
let merged_dict = build_and_merge::<u64>(&input_files, *k, rc, &quality, *threads);

// Save
save_skf(&merged_dict, format!("{output}.skf").as_str());
} else {
log::info!("k={}: using 128-bit representation", *k);
let merged_dict = build_and_merge::<u128>(
&input_files,
*k,
rc,
*min_count,
*min_qual,
qual_filter,
*threads,
);
let merged_dict = build_and_merge::<u128>(&input_files, *k, rc, &quality, *threads);

// Save
save_skf(&merged_dict, format!("{output}.skf").as_str());
Expand All @@ -363,6 +415,7 @@ pub fn main() {
filter,
threads,
} => {
check_threads(*threads);
if let Ok(mut ska_array) = load_array::<u64>(input, *threads) {
// In debug mode (cannot be set from CLI, give details)
log::debug!("{ska_array}");
Expand All @@ -382,6 +435,7 @@ pub fn main() {
format,
threads,
} => {
check_threads(*threads);
log::info!("Loading skf as dictionary");
if let Ok(mut ska_array) = load_array::<u64>(input, *threads) {
log::info!(
Expand Down
Loading

0 comments on commit ba5b0d2

Please sign in to comment.