Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Memory optimisation: don't store whole build array #19

Merged
merged 5 commits into from
Feb 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "ska"
version = "0.2.1"
version = "0.2.2"
authors = [
"John Lees <jlees@ebi.ac.uk>",
"Simon Harris <simon.harris@gatesfoundation.org>",
Expand Down
36 changes: 11 additions & 25 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::fmt;

use clap::{ArgGroup, Parser, Subcommand, ValueEnum};

extern crate num_cpus;
use super::QualFilter;

/// Default split k-mer size
pub const DEFAULT_KMER: usize = 17;
Expand Down Expand Up @@ -45,14 +45,21 @@ fn valid_cpus(s: &str) -> Result<usize, String> {
let threads: usize = s
.parse()
.map_err(|_| format!("`{s}` isn't a valid number of cores"))?;
let max_threads = num_cpus::get();
if threads < 1 || threads > max_threads {
Err("Threads must be between 1 and {max_threads}".to_string())
if threads < 1 {
Err("Threads must be one or higher".to_string())
} else {
Ok(threads)
}
}

/// Prints a warning if more threads than available have been requested
pub fn check_threads(threads: usize) {
let max_threads = num_cpus::get();
if threads > max_threads {
log::warn!("{threads} threads is greater than available cores {max_threads}");
}
}

/// Possible output file types
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
pub enum FileType {
Expand Down Expand Up @@ -84,27 +91,6 @@ impl fmt::Display for FilterType {
}
}

/// Possible quality score filters when building with reads
#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
pub enum QualFilter {
/// Ignore quality scores in reads
NoFilter,
/// Filter middle bases below quality threshold
Middle,
/// Filter entire k-mer when any base below quality threshold
Strict,
}

impl fmt::Display for QualFilter {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
Self::NoFilter => write!(f, "No quality filtering"),
Self::Middle => write!(f, "Middle base quality filtering"),
Self::Strict => write!(f, "Whole k-mer quality filtering"),
}
}
}

/// Options that apply to all subcommands
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
Expand Down
10 changes: 7 additions & 3 deletions src/io_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use std::path::Path;

use regex::Regex;

use super::QualOpts;
use crate::merge_ska_array::MergeSkaArray;
use crate::merge_ska_dict::{build_and_merge, InputFastx};
use crate::ska_dict::bit_encoding::UInt;
Expand Down Expand Up @@ -63,13 +64,16 @@ pub fn load_array<IntT: for<'a> UInt<'a>>(
} else {
log::info!("Multiple files as input, running ska build with default settings");
let input_files = read_input_fastas(input);
let default_qual = QualOpts {
min_count: DEFAULT_MINCOUNT,
min_qual: DEFAULT_MINQUAL,
qual_filter: DEFAULT_QUALFILTER,
};
let merged_dict = build_and_merge(
&input_files,
DEFAULT_KMER,
!DEFAULT_STRAND,
DEFAULT_MINCOUNT,
DEFAULT_MINQUAL,
&DEFAULT_QUALFILTER,
&default_qual,
threads,
);
Ok(MergeSkaArray::new(&merged_dict))
Expand Down
100 changes: 77 additions & 23 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,20 @@
//! FASTQ files must be paired end. If you'd like to request more flexibility in
//! this regard please contact us.
//!
//! ### Threads
//!
//! The maximum threads actually used will be a power of two, so if you provided
//! `--threads 6` only four would be used. Additionally, at least ten samples per
//! thread are required so maximums are:
//!
//! | Samples | Maximum threads |
//! |---------|-----------------|
//! | 1-19 | 1 |
//! | 20-39 | 2 |
//! | 40-79 | 4 |
//!
//! and so on. Use `-v` to see a message with the number being used.
//!
//! ## ska align
//!
//! Creates an alignment from a `.skf` file or sequence files. Sites (columns) are
Expand Down Expand Up @@ -222,7 +236,7 @@
//! ```rust
//! use ska::merge_ska_dict::{InputFastx, build_and_merge};
//! use ska::merge_ska_array::MergeSkaArray;
//! use ska::cli::QualFilter;
//! use ska::{QualOpts, QualFilter};
//!
//! // Build, merge
//! let input_files: [InputFastx; 2] = [("test1".to_string(),
Expand All @@ -233,13 +247,11 @@
//! None)];
//! let rc = true;
//! let k = 17;
//! let min_count = 1;
//! let min_qual = 0;
//! let quality = QualOpts {min_count: 1, min_qual: 0, qual_filter: QualFilter::NoFilter};
//! let threads = 2;
//! let qual_filter = QualFilter::NoFilter;
//! // NB u64 for k<=31, u128 for k<=63
//! let merged_dict =
//! build_and_merge::<u64>(&input_files, k, rc, min_count, min_qual, &qual_filter, threads);
//! build_and_merge::<u64>(&input_files, k, rc, &quality, threads);
//!
//! // Save
//! let ska_array = MergeSkaArray::new(&merged_dict);
Expand Down Expand Up @@ -280,8 +292,12 @@
//!

#![warn(missing_docs)]
use std::fmt;
use std::time::Instant;

use clap::ValueEnum;
extern crate num_cpus;

pub mod merge_ska_dict;
pub mod ska_dict;
use crate::merge_ska_dict::build_and_merge;
Expand All @@ -300,11 +316,56 @@ use crate::cli::*;
pub mod io_utils;
use crate::io_utils::*;

/// Possible quality score filters when building with reads
#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
pub enum QualFilter {
/// Ignore quality scores in reads
NoFilter,
/// Filter middle bases below quality threshold
Middle,
/// Filter entire k-mer when any base below quality threshold
Strict,
}

impl fmt::Display for QualFilter {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
Self::NoFilter => write!(f, "No quality filtering"),
Self::Middle => write!(f, "Middle base quality filtering"),
Self::Strict => write!(f, "Whole k-mer quality filtering"),
}
}
}

/// Quality filtering options for FASTQ files
pub struct QualOpts {
/// Minimum k-mer count across reads to be added
pub min_count: u16,
/// Minimum base quality to be added
pub min_qual: u8,
/// [`QualFilter`]: apply quality across whole k-mer, just middle base, or not at all
pub qual_filter: QualFilter,
}

impl fmt::Display for QualOpts {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"minimum quality {} ({}); filter: {}",
self.min_qual,
(self.min_qual + 33) as char,
self.qual_filter
)
}
}

#[doc(hidden)]
pub fn main() {
let args = cli_args();
if args.verbose {
simple_logger::init_with_level(log::Level::Info).unwrap();
} else {
simple_logger::init_with_level(log::Level::Warn).unwrap();
}

eprintln!("SKA: Split K-mer Analysis (the alignment-free aligner)");
Expand All @@ -321,36 +382,27 @@ pub fn main() {
qual_filter,
threads,
} => {
check_threads(*threads);

// Read input
let input_files = get_input_list(file_list, seq_files);
let quality = QualOpts {
min_count: *min_count,
min_qual: *min_qual,
qual_filter: *qual_filter,
};

// Build, merge
let rc = !*single_strand;
if *k <= 31 {
log::info!("k={}: using 64-bit representation", *k);
let merged_dict = build_and_merge::<u64>(
&input_files,
*k,
rc,
*min_count,
*min_qual,
qual_filter,
*threads,
);
let merged_dict = build_and_merge::<u64>(&input_files, *k, rc, &quality, *threads);

// Save
save_skf(&merged_dict, format!("{output}.skf").as_str());
} else {
log::info!("k={}: using 128-bit representation", *k);
let merged_dict = build_and_merge::<u128>(
&input_files,
*k,
rc,
*min_count,
*min_qual,
qual_filter,
*threads,
);
let merged_dict = build_and_merge::<u128>(&input_files, *k, rc, &quality, *threads);

// Save
save_skf(&merged_dict, format!("{output}.skf").as_str());
Expand All @@ -363,6 +415,7 @@ pub fn main() {
filter,
threads,
} => {
check_threads(*threads);
if let Ok(mut ska_array) = load_array::<u64>(input, *threads) {
// In debug mode (cannot be set from CLI, give details)
log::debug!("{ska_array}");
Expand All @@ -382,6 +435,7 @@ pub fn main() {
format,
threads,
} => {
check_threads(*threads);
log::info!("Loading skf as dictionary");
if let Ok(mut ska_array) = load_array::<u64>(input, *threads) {
log::info!(
Expand Down
Loading