Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
392 changes: 152 additions & 240 deletions Cargo.lock

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ package.rust-version = "1.85.1"
package.repository = "https://github.com/sebschmi/template-switch-aligner"

[workspace.dependencies]
clap = { version = "4.5.16", features = ["derive"] }
serde = "1.0.219"
clap = { version = "4.5.54", features = ["derive"] }
serde = "1.0.228"
compact-genome = "12.5.0"
traitsequence = "8.1.2"
log = "0.4.27"
log = "0.4.29"
num-traits = "0.2.19"
rustc-hash = "2.1.1"
binary-heap-plus = "0.5.0"
Expand Down
2 changes: 1 addition & 1 deletion generic_a_star/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "generic_a_star"
description = "A generic implementation of the A* algorithm"
version = "0.19.1"
version = "0.21.0"
edition.workspace = true
rust-version.workspace = true
license.workspace = true
Expand Down
8 changes: 4 additions & 4 deletions lib_ts_chainalign/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@
name = "lib_ts_chainalign"
description = "A chaining-based sequence-to-sequence aligner that accounts for template switches"
authors = ["Sebastian Schmidt <sebastian.schmidt@helsinki.fi>"]
version = "0.1.0"
version = "0.3.0"
license.workspace = true
edition.workspace = true
rust-version.workspace = true
repository.workspace = true

[dependencies]
generic_a_star = { version = "0.19.1", path = "../generic_a_star" }
lib_tsalign = { version = "0.19.1", path = "../lib_tsalign" }
generic_a_star = { version = "0.21.0", path = "../generic_a_star" }
lib_tsalign = { version = "0.21.0", path = "../lib_tsalign" }
ndarray = { version = "0.17.1", features = ["serde"] }
num-traits.workspace = true
serde.workspace = true
serde = { workspace = true, features = ["derive"] }
compact-genome.workspace = true
itertools = "0.14.0"
log.workspace = true
Expand Down
61 changes: 14 additions & 47 deletions lib_ts_chainalign/src/chain_align.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use binary_heap_plus::BinaryHeap;
use clap::ValueEnum;
use compact_genome::{
implementation::vec_sequence::VectorGenome,
interface::{
Expand Down Expand Up @@ -34,69 +33,37 @@ use crate::{
chain_align::{
chainer::{Context, Identifier, Node, closed_list::ChainerClosedList},
evaluation::ChainEvaluator,
performance_parameters::{
AlignmentPerformanceParameters, ChainingClosedList, ChainingOpenList,
},
},
chaining_cost_function::ChainingCostFunction,
costs::AlignmentCosts,
};

mod chainer;
mod evaluation;

pub struct AlignmentParameters<Cost> {
/// The step width for generating successors during chaining.
///
/// At most `max_successors` will be generated at a time, but at least all with minimum chaining cost.
pub max_successors: usize,

/// The cost until which the cost function is initialised exactly.
///
/// Anchor chainings with a higher exact cost are initialised based on the lower bound.
pub max_exact_cost_function_cost: Cost,

/// The closed list type to use for chaining.
pub closed_list: ChainingClosedList,

/// The open list type to use for chaining.
pub open_list: ChainingOpenList,
}

/// The closed list type to use for chaining.
#[derive(Debug, Clone, ValueEnum)]
pub enum ChainingClosedList {
/// Use [`HashMap`](std::collections::HashMap) as closed list with [`FxHasher`](rustc_hash::FxHasher) as hasher.
FxHashMap,
/// Use a special-purpose closed list.
Special,
}

/// The open list type to use for chaining.
#[derive(Debug, Clone, ValueEnum)]
pub enum ChainingOpenList {
/// Use [`BinaryHeap`](std::collections::BinaryHeap) as open list.
StdHeap,
/// Use [`LinearHeap`](generic_a_star::open_lists::linear_heap::LinearHeap) as open list.
LinearHeap,
}
pub mod performance_parameters;

#[expect(clippy::too_many_arguments)]
pub fn align<AlphabetType: Alphabet, Cost: AStarCost>(
sequences: &AlignmentSequences,
start: AlignmentCoordinates,
end: AlignmentCoordinates,
parameters: &AlignmentParameters<Cost>,
performance_parameters: &AlignmentPerformanceParameters<Cost>,
alignment_costs: &AlignmentCosts<Cost>,
rc_fn: &dyn Fn(u8) -> u8,
max_match_run: u32,
anchors: &Anchors,
chaining_cost_function: &mut ChainingCostFunction<Cost>,
) -> AlignmentResult<lib_tsalign::a_star_aligner::template_switch_distance::AlignmentType, Cost> {
match parameters.open_list {
match performance_parameters.open_list {
ChainingOpenList::StdHeap => {
choose_closed_list::<AlphabetType, _, BinaryHeap<Node<Cost>, AStarNodeComparator>>(
sequences,
start,
end,
parameters,
performance_parameters,
alignment_costs,
rc_fn,
max_match_run,
Expand All @@ -108,7 +75,7 @@ pub fn align<AlphabetType: Alphabet, Cost: AStarCost>(
sequences,
start,
end,
parameters,
performance_parameters,
alignment_costs,
rc_fn,
max_match_run,
Expand All @@ -127,20 +94,20 @@ pub fn choose_closed_list<
sequences: &AlignmentSequences,
start: AlignmentCoordinates,
end: AlignmentCoordinates,
parameters: &AlignmentParameters<Cost>,
performance_parameters: &AlignmentPerformanceParameters<Cost>,
alignment_costs: &AlignmentCosts<Cost>,
rc_fn: &dyn Fn(u8) -> u8,
max_match_run: u32,
anchors: &Anchors,
chaining_cost_function: &mut ChainingCostFunction<Cost>,
) -> AlignmentResult<lib_tsalign::a_star_aligner::template_switch_distance::AlignmentType, Cost> {
match parameters.closed_list {
match performance_parameters.closed_list {
ChainingClosedList::FxHashMap => {
actually_align::<AlphabetType, _, FxHashMapSeed<_, _>, OpenList>(
sequences,
start,
end,
parameters,
performance_parameters,
alignment_costs,
rc_fn,
max_match_run,
Expand All @@ -153,7 +120,7 @@ pub fn choose_closed_list<
sequences,
start,
end,
parameters,
performance_parameters,
alignment_costs,
rc_fn,
max_match_run,
Expand All @@ -174,7 +141,7 @@ fn actually_align<
sequences: &AlignmentSequences,
start: AlignmentCoordinates,
end: AlignmentCoordinates,
parameters: &AlignmentParameters<Cost>,
performance_parameters: &AlignmentPerformanceParameters<Cost>,
alignment_costs: &AlignmentCosts<Cost>,
rc_fn: &dyn Fn(u8) -> u8,
max_match_run: u32,
Expand All @@ -194,7 +161,7 @@ fn actually_align<
chaining_cost_function,
&alignment_costs.ts_limits,
k,
parameters.max_successors,
performance_parameters.max_successors,
);
let mut astar = AStar::<_, ClosedList, OpenList>::new(context);

Expand Down
51 changes: 51 additions & 0 deletions lib_ts_chainalign/src/chain_align/performance_parameters.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
use clap::ValueEnum;

/// Performance parameters for chainalign.
///
/// Use the `default()` method for a reasonable choice of parameters that works well in many cases.
pub struct AlignmentPerformanceParameters<Cost> {
/// The step width for generating successors during chaining.
///
/// At most `max_successors` will be generated at a time, but at least all with minimum chaining cost.
pub max_successors: usize,

/// The cost until which the cost function is initialised exactly.
///
/// Anchor chainings with a higher exact cost are initialised based on the lower bound.
pub max_exact_cost_function_cost: Cost,

/// The closed list type to use for chaining.
pub closed_list: ChainingClosedList,

/// The open list type to use for chaining.
pub open_list: ChainingOpenList,
}

/// The closed list type to use for chaining.
#[derive(Debug, Clone, ValueEnum)]
pub enum ChainingClosedList {
/// Use [`HashMap`](std::collections::HashMap) as closed list with [`FxHasher`](rustc_hash::FxHasher) as hasher.
FxHashMap,
/// Use a special-purpose closed list.
Special,
}

/// The open list type to use for chaining.
#[derive(Debug, Clone, ValueEnum)]
pub enum ChainingOpenList {
/// Use [`BinaryHeap`](std::collections::BinaryHeap) as open list.
StdHeap,
/// Use [`LinearHeap`](generic_a_star::open_lists::linear_heap::LinearHeap) as open list.
LinearHeap,
}

impl<Cost: From<u8>> Default for AlignmentPerformanceParameters<Cost> {
fn default() -> Self {
Self {
max_successors: 1,
max_exact_cost_function_cost: 1u8.into(),
closed_list: ChainingClosedList::Special,
open_list: ChainingOpenList::LinearHeap,
}
}
}
16 changes: 16 additions & 0 deletions lib_ts_chainalign/src/costs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,33 @@ pub struct GapAffineCosts<Cost> {

#[derive(Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct TsLimits {
/// The maximum range of the 12-jump of a template switch.
/// This parameter is ignored for now.
pub jump_12: Range<isize>,
/// The maximum range of the 34-jump of a template switch.
/// This parameter is ignored for now.
pub jump_34: Range<isize>,
/// The range for the length of the 23-alignment of a template switch.
pub length_23: Range<usize>,
/// The range for the ancestor gap of a template switch.
/// This parameter is ignored for now.
pub ancestor_gap: Range<isize>,
}

/// The cost function for alignments.
///
/// For convenience, it implements [`TryFrom<TemplateSwitchConfig<Alphabet, Cost>>`](std::convert::TryFrom).
/// Note that the conversion is very strict and only allows to convert from a [`TemplateSwitchConfig`](lib_tsalign::config::TemplateSwitchConfig) if the conversion loses no information.
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct AlignmentCosts<Cost> {
/// Costs for primary alignment outside of template switches.
pub primary_costs: GapAffineCosts<Cost>,
/// Costs for secondary alignment, i.e. for the 23-alignment of a template switch.
pub secondary_costs: GapAffineCosts<Cost>,
/// The base cost of a template switch.
/// This is applied whenevera template switch is started.
pub ts_base_cost: Cost,
/// Limits on the geometry of a template switch.
pub ts_limits: TsLimits,
}

Expand Down
35 changes: 30 additions & 5 deletions lib_ts_chainalign/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use log::{debug, info, trace};
use crate::{
alignment::{coordinates::AlignmentCoordinates, sequences::AlignmentSequences},
anchors::Anchors,
chain_align::AlignmentParameters,
chain_align::performance_parameters::AlignmentPerformanceParameters,
chaining_cost_function::ChainingCostFunction,
chaining_lower_bounds::ChainingLowerBounds,
costs::AlignmentCosts,
Expand All @@ -26,11 +26,22 @@ pub mod costs;
pub mod exact_chaining;
pub mod panic_on_extend;

/// A reverse complement function for DNA alphabets.
pub fn dna_rc_fn(c: u8) -> u8 {
match c {
b'A' => b'T',
b'C' => b'G',
b'G' => b'C',
b'T' => b'A',
c => panic!("Unsupported character: {c}"),
}
}

/// Perform preprocessing for tschainalign.
///
/// * `max_n` is the maximum sequence length that the lower bounds should support.
/// * `max_match_run` is the maximum consecutive sequence of matches that is allowed.
/// Set this to `k-1`, if the anchors are `k`-mers.
/// Set this to `k-1`, if the anchors are supposed to be `k`-mers.
/// * `alignment_costs` is the cost function for the alignment.
pub fn preprocess(
max_n: usize,
Expand All @@ -40,12 +51,26 @@ pub fn preprocess(
ChainingLowerBounds::new(max_n, max_match_run, alignment_costs)
}

/// Align two sequences.
///
/// Note that `reference` and `query` are interchangeable, and the order has no meaning.
///
/// * `AlphabetType` must be a DNA alphabet.
///
/// * `reference` is the reference string in ASCII format. Only characters `A`, `C`, `G` and `T` are allowed.
/// * `query` is the query string in ASCII format. Only characters `A`, `C`, `G` and `T` are allowed.
/// * `range` is the range on which the alignment happens. Note that points 2 and 3 of a template switch may fall outside of this range.
/// * `performance_parameters` is a set of parameters for the aligner that only affect performance.
/// * `rc_fn` is a function that maps a character to its reverse complement.
/// * `reference_name` is the name of the reference string. It is irrelevant for the alignment, but will appear in e.g. the output of `tsalign show`.
/// * `query_name` is the name of the query string. It is irrelevant for the alignment, but will appear in e.g. the output of `tsalign show`.
/// * `chaining_lower_bounds` are the lower bounds computed with the function [`preprocess`].
#[expect(clippy::too_many_arguments)]
pub fn align<AlphabetType: Alphabet>(
reference: Vec<u8>,
query: Vec<u8>,
range: AlignmentRange,
parameters: &AlignmentParameters<U32Cost>,
performance_parameters: &AlignmentPerformanceParameters<U32Cost>,
rc_fn: &dyn Fn(u8) -> u8,
reference_name: &str,
query_name: &str,
Expand Down Expand Up @@ -76,15 +101,15 @@ pub fn align<AlphabetType: Alphabet>(
&sequences,
start,
end,
parameters.max_exact_cost_function_cost,
performance_parameters.max_exact_cost_function_cost,
rc_fn,
);

chain_align::align::<AlphabetType, _>(
&sequences,
start,
end,
parameters,
performance_parameters,
chaining_lower_bounds.alignment_costs(),
rc_fn,
chaining_lower_bounds.max_match_run(),
Expand Down
8 changes: 4 additions & 4 deletions lib_tsalign/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
name = "lib_tsalign"
description = "A sequence-to-sequence aligner that accounts for template switches"
authors = ["Sebastian Schmidt <sebastian.schmidt@helsinki.fi>"]
version = "0.19.1"
version = "0.21.0"
license.workspace = true
edition.workspace = true
rust-version.workspace = true
Expand All @@ -19,16 +19,16 @@ serde = [
[dependencies]
compact-genome.workspace = true
traitsequence.workspace = true
ndarray = "0.16.1"
ndarray = "0.17.1"
binary-heap-plus = "0.5.0"
nom = "8.0.0"
thiserror = "2.0.3"
num-traits.workspace = true
serde = { workspace = true, features = ["derive"], optional = true }
noisy_float = { version = "0.2.0" }
generic_a_star = { version = "0.19.1", path = "../generic_a_star" }
generic_a_star = { version = "0.21.0", path = "../generic_a_star" }
log.workspace = true
rustc-hash = "2.1.1"
seed_chain = { version = "0.19.1", path = "../seed_chain" }
seed_chain = { version = "0.21.0", path = "../seed_chain" }
extend_map = "0.14.3"
template-switch-error-free-inners = "0.1.1"
Loading