From 65289689540ba526647ee588e31a24193d62572e Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Thu, 18 Dec 2025 12:45:30 +0200 Subject: [PATCH 1/4] Better statistics about amount of gap fillings per chain. --- lib_ts_chainalign/src/chain_align.rs | 45 +++++++++++++++++++ .../src/chain_align/evaluation.rs | 44 ++++++++++++++---- 2 files changed, 80 insertions(+), 9 deletions(-) diff --git a/lib_ts_chainalign/src/chain_align.rs b/lib_ts_chainalign/src/chain_align.rs index eff27dd..b053493 100644 --- a/lib_ts_chainalign/src/chain_align.rs +++ b/lib_ts_chainalign/src/chain_align.rs @@ -15,6 +15,7 @@ use generic_a_star::{ open_lists::{AStarOpenList, linear_heap::LinearHeap}, }; use indicatif::ProgressBar; +use itertools::Itertools; use lib_tsalign::a_star_aligner::{ alignment_result::AlignmentResult, template_switch_distance::{EqualCostRange, TemplateSwitchDirection}, @@ -23,6 +24,7 @@ use log::{debug, trace}; use rustc_hash::FxHashMapSeed; use std::{ fmt::Write, + iter, time::{Duration, Instant}, }; @@ -324,6 +326,25 @@ fn actually_align< }; progress_bar.finish_and_clear(); + let total_gap_fill_alignments: u32 = + chain_evaluator.gap_fill_alignments_per_chain().iter().sum(); + let chains_with_at_most_exp2x_gap_alignments_amount: Vec<_> = iter::once( + chain_evaluator + .gap_fill_alignments_per_chain() + .iter() + .filter(|a| **a == 0) + .count(), + ) + .chain((0..u32::BITS).map(|e| { + chain_evaluator + .gap_fill_alignments_per_chain() + .iter() + .filter(|a| **a <= 1 << e) + .count() + })) + .take_while_inclusive(|amount| *amount < chaining_execution_count) + .collect(); + debug!("Computed {chaining_execution_count} chains"); debug!("Chaining took {:.1}s", chaining_duration.as_secs_f64()); debug!("Evaluation took {:.1}s", evaluation_duration.as_secs_f64()); @@ -335,6 +356,30 @@ fn actually_align< ); debug!("Chaining closed nodes: {total_chaining_closed_nodes}"); debug!("Total chain gaps: {}", chain_evaluator.total_gaps()); + debug!( + "Total chain gap alignments: {} ({:.0}% of total gaps)", + total_gap_fill_alignments, + total_gap_fill_alignments as f64 / chain_evaluator.total_gaps() as f64 * 1e2 + ); + debug!( + "Fraction of chains with [0, 0] gap alignments: {:.0}%", + *chains_with_at_most_exp2x_gap_alignments_amount + .first() + .unwrap() as f64 + / chaining_execution_count as f64 + * 1e2, + ); + for (e, amount) in chains_with_at_most_exp2x_gap_alignments_amount + .windows(2) + .enumerate() + { + debug!( + "Fraction of chains with [{}, {}] gap alignments: {:.0}%", + if e > 0 { (1u32 << (e - 1)) + 1 } else { 1 }, + 1 << e, + (amount[1] - amount[0]) as f64 / chaining_execution_count as f64 * 1e2, + ); + } debug!( "Total chain gap fillings: {} ({:.2}x total gaps, {:.0}% redundant)", chain_evaluator.total_gap_fillings(), diff --git a/lib_ts_chainalign/src/chain_align/evaluation.rs b/lib_ts_chainalign/src/chain_align/evaluation.rs index d7d62d0..14fbaa5 100644 --- a/lib_ts_chainalign/src/chain_align/evaluation.rs +++ b/lib_ts_chainalign/src/chain_align/evaluation.rs @@ -29,6 +29,7 @@ pub struct ChainEvaluator<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> total_gaps: u64, total_gap_fillings: u64, total_redundant_gap_fillings: u64, + gap_fill_alignments_per_chain: Vec, } struct PanicOnExtend; @@ -74,6 +75,7 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> total_gaps: 0, total_gap_fillings: 0, total_redundant_gap_fillings: 0, + gap_fill_alignments_per_chain: Default::default(), } } @@ -93,6 +95,7 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> let mut current_upper_bound = Cost::zero(); let mut alignments = Vec::new(); let mut current_from_index = 0; + let mut gap_fill_alignment_count = 0; loop { let from_anchor = chain[current_from_index]; @@ -129,6 +132,8 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> ); self.total_gap_fillings += u64::try_from(self.additional_primary_targets_buffer.len()).unwrap(); + gap_fill_alignment_count += 1; + trace!("Aligning from start to end costs {}", cost); if !final_evaluation { chaining_cost_function.update_start_to_end(cost, true); @@ -161,6 +166,8 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> ); self.total_gap_fillings += u64::try_from(self.additional_primary_targets_buffer.len()).unwrap(); + gap_fill_alignment_count += 1; + trace!( "Aligning from start to P{index}{} costs {}", anchors.primary(index), @@ -196,6 +203,8 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> ); self.total_gap_fillings += u64::try_from(self.additional_secondary_targets_buffer.len()).unwrap(); + gap_fill_alignment_count += 1; + trace!( "Aligning from start to S{}[{index}]{} costs {}", ts_kind.digits(), @@ -229,6 +238,8 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> ); self.total_gap_fillings += u64::try_from(self.additional_primary_targets_buffer.len()).unwrap(); + gap_fill_alignment_count += 1; + trace!( "Aligning from P{index}{} to end costs {}", anchors.primary(index), @@ -262,6 +273,8 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> ); self.total_gap_fillings += u64::try_from(self.additional_primary_targets_buffer.len()).unwrap(); + gap_fill_alignment_count += 1; + trace!( "Aligning from S{}[{index}]{} to end costs {}", ts_kind.digits(), @@ -318,6 +331,8 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> ); self.total_gap_fillings += u64::try_from(self.additional_primary_targets_buffer.len()).unwrap(); + gap_fill_alignment_count += 1; + trace!( "Aligning from P{from_index}{} to P{to_index}{} (from {start} to {end}) costs {}", anchors.primary(from_index), @@ -378,7 +393,15 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> ); self.total_gap_fillings += u64::try_from(self.additional_secondary_targets_buffer.len()).unwrap(); - self.total_gap_fillings += 1; + gap_fill_alignment_count += 1; + + trace!( + "Aligning from P{from_index}{} to S{}[{to_index}]{} costs {}", + anchors.primary(from_index), + ts_kind.digits(), + anchors.secondary(to_index, ts_kind), + cost + ); if !final_evaluation { chaining_cost_function .update_jump_12(from_index, to_index, ts_kind, cost, true); @@ -390,13 +413,6 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> &mut self.total_redundant_gap_fillings, ); } - trace!( - "Aligning from P{from_index}{} to S{}[{to_index}]{} costs {}", - anchors.primary(from_index), - ts_kind.digits(), - anchors.secondary(to_index, ts_kind), - cost - ); alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment); } @@ -443,6 +459,8 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> ); self.total_gap_fillings += u64::try_from(self.additional_secondary_targets_buffer.len()).unwrap(); + gap_fill_alignment_count += 1; + trace!( "Aligning from S{}[{from_index}]{} to S{}[{to_index}]{} costs {}", ts_kind.digits(), @@ -495,7 +513,8 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> ); self.total_gap_fillings += u64::try_from(self.additional_primary_targets_buffer.len()).unwrap(); - self.total_gap_fillings += 1; + gap_fill_alignment_count += 1; + trace!( "Aligning from S{}[{from_index}]{} to P{to_index}{} (S{} to P{}) costs {}", ts_kind.digits(), @@ -542,6 +561,9 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> } } + self.gap_fill_alignments_per_chain + .push(gap_fill_alignment_count); + (current_upper_bound, alignments) } @@ -556,6 +578,10 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> pub fn total_redundant_gap_fillings(&self) -> u64 { self.total_redundant_gap_fillings } + + pub fn gap_fill_alignments_per_chain(&self) -> &[u32] { + &self.gap_fill_alignments_per_chain + } } impl Extend for PanicOnExtend { From 719fb2adc8cbe699a3c677c140f8e35b0d059124 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Thu, 18 Dec 2025 14:59:58 +0200 Subject: [PATCH 2/4] Precompute exact alignments in cost function up to given cost. --- .../src/alignment/coordinates.rs | 8 +- lib_ts_chainalign/src/alignment/sequences.rs | 24 +- .../src/anchors/reverse_lookup.rs | 126 +++++-- lib_ts_chainalign/src/chain_align.rs | 13 +- .../src/chain_align/evaluation.rs | 9 +- .../src/chaining_cost_function.rs | 330 ++++++++++++++---- .../src/exact_chaining/gap_affine.rs | 59 +++- .../src/exact_chaining/ts_12_jump.rs | 63 +++- .../src/exact_chaining/ts_12_jump/algo.rs | 2 +- .../src/exact_chaining/ts_34_jump.rs | 87 ++++- lib_ts_chainalign/src/lib.rs | 14 +- lib_ts_chainalign/src/panic_on_extend.rs | 7 + tsalign/src/align.rs | 10 +- tsalign/src/align/a_star_chain_ts.rs | 1 + 14 files changed, 613 insertions(+), 140 deletions(-) create mode 100644 lib_ts_chainalign/src/panic_on_extend.rs diff --git a/lib_ts_chainalign/src/alignment/coordinates.rs b/lib_ts_chainalign/src/alignment/coordinates.rs index 4b02100..51e5101 100644 --- a/lib_ts_chainalign/src/alignment/coordinates.rs +++ b/lib_ts_chainalign/src/alignment/coordinates.rs @@ -103,7 +103,9 @@ impl AlignmentCoordinates { // Descendant is a, so it is limited by the descendant ordinate. TsDescendant::Seq1 => *a < end.secondary_ordinate_descendant().unwrap(), // Descendant is b, so a can go until the end of the sequence. - TsDescendant::Seq2 => *a < sequences.end().primary_ordinate_a().unwrap(), + TsDescendant::Seq2 => { + *a < sequences.primary_end().primary_ordinate_a().unwrap() + } } } AlignmentCoordinates::Secondary { ancestor, .. } => 0 < *ancestor, @@ -138,7 +140,9 @@ impl AlignmentCoordinates { AlignmentCoordinates::Primary { b, .. } => { match end.ts_kind().unwrap().descendant { // Descendant is a, so b can go until the end of the sequence. - TsDescendant::Seq1 => *b < sequences.end().primary_ordinate_b().unwrap(), + TsDescendant::Seq1 => { + *b < sequences.primary_end().primary_ordinate_b().unwrap() + } // Descendant is b, so it is limited by the descendant ordinate. TsDescendant::Seq2 => *b < end.secondary_ordinate_descendant().unwrap(), } diff --git a/lib_ts_chainalign/src/alignment/sequences.rs b/lib_ts_chainalign/src/alignment/sequences.rs index 76e4481..8d687c5 100644 --- a/lib_ts_chainalign/src/alignment/sequences.rs +++ b/lib_ts_chainalign/src/alignment/sequences.rs @@ -1,6 +1,6 @@ use crate::alignment::{ coordinates::AlignmentCoordinates, - ts_kind::{TsAncestor, TsDescendant}, + ts_kind::{TsAncestor, TsDescendant, TsKind}, }; pub struct AlignmentSequences { @@ -53,12 +53,28 @@ impl AlignmentSequences { } } - pub fn start(&self) -> AlignmentCoordinates { + pub fn primary_start(&self) -> AlignmentCoordinates { AlignmentCoordinates::new_primary(0, 0) } - pub fn end(&self) -> AlignmentCoordinates { - AlignmentCoordinates::new_primary(self.seq1.len(), self.seq2.len()) + pub fn primary_end(&self) -> AlignmentCoordinates { + self.end(None) + } + + pub fn secondary_end(&self, ts_kind: TsKind) -> AlignmentCoordinates { + self.end(Some(ts_kind)) + } + + pub fn end(&self, ts_kind: Option) -> AlignmentCoordinates { + match ts_kind { + None => AlignmentCoordinates::new_primary(self.seq1.len(), self.seq2.len()), + Some(ts_kind @ (TsKind::TS11 | TsKind::TS21)) => { + AlignmentCoordinates::new_secondary(0, self.seq1.len(), ts_kind) + } + Some(ts_kind @ (TsKind::TS12 | TsKind::TS22)) => { + AlignmentCoordinates::new_secondary(0, self.seq2.len(), ts_kind) + } + } } pub fn seq1(&self) -> &[u8] { diff --git a/lib_ts_chainalign/src/anchors/reverse_lookup.rs b/lib_ts_chainalign/src/anchors/reverse_lookup.rs index 725ad44..1a53a2b 100644 --- a/lib_ts_chainalign/src/anchors/reverse_lookup.rs +++ b/lib_ts_chainalign/src/anchors/reverse_lookup.rs @@ -5,7 +5,11 @@ use crate::{ anchors::{Anchors, index::AnchorIndex, primary::PrimaryAnchor, secondary::SecondaryAnchor}, }; -struct PrimaryAnchorToIndexIter { +struct PrimaryAnchorToIndexIter< + Anchor, + CoordinateIter: Iterator, + AnchorIter: Iterator, +> { coordinate_iter: Peekable, anchor_iter: Peekable, } @@ -15,25 +19,38 @@ struct SecondaryAnchorToIndexIter, } +pub trait PartialIntoAnchorIndex { + type IntoPartSource; + type IntoTarget; + + fn source_part(&self) -> &Self::IntoPartSource; + + fn partial_into(self, target: AnchorIndex) -> Self::IntoTarget; +} + impl< - CoordinateIter: Iterator, + Anchor: PartialIntoAnchorIndex, + CoordinateIter: Iterator, AnchorIter: Iterator, -> Iterator for PrimaryAnchorToIndexIter +> Iterator for PrimaryAnchorToIndexIter { - type Item = Option; + type Item = Anchor::IntoTarget; fn next(&mut self) -> Option { while let (Some(coordinate_anchor), Some((anchor_index, anchor))) = (self.coordinate_iter.peek(), self.anchor_iter.peek()) { - match coordinate_anchor.cmp(anchor) { + match coordinate_anchor.source_part().cmp(anchor) { Ordering::Less => { self.coordinate_iter.next().unwrap(); - return Some(None); } Ordering::Equal => { - let result = Some(Some(*anchor_index)); - self.coordinate_iter.next().unwrap(); + let result = Some( + self.coordinate_iter + .next() + .unwrap() + .partial_into(*anchor_index), + ); self.anchor_iter.next().unwrap(); return result; } @@ -43,29 +60,33 @@ impl< } } - self.coordinate_iter.next().map(|_| None) + None } } impl< - CoordinateIter: Iterator, + Anchor: PartialIntoAnchorIndex, + CoordinateIter: Iterator, AnchorIter: Iterator, > Iterator for SecondaryAnchorToIndexIter { - type Item = Option; + type Item = Anchor::IntoTarget; fn next(&mut self) -> Option { while let (Some(coordinate_anchor), Some((anchor_index, anchor))) = (self.coordinate_iter.peek(), self.anchor_iter.peek()) { - match coordinate_anchor.cmp(anchor) { + match coordinate_anchor.source_part().cmp(anchor) { Ordering::Less => { self.coordinate_iter.next().unwrap(); - return Some(None); } Ordering::Equal => { - let result = Some(Some(*anchor_index)); - self.coordinate_iter.next().unwrap(); + let result = Some( + self.coordinate_iter + .next() + .unwrap() + .partial_into(*anchor_index), + ); self.anchor_iter.next().unwrap(); return result; } @@ -75,7 +96,7 @@ impl< } } - self.coordinate_iter.next().map(|_| None) + None } } @@ -83,26 +104,87 @@ impl Anchors { /// Returns an iterator over the primary anchor indices that correspond to the given primary anchors. /// /// If a primary anchor does not exist, then the iterator returns `Some(None)`. - pub fn primary_anchor_to_index_iter( + pub fn primary_anchor_to_index_iter< + Anchor: PartialIntoAnchorIndex, + >( &self, - iter: impl IntoIterator, - ) -> impl Iterator> { + iter: impl IntoIterator, + ) -> impl Iterator { PrimaryAnchorToIndexIter { coordinate_iter: iter.into_iter().peekable(), anchor_iter: self.enumerate_primaries().peekable(), } } + /// Returns an iterator over the secondary anchor indices that correspond to the given secondary anchors. /// /// If a secondary anchor does not exist, then the iterator returns `Some(None)`. - pub fn secondary_anchor_to_index_iter( + pub fn secondary_anchor_to_index_iter< + Anchor: PartialIntoAnchorIndex, + >( &self, - iter: impl IntoIterator, + iter: impl IntoIterator, ts_kind: TsKind, - ) -> impl Iterator> { + ) -> impl Iterator { SecondaryAnchorToIndexIter { coordinate_iter: iter.into_iter().peekable(), anchor_iter: self.enumerate_secondaries(ts_kind).peekable(), } } } + +impl PartialIntoAnchorIndex for PrimaryAnchor { + type IntoPartSource = PrimaryAnchor; + + type IntoTarget = AnchorIndex; + + fn source_part(&self) -> &Self::IntoPartSource { + self + } + + fn partial_into(self, target: AnchorIndex) -> Self::IntoTarget { + target + } +} + +impl PartialIntoAnchorIndex for SecondaryAnchor { + type IntoPartSource = SecondaryAnchor; + + type IntoTarget = AnchorIndex; + + fn source_part(&self) -> &Self::IntoPartSource { + self + } + + fn partial_into(self, target: AnchorIndex) -> Self::IntoTarget { + target + } +} + +impl PartialIntoAnchorIndex for (PrimaryAnchor, T) { + type IntoPartSource = PrimaryAnchor; + + type IntoTarget = (AnchorIndex, T); + + fn source_part(&self) -> &Self::IntoPartSource { + &self.0 + } + + fn partial_into(self, target: AnchorIndex) -> Self::IntoTarget { + (target, self.1) + } +} + +impl PartialIntoAnchorIndex for (SecondaryAnchor, T) { + type IntoPartSource = SecondaryAnchor; + + type IntoTarget = (AnchorIndex, T); + + fn source_part(&self) -> &Self::IntoPartSource { + &self.0 + } + + fn partial_into(self, target: AnchorIndex) -> Self::IntoTarget { + (target, self.1) + } +} diff --git a/lib_ts_chainalign/src/chain_align.rs b/lib_ts_chainalign/src/chain_align.rs index b053493..0de9c30 100644 --- a/lib_ts_chainalign/src/chain_align.rs +++ b/lib_ts_chainalign/src/chain_align.rs @@ -42,12 +42,17 @@ use crate::{ mod chainer; mod evaluation; -pub struct AlignmentParameters { +pub struct AlignmentParameters { /// The step width for generating successors during chaining. /// /// At most `max_successors` will be generated at a time, but at least all with minimum chaining cost. pub max_successors: usize, + /// The cost until which the cost function is initialised exactly. + /// + /// Anchor chainings with a higher exact cost are initialised based on the lower bound. + pub max_exact_cost_function_cost: Cost, + /// The closed list type to use for chaining. pub closed_list: ChainingClosedList, @@ -78,7 +83,7 @@ pub fn align( sequences: &AlignmentSequences, start: AlignmentCoordinates, end: AlignmentCoordinates, - parameters: &AlignmentParameters, + parameters: &AlignmentParameters, alignment_costs: &AlignmentCosts, rc_fn: &dyn Fn(u8) -> u8, max_match_run: u32, @@ -122,7 +127,7 @@ pub fn choose_closed_list< sequences: &AlignmentSequences, start: AlignmentCoordinates, end: AlignmentCoordinates, - parameters: &AlignmentParameters, + parameters: &AlignmentParameters, alignment_costs: &AlignmentCosts, rc_fn: &dyn Fn(u8) -> u8, max_match_run: u32, @@ -169,7 +174,7 @@ fn actually_align< sequences: &AlignmentSequences, start: AlignmentCoordinates, end: AlignmentCoordinates, - parameters: &AlignmentParameters, + parameters: &AlignmentParameters, alignment_costs: &AlignmentCosts, rc_fn: &dyn Fn(u8) -> u8, max_match_run: u32, diff --git a/lib_ts_chainalign/src/chain_align/evaluation.rs b/lib_ts_chainalign/src/chain_align/evaluation.rs index 14fbaa5..1d8e510 100644 --- a/lib_ts_chainalign/src/chain_align/evaluation.rs +++ b/lib_ts_chainalign/src/chain_align/evaluation.rs @@ -15,6 +15,7 @@ use crate::{ exact_chaining::{ gap_affine::GapAffineAligner, ts_12_jump::Ts12JumpAligner, ts_34_jump::Ts34JumpAligner, }, + panic_on_extend::PanicOnExtend, }; pub struct ChainEvaluator<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> { @@ -32,8 +33,6 @@ pub struct ChainEvaluator<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> gap_fill_alignments_per_chain: Vec, } -struct PanicOnExtend; - impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> ChainEvaluator<'sequences, 'alignment_costs, 'rc_fn, Cost> { @@ -583,9 +582,3 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> &self.gap_fill_alignments_per_chain } } - -impl Extend for PanicOnExtend { - fn extend>(&mut self, iter: Iter) { - assert!(iter.into_iter().next().is_none()); - } -} diff --git a/lib_ts_chainalign/src/chaining_cost_function.rs b/lib_ts_chainalign/src/chaining_cost_function.rs index cd981ad..2420eae 100644 --- a/lib_ts_chainalign/src/chaining_cost_function.rs +++ b/lib_ts_chainalign/src/chaining_cost_function.rs @@ -6,10 +6,16 @@ use log::debug; use num_traits::Zero; use crate::{ - alignment::{coordinates::AlignmentCoordinates, ts_kind::TsKind}, + alignment::{ + coordinates::AlignmentCoordinates, sequences::AlignmentSequences, ts_kind::TsKind, + }, anchors::{Anchors, index::AnchorIndex, primary::PrimaryAnchor, secondary::SecondaryAnchor}, chaining_cost_function::cost_array::ChainingCostArray, chaining_lower_bounds::ChainingLowerBounds, + exact_chaining::{ + gap_affine::GapAffineAligner, ts_12_jump::Ts12JumpAligner, ts_34_jump::Ts34JumpAligner, + }, + panic_on_extend::PanicOnExtend, }; mod cost_array; @@ -25,8 +31,11 @@ impl ChainingCostFunction { pub fn new_from_lower_bounds( chaining_lower_bounds: &ChainingLowerBounds, anchors: &Anchors, + sequences: &AlignmentSequences, start: AlignmentCoordinates, end: AlignmentCoordinates, + max_exact_cost_function_cost: Cost, + rc_fn: &dyn Fn(u8) -> u8, ) -> Self { let start_time = Instant::now(); @@ -35,35 +44,140 @@ impl ChainingCostFunction { let primary_start_anchor_index = AnchorIndex::zero(); let primary_end_anchor_index = primary_anchor_amount - 1; + let mut primary_aligner = GapAffineAligner::new( + sequences, + &chaining_lower_bounds.alignment_costs().primary_costs, + rc_fn, + chaining_lower_bounds.max_match_run(), + ); + let mut secondary_aligner = GapAffineAligner::new( + sequences, + &chaining_lower_bounds.alignment_costs().secondary_costs, + rc_fn, + chaining_lower_bounds.max_match_run(), + ); + let mut ts_12_jump_aligner = Ts12JumpAligner::new( + sequences, + chaining_lower_bounds.alignment_costs(), + rc_fn, + chaining_lower_bounds.max_match_run(), + ); + let mut ts_34_jump_aligner = Ts34JumpAligner::new( + sequences, + chaining_lower_bounds.alignment_costs(), + rc_fn, + chaining_lower_bounds.max_match_run(), + ); + let mut additional_primary_targets_output = Vec::new(); + let mut additional_secondary_targets_output = Vec::new(); + + // Initialise primary with infinitiy. let mut primary = ChainingCostArray::new_from_cost( [primary_anchor_amount, primary_anchor_amount], Cost::max_value(), ); + + // Fill primary from start with exact values. + additional_primary_targets_output.clear(); + primary_aligner.align_until_cost_limit( + sequences.primary_start(), + max_exact_cost_function_cost, + &mut additional_primary_targets_output, + &mut PanicOnExtend, + ); + additional_primary_targets_output.sort_unstable(); + for (to_index, cost) in + anchors.primary_anchor_to_index_iter(additional_primary_targets_output.iter().copied()) + { + let to_index = to_index + 1; + primary[[primary_start_anchor_index, to_index]] = cost; + if cost <= max_exact_cost_function_cost { + primary.set_exact(primary_start_anchor_index, to_index); + } + } + if additional_primary_targets_output + .last() + .map(|(anchor, _)| anchor.start() == end) + .unwrap_or(false) + { + let cost = additional_primary_targets_output.last().unwrap().1; + primary[[primary_start_anchor_index, primary_end_anchor_index]] = cost; + if cost <= max_exact_cost_function_cost { + primary.set_exact(primary_start_anchor_index, primary_end_anchor_index); + } + } + + // Fill remaining primary with lower bound. let gap1 = end.primary_ordinate_a().unwrap() - start.primary_ordinate_a().unwrap(); let gap2 = end.primary_ordinate_b().unwrap() - start.primary_ordinate_b().unwrap(); - primary[[primary_start_anchor_index, primary_end_anchor_index]] = - chaining_lower_bounds.primary_lower_bound(gap1, gap2); + primary[[primary_start_anchor_index, primary_end_anchor_index]] = chaining_lower_bounds + .primary_lower_bound(gap1, gap2) + .max(max_exact_cost_function_cost) + .min(primary[[primary_start_anchor_index, primary_end_anchor_index]]); for (from_index, from_anchor) in anchors.enumerate_primaries() { let from_index = from_index + 1; let (gap1, gap2) = from_anchor.chaining_gaps_from_start(start); - primary[[primary_start_anchor_index, from_index]] = - chaining_lower_bounds.primary_lower_bound(gap1, gap2); + primary[[primary_start_anchor_index, from_index]] = chaining_lower_bounds + .primary_lower_bound(gap1, gap2) + .max(max_exact_cost_function_cost) + .min(primary[[primary_start_anchor_index, from_index]]); + + // Fill primary from from_index with exact values. + additional_primary_targets_output.clear(); + primary_aligner.align_until_cost_limit( + anchors.primary(from_index - 1).end(k), + max_exact_cost_function_cost, + &mut additional_primary_targets_output, + &mut PanicOnExtend, + ); + additional_primary_targets_output.sort_unstable(); + for (to_index, cost) in anchors + .primary_anchor_to_index_iter(additional_primary_targets_output.iter().copied()) + { + let to_index = to_index + 1; + primary[[from_index, to_index]] = cost; + if cost <= max_exact_cost_function_cost { + primary.set_exact(from_index, to_index); + } + } + if additional_primary_targets_output + .last() + .map(|(anchor, _)| anchor.start() == end) + .unwrap_or(false) + { + let cost = additional_primary_targets_output.last().unwrap().1; + primary[[from_index, primary_end_anchor_index]] = cost; + if cost <= max_exact_cost_function_cost { + primary.set_exact(from_index, primary_end_anchor_index); + } + } + + // Fill remaining primary with lower bound. let (gap1, gap2) = from_anchor.chaining_gaps_to_end(end, k); - primary[[from_index, primary_end_anchor_index]] = - chaining_lower_bounds.primary_lower_bound(gap1, gap2); + primary[[from_index, primary_end_anchor_index]] = chaining_lower_bounds + .primary_lower_bound(gap1, gap2) + .max(max_exact_cost_function_cost) + .min(primary[[from_index, primary_end_anchor_index]]); for (to_index, to_anchor) in anchors.enumerate_primaries() { let to_index = to_index + 1; if let Some((gap1, gap2)) = from_anchor.chaining_gaps(&to_anchor, k) { - primary[[from_index, to_index]] = - chaining_lower_bounds.primary_lower_bound(gap1, gap2); + primary[[from_index, to_index]] = chaining_lower_bounds + .primary_lower_bound(gap1, gap2) + .max(max_exact_cost_function_cost) + .min(primary[[from_index, to_index]]); } if from_anchor.is_direct_predecessor_of(&to_anchor) { + debug_assert!( + primary[[from_index, to_index]].is_zero() + || primary[[from_index, to_index]] == Cost::max_value() + ); primary[[from_index, to_index]] = Cost::zero(); } } } + // Initialise secondaries with infinity. let mut secondaries = TsKind::iter() .map(|ts_kind| { ChainingCostArray::new_from_cost( @@ -78,18 +192,51 @@ impl ChainingCostFunction { .unwrap(); for (ts_kind, secondary) in TsKind::iter().zip(&mut secondaries) { for (from_index, from_anchor) in anchors.enumerate_secondaries(ts_kind) { + // Fill secondary from from_index with exact values. + additional_secondary_targets_output.clear(); + secondary_aligner.align_until_cost_limit( + anchors.secondary(from_index, ts_kind).end(ts_kind, k), + max_exact_cost_function_cost, + &mut PanicOnExtend, + &mut additional_secondary_targets_output, + ); + additional_secondary_targets_output.sort_unstable(); + for (to_index, cost) in anchors.secondary_anchor_to_index_iter( + additional_secondary_targets_output.iter().copied(), + ts_kind, + ) { + secondary[[from_index, to_index]] = cost; + if cost <= max_exact_cost_function_cost { + secondary.set_exact(from_index, to_index); + } + } + + // Fill remaining secondary with lower bound. for (to_index, to_anchor) in anchors.enumerate_secondaries(ts_kind) { if let Some((gap1, gap2)) = from_anchor.chaining_gaps(&to_anchor, ts_kind, k) { - secondary[[from_index, to_index]] = - chaining_lower_bounds.secondary_lower_bound(gap1, gap2); + secondary[[from_index, to_index]] = chaining_lower_bounds + .secondary_lower_bound(gap1, gap2) + .max(max_exact_cost_function_cost) + .min(secondary[[from_index, to_index]]); } if from_anchor.is_direct_predecessor_of(&to_anchor) { + debug_assert!( + secondary[[from_index, to_index]].is_zero() + || secondary[[from_index, to_index]] == Cost::max_value(), + "Direct predecessor relationship from S{}{} to S{}{} has cost {}", + ts_kind.digits(), + anchors.secondary(from_index, ts_kind), + ts_kind.digits(), + anchors.secondary(to_index, ts_kind), + secondary[[from_index, to_index]], + ); secondary[[from_index, to_index]] = Cost::zero(); } } } } + // Initialise 12-jumps with infinity. let mut jump_12s = TsKind::iter() .map(|ts_kind| { ChainingCostArray::new_from_cost( @@ -102,15 +249,39 @@ impl ChainingCostFunction { for (ts_kind, jump_12) in TsKind::iter().zip(&mut jump_12s) { for (from_index, from_anchor) in anchors.enumerate_primaries() { let from_index = from_index + 1; + + // Fill 12-jumps from from_index with exact values. + additional_secondary_targets_output.clear(); + ts_12_jump_aligner.align_until_cost_limit( + anchors.primary(from_index - 1).end(k), + ts_kind, + max_exact_cost_function_cost, + &mut additional_secondary_targets_output, + ); + additional_secondary_targets_output.sort_unstable(); + for (to_index, cost) in anchors.secondary_anchor_to_index_iter( + additional_secondary_targets_output.iter().copied(), + ts_kind, + ) { + jump_12[[from_index, to_index]] = cost; + if cost <= max_exact_cost_function_cost { + jump_12.set_exact(from_index, to_index); + } + } + + // Fill remaining 12-jumps with lower bound. for (to_index, to_anchor) in anchors.enumerate_secondaries(ts_kind) { if let Some(gap) = from_anchor.chaining_jump_gap(&to_anchor, ts_kind, k) { - jump_12[[from_index, to_index]] = - chaining_lower_bounds.jump_12_lower_bound(gap); + jump_12[[from_index, to_index]] = chaining_lower_bounds + .jump_12_lower_bound(gap) + .max(max_exact_cost_function_cost) + .min(jump_12[[from_index, to_index]]); } } } } + // Initialise 34-jumps with infinity. let mut jump_34s = TsKind::iter() .map(|ts_kind| { ChainingCostArray::new_from_cost( @@ -122,11 +293,43 @@ impl ChainingCostFunction { .unwrap(); for (ts_kind, jump_34) in TsKind::iter().zip(&mut jump_34s) { for (from_index, from_anchor) in anchors.enumerate_secondaries(ts_kind) { + // Fill 34-jumps from from_index with exact values. + additional_primary_targets_output.clear(); + ts_34_jump_aligner.align_until_cost_limit( + anchors.secondary(from_index, ts_kind).end(ts_kind, k), + max_exact_cost_function_cost, + &mut additional_primary_targets_output, + ); + additional_primary_targets_output.sort_unstable(); + for (to_index, cost) in anchors + .primary_anchor_to_index_iter(additional_primary_targets_output.iter().copied()) + { + let to_index = to_index + 1; + jump_34[[from_index, to_index]] = cost; + if cost <= max_exact_cost_function_cost { + jump_34.set_exact(from_index, to_index); + } + } + if additional_primary_targets_output + .last() + .map(|(anchor, _)| anchor.start() == end) + .unwrap_or(false) + { + let cost = additional_primary_targets_output.last().unwrap().1; + jump_34[[from_index, primary_end_anchor_index]] = cost; + if cost <= max_exact_cost_function_cost { + jump_34.set_exact(from_index, primary_end_anchor_index); + } + } + + // Fill remaining 34-jumps with lower bound. for (to_index, to_anchor) in anchors.enumerate_primaries() { let to_index = to_index + 1; if let Some(gap) = from_anchor.chaining_jump_gap(&to_anchor, ts_kind, k) { - jump_34[[from_index, to_index]] = - chaining_lower_bounds.jump_34_lower_bound(gap); + jump_34[[from_index, to_index]] = chaining_lower_bounds + .jump_34_lower_bound(gap) + .max(max_exact_cost_function_cost) + .min(jump_34[[from_index, to_index]]); } } } @@ -136,12 +339,38 @@ impl ChainingCostFunction { TsKind::iter().zip(jump_12s.iter_mut().zip(&mut jump_34s)) { for (index, anchor) in anchors.enumerate_secondaries(ts_kind) { + // Fill 12-jumps from start with exact values. + additional_secondary_targets_output.clear(); + ts_12_jump_aligner.align_until_cost_limit( + start, + ts_kind, + max_exact_cost_function_cost, + &mut additional_secondary_targets_output, + ); + additional_secondary_targets_output.sort_unstable(); + for (to_index, cost) in anchors.secondary_anchor_to_index_iter( + additional_secondary_targets_output.iter().copied(), + ts_kind, + ) { + jump_12[[primary_start_anchor_index, to_index]] = cost; + if cost <= max_exact_cost_function_cost { + jump_12.set_exact(primary_start_anchor_index, to_index); + } + } + + // Fill remaining 12-jumps from start with lower bound. let gap = anchor.chaining_jump_gap_from_start(start, ts_kind); - jump_12[[primary_start_anchor_index, index]] = - chaining_lower_bounds.jump_12_lower_bound(gap); + jump_12[[primary_start_anchor_index, index]] = chaining_lower_bounds + .jump_12_lower_bound(gap) + .max(max_exact_cost_function_cost) + .min(jump_12[[primary_start_anchor_index, index]]); + + // Fill remaining 34-jumps to end with lower bound. let gap = anchor.chaining_jump_gap_to_end(end, ts_kind, k); - jump_34[[index, primary_end_anchor_index]] = - chaining_lower_bounds.jump_34_lower_bound(gap); + jump_34[[index, primary_end_anchor_index]] = chaining_lower_bounds + .jump_34_lower_bound(gap) + .max(max_exact_cost_function_cost) + .min(jump_34[[index, primary_end_anchor_index]]); } } @@ -574,14 +803,9 @@ impl ChainingCostFunction { total_redundant_gap_fillings: &mut u64, ) { additional_targets.sort_unstable(); - for (to_anchor_index, cost) in anchors - .primary_anchor_to_index_iter(additional_targets.iter().map(|(anchor, _)| *anchor)) - .zip(additional_targets.iter().map(|(_, cost)| *cost)) + for (to_primary_index, cost) in + anchors.primary_anchor_to_index_iter(additional_targets.iter().copied()) { - let Some(to_primary_index) = to_anchor_index else { - continue; - }; - if self.is_primary_exact(from_primary_index, to_primary_index) { *total_redundant_gap_fillings += 1; debug_assert_eq!(self.primary(from_primary_index, to_primary_index), cost); @@ -598,14 +822,9 @@ impl ChainingCostFunction { total_redundant_gap_fillings: &mut u64, ) { additional_targets.sort_unstable(); - for (to_anchor_index, cost) in anchors - .primary_anchor_to_index_iter(additional_targets.iter().map(|(anchor, _)| *anchor)) - .zip(additional_targets.iter().map(|(_, cost)| *cost)) + for (to_primary_index, cost) in + anchors.primary_anchor_to_index_iter(additional_targets.iter().copied()) { - let Some(to_primary_index) = to_anchor_index else { - continue; - }; - if self.is_primary_from_start_exact(to_primary_index) { *total_redundant_gap_fillings += 1; debug_assert_eq!(self.primary_from_start(to_primary_index), cost); @@ -624,17 +843,9 @@ impl ChainingCostFunction { total_redundant_gap_fillings: &mut u64, ) { additional_targets.sort_unstable(); - for (to_anchor_index, cost) in anchors - .secondary_anchor_to_index_iter( - additional_targets.iter().map(|(anchor, _)| *anchor), - ts_kind, - ) - .zip(additional_targets.iter().map(|(_, cost)| *cost)) + for (to_secondary_index, cost) in + anchors.secondary_anchor_to_index_iter(additional_targets.iter().copied(), ts_kind) { - let Some(to_secondary_index) = to_anchor_index else { - continue; - }; - if self.is_secondary_exact(from_secondary_index, to_secondary_index, ts_kind) { *total_redundant_gap_fillings += 1; debug_assert_eq!( @@ -662,17 +873,9 @@ impl ChainingCostFunction { total_redundant_gap_fillings: &mut u64, ) { additional_targets.sort_unstable(); - for (to_anchor_index, cost) in anchors - .secondary_anchor_to_index_iter( - additional_targets.iter().map(|(anchor, _)| *anchor), - ts_kind, - ) - .zip(additional_targets.iter().map(|(_, cost)| *cost)) + for (to_secondary_index, cost) in + anchors.secondary_anchor_to_index_iter(additional_targets.iter().copied(), ts_kind) { - let Some(to_secondary_index) = to_anchor_index else { - continue; - }; - if self.is_jump_12_exact(from_primary_index, to_secondary_index, ts_kind) { *total_redundant_gap_fillings += 1; debug_assert_eq!( @@ -696,17 +899,9 @@ impl ChainingCostFunction { total_redundant_gap_fillings: &mut u64, ) { additional_targets.sort_unstable(); - for (to_anchor_index, cost) in anchors - .secondary_anchor_to_index_iter( - additional_targets.iter().map(|(anchor, _)| *anchor), - ts_kind, - ) - .zip(additional_targets.iter().map(|(_, cost)| *cost)) + for (to_secondary_index, cost) in + anchors.secondary_anchor_to_index_iter(additional_targets.iter().copied(), ts_kind) { - let Some(to_secondary_index) = to_anchor_index else { - continue; - }; - if self.is_jump_12_from_start_exact(to_secondary_index, ts_kind) { *total_redundant_gap_fillings += 1; debug_assert_eq!(self.jump_12_from_start(to_secondary_index, ts_kind), cost,); @@ -725,14 +920,9 @@ impl ChainingCostFunction { total_redundant_gap_fillings: &mut u64, ) { additional_targets.sort_unstable(); - for (to_anchor_index, cost) in anchors - .primary_anchor_to_index_iter(additional_targets.iter().map(|(anchor, _)| *anchor)) - .zip(additional_targets.iter().map(|(_, cost)| *cost)) + for (to_primary_index, cost) in + anchors.primary_anchor_to_index_iter(additional_targets.iter().copied()) { - let Some(to_primary_index) = to_anchor_index else { - continue; - }; - if self.is_jump_34_exact(from_secondary_index, to_primary_index, ts_kind) { *total_redundant_gap_fillings += 1; debug_assert_eq!( diff --git a/lib_ts_chainalign/src/exact_chaining/gap_affine.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine.rs index 46a6ba4..f019001 100644 --- a/lib_ts_chainalign/src/exact_chaining/gap_affine.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine.rs @@ -73,8 +73,62 @@ impl<'sequences, 'cost_table, 'rc_fn, Cost: AStarCost> AStarResult::ExceededMemoryLimit { .. } => unreachable!("Cost limit is None"), AStarResult::NoTarget => (Cost::max_value(), Vec::new().into()), }; - a_star.search_until(|_, node| node.cost > cost); + + Self::fill_additional_targets( + &a_star, + start, + additional_primary_targets_output, + additional_secondary_targets_output, + ); + self.a_star_buffers = Some(a_star.into_buffers()); + + (cost, alignment) + } + + /// Align from start until the cost limit is reached. + /// + /// Collect all closed nodes into the given output lists. + pub fn align_until_cost_limit( + &mut self, + start: AlignmentCoordinates, + cost_limit: Cost, + additional_primary_targets_output: &mut impl Extend<(PrimaryAnchor, Cost)>, + additional_secondary_targets_output: &mut impl Extend<(SecondaryAnchor, Cost)>, + ) { + let end = self.sequences.end(start.ts_kind()); + debug_assert!( + start.is_primary() && end.is_primary() || start.is_secondary() && end.is_secondary() + ); + + let context = Context::new( + self.cost_table, + self.sequences, + self.rc_fn, + start, + end, + true, + self.max_match_run, + ); + let mut a_star = AStar::new_with_buffers(context, self.a_star_buffers.take().unwrap()); + a_star.initialise(); + a_star.search_until(|_, node| node.cost > cost_limit); + + Self::fill_additional_targets( + &a_star, + start, + additional_primary_targets_output, + additional_secondary_targets_output, + ); + self.a_star_buffers = Some(a_star.into_buffers()); + } + + fn fill_additional_targets( + a_star: &AStar>, + start: AlignmentCoordinates, + additional_primary_targets_output: &mut impl Extend<(PrimaryAnchor, Cost)>, + additional_secondary_targets_output: &mut impl Extend<(SecondaryAnchor, Cost)>, + ) { additional_primary_targets_output.extend( a_star .iter_closed_nodes() @@ -103,8 +157,5 @@ impl<'sequences, 'cost_table, 'rc_fn, Cost: AStarCost> ) }), ); - self.a_star_buffers = Some(a_star.into_buffers()); - - (cost, alignment) } } diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs index cea55dc..01c2908 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs @@ -2,8 +2,10 @@ use generic_a_star::{AStar, AStarBuffers, AStarResult, cost::AStarCost}; use crate::{ alignment::{ - Alignment, coordinates::AlignmentCoordinates, sequences::AlignmentSequences, - ts_kind::TsDescendant, + Alignment, + coordinates::AlignmentCoordinates, + sequences::AlignmentSequences, + ts_kind::{TsDescendant, TsKind}, }, anchors::secondary::SecondaryAnchor, costs::AlignmentCosts, @@ -79,6 +81,60 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> }; a_star.search_until(|_, node| node.cost > cost); + Self::fill_additional_targets( + &a_star, + descendant_start, + additional_secondary_targets_output, + ); + self.a_star_buffers = Some(a_star.into_buffers()); + + (cost, alignment) + } + + /// Align from start until the cost limit is reached. + /// + /// Collect all closed nodes into the given output lists. + pub fn align_until_cost_limit( + &mut self, + start: AlignmentCoordinates, + ts_kind: TsKind, + cost_limit: Cost, + additional_secondary_targets_output: &mut impl Extend<(SecondaryAnchor, Cost)>, + ) { + assert!(start.is_primary()); + let end = self.sequences.secondary_end(ts_kind); + + let context = Context::new( + self.alignment_costs, + self.sequences, + self.rc_fn, + start, + end, + true, + self.max_match_run, + ); + let mut a_star = AStar::new_with_buffers(context, self.a_star_buffers.take().unwrap()); + a_star.initialise(); + a_star.search_until(|_, node| node.cost > cost_limit); + + let descendant_start = match end.ts_kind().unwrap().descendant { + TsDescendant::Seq1 => start.primary_ordinate_a(), + TsDescendant::Seq2 => start.primary_ordinate_b(), + } + .unwrap(); + Self::fill_additional_targets( + &a_star, + descendant_start, + additional_secondary_targets_output, + ); + self.a_star_buffers = Some(a_star.into_buffers()); + } + + fn fill_additional_targets( + a_star: &AStar>, + descendant_start: usize, + additional_secondary_targets_output: &mut impl Extend<(SecondaryAnchor, Cost)>, + ) { additional_secondary_targets_output.extend( a_star .iter_closed_nodes() @@ -99,8 +155,5 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> ) }), ); - self.a_star_buffers = Some(a_star.into_buffers()); - - (cost, alignment) } } diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs index 049c075..8b424ae 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs @@ -203,7 +203,7 @@ impl AStarContext for Context<'_, '_, '_, Cost> { // This generates too many jumps, most of these are gonna be much too far. output.extend( coordinates - .generate_12_jumps(self.end, self.sequences.end()) + .generate_12_jumps(self.end, self.sequences.primary_end()) .map(|(jump, coordinates)| Node { identifier: Identifier::Jump12 { coordinates, diff --git a/lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs b/lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs index 4e281b1..202d15e 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs @@ -2,8 +2,10 @@ use generic_a_star::{AStar, AStarBuffers, AStarResult, cost::AStarCost}; use crate::{ alignment::{ - Alignment, coordinates::AlignmentCoordinates, sequences::AlignmentSequences, - ts_kind::TsDescendant, + Alignment, + coordinates::AlignmentCoordinates, + sequences::AlignmentSequences, + ts_kind::{TsDescendant, TsKind}, }, anchors::primary::PrimaryAnchor, costs::AlignmentCosts, @@ -85,6 +87,71 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> }; a_star.search_until(|_, node| node.cost > cost); + Self::fill_additional_targets( + &a_star, + descendant_start, + start.ts_kind().unwrap(), + self.alignment_costs, + additional_primary_targets_output, + ); + self.a_star_buffers = Some(a_star.into_buffers()); + + // The TS base cost is applied at the 12-jump, but we anyways apply it in this algorithm to make it label-setting if the base cost is non-zero. + // But since the 34-jump has zero cost, we subtract it again. + ( + if cost == Cost::max_value() { + Cost::max_value() + } else { + cost - self.alignment_costs.ts_base_cost + }, + alignment, + ) + } + + /// Align from start until the cost limit is reached. + /// + /// Collect all closed nodes into the given output lists. + pub fn align_until_cost_limit( + &mut self, + start: AlignmentCoordinates, + cost_limit: Cost, + additional_primary_targets_output: &mut impl Extend<(PrimaryAnchor, Cost)>, + ) { + assert!(start.is_secondary()); + let end = self.sequences.primary_end(); + + let context = Context::new( + self.alignment_costs, + self.sequences, + self.rc_fn, + start, + end, + true, + self.max_match_run, + ); + let mut a_star = AStar::new_with_buffers(context, self.a_star_buffers.take().unwrap()); + a_star.initialise(); + a_star.search_until(|_, node| node.cost > cost_limit); + + let descendant_start = start.secondary_ordinate_descendant().unwrap(); + let ts_kind = start.ts_kind().unwrap(); + Self::fill_additional_targets( + &a_star, + descendant_start, + ts_kind, + self.alignment_costs, + additional_primary_targets_output, + ); + self.a_star_buffers = Some(a_star.into_buffers()); + } + + fn fill_additional_targets( + a_star: &AStar>, + descendant_start: usize, + ts_kind: TsKind, + alignment_costs: &AlignmentCosts, + additional_primary_targets_output: &mut impl Extend<(PrimaryAnchor, Cost)>, + ) { additional_primary_targets_output.extend( a_star .iter_closed_nodes() @@ -92,7 +159,7 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> node.identifier.coordinates().is_primary() && (node.identifier.has_non_match() != (descendant_start - == match start.ts_kind().unwrap().descendant { + == match ts_kind.descendant { TsDescendant::Seq1 => { node.identifier.coordinates().primary_ordinate_a().unwrap() } @@ -109,22 +176,10 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> if node.cost == Cost::max_value() { Cost::max_value() } else { - node.cost - self.alignment_costs.ts_base_cost + node.cost - alignment_costs.ts_base_cost }, ) }), ); - self.a_star_buffers = Some(a_star.into_buffers()); - - // The TS base cost is applied at the 12-jump, but we anyways apply it in this algorithm to make it label-setting if the base cost is non-zero. - // But since the 34-jump has zero cost, we subtract it again. - ( - if cost == Cost::max_value() { - Cost::max_value() - } else { - cost - self.alignment_costs.ts_base_cost - }, - alignment, - ) } } diff --git a/lib_ts_chainalign/src/lib.rs b/lib_ts_chainalign/src/lib.rs index ad8cc6e..ba06408 100644 --- a/lib_ts_chainalign/src/lib.rs +++ b/lib_ts_chainalign/src/lib.rs @@ -24,6 +24,7 @@ pub mod chaining_cost_function; pub mod chaining_lower_bounds; pub mod costs; pub mod exact_chaining; +pub mod panic_on_extend; /// Perform preprocessing for tschainalign. /// @@ -44,7 +45,7 @@ pub fn align( reference: Vec, query: Vec, range: AlignmentRange, - parameters: &AlignmentParameters, + parameters: &AlignmentParameters, rc_fn: &dyn Fn(u8) -> u8, reference_name: &str, query_name: &str, @@ -69,8 +70,15 @@ pub fn align( trace!("Anchors:\n{anchors}"); let start = AlignmentCoordinates::new_primary(range.reference_offset(), range.query_offset()); let end = AlignmentCoordinates::new_primary(range.reference_limit(), range.query_limit()); - let mut chaining_cost_function = - ChainingCostFunction::new_from_lower_bounds(chaining_lower_bounds, &anchors, start, end); + let mut chaining_cost_function = ChainingCostFunction::new_from_lower_bounds( + chaining_lower_bounds, + &anchors, + &sequences, + start, + end, + parameters.max_exact_cost_function_cost, + rc_fn, + ); chain_align::align::( &sequences, diff --git a/lib_ts_chainalign/src/panic_on_extend.rs b/lib_ts_chainalign/src/panic_on_extend.rs new file mode 100644 index 0000000..1cdc145 --- /dev/null +++ b/lib_ts_chainalign/src/panic_on_extend.rs @@ -0,0 +1,7 @@ +pub struct PanicOnExtend; + +impl Extend for PanicOnExtend { + fn extend>(&mut self, iter: Iter) { + assert!(iter.into_iter().next().is_none()); + } +} diff --git a/tsalign/src/align.rs b/tsalign/src/align.rs index 4991116..bd266f0 100644 --- a/tsalign/src/align.rs +++ b/tsalign/src/align.rs @@ -114,9 +114,17 @@ pub struct Cli { /// Can be tuned to optimise performance. /// /// This applies only to tschainalign. - #[clap(long, default_value = "5")] + #[clap(long, default_value = "1")] max_chaining_successors: usize, + /// The maximum cost until which the cost function is initialised exactly. + /// + /// Setting this to a higher value will increase the time required to initialise the cost function, but decrease the amount of chains computed. + /// + /// This applies only to tschainalign. + #[clap(long, default_value = "10")] + max_exact_cost_function_cost: u32, + /// The closed list type to use for chaining. /// /// This applies only to tschainalign. diff --git a/tsalign/src/align/a_star_chain_ts.rs b/tsalign/src/align/a_star_chain_ts.rs index 497c4f2..9dc1961 100644 --- a/tsalign/src/align/a_star_chain_ts.rs +++ b/tsalign/src/align/a_star_chain_ts.rs @@ -95,6 +95,7 @@ pub fn align_a_star_chain_ts< let query = query.clone_as_vec(); let parameters = AlignmentParameters { max_successors: cli.max_chaining_successors, + max_exact_cost_function_cost: cli.max_exact_cost_function_cost.into(), closed_list: cli.chaining_closed_list, open_list: cli.chaining_open_list, }; From 3fcc34883e81d05d7ba8581439485ab24796b77c Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Thu, 18 Dec 2025 16:32:18 +0200 Subject: [PATCH 3/4] Fix 34-jump cost limit. --- lib_ts_chainalign/src/chaining_cost_function.rs | 13 +++++++++---- lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs | 4 +++- tsalign/src/align.rs | 2 +- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/lib_ts_chainalign/src/chaining_cost_function.rs b/lib_ts_chainalign/src/chaining_cost_function.rs index 2420eae..4e64f04 100644 --- a/lib_ts_chainalign/src/chaining_cost_function.rs +++ b/lib_ts_chainalign/src/chaining_cost_function.rs @@ -2,7 +2,7 @@ use std::time::Instant; use generic_a_star::cost::AStarCost; use itertools::Itertools; -use log::debug; +use log::{debug, info}; use num_traits::Zero; use crate::{ @@ -37,6 +37,7 @@ impl ChainingCostFunction { max_exact_cost_function_cost: Cost, rc_fn: &dyn Fn(u8) -> u8, ) -> Self { + info!("Initialising chaining cost function..."); let start_time = Instant::now(); let k = usize::try_from(chaining_lower_bounds.max_match_run() + 1).unwrap(); @@ -378,7 +379,7 @@ impl ChainingCostFunction { let duration = end_time - start_time; debug!( "Initialising chaining cost function took {:.0}ms", - duration.as_secs_f64() * 1000.0 + duration.as_secs_f64() * 1e3, ); Self { @@ -690,7 +691,11 @@ impl ChainingCostFunction { } let target = &mut self.jump_34_array_mut(ts_kind)[[from_secondary_index, to_primary_index + 1]]; - assert!(*target <= cost); + assert!( + *target <= cost, + "Reducing 34-jump from S{}-{from_secondary_index} to P-{to_primary_index}: cost from {target} to {cost}", + ts_kind.digits() + ); let result = *target < cost; *target = cost; result @@ -928,7 +933,7 @@ impl ChainingCostFunction { debug_assert_eq!( self.jump_34(from_secondary_index, to_primary_index, ts_kind), cost, - "Jump12: Previous exact cost was {} but additional target cost is {cost}.\nFrom S{}-{from_secondary_index}{} to P-{to_primary_index}{}.", + "Jump34: Previous exact cost was {} but additional target cost is {cost}.\nFrom S{}-{from_secondary_index}{} to P-{to_primary_index}{}.", self.jump_34(from_secondary_index, to_primary_index, ts_kind), ts_kind.digits(), anchors.secondary(from_secondary_index, ts_kind), diff --git a/lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs b/lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs index 202d15e..1b054ee 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs @@ -131,7 +131,9 @@ impl<'sequences, 'alignment_costs, 'rc_fn, Cost: AStarCost> ); let mut a_star = AStar::new_with_buffers(context, self.a_star_buffers.take().unwrap()); a_star.initialise(); - a_star.search_until(|_, node| node.cost > cost_limit); + // The TS base cost is applied at the 12-jump, but we anyways apply it in this algorithm to make it label-setting if the base cost is non-zero. + // But since the 34-jump has zero cost, we subtract it again. + a_star.search_until(|_, node| node.cost > cost_limit + self.alignment_costs.ts_base_cost); let descendant_start = start.secondary_ordinate_descendant().unwrap(); let ts_kind = start.ts_kind().unwrap(); diff --git a/tsalign/src/align.rs b/tsalign/src/align.rs index bd266f0..6ebe836 100644 --- a/tsalign/src/align.rs +++ b/tsalign/src/align.rs @@ -122,7 +122,7 @@ pub struct Cli { /// Setting this to a higher value will increase the time required to initialise the cost function, but decrease the amount of chains computed. /// /// This applies only to tschainalign. - #[clap(long, default_value = "10")] + #[clap(long, default_value = "1")] max_exact_cost_function_cost: u32, /// The closed list type to use for chaining. From f425fcc51886fe08b4b9854cda7d06e881296ad8 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Thu, 18 Dec 2025 17:46:15 +0200 Subject: [PATCH 4/4] Increase cost lower bound for inexact costs. --- .../src/chaining_cost_function.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lib_ts_chainalign/src/chaining_cost_function.rs b/lib_ts_chainalign/src/chaining_cost_function.rs index 4e64f04..e51dd87 100644 --- a/lib_ts_chainalign/src/chaining_cost_function.rs +++ b/lib_ts_chainalign/src/chaining_cost_function.rs @@ -113,14 +113,14 @@ impl ChainingCostFunction { let gap2 = end.primary_ordinate_b().unwrap() - start.primary_ordinate_b().unwrap(); primary[[primary_start_anchor_index, primary_end_anchor_index]] = chaining_lower_bounds .primary_lower_bound(gap1, gap2) - .max(max_exact_cost_function_cost) + .max(max_exact_cost_function_cost + Cost::from_usize(1)) .min(primary[[primary_start_anchor_index, primary_end_anchor_index]]); for (from_index, from_anchor) in anchors.enumerate_primaries() { let from_index = from_index + 1; let (gap1, gap2) = from_anchor.chaining_gaps_from_start(start); primary[[primary_start_anchor_index, from_index]] = chaining_lower_bounds .primary_lower_bound(gap1, gap2) - .max(max_exact_cost_function_cost) + .max(max_exact_cost_function_cost + Cost::from_usize(1)) .min(primary[[primary_start_anchor_index, from_index]]); // Fill primary from from_index with exact values. @@ -157,7 +157,7 @@ impl ChainingCostFunction { let (gap1, gap2) = from_anchor.chaining_gaps_to_end(end, k); primary[[from_index, primary_end_anchor_index]] = chaining_lower_bounds .primary_lower_bound(gap1, gap2) - .max(max_exact_cost_function_cost) + .max(max_exact_cost_function_cost + Cost::from_usize(1)) .min(primary[[from_index, primary_end_anchor_index]]); for (to_index, to_anchor) in anchors.enumerate_primaries() { @@ -165,7 +165,7 @@ impl ChainingCostFunction { if let Some((gap1, gap2)) = from_anchor.chaining_gaps(&to_anchor, k) { primary[[from_index, to_index]] = chaining_lower_bounds .primary_lower_bound(gap1, gap2) - .max(max_exact_cost_function_cost) + .max(max_exact_cost_function_cost + Cost::from_usize(1)) .min(primary[[from_index, to_index]]); } if from_anchor.is_direct_predecessor_of(&to_anchor) { @@ -217,7 +217,7 @@ impl ChainingCostFunction { if let Some((gap1, gap2)) = from_anchor.chaining_gaps(&to_anchor, ts_kind, k) { secondary[[from_index, to_index]] = chaining_lower_bounds .secondary_lower_bound(gap1, gap2) - .max(max_exact_cost_function_cost) + .max(max_exact_cost_function_cost + Cost::from_usize(1)) .min(secondary[[from_index, to_index]]); } if from_anchor.is_direct_predecessor_of(&to_anchor) { @@ -275,7 +275,7 @@ impl ChainingCostFunction { if let Some(gap) = from_anchor.chaining_jump_gap(&to_anchor, ts_kind, k) { jump_12[[from_index, to_index]] = chaining_lower_bounds .jump_12_lower_bound(gap) - .max(max_exact_cost_function_cost) + .max(max_exact_cost_function_cost + Cost::from_usize(1)) .min(jump_12[[from_index, to_index]]); } } @@ -329,7 +329,7 @@ impl ChainingCostFunction { if let Some(gap) = from_anchor.chaining_jump_gap(&to_anchor, ts_kind, k) { jump_34[[from_index, to_index]] = chaining_lower_bounds .jump_34_lower_bound(gap) - .max(max_exact_cost_function_cost) + .max(max_exact_cost_function_cost + Cost::from_usize(1)) .min(jump_34[[from_index, to_index]]); } } @@ -363,14 +363,14 @@ impl ChainingCostFunction { let gap = anchor.chaining_jump_gap_from_start(start, ts_kind); jump_12[[primary_start_anchor_index, index]] = chaining_lower_bounds .jump_12_lower_bound(gap) - .max(max_exact_cost_function_cost) + .max(max_exact_cost_function_cost + Cost::from_usize(1)) .min(jump_12[[primary_start_anchor_index, index]]); // Fill remaining 34-jumps to end with lower bound. let gap = anchor.chaining_jump_gap_to_end(end, ts_kind, k); jump_34[[index, primary_end_anchor_index]] = chaining_lower_bounds .jump_34_lower_bound(gap) - .max(max_exact_cost_function_cost) + .max(max_exact_cost_function_cost + Cost::from_usize(1)) .min(jump_34[[index, primary_end_anchor_index]]); } }