From bcaa8785553c6c17db560d989f35a24fcb923361 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 23 May 2026 21:28:21 +0100 Subject: [PATCH 1/3] fix: address six bugs found in post-merge code review Bug-hunt review on master (see BUG_REVIEW.md). Fixes: - send_chunks: bench cap no longer zeroes the final partial chunk - Param routing: activation auto-detect works when --instrument is set - TSV: use mzML column layout (no spurious Title column) - GF SpecE: honor is_protein_n_term for Met-cleaved peptides - TSV: write IsotopeError from psm.isotope_offset - CLI: reject inverted charge/isotope error ranges Adds regression tests for bench mode and inverted charge range. Documents five additional open findings for follow-up. Co-authored-by: Cursor --- BUG_REVIEW.md | 49 +++++++++++++++++++++++++++ crates/msgf-rust/src/bin/msgf-rust.rs | 26 ++++++++++---- crates/msgf-rust/tests/cli_smoke.rs | 47 +++++++++++++++++++++++++ crates/output/src/tsv.rs | 4 +-- crates/search/src/match_engine.rs | 8 ++--- 5 files changed, 120 insertions(+), 14 deletions(-) create mode 100644 BUG_REVIEW.md diff --git a/BUG_REVIEW.md b/BUG_REVIEW.md new file mode 100644 index 00000000..c39371b8 --- /dev/null +++ b/BUG_REVIEW.md @@ -0,0 +1,49 @@ +# msgf-rust bug review (2026-05-23) + +Branch: `review/bug-hunt` (from `master` @ 18360a3d) + +Systematic review of the Rust MS-GF+ port: static analysis of critical paths, +full `cargo test --release --workspace`, and targeted code reading. + +## Fixed in this branch + +| ID | Severity | Location | Issue | Fix | +|---|---|---|---|---| +| B1 | **Critical** | `msgf-rust.rs` `send_chunks` | Bench cap (`--max-spectra N`) truncated the final partial chunk to zero when `total == N` (e.g. N=100 with chunk size 5000 → empty output). | Removed erroneous tail `truncate` block; loop already stops at cap. | +| B2 | **High** | `msgf-rust.rs` param routing | Activation auto-detect was gated on `instrument == low-res`, so `--fragmentation auto --instrument QExactive` on mzML skipped peek and resolved to CID params for HCD data. | Gate auto-route on `fragmentation == auto` + mzML extension only. | +| B3 | **High** | `msgf-rust.rs` TSV write | `write_tsv(..., is_mgf=true)` always emitted MGF layout (extra `Title` column) even for mzML inputs. | Pass `!is_mzml`. | +| B4 | **High** | `match_engine.rs` GF | SpecE GF graph used `start_offset == 0` for protein N-term instead of `cand.is_protein_n_term`, breaking Met-cleaved N-termini at offset 1. | Use `cand.is_protein_n_term` / `is_protein_c_term`. | +| B5 | **Medium** | `tsv.rs` | `IsotopeError` column hardcoded to 0 while PIN writes `psm.isotope_offset`. | Thread isotope offset from PSM. | +| B6 | **Medium** | `msgf-rust.rs` CLI | Inverted `--charge-min/--charge-max` or isotope ranges produced empty ranges with no error. | Validate at startup and return clear error. | + +## Open — not fixed (documented for follow-up) + +| ID | Severity | Location | Issue | +|---|---|---|---| +| B7 | **High** | `match_engine.rs` `dedup_pepseq_score` | Dedup key is `(bare sequence, psm.score.round())`; ignores modifications and uses pin score instead of Java-aligned `rank_score`. Can over-merge TMT/mod variants. | +| B8 | **Medium** | `match_engine.rs` | `dedup_pepseq_score` keeps first PSM on collision (HashMap iteration order) → nondeterministic primary candidate for shared peptides. | +| B9 | **Medium** | `sa_walk.rs` | Does not enforce `max_missed_cleavages`; only used in tests today but would inflate search space if wired to production. | +| B10 | **High** | `mzml.rs` `Iterator::next` | First per-spectrum parse error sets `done=true` and aborts the entire file; remaining spectra are silently skipped. | +| B11 | **Low** | `sa_walk.rs` Met pass | Dedupes Met-cleaved peptides on residue bytes only, collapsing distinct C-terminal contexts. | + +## Known test failures (pre-existing, CI-skipped) + +These fail on `master` without the 7 CI skip flags; tracked as parity/min_peaks regressions: + +- `match_engine_smoke::known_peptide_appears_in_top_n` +- `match_engine_smoke::charge_missing_spectrum_uses_per_charge_scored_spec` +- `match_engine_smoke::spectrum_without_charge_tries_charge_range` +- Maven fixture loads, thread-determinism test (see `.github/workflows/ci.yml`) + +## Verification + +```bash +cargo test --release --workspace -- \ + --skip charge_missing_spectrum_uses_per_charge_scored_spec \ + --skip spectrum_without_charge_tries_charge_range \ + --skip known_peptide_appears_in_top_n \ + --skip read_bsa_canno_text_format \ + --skip read_tryp_pig_bov_revcat_csarr_cnlcp \ + --skip tryp_pig_bov_revcat_full_set_loads \ + --skip match_spectra_output_invariant_across_thread_counts +``` diff --git a/crates/msgf-rust/src/bin/msgf-rust.rs b/crates/msgf-rust/src/bin/msgf-rust.rs index ecb5a330..f5203841 100644 --- a/crates/msgf-rust/src/bin/msgf-rust.rs +++ b/crates/msgf-rust/src/bin/msgf-rust.rs @@ -302,10 +302,6 @@ where } } } - if bench_cap < usize::MAX && total + chunk.len() > bench_cap { - let keep = bench_cap.saturating_sub(total); - chunk.truncate(keep); - } if !chunk.is_empty() { let _ = tx.send(chunk); } @@ -396,8 +392,12 @@ fn run(cli: Cli) -> Result<(), Box> { let param_path = match cli.param_file.clone() { Some(p) => p, None => { - let auto_route_eligible = cli.fragmentation == Fragmentation::Auto - && cli.instrument == Instrument::LowRes; + let ext_is_mzml = cli.spectrum + .extension() + .and_then(|e| e.to_str()) + .map(|s| s.eq_ignore_ascii_case("mzml")) + .unwrap_or(false); + let auto_route_eligible = cli.fragmentation == Fragmentation::Auto && ext_is_mzml; if auto_route_eligible { match detect_dominant_activation(&cli.spectrum) { Some(method) => { @@ -442,7 +442,19 @@ fn run(cli: Cli) -> Result<(), Box> { params.precursor_tolerance = PrecursorTolerance::symmetric(Tolerance::Ppm(cli.precursor_tol_ppm)); params.charge_range = cli.charge_min..=cli.charge_max; + if cli.charge_min > cli.charge_max { + return Err(format!( + "invalid charge range: --charge-min {} > --charge-max {}", + cli.charge_min, cli.charge_max + ).into()); + } params.isotope_error_range = cli.isotope_error_min..=cli.isotope_error_max; + if cli.isotope_error_min > cli.isotope_error_max { + return Err(format!( + "invalid isotope error range: --isotope-error-min {} > --isotope-error-max {}", + cli.isotope_error_min, cli.isotope_error_max + ).into()); + } params.top_n_psms_per_spectrum = cli.top_n; params.num_tolerable_termini = match cli.enzyme_specificity { EnzymeSpecificity::Fully => 2, @@ -665,7 +677,7 @@ fn run(cli: Cli) -> Result<(), Box> { .file_name() .map(|n| n.to_string_lossy().into_owned()) .unwrap_or_else(|| cli.spectrum.display().to_string()); - output::write_tsv(tsv_path, &spectra, &queues, &prepared.candidates, ¶ms, &idx, &spec_file_name, true)?; + output::write_tsv(tsv_path, &spectra, &queues, &prepared.candidates, ¶ms, &idx, &spec_file_name, !is_mzml)?; eprintln!("Wrote TSV: {}", tsv_path.display()); } diff --git a/crates/msgf-rust/tests/cli_smoke.rs b/crates/msgf-rust/tests/cli_smoke.rs index f3ce24f1..b3590add 100644 --- a/crates/msgf-rust/tests/cli_smoke.rs +++ b/crates/msgf-rust/tests/cli_smoke.rs @@ -260,6 +260,53 @@ fn cli_runs_end_to_end_on_tiny_mzml() { ); } +#[test] +fn bench_mode_max_spectra_produces_nonempty_pin() { + // Regression for send_chunks bench-cap bug: --max-spectra 100 must not + // drop the entire final partial chunk (which used to truncate to zero). + let dir = tempfile::tempdir().expect("tempdir"); + let pin_path = dir.path().join("bench.pin"); + + let status = base_cmd( + "test-fixtures/test.mgf", + "test-fixtures/BSA.fasta", + &pin_path, + ) + .arg("--max-spectra") + .arg("100") + .status() + .expect("run msgf-rust bench mode"); + + assert!(status.success(), "bench mode should exit 0, got: {status}"); + assert!(pin_path.exists(), "PIN should be written in bench mode"); + + let content = std::fs::read_to_string(&pin_path).unwrap(); + assert!( + content.lines().count() > 1, + "bench mode with --max-spectra 100 should produce header + data rows" + ); +} + +#[test] +fn cli_rejects_inverted_charge_range() { + let dir = tempfile::tempdir().expect("tempdir"); + let pin_path = dir.path().join("out.pin"); + + let status = base_cmd( + "test-fixtures/test.mgf", + "test-fixtures/BSA.fasta", + &pin_path, + ) + .arg("--charge-min") + .arg("4") + .arg("--charge-max") + .arg("2") + .status() + .expect("run msgf-rust with inverted charge range"); + + assert!(!status.success(), "inverted charge range must fail"); +} + /// Regression guard: legacy Java numeric flag values and the new /// Rust-idiomatic named values must resolve to byte-identical PIN output. /// Quantms scripts use the numeric form; new docs recommend the named form. diff --git a/crates/output/src/tsv.rs b/crates/output/src/tsv.rs index ddc36981..d77d5607 100644 --- a/crates/output/src/tsv.rs +++ b/crates/output/src/tsv.rs @@ -176,8 +176,8 @@ fn write_psm_row( // Precursor m/z formatted to 4 decimal places let precursor = format!("{:.4}", spec.precursor_mz); - // IsotopeError: always 0 (winning isotope offset not threaded here yet) - let isotope_error: i32 = 0; + // IsotopeError: winning isotope offset from the search (matches PIN column). + let isotope_error: i32 = psm.isotope_offset as i32; // PrecursorError: mass_error_ppm stored on psm; convert to Da if needed let precursor_error = if ppm_mode { diff --git a/crates/search/src/match_engine.rs b/crates/search/src/match_engine.rs index 9d776026..4a6c8191 100644 --- a/crates/search/src/match_engine.rs +++ b/crates/search/src/match_engine.rs @@ -680,11 +680,9 @@ fn compute_spec_e_values_for_spectrum( let mut any_c = false; for psm in queue.iter_psms() { let cand = &candidates[psm.primary_candidate_idx() as usize]; - if let Some(prot) = search_index.protein_at(cand.protein_index) { - let start = cand.start_offset_in_protein; - let pep_len = cand.peptide.length(); - if start == 0 { any_n = true; } - if start + pep_len >= prot.sequence.len() { any_c = true; } + if search_index.protein_at(cand.protein_index).is_some() { + if cand.is_protein_n_term { any_n = true; } + if cand.is_protein_c_term { any_c = true; } if any_n && any_c { break; } } } From 2aa605f86510bc5994846eef5ef65f38ab75a408 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 24 May 2026 07:27:14 +0100 Subject: [PATCH 2/3] fix: dedup B7/B8, CLI cleanup, and dedup perf pass Correct mod-aware pepSeq dedup to use rank_score with deterministic survivor selection, add isotope-range validation test, and optimize the dedup hot path with Arc-cached integer keys and FxHashMap charge queues. Co-authored-by: Cursor --- BUG_REVIEW.md | 9 +- crates/msgf-rust/src/bin/msgf-rust.rs | 30 ++-- crates/msgf-rust/tests/cli_smoke.rs | 20 +++ crates/output/src/tsv.rs | 3 +- crates/search/src/match_engine.rs | 236 +++++++++++++++++++++----- 5 files changed, 231 insertions(+), 67 deletions(-) diff --git a/BUG_REVIEW.md b/BUG_REVIEW.md index c39371b8..25f5aecf 100644 --- a/BUG_REVIEW.md +++ b/BUG_REVIEW.md @@ -15,13 +15,13 @@ full `cargo test --release --workspace`, and targeted code reading. | B4 | **High** | `match_engine.rs` GF | SpecE GF graph used `start_offset == 0` for protein N-term instead of `cand.is_protein_n_term`, breaking Met-cleaved N-termini at offset 1. | Use `cand.is_protein_n_term` / `is_protein_c_term`. | | B5 | **Medium** | `tsv.rs` | `IsotopeError` column hardcoded to 0 while PIN writes `psm.isotope_offset`. | Thread isotope offset from PSM. | | B6 | **Medium** | `msgf-rust.rs` CLI | Inverted `--charge-min/--charge-max` or isotope ranges produced empty ranges with no error. | Validate at startup and return clear error. | +| B7 | **High** | `match_engine.rs` dedup | Dedup used bare sequence + pin score; merged mod variants incorrectly. | Mod-aware pepSeq key + `rank_score`. | +| B8 | **Medium** | `match_engine.rs` dedup | HashMap survivor order was nondeterministic. | `BTreeMap` + best-`rank_score` survivor rule. | ## Open — not fixed (documented for follow-up) | ID | Severity | Location | Issue | |---|---|---|---| -| B7 | **High** | `match_engine.rs` `dedup_pepseq_score` | Dedup key is `(bare sequence, psm.score.round())`; ignores modifications and uses pin score instead of Java-aligned `rank_score`. Can over-merge TMT/mod variants. | -| B8 | **Medium** | `match_engine.rs` | `dedup_pepseq_score` keeps first PSM on collision (HashMap iteration order) → nondeterministic primary candidate for shared peptides. | | B9 | **Medium** | `sa_walk.rs` | Does not enforce `max_missed_cleavages`; only used in tests today but would inflate search space if wired to production. | | B10 | **High** | `mzml.rs` `Iterator::next` | First per-spectrum parse error sets `done=true` and aborts the entire file; remaining spectra are silently skipped. | | B11 | **Low** | `sa_walk.rs` Met pass | Dedupes Met-cleaved peptides on residue bytes only, collapsing distinct C-terminal contexts. | @@ -47,3 +47,8 @@ cargo test --release --workspace -- \ --skip tryp_pig_bov_revcat_full_set_loads \ --skip match_spectra_output_invariant_across_thread_counts ``` + +## Performance (dedup pass) + +- PepSeq dedup keys use integer mod units + `Arc` cache per candidate (avoids repeated string formatting). +- Per-charge `TopNQueue` map uses `FxHashMap` (typically 1–3 charges per spectrum). diff --git a/crates/msgf-rust/src/bin/msgf-rust.rs b/crates/msgf-rust/src/bin/msgf-rust.rs index f5203841..25fe030f 100644 --- a/crates/msgf-rust/src/bin/msgf-rust.rs +++ b/crates/msgf-rust/src/bin/msgf-rust.rs @@ -381,23 +381,19 @@ fn run(cli: Cli) -> Result<(), Box> { // ── 4. Load Param scoring model ─────────────────────────────────────────── // - // When the user provided `--param-file`, that wins outright. Otherwise: - // * If `--fragmentation`/`--instrument` are set, honour them (existing - // behaviour — preserves the bench harness's explicit-flag path). - // * If none of those are set, peek the input file for its dominant - // activation method and route to the matching bundled .param file. - // This mirrors Java MS-GF+'s ASWRITTEN per-spectrum dispatch at the - // file-wide granularity (good enough when an mzML carries a single - // activation method, which is the common case). + // `--param-file` wins outright. Otherwise, for mzML with `--fragmentation auto`, + // peek the file's dominant activation method and pick the bundled `.param`. + // MGF and explicit fragmentation/instrument flags use `resolve_bundled_param`. + let spectrum_ext = cli.spectrum + .extension() + .and_then(|e| e.to_str()) + .map(|s| s.to_lowercase()); + let is_mzml = matches!(spectrum_ext.as_deref(), Some("mzml")); + let param_path = match cli.param_file.clone() { Some(p) => p, None => { - let ext_is_mzml = cli.spectrum - .extension() - .and_then(|e| e.to_str()) - .map(|s| s.eq_ignore_ascii_case("mzml")) - .unwrap_or(false); - let auto_route_eligible = cli.fragmentation == Fragmentation::Auto && ext_is_mzml; + let auto_route_eligible = cli.fragmentation == Fragmentation::Auto && is_mzml; if auto_route_eligible { match detect_dominant_activation(&cli.spectrum) { Some(method) => { @@ -514,10 +510,7 @@ fn run(cli: Cli) -> Result<(), Box> { prepared.bucket_index.len(), ); - let ext = cli.spectrum - .extension() - .and_then(|e| e.to_str()) - .map(|s| s.to_lowercase()); + let ext = spectrum_ext; let ms_level_u32 = cli.ms_level as u32; let bench_mode = cli.max_spectra > 0; let bench_cap = if bench_mode { cli.max_spectra } else { usize::MAX }; @@ -541,7 +534,6 @@ fn run(cli: Cli) -> Result<(), Box> { // Spawn the parser thread. It owns the reader (paths + flags moved in). // The thread returns ParseStats with the error count + sample messages. let spectrum_path = cli.spectrum.clone(); - let is_mzml = matches!(ext.as_deref(), Some("mzml")); let mzml_warn_ms_level_emitted = if !is_mzml && cli.ms_level != 2 { eprintln!( "WARN: --ms-level={} requested for an MGF input; MGF files \ diff --git a/crates/msgf-rust/tests/cli_smoke.rs b/crates/msgf-rust/tests/cli_smoke.rs index b3590add..ac69ba0a 100644 --- a/crates/msgf-rust/tests/cli_smoke.rs +++ b/crates/msgf-rust/tests/cli_smoke.rs @@ -307,6 +307,26 @@ fn cli_rejects_inverted_charge_range() { assert!(!status.success(), "inverted charge range must fail"); } +#[test] +fn cli_rejects_inverted_isotope_error_range() { + let dir = tempfile::tempdir().expect("tempdir"); + let pin_path = dir.path().join("out.pin"); + + let status = base_cmd( + "test-fixtures/test.mgf", + "test-fixtures/BSA.fasta", + &pin_path, + ) + .arg("--isotope-error-min") + .arg("3") + .arg("--isotope-error-max") + .arg("-1") + .status() + .expect("run msgf-rust with inverted isotope range"); + + assert!(!status.success(), "inverted isotope error range must fail"); +} + /// Regression guard: legacy Java numeric flag values and the new /// Rust-idiomatic named values must resolve to byte-identical PIN output. /// Quantms scripts use the numeric form; new docs recommend the named form. diff --git a/crates/output/src/tsv.rs b/crates/output/src/tsv.rs index d77d5607..a8c258bc 100644 --- a/crates/output/src/tsv.rs +++ b/crates/output/src/tsv.rs @@ -12,8 +12,7 @@ //! //! * **FragMethod**: `ActivationMethod::name()` for the five canonical variants; //! `"UNKNOWN"` for unknown / unset activation. -//! * **IsotopeError**: always `0`; the winning isotope offset is not currently -//! threaded into the TSV writer. +//! * **IsotopeError**: winning isotope offset from the search (`PsmMatch::isotope_offset`). //! * **Decoy filtering**: decoys are emitted; downstream Percolator labels them. //! * **SpecID for non-MGF**: `"scan=N"` (mzML convention). diff --git a/crates/search/src/match_engine.rs b/crates/search/src/match_engine.rs index 4a6c8191..82473f1f 100644 --- a/crates/search/src/match_engine.rs +++ b/crates/search/src/match_engine.rs @@ -3,6 +3,7 @@ use std::collections::{BTreeMap, HashMap}; use std::hash::Hasher; use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; // GF failure-mode diagnostics (2026-05-19). Module-level atomics // incremented per-bin from compute_spec_e_values_for_spectrum and @@ -22,7 +23,7 @@ static GF_BIN_ATTEMPTS: AtomicU64 = AtomicU64::new(0); static GF_SPECTRA_NO_GROUP: AtomicU64 = AtomicU64::new(0); use rayon::prelude::*; -use rustc_hash::{FxHashSet, FxHasher}; +use rustc_hash::{FxHashMap, FxHashSet, FxHasher}; use smallvec::{smallvec, SmallVec}; use model::aa_set::AminoAcidSet; @@ -318,7 +319,7 @@ impl<'a> PreparedSearch<'a> { // R-2.1: per-charge queue keyed by charge state. Mirrors Java's // per-SpecKey raw-score retention (DBScanner.java:534). - let mut per_charge_queues: HashMap = HashMap::new(); + let mut per_charge_queues: FxHashMap = FxHashMap::default(); for &cand_idx in &window_cand_indices { let cand = &candidates[cand_idx]; @@ -680,11 +681,9 @@ fn compute_spec_e_values_for_spectrum( let mut any_c = false; for psm in queue.iter_psms() { let cand = &candidates[psm.primary_candidate_idx() as usize]; - if search_index.protein_at(cand.protein_index).is_some() { - if cand.is_protein_n_term { any_n = true; } - if cand.is_protein_c_term { any_c = true; } - if any_n && any_c { break; } - } + if cand.is_protein_n_term { any_n = true; } + if cand.is_protein_c_term { any_c = true; } + if any_n && any_c { break; } } (any_n, any_c) }; @@ -1364,51 +1363,200 @@ mod feature_tests { } } -/// Pre-merge dedup pass (R-2.2): collapse PSMs that share the same -/// (peptide_residue, rounded_score) key into a single entry, aggregating -/// their `candidate_idxs` into a unified Vec. Mirrors Java's -/// `DBScanner.java:719-733` `pepSeqMap` dedup. -/// -/// Called by the per-spectrum loop after the per-candidate scoring loop, -/// before per-charge GF compute (so SpecE is computed on the deduped set). -/// -/// Inputs: -/// - `psms`: drained from a per-charge `TopNQueue` via `drain_into_vec` -/// - `candidates`: the search's enumerated candidate slice; used to resolve -/// each PSM's peptide residue sequence for the dedup key -/// -/// Returns: deduped `Vec`. The caller re-pushes these into the -/// per-charge queue via `queue.push()` for each entry. +/// Pre-merge dedup pass (R-2.2): collapse PSMs sharing the same Java +/// `(pepSeq, score)` key before per-charge GF compute. pub(crate) fn dedup_pepseq_score( psms: Vec, candidates: &[Candidate], ) -> Vec { - use std::collections::HashMap; + use std::collections::btree_map::Entry; + use std::collections::BTreeMap; - // Key: (peptide_residue_bytes, rounded_score_i32) - // The residue sequence is the unmodified bare AA string, matching Java's - // `m.getPepSeq()` used as the dedup key (DBScanner.java:721). - let mut groups: HashMap<(Vec, i32), PsmMatch> = HashMap::new(); + let mut pep_key_cache: FxHashMap> = FxHashMap::default(); + let mut groups: BTreeMap = BTreeMap::new(); for psm in psms { - let cand = &candidates[psm.primary_candidate_idx() as usize]; - let pep_residues: Vec = cand.peptide.residues.iter().map(|aa| aa.residue).collect(); - let score_rounded = psm.score.round() as i32; - let key = (pep_residues, score_rounded); - - groups - .entry(key) - .and_modify(|existing| { - // Aggregate this PSM's indices into the surviving entry. - // Avoid duplicates if the same idx somehow appears twice. - for &idx in &psm.candidate_idxs { - if !existing.candidate_idxs.contains(&idx) { - existing.candidate_idxs.push(idx); - } + let primary = psm.primary_candidate_idx(); + let pep_key = pep_key_cache + .entry(primary) + .or_insert_with(|| Arc::new(PepDedupKey::from_peptide(&candidates[primary as usize].peptide))) + .clone(); + let key = DedupMapKey { + pep: pep_key, + score: psm.rank_score.round() as i32, + }; + + match groups.entry(key) { + Entry::Vacant(slot) => { + slot.insert(psm); + } + Entry::Occupied(mut slot) => { + let existing = slot.get_mut(); + merge_unique_candidate_idxs(&mut existing.candidate_idxs, &psm.candidate_idxs); + if psm.rank_score > existing.rank_score { + let merged_idxs = std::mem::take(&mut existing.candidate_idxs); + let mut survivor = psm; + merge_unique_candidate_idxs(&mut survivor.candidate_idxs, &merged_idxs); + *existing = survivor; } - }) - .or_insert(psm); + } + } } - groups.into_values().collect() + let mut out = Vec::with_capacity(groups.len()); + out.extend(groups.into_values()); + out +} + +/// Mod-aware dedup key: bare residues plus per-position mod mass (1e-5 Da units). +/// Matches Java pepSeq semantics without string formatting on the hot path. +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)] +struct PepDedupKey { + residues: Vec, + mod_units: Vec, +} + +impl PepDedupKey { + fn from_peptide(peptide: &model::Peptide) -> Self { + let len = peptide.residues.len(); + let mut residues = Vec::with_capacity(len); + let mut mod_units = Vec::with_capacity(len); + for aa in &peptide.residues { + residues.push(aa.residue); + mod_units.push( + aa.mod_ + .as_ref() + .map(|m| (m.mass_delta * 100_000.0).round() as i32) + .unwrap_or(0), + ); + } + Self { residues, mod_units } + } +} + +#[derive(Clone, PartialEq, Eq)] +struct DedupMapKey { + pep: Arc, + score: i32, +} + +impl PartialOrd for DedupMapKey { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for DedupMapKey { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.pep + .residues + .cmp(&other.pep.residues) + .then_with(|| self.pep.mod_units.cmp(&other.pep.mod_units)) + .then(self.score.cmp(&other.score)) + } +} + +fn merge_unique_candidate_idxs(into: &mut Vec, from: &[u32]) { + for &idx in from { + if !into.contains(&idx) { + into.push(idx); + } + } +} + +#[cfg(test)] +mod dedup_tests { + use super::*; + use std::sync::Arc; + use model::amino_acid::AminoAcid; + use model::modification::{ModLocation, Modification}; + use model::peptide::Peptide; + use model::ResidueSpec; + use crate::psm::PsmMatch; + + fn seq_peptide(bytes: &[u8]) -> Peptide { + let residues: Vec = bytes + .iter() + .filter_map(|&b| AminoAcid::standard(b)) + .collect(); + Peptide::new(residues, b'R', b'K') + } + + fn cand_with_peptide(peptide: Peptide) -> Candidate { + Candidate { + peptide, + protein_index: 0, + start_offset_in_protein: 0, + is_decoy: false, + is_protein_n_term: false, + is_protein_c_term: false, + } + } + + fn psm(primary: u32, rank: f32, pin: f32) -> PsmMatch { + PsmMatch { + spectrum_idx: 0, + candidate_idxs: vec![primary], + charge_used: 2, + mass_error_ppm: 0.0, + score: pin, + rank_score: rank, + edge_score: (rank - pin) as i32, + spec_e_value: 1.0, + de_novo_score: 0, + activation_method: None, + e_value: 1.0, + features: Default::default(), + isotope_offset: 0, + } + } + + #[test] + fn dedup_uses_rank_score_not_pin_score() { + let pep = seq_peptide(b"PEPTK"); + let cands = vec![cand_with_peptide(pep.clone())]; + let psms = vec![ + psm(0, 100.4, 99.6), + psm(0, 120.0, 99.6), + ]; + let out = dedup_pepseq_score(psms, &cands); + assert_eq!(out.len(), 2, "different rank_score keys must not merge"); + } + + #[test] + fn dedup_distinguishes_mod_state() { + let mut ox = seq_peptide(b"PEPMK"); + ox.residues[3].mod_ = Some(Arc::new(Modification { + name: "Ox".into(), + mass_delta: 15.99491, + residue: ResidueSpec::Specific(b'M'), + location: ModLocation::Anywhere, + fixed: false, + accession: None, + })); + let cands = vec![ + cand_with_peptide(seq_peptide(b"PEPMK")), + cand_with_peptide(ox), + ]; + let psms = vec![ + psm(0, 50.0, 50.0), + psm(1, 50.0, 50.0), + ]; + let out = dedup_pepseq_score(psms, &cands); + assert_eq!(out.len(), 2, "mod-aware pepSeq keys must differ"); + } + + #[test] + fn dedup_keeps_highest_rank_score_survivor() { + let pep = seq_peptide(b"PEPTK"); + let cands = vec![cand_with_peptide(pep)]; + // Same rounded score bucket (60) but different float rank_score — merge to best. + let psms = vec![ + psm(0, 59.6, 50.0), + psm(0, 60.4, 50.0), + ]; + let out = dedup_pepseq_score(psms, &cands); + assert_eq!(out.len(), 1); + assert!((out[0].rank_score - 60.4).abs() < f32::EPSILON); + } } From 06dc0d158a050b1af6ffc32cca66544586f28981 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 24 May 2026 07:36:29 +0100 Subject: [PATCH 3/3] docs: fix stale auto-detect, PIN columns, and broken links MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Align README and DOCS with post-B2/B5 behavior, correct PIN column count, document inverted range validation, and replace removed known-divergences.md references with DOCS §8d. Co-authored-by: Cursor --- BUG_REVIEW.md | 20 +++++++++++++++++++- DOCS.md | 24 +++++++++++++----------- README.md | 6 +++--- benchmark/README.md | 5 +++++ benchmark/ci/README.md | 4 +++- crates/search/src/search_index.rs | 2 +- crates/search/tests/gf_java_parity.rs | 8 ++++---- 7 files changed, 48 insertions(+), 21 deletions(-) diff --git a/BUG_REVIEW.md b/BUG_REVIEW.md index 25f5aecf..46a90f2e 100644 --- a/BUG_REVIEW.md +++ b/BUG_REVIEW.md @@ -22,7 +22,7 @@ full `cargo test --release --workspace`, and targeted code reading. | ID | Severity | Location | Issue | |---|---|---|---| -| B9 | **Medium** | `sa_walk.rs` | Does not enforce `max_missed_cleavages`; only used in tests today but would inflate search space if wired to production. | +| B9 | **Low** | `sa_walk.rs` | Test-only SA walk helper does not enforce `max_missed_cleavages`; production search uses `candidate_gen::enumerate_candidates`, which does. | | B10 | **High** | `mzml.rs` `Iterator::next` | First per-spectrum parse error sets `done=true` and aborts the entire file; remaining spectra are silently skipped. | | B11 | **Low** | `sa_walk.rs` Met pass | Dedupes Met-cleaved peptides on residue bytes only, collapsing distinct C-terminal contexts. | @@ -52,3 +52,21 @@ cargo test --release --workspace -- \ - PepSeq dedup keys use integer mod units + `Arc` cache per candidate (avoids repeated string formatting). - Per-charge `TopNQueue` map uses `FxHashMap` (typically 1–3 charges per spectrum). + +## Documentation review (2026-05-24) + +Fixes applied on this branch: + +| Issue | Location | Fix | +|---|---|---| +| PIN column count said "28" | `README.md` | Corrected to 36 (default charge 2–3) + EdgeScore note | +| Auto-detect described "first spectrum" only | `README.md` | First 64 MS2 histogram; `--instrument` does not gate peek | +| Auto-detect required `--instrument low-res` | `DOCS.md` §4 | Matches code: only `--fragmentation auto` + mzML | +| TSV `IsotopeError` documented as always 0 | `DOCS.md` §3b | Updated after B5 fix | +| Broken `known-divergences.md` links | `README.md`, `DOCS.md` §8d | Legacy file removed in iter39; point to §8d / tests | +| Inverted charge/isotope ranges undocumented | `DOCS.md` §1 | Startup validation documented | + +**Still stale (not fixed here):** + +- `benchmark/ci/README.md` — references Java Maven workflow; no Rust benchmark workflow in `.github/workflows/` yet. +- `.claude/CLAUDE.md` — Java-tree context; accurate on `java-legacy` branch only. diff --git a/DOCS.md b/DOCS.md index 63de6a69..a531ec56 100644 --- a/DOCS.md +++ b/DOCS.md @@ -37,15 +37,15 @@ All flags use kebab-case long options (`--flag-name`). Several flags also accept | Flag | Type | Default | Description | Legacy form | |---|---|---|---|---| | `--precursor-tol-ppm` | f64 | `20.0` | Symmetric precursor mass tolerance in parts per million. | Java `-t 20ppm` | -| `--charge-min` | u8 | `2` | Minimum precursor charge to try when the spectrum record does not specify charge. | *(no direct Java flag; set via param file in Java)* | -| `--charge-max` | u8 | `3` | Maximum precursor charge to try when charge is missing from the spectrum. | *(same)* | +| `--charge-min` | u8 | `2` | Minimum precursor charge to try when the spectrum record does not specify charge. Must be ≤ `--charge-max` (inverted ranges are rejected at startup). | *(no direct Java flag; set via param file in Java)* | +| `--charge-max` | u8 | `3` | Maximum precursor charge to try when charge is missing from the spectrum. Must be ≥ `--charge-min`. | *(same)* | | `--enzyme-specificity` | enum | `fully` | Enzymatic cleavage enforcement at peptide termini (Number of Tolerable Termini). `fully`: both termini must be cleavage sites (Java `-ntt 2`). `semi`: at least one terminus (Java `-ntt 1`). `non-specific`: neither required (Java `-ntt 0`). | `--ntt` alias; numeric `0`/`1`/`2` | | `--max-missed-cleavages` | u32 | `1` | Maximum missed enzymatic cleavages allowed per candidate peptide. | Java `-maxMissedCleavages 1` | | `--min-length` | u32 | `6` | Minimum peptide length in residues (excluding flanking context). | Java `-minLength 6` | | `--max-length` | u32 | `40` | Maximum peptide length in residues. | Java `-maxLength 40` | | `--top-n` | u32 | `10` | Maximum PSMs retained per spectrum (ranked by SpecEValue). | Java `-n 10` | -| `--isotope-error-min` | i8 | `-1` | Minimum isotope error offset to evaluate during precursor matching. | Java `-ti -1,2` (first value) | -| `--isotope-error-max` | i8 | `2` | Maximum isotope error offset to evaluate. | Java `-ti -1,2` (second value) | +| `--isotope-error-min` | i8 | `-1` | Minimum isotope error offset to evaluate during precursor matching. Must be ≤ `--isotope-error-max`. | Java `-ti -1,2` (first value) | +| `--isotope-error-max` | i8 | `2` | Maximum isotope error offset to evaluate. Must be ≥ `--isotope-error-min`. | Java `-ti -1,2` (second value) | | `--min-peaks` | u32 | `10` | Minimum number of MS2 peaks required to score a spectrum; spectra below this threshold are skipped. | Java `-minNumPeaks 10` | ### Modifications @@ -165,7 +165,7 @@ msgf-rust writes Percolator `.pin` (always) and optionally `.tsv`. Implementatio ### 3a. PIN columns -Tab-separated, one header row, one row per PSM. Rows are sorted best-first within each spectrum (lowest SpecEValue first). Charge one-hot columns are emitted for every integer charge in `[--charge-min, --charge-max]`; the table below uses the default range 2–3 (`charge2`, `charge3`). +Tab-separated, one header row, one row per PSM. Rows are sorted best-first within each spectrum (lowest SpecEValue first). With default `--charge-min 2 --charge-max 3`, the header has **36 columns**: 35 Java-parity fields plus Rust-only `EdgeScore` (before `Peptide`). Additional charge states add one `chargeN` column each. | Column | Type | Description | |---|---|---| @@ -220,7 +220,7 @@ Tab-separated human-readable report. The `Title` column appears **only for MGF** | `Title` | string | MGF `TITLE=` field. | | `FragMethod` | string | Activation method name (`HCD`, `CID`, …) or `UNKNOWN`. | | `Precursor` | float | Precursor m/z (4 decimal places). | -| `IsotopeError` | int | Always `0` in current release (winning offset not threaded to TSV). | +| `IsotopeError` | int | Winning isotope offset (same value as PIN `isotope_error`). | | `PrecursorError(ppm)` | float | Mass error in ppm when tolerance is ppm mode; column named `PrecursorError(Da)` in Da mode. | | `Charge` | int | Assigned precursor charge. | | `Peptide` | string | Annotated peptide sequence with modifications. | @@ -242,14 +242,16 @@ Use **PIN** when the goal is FDR calibration or rescoring: Percolator, MS²Resco ## 4. Auto-detection -For mzML inputs, when `--fragmentation auto`, `--instrument low-res`, and `--protocol auto` (the CLI defaults), msgf-rust peeks the input file before loading the full dataset: +For **mzML** inputs when `--fragmentation auto` (the default), msgf-rust peeks the input file before loading the full dataset: 1. **Activation method** — histogram of `` cvParams across the first 64 MS2 spectra; dominant method wins. Mixed methods trigger an stderr warning but the dominant method is still used file-wide. 2. **Instrument class** — scans `` / analyzer cvParams via `input::detect_instrument_type`; dominant analyzer among MS2 spectra wins. `None` → `low-res` (Java `LOW_RESOLUTION_LTQ` default). -MGF files carry no activation or instrument metadata → auto-detect returns `None` → bundled default `HCD_QExactive_Tryp.param` unless explicit `--fragmentation` / `--instrument` flags override. +The CLI `--instrument` flag does **not** gate this path — only `--fragmentation auto` + mzML extension does. When peek succeeds, instrument is taken from the file; `--protocol` from the CLI is still used to pick protocol-suffixed `.param` files (e.g. `_TMT`). -Explicit `--fragmentation` (non-auto) or non-default `--instrument` disables the activation peek and uses flag-based resolution directly (§1). +MGF files carry no activation or instrument metadata → auto-detect returns `None` → bundled default `HCD_QExactive_Tryp.param` unless explicit `--fragmentation` / `--instrument` flags override via `resolve_bundled_param`. + +Non-auto `--fragmentation` (e.g. `HCD`, `3`) disables the activation peek and uses flag-based resolution directly (§1), including `--instrument` and `--protocol` from the CLI. ### Activation CV mapping (mzML `` cvParam accession → method) @@ -476,9 +478,9 @@ On PSMs where Java and Rust agree on scan + top-1 peptide (the "agreement bucket | Feature | Divergence | Status | |---|---|---| -| `lnEValue` | ~4 orders of magnitude mean shift (Rust more confident) | Deferred — `num_distinct` semantics differ (`known-divergences.md` #2) | +| `lnEValue` | ~4 orders of magnitude mean shift (Rust more confident) | Deferred — `num_distinct` semantics differ (see item #2 below) | | `MeanRelErrorTop7` / `MeanErrorTop7` / `StdevRelErrorTop7` | >1% relative difference on ~99% of agreement-bucket PSMs | Deferred — error-stat normalization differs | -| BSA charge-3 SpecEValue (BSA.fasta + test.mgf fixture) | 1–4 OOM gap depending on deconvolution iteration | Known — deconvolution implementation divergence (`known-divergences.md` #3); kept as dev-branch smoke gate | +| BSA charge-3 SpecEValue (BSA.fasta + test.mgf fixture) | 1–4 OOM gap depending on deconvolution iteration | Known — deconvolution implementation divergence; kept as dev-branch smoke gate (`gf_java_parity` tests) | Percolator's learned weights absorb these distribution shifts; rescored FDR results remain competitive or better than Java. diff --git a/README.md b/README.md index f57bc6da..f3a1b553 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ msgf-rust \ This runs a tryptic search at 20 ppm precursor tolerance with the bundled HCD_QExactive_Tryp scoring model, writes Percolator-format PSMs to `out.pin`, and prints per-phase timings to stderr. Feed `out.pin` directly into Percolator (Docker or native) to compute q-values. -A row in `out.pin` is one peptide–spectrum match with 28 columns: `SpecId`, `Label`, `ScanNr`, charge one-hot encoding, then features like `RawScore`, `lnSpecEValue`, `DeNovoScore`, ion-current ratios, peptide-length stats, etc. Full column reference: `DOCS.md` §3a. +A row in `out.pin` is one peptide–spectrum match. With the default charge range (2–3), each row has **36 tab-separated columns**: 35 Java-parity Percolator features plus Rust-only `EdgeScore` (inserted before `Peptide`). Charge one-hot columns scale with `[--charge-min, --charge-max]`. Full column reference: `DOCS.md` §3a. ## Common workflows @@ -131,11 +131,11 @@ Run `msgf-rust --help` for the auto-generated help with full descriptions. ## Auto-detection -For mzML inputs, msgf-rust reads the activation block of the first MS2 spectrum and selects a bundled `.param` file accordingly. The detection covers HCD/CID/ETD/UVPD activation and LowRes/HighRes/TOF/QExactive instrument classes (via mzML CV params). The bundled model is then resolved from `(fragmentation, instrument, protocol)`. MGF files have no activation metadata, so they go through the CLI defaults (which can be overridden with explicit `--fragmentation` / `--instrument` flags). Full resolution table: `DOCS.md` §4. +For mzML inputs with `--fragmentation auto` (the default), msgf-rust peeks the first 64 MS2 spectra, histograms activation methods and analyzer types, and selects a bundled `.param` file from the dominant values. The `--instrument` CLI flag is **not** required for this path — instrument class is read from the mzML when possible. `--protocol` from the CLI is still applied when resolving the bundled model. MGF files have no activation metadata, so they use flag-based resolution (defaulting to `HCD_QExactive_Tryp.param`). Full resolution table: `DOCS.md` §4. ## Parity vs Java MS-GF+ -PIN output columns are bit-exact with Java MS-GF+ on the agreement bucket (same scan + same top-1 peptide) for most features. Three residual divergences exist as deferred research: `lnEValue` (num_distinct semantics), `MeanRelErrorTop7` (error-stat normalization), and the BSA charge-3 SEV gap from the deconvolution-implementation difference (`known-divergences.md` item #3, kept on the development branch). None gate cutover; aggregate 1% FDR PSM counts beat Java on all three benchmark datasets. Full detail: `DOCS.md` §8d. +PIN output columns are bit-exact with Java MS-GF+ on the agreement bucket (same scan + same top-1 peptide) for most features. Three residual divergences exist as deferred research: `lnEValue` (num_distinct semantics), `MeanRelErrorTop7` (error-stat normalization), and the BSA charge-3 SEV gap from deconvolution-implementation differences. None gate cutover; aggregate 1% FDR PSM counts beat Java on all three benchmark datasets. Full detail: `DOCS.md` §8d. ## Citation diff --git a/benchmark/README.md b/benchmark/README.md index 099fe10d..604387cb 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -4,6 +4,11 @@ Only the **CI benchmark scaffold** is committed under this directory; heavy local-only harnesses and artifacts (`data/`, `results/`, prebuilt JARs, etc.) are intentionally gitignored and not distributed with the fork. +The PXD001819 CI scripts under `ci/` were written for the Java MS-GF+ build +(`mvn`, mzIdentML output). The Rust binary uses PIN/TSV and a separate CI +workflow (`.github/workflows/ci.yml`); Rust benchmark automation is not yet +wired to this scaffold. + ## Datasets | Dataset | PXD | Instrument | Type | Spectra Source | FASTA / SDRF | diff --git a/benchmark/ci/README.md b/benchmark/ci/README.md index aded6d57..be422145 100644 --- a/benchmark/ci/README.md +++ b/benchmark/ci/README.md @@ -1,6 +1,8 @@ # CI benchmark (PXD001819) -- **Workflow:** `.github/workflows/benchmark-pxd001819.yml` (`workflow_dispatch`) +> **Note:** This scaffold targets the Java MS-GF+ tree (`mvn`, mzIdentML metrics). The Rust port (`msgf-rust`) uses `.github/workflows/ci.yml` for tests but does not yet wire this benchmark harness. See [`benchmark/README.md`](../README.md) for scope. + +- **Workflow:** `.github/workflows/benchmark-pxd001819.yml` (`workflow_dispatch`) — Java branch only - **Run locally:** `mvn -B package -DskipTests && bash benchmark/ci/PXD001819/run_ci.sh` - **Compare:** `python3 benchmark/ci/PXD001819/compare_metrics.py benchmark/results/PXD001819/ci/ci_metrics.txt benchmark/ci/PXD001819/baseline.tsv` - **Self-test:** `python3 -m unittest benchmark.ci.PXD001819.test_compare_metrics` diff --git a/crates/search/src/search_index.rs b/crates/search/src/search_index.rs index 0a7b7777..cbc9cde3 100644 --- a/crates/search/src/search_index.rs +++ b/crates/search/src/search_index.rs @@ -63,7 +63,7 @@ impl SearchIndex { /// /// Counts distinct prefixes of length `l` across the entire suffix array /// (target + decoy combined, modulo the still-open mod-context divergence - /// tracked in `docs/parity-analysis/known-divergences.md`). + /// tracked in `DOCS.md` §8d). /// /// Distinct identity is the residue byte sequence with no mods and no /// flanking residues. Two candidates with identical residues but different diff --git a/crates/search/tests/gf_java_parity.rs b/crates/search/tests/gf_java_parity.rs index 8eb3c93d..0ff65363 100644 --- a/crates/search/tests/gf_java_parity.rs +++ b/crates/search/tests/gf_java_parity.rs @@ -11,7 +11,7 @@ //! Java SP values are now captured directly via //! `-Dmsgfplus.gftrace=true` against `target/MSGFPlus.jar` (commit e918376) //! so the test compares SP-vs-SP. The remaining `num_distinct`-level -//! discrepancy is tracked separately as known-divergences item #2 +//! discrepancy is tracked in `DOCS.md` §8d (lnEValue / num_distinct). //! (e_value proxy follow-up). //! //! Reference fixture (for context, not used for the assertion): @@ -91,7 +91,7 @@ const FIVE_TRACED_PSMS: &[(i32, &str, u8, f64)] = &[ /// Best case; Rust and Java agree to within ~18%. /// /// The remaining SP-level drift is small and is tracked under the -/// known-divergences list (RawScore scale + Float.MIN_VALUE underflow +/// known-divergences list (see `DOCS.md` §8d: RawScore scale + Float.MIN_VALUE underflow /// guard). The previously suspected scan-3353-specific score-distribution /// width bug appears to have been an artifact of the SEV-vs-SP comparison. /// @@ -100,7 +100,7 @@ const FIVE_TRACED_PSMS: &[(i32, &str, u8, f64)] = &[ /// `NewScoredSpectrum.java:83-88`). The two charge-3 PSMs in this fixture /// (scan 3416 and 3353) moved from 0.24/0.13 OOM → 1.03/1.20 OOM. The shift /// EXPOSES an underlying deconvolution-implementation divergence between -/// Rust and Java (`known-divergences.md` item #3, still open). The fix is +/// Rust and Java (`DOCS.md` §8d — BSA charge-3 deconvolution, still open). The fix is /// algorithmically correct — Rust now matches Java's prob_peak ordering — /// but the deconvoluted peak list differs from Java's implementation, /// shifting ion_existence_score. Charge-2 PSMs (3 of 5 in this fixture) are @@ -112,7 +112,7 @@ const FIVE_TRACED_PSMS: &[(i32, &str, u8, f64)] = &[ /// validated on Astral (Rust now BEATS Java by +287 PSMs at 1% FDR; /// see project memory `iter32-37-shipped`). It also widens the BSA /// charge-3 SEV gap from 1.03/1.20 OOM → 2.56-3.58 OOM because the -/// deconvolution-implementation divergence (`known-divergences.md` #3) +/// deconvolution-implementation divergence (`DOCS.md` §8d) /// now feeds the corrected score path. Bumping tolerance to 4.0 OOM /// keeps this test as a coarse smoke gate while #3 remains open; a /// regression beyond 4.0 OOM would still signal a new bug.