From bcaa8785553c6c17db560d989f35a24fcb923361 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Sat, 23 May 2026 21:28:21 +0100
Subject: [PATCH 1/3] fix: address six bugs found in post-merge code review

Bug-hunt review on master (see BUG_REVIEW.md). Fixes:

- send_chunks: bench cap no longer zeroes the final partial chunk
- Param routing: activation auto-detect works when --instrument is set
- TSV: use mzML column layout (no spurious Title column)
- GF SpecE: honor is_protein_n_term for Met-cleaved peptides
- TSV: write IsotopeError from psm.isotope_offset
- CLI: reject inverted charge/isotope error ranges

Adds regression tests for bench mode and inverted charge range.
Documents five additional open findings for follow-up.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 BUG_REVIEW.md                         | 49 +++++++++++++++++++++++++++
 crates/msgf-rust/src/bin/msgf-rust.rs | 26 ++++++++++----
 crates/msgf-rust/tests/cli_smoke.rs   | 47 +++++++++++++++++++++++++
 crates/output/src/tsv.rs              |  4 +--
 crates/search/src/match_engine.rs     |  8 ++---
 5 files changed, 120 insertions(+), 14 deletions(-)
 create mode 100644 BUG_REVIEW.md

diff --git a/BUG_REVIEW.md b/BUG_REVIEW.md
new file mode 100644
index 00000000..c39371b8
--- /dev/null
+++ b/BUG_REVIEW.md
@@ -0,0 +1,49 @@
+# msgf-rust bug review (2026-05-23)
+
+Branch: `review/bug-hunt` (from `master` @ 18360a3d)
+
+Systematic review of the Rust MS-GF+ port: static analysis of critical paths,
+full `cargo test --release --workspace`, and targeted code reading.
+
+## Fixed in this branch
+
+| ID | Severity | Location | Issue | Fix |
+|---|---|---|---|---|
+| B1 | **Critical** | `msgf-rust.rs` `send_chunks` | Bench cap (`--max-spectra N`) truncated the final partial chunk to zero when `total == N` (e.g. N=100 with chunk size 5000 → empty output). | Removed erroneous tail `truncate` block; loop already stops at cap. |
+| B2 | **High** | `msgf-rust.rs` param routing | Activation auto-detect was gated on `instrument == low-res`, so `--fragmentation auto --instrument QExactive` on mzML skipped peek and resolved to CID params for HCD data. | Gate auto-route on `fragmentation == auto` + mzML extension only. |
+| B3 | **High** | `msgf-rust.rs` TSV write | `write_tsv(..., is_mgf=true)` always emitted MGF layout (extra `Title` column) even for mzML inputs. | Pass `!is_mzml`. |
+| B4 | **High** | `match_engine.rs` GF | SpecE GF graph used `start_offset == 0` for protein N-term instead of `cand.is_protein_n_term`, breaking Met-cleaved N-termini at offset 1. | Use `cand.is_protein_n_term` / `is_protein_c_term`. |
+| B5 | **Medium** | `tsv.rs` | `IsotopeError` column hardcoded to 0 while PIN writes `psm.isotope_offset`. | Thread isotope offset from PSM. |
+| B6 | **Medium** | `msgf-rust.rs` CLI | Inverted `--charge-min/--charge-max` or isotope ranges produced empty ranges with no error. | Validate at startup and return clear error. |
+
+## Open — not fixed (documented for follow-up)
+
+| ID | Severity | Location | Issue |
+|---|---|---|---|
+| B7 | **High** | `match_engine.rs` `dedup_pepseq_score` | Dedup key is `(bare sequence, psm.score.round())`; ignores modifications and uses pin score instead of Java-aligned `rank_score`. Can over-merge TMT/mod variants. |
+| B8 | **Medium** | `match_engine.rs` | `dedup_pepseq_score` keeps first PSM on collision (HashMap iteration order) → nondeterministic primary candidate for shared peptides. |
+| B9 | **Medium** | `sa_walk.rs` | Does not enforce `max_missed_cleavages`; only used in tests today but would inflate search space if wired to production. |
+| B10 | **High** | `mzml.rs` `Iterator::next` | First per-spectrum parse error sets `done=true` and aborts the entire file; remaining spectra are silently skipped. |
+| B11 | **Low** | `sa_walk.rs` Met pass | Dedupes Met-cleaved peptides on residue bytes only, collapsing distinct C-terminal contexts. |
+
+## Known test failures (pre-existing, CI-skipped)
+
+These fail on `master` without the 7 CI skip flags; tracked as parity/min_peaks regressions:
+
+- `match_engine_smoke::known_peptide_appears_in_top_n`
+- `match_engine_smoke::charge_missing_spectrum_uses_per_charge_scored_spec`
+- `match_engine_smoke::spectrum_without_charge_tries_charge_range`
+- Maven fixture loads, thread-determinism test (see `.github/workflows/ci.yml`)
+
+## Verification
+
+```bash
+cargo test --release --workspace -- \
+  --skip charge_missing_spectrum_uses_per_charge_scored_spec \
+  --skip spectrum_without_charge_tries_charge_range \
+  --skip known_peptide_appears_in_top_n \
+  --skip read_bsa_canno_text_format \
+  --skip read_tryp_pig_bov_revcat_csarr_cnlcp \
+  --skip tryp_pig_bov_revcat_full_set_loads \
+  --skip match_spectra_output_invariant_across_thread_counts
+```
diff --git a/crates/msgf-rust/src/bin/msgf-rust.rs b/crates/msgf-rust/src/bin/msgf-rust.rs
index ecb5a330..f5203841 100644
--- a/crates/msgf-rust/src/bin/msgf-rust.rs
+++ b/crates/msgf-rust/src/bin/msgf-rust.rs
@@ -302,10 +302,6 @@ where
             }
         }
     }
-    if bench_cap < usize::MAX && total + chunk.len() > bench_cap {
-        let keep = bench_cap.saturating_sub(total);
-        chunk.truncate(keep);
-    }
     if !chunk.is_empty() {
         let _ = tx.send(chunk);
     }
@@ -396,8 +392,12 @@ fn run(cli: Cli) -> Result<(), Box<dyn std::error::Error>> {
     let param_path = match cli.param_file.clone() {
         Some(p) => p,
         None    => {
-            let auto_route_eligible = cli.fragmentation == Fragmentation::Auto
-                && cli.instrument == Instrument::LowRes;
+            let ext_is_mzml = cli.spectrum
+                .extension()
+                .and_then(|e| e.to_str())
+                .map(|s| s.eq_ignore_ascii_case("mzml"))
+                .unwrap_or(false);
+            let auto_route_eligible = cli.fragmentation == Fragmentation::Auto && ext_is_mzml;
             if auto_route_eligible {
                 match detect_dominant_activation(&cli.spectrum) {
                     Some(method) => {
@@ -442,7 +442,19 @@ fn run(cli: Cli) -> Result<(), Box<dyn std::error::Error>> {
     params.precursor_tolerance =
         PrecursorTolerance::symmetric(Tolerance::Ppm(cli.precursor_tol_ppm));
     params.charge_range = cli.charge_min..=cli.charge_max;
+    if cli.charge_min > cli.charge_max {
+        return Err(format!(
+            "invalid charge range: --charge-min {} > --charge-max {}",
+            cli.charge_min, cli.charge_max
+        ).into());
+    }
     params.isotope_error_range = cli.isotope_error_min..=cli.isotope_error_max;
+    if cli.isotope_error_min > cli.isotope_error_max {
+        return Err(format!(
+            "invalid isotope error range: --isotope-error-min {} > --isotope-error-max {}",
+            cli.isotope_error_min, cli.isotope_error_max
+        ).into());
+    }
     params.top_n_psms_per_spectrum = cli.top_n;
     params.num_tolerable_termini = match cli.enzyme_specificity {
         EnzymeSpecificity::Fully => 2,
@@ -665,7 +677,7 @@ fn run(cli: Cli) -> Result<(), Box<dyn std::error::Error>> {
             .file_name()
             .map(|n| n.to_string_lossy().into_owned())
             .unwrap_or_else(|| cli.spectrum.display().to_string());
-        output::write_tsv(tsv_path, &spectra, &queues, &prepared.candidates, &params, &idx, &spec_file_name, true)?;
+        output::write_tsv(tsv_path, &spectra, &queues, &prepared.candidates, &params, &idx, &spec_file_name, !is_mzml)?;
         eprintln!("Wrote TSV: {}", tsv_path.display());
     }
 
diff --git a/crates/msgf-rust/tests/cli_smoke.rs b/crates/msgf-rust/tests/cli_smoke.rs
index f3ce24f1..b3590add 100644
--- a/crates/msgf-rust/tests/cli_smoke.rs
+++ b/crates/msgf-rust/tests/cli_smoke.rs
@@ -260,6 +260,53 @@ fn cli_runs_end_to_end_on_tiny_mzml() {
     );
 }
 
+#[test]
+fn bench_mode_max_spectra_produces_nonempty_pin() {
+    // Regression for send_chunks bench-cap bug: --max-spectra 100 must not
+    // drop the entire final partial chunk (which used to truncate to zero).
+    let dir = tempfile::tempdir().expect("tempdir");
+    let pin_path = dir.path().join("bench.pin");
+
+    let status = base_cmd(
+        "test-fixtures/test.mgf",
+        "test-fixtures/BSA.fasta",
+        &pin_path,
+    )
+    .arg("--max-spectra")
+    .arg("100")
+    .status()
+    .expect("run msgf-rust bench mode");
+
+    assert!(status.success(), "bench mode should exit 0, got: {status}");
+    assert!(pin_path.exists(), "PIN should be written in bench mode");
+
+    let content = std::fs::read_to_string(&pin_path).unwrap();
+    assert!(
+        content.lines().count() > 1,
+        "bench mode with --max-spectra 100 should produce header + data rows"
+    );
+}
+
+#[test]
+fn cli_rejects_inverted_charge_range() {
+    let dir = tempfile::tempdir().expect("tempdir");
+    let pin_path = dir.path().join("out.pin");
+
+    let status = base_cmd(
+        "test-fixtures/test.mgf",
+        "test-fixtures/BSA.fasta",
+        &pin_path,
+    )
+    .arg("--charge-min")
+    .arg("4")
+    .arg("--charge-max")
+    .arg("2")
+    .status()
+    .expect("run msgf-rust with inverted charge range");
+
+    assert!(!status.success(), "inverted charge range must fail");
+}
+
 /// Regression guard: legacy Java numeric flag values and the new
 /// Rust-idiomatic named values must resolve to byte-identical PIN output.
 /// Quantms scripts use the numeric form; new docs recommend the named form.
diff --git a/crates/output/src/tsv.rs b/crates/output/src/tsv.rs
index ddc36981..d77d5607 100644
--- a/crates/output/src/tsv.rs
+++ b/crates/output/src/tsv.rs
@@ -176,8 +176,8 @@ fn write_psm_row<W: Write>(
     // Precursor m/z formatted to 4 decimal places
     let precursor = format!("{:.4}", spec.precursor_mz);
 
-    // IsotopeError: always 0 (winning isotope offset not threaded here yet)
-    let isotope_error: i32 = 0;
+    // IsotopeError: winning isotope offset from the search (matches PIN column).
+    let isotope_error: i32 = psm.isotope_offset as i32;
 
     // PrecursorError: mass_error_ppm stored on psm; convert to Da if needed
     let precursor_error = if ppm_mode {
diff --git a/crates/search/src/match_engine.rs b/crates/search/src/match_engine.rs
index 9d776026..4a6c8191 100644
--- a/crates/search/src/match_engine.rs
+++ b/crates/search/src/match_engine.rs
@@ -680,11 +680,9 @@ fn compute_spec_e_values_for_spectrum(
         let mut any_c = false;
         for psm in queue.iter_psms() {
             let cand = &candidates[psm.primary_candidate_idx() as usize];
-            if let Some(prot) = search_index.protein_at(cand.protein_index) {
-                let start = cand.start_offset_in_protein;
-                let pep_len = cand.peptide.length();
-                if start == 0 { any_n = true; }
-                if start + pep_len >= prot.sequence.len() { any_c = true; }
+            if search_index.protein_at(cand.protein_index).is_some() {
+                if cand.is_protein_n_term { any_n = true; }
+                if cand.is_protein_c_term { any_c = true; }
                 if any_n && any_c { break; }
             }
         }

From 2aa605f86510bc5994846eef5ef65f38ab75a408 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Sun, 24 May 2026 07:27:14 +0100
Subject: [PATCH 2/3] fix: dedup B7/B8, CLI cleanup, and dedup perf pass

Correct mod-aware pepSeq dedup to use rank_score with deterministic
survivor selection, add isotope-range validation test, and optimize the
dedup hot path with Arc-cached integer keys and FxHashMap charge queues.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 BUG_REVIEW.md                         |   9 +-
 crates/msgf-rust/src/bin/msgf-rust.rs |  30 ++--
 crates/msgf-rust/tests/cli_smoke.rs   |  20 +++
 crates/output/src/tsv.rs              |   3 +-
 crates/search/src/match_engine.rs     | 236 +++++++++++++++++++++-----
 5 files changed, 231 insertions(+), 67 deletions(-)

diff --git a/BUG_REVIEW.md b/BUG_REVIEW.md
index c39371b8..25f5aecf 100644
--- a/BUG_REVIEW.md
+++ b/BUG_REVIEW.md
@@ -15,13 +15,13 @@ full `cargo test --release --workspace`, and targeted code reading.
 | B4 | **High** | `match_engine.rs` GF | SpecE GF graph used `start_offset == 0` for protein N-term instead of `cand.is_protein_n_term`, breaking Met-cleaved N-termini at offset 1. | Use `cand.is_protein_n_term` / `is_protein_c_term`. |
 | B5 | **Medium** | `tsv.rs` | `IsotopeError` column hardcoded to 0 while PIN writes `psm.isotope_offset`. | Thread isotope offset from PSM. |
 | B6 | **Medium** | `msgf-rust.rs` CLI | Inverted `--charge-min/--charge-max` or isotope ranges produced empty ranges with no error. | Validate at startup and return clear error. |
+| B7 | **High** | `match_engine.rs` dedup | Dedup used bare sequence + pin score; merged mod variants incorrectly. | Mod-aware pepSeq key + `rank_score`. |
+| B8 | **Medium** | `match_engine.rs` dedup | HashMap survivor order was nondeterministic. | `BTreeMap` + best-`rank_score` survivor rule. |
 
 ## Open — not fixed (documented for follow-up)
 
 | ID | Severity | Location | Issue |
 |---|---|---|---|
-| B7 | **High** | `match_engine.rs` `dedup_pepseq_score` | Dedup key is `(bare sequence, psm.score.round())`; ignores modifications and uses pin score instead of Java-aligned `rank_score`. Can over-merge TMT/mod variants. |
-| B8 | **Medium** | `match_engine.rs` | `dedup_pepseq_score` keeps first PSM on collision (HashMap iteration order) → nondeterministic primary candidate for shared peptides. |
 | B9 | **Medium** | `sa_walk.rs` | Does not enforce `max_missed_cleavages`; only used in tests today but would inflate search space if wired to production. |
 | B10 | **High** | `mzml.rs` `Iterator::next` | First per-spectrum parse error sets `done=true` and aborts the entire file; remaining spectra are silently skipped. |
 | B11 | **Low** | `sa_walk.rs` Met pass | Dedupes Met-cleaved peptides on residue bytes only, collapsing distinct C-terminal contexts. |
@@ -47,3 +47,8 @@ cargo test --release --workspace -- \
   --skip tryp_pig_bov_revcat_full_set_loads \
   --skip match_spectra_output_invariant_across_thread_counts
 ```
+
+## Performance (dedup pass)
+
+- PepSeq dedup keys use integer mod units + `Arc` cache per candidate (avoids repeated string formatting).
+- Per-charge `TopNQueue` map uses `FxHashMap<u8, _>` (typically 1–3 charges per spectrum).
diff --git a/crates/msgf-rust/src/bin/msgf-rust.rs b/crates/msgf-rust/src/bin/msgf-rust.rs
index f5203841..25fe030f 100644
--- a/crates/msgf-rust/src/bin/msgf-rust.rs
+++ b/crates/msgf-rust/src/bin/msgf-rust.rs
@@ -381,23 +381,19 @@ fn run(cli: Cli) -> Result<(), Box<dyn std::error::Error>> {
 
     // ── 4. Load Param scoring model ───────────────────────────────────────────
     //
-    // When the user provided `--param-file`, that wins outright. Otherwise:
-    //   * If `--fragmentation`/`--instrument` are set, honour them (existing
-    //     behaviour — preserves the bench harness's explicit-flag path).
-    //   * If none of those are set, peek the input file for its dominant
-    //     activation method and route to the matching bundled .param file.
-    //     This mirrors Java MS-GF+'s ASWRITTEN per-spectrum dispatch at the
-    //     file-wide granularity (good enough when an mzML carries a single
-    //     activation method, which is the common case).
+    // `--param-file` wins outright. Otherwise, for mzML with `--fragmentation auto`,
+    // peek the file's dominant activation method and pick the bundled `.param`.
+    // MGF and explicit fragmentation/instrument flags use `resolve_bundled_param`.
+    let spectrum_ext = cli.spectrum
+        .extension()
+        .and_then(|e| e.to_str())
+        .map(|s| s.to_lowercase());
+    let is_mzml = matches!(spectrum_ext.as_deref(), Some("mzml"));
+
     let param_path = match cli.param_file.clone() {
         Some(p) => p,
         None    => {
-            let ext_is_mzml = cli.spectrum
-                .extension()
-                .and_then(|e| e.to_str())
-                .map(|s| s.eq_ignore_ascii_case("mzml"))
-                .unwrap_or(false);
-            let auto_route_eligible = cli.fragmentation == Fragmentation::Auto && ext_is_mzml;
+            let auto_route_eligible = cli.fragmentation == Fragmentation::Auto && is_mzml;
             if auto_route_eligible {
                 match detect_dominant_activation(&cli.spectrum) {
                     Some(method) => {
@@ -514,10 +510,7 @@ fn run(cli: Cli) -> Result<(), Box<dyn std::error::Error>> {
         prepared.bucket_index.len(),
     );
 
-    let ext = cli.spectrum
-        .extension()
-        .and_then(|e| e.to_str())
-        .map(|s| s.to_lowercase());
+    let ext = spectrum_ext;
     let ms_level_u32 = cli.ms_level as u32;
     let bench_mode = cli.max_spectra > 0;
     let bench_cap = if bench_mode { cli.max_spectra } else { usize::MAX };
@@ -541,7 +534,6 @@ fn run(cli: Cli) -> Result<(), Box<dyn std::error::Error>> {
     // Spawn the parser thread. It owns the reader (paths + flags moved in).
     // The thread returns ParseStats with the error count + sample messages.
     let spectrum_path = cli.spectrum.clone();
-    let is_mzml = matches!(ext.as_deref(), Some("mzml"));
     let mzml_warn_ms_level_emitted = if !is_mzml && cli.ms_level != 2 {
         eprintln!(
             "WARN: --ms-level={} requested for an MGF input; MGF files \
diff --git a/crates/msgf-rust/tests/cli_smoke.rs b/crates/msgf-rust/tests/cli_smoke.rs
index b3590add..ac69ba0a 100644
--- a/crates/msgf-rust/tests/cli_smoke.rs
+++ b/crates/msgf-rust/tests/cli_smoke.rs
@@ -307,6 +307,26 @@ fn cli_rejects_inverted_charge_range() {
     assert!(!status.success(), "inverted charge range must fail");
 }
 
+#[test]
+fn cli_rejects_inverted_isotope_error_range() {
+    let dir = tempfile::tempdir().expect("tempdir");
+    let pin_path = dir.path().join("out.pin");
+
+    let status = base_cmd(
+        "test-fixtures/test.mgf",
+        "test-fixtures/BSA.fasta",
+        &pin_path,
+    )
+    .arg("--isotope-error-min")
+    .arg("3")
+    .arg("--isotope-error-max")
+    .arg("-1")
+    .status()
+    .expect("run msgf-rust with inverted isotope range");
+
+    assert!(!status.success(), "inverted isotope error range must fail");
+}
+
 /// Regression guard: legacy Java numeric flag values and the new
 /// Rust-idiomatic named values must resolve to byte-identical PIN output.
 /// Quantms scripts use the numeric form; new docs recommend the named form.
diff --git a/crates/output/src/tsv.rs b/crates/output/src/tsv.rs
index d77d5607..a8c258bc 100644
--- a/crates/output/src/tsv.rs
+++ b/crates/output/src/tsv.rs
@@ -12,8 +12,7 @@
 //!
 //! * **FragMethod**: `ActivationMethod::name()` for the five canonical variants;
 //!   `"UNKNOWN"` for unknown / unset activation.
-//! * **IsotopeError**: always `0`; the winning isotope offset is not currently
-//!   threaded into the TSV writer.
+//! * **IsotopeError**: winning isotope offset from the search (`PsmMatch::isotope_offset`).
 //! * **Decoy filtering**: decoys are emitted; downstream Percolator labels them.
 //! * **SpecID for non-MGF**: `"scan=N"` (mzML convention).
 
diff --git a/crates/search/src/match_engine.rs b/crates/search/src/match_engine.rs
index 4a6c8191..82473f1f 100644
--- a/crates/search/src/match_engine.rs
+++ b/crates/search/src/match_engine.rs
@@ -3,6 +3,7 @@
 use std::collections::{BTreeMap, HashMap};
 use std::hash::Hasher;
 use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::Arc;
 
 // GF failure-mode diagnostics (2026-05-19). Module-level atomics
 // incremented per-bin from compute_spec_e_values_for_spectrum and
@@ -22,7 +23,7 @@ static GF_BIN_ATTEMPTS: AtomicU64 = AtomicU64::new(0);
 static GF_SPECTRA_NO_GROUP: AtomicU64 = AtomicU64::new(0);
 
 use rayon::prelude::*;
-use rustc_hash::{FxHashSet, FxHasher};
+use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
 use smallvec::{smallvec, SmallVec};
 
 use model::aa_set::AminoAcidSet;
@@ -318,7 +319,7 @@ impl<'a> PreparedSearch<'a> {
 
             // R-2.1: per-charge queue keyed by charge state. Mirrors Java's
             // per-SpecKey raw-score retention (DBScanner.java:534).
-            let mut per_charge_queues: HashMap<u8, TopNQueue> = HashMap::new();
+            let mut per_charge_queues: FxHashMap<u8, TopNQueue> = FxHashMap::default();
 
             for &cand_idx in &window_cand_indices {
                 let cand = &candidates[cand_idx];
@@ -680,11 +681,9 @@ fn compute_spec_e_values_for_spectrum(
         let mut any_c = false;
         for psm in queue.iter_psms() {
             let cand = &candidates[psm.primary_candidate_idx() as usize];
-            if search_index.protein_at(cand.protein_index).is_some() {
-                if cand.is_protein_n_term { any_n = true; }
-                if cand.is_protein_c_term { any_c = true; }
-                if any_n && any_c { break; }
-            }
+            if cand.is_protein_n_term { any_n = true; }
+            if cand.is_protein_c_term { any_c = true; }
+            if any_n && any_c { break; }
         }
         (any_n, any_c)
     };
@@ -1364,51 +1363,200 @@ mod feature_tests {
     }
 }
 
-/// Pre-merge dedup pass (R-2.2): collapse PSMs that share the same
-/// (peptide_residue, rounded_score) key into a single entry, aggregating
-/// their `candidate_idxs` into a unified Vec. Mirrors Java's
-/// `DBScanner.java:719-733` `pepSeqMap` dedup.
-///
-/// Called by the per-spectrum loop after the per-candidate scoring loop,
-/// before per-charge GF compute (so SpecE is computed on the deduped set).
-///
-/// Inputs:
-/// - `psms`: drained from a per-charge `TopNQueue` via `drain_into_vec`
-/// - `candidates`: the search's enumerated candidate slice; used to resolve
-///   each PSM's peptide residue sequence for the dedup key
-///
-/// Returns: deduped `Vec<PsmMatch>`. The caller re-pushes these into the
-/// per-charge queue via `queue.push()` for each entry.
+/// Pre-merge dedup pass (R-2.2): collapse PSMs sharing the same Java
+/// `(pepSeq, score)` key before per-charge GF compute.
 pub(crate) fn dedup_pepseq_score(
     psms: Vec<PsmMatch>,
     candidates: &[Candidate],
 ) -> Vec<PsmMatch> {
-    use std::collections::HashMap;
+    use std::collections::btree_map::Entry;
+    use std::collections::BTreeMap;
 
-    // Key: (peptide_residue_bytes, rounded_score_i32)
-    // The residue sequence is the unmodified bare AA string, matching Java's
-    // `m.getPepSeq()` used as the dedup key (DBScanner.java:721).
-    let mut groups: HashMap<(Vec<u8>, i32), PsmMatch> = HashMap::new();
+    let mut pep_key_cache: FxHashMap<u32, Arc<PepDedupKey>> = FxHashMap::default();
+    let mut groups: BTreeMap<DedupMapKey, PsmMatch> = BTreeMap::new();
 
     for psm in psms {
-        let cand = &candidates[psm.primary_candidate_idx() as usize];
-        let pep_residues: Vec<u8> = cand.peptide.residues.iter().map(|aa| aa.residue).collect();
-        let score_rounded = psm.score.round() as i32;
-        let key = (pep_residues, score_rounded);
-
-        groups
-            .entry(key)
-            .and_modify(|existing| {
-                // Aggregate this PSM's indices into the surviving entry.
-                // Avoid duplicates if the same idx somehow appears twice.
-                for &idx in &psm.candidate_idxs {
-                    if !existing.candidate_idxs.contains(&idx) {
-                        existing.candidate_idxs.push(idx);
-                    }
+        let primary = psm.primary_candidate_idx();
+        let pep_key = pep_key_cache
+            .entry(primary)
+            .or_insert_with(|| Arc::new(PepDedupKey::from_peptide(&candidates[primary as usize].peptide)))
+            .clone();
+        let key = DedupMapKey {
+            pep: pep_key,
+            score: psm.rank_score.round() as i32,
+        };
+
+        match groups.entry(key) {
+            Entry::Vacant(slot) => {
+                slot.insert(psm);
+            }
+            Entry::Occupied(mut slot) => {
+                let existing = slot.get_mut();
+                merge_unique_candidate_idxs(&mut existing.candidate_idxs, &psm.candidate_idxs);
+                if psm.rank_score > existing.rank_score {
+                    let merged_idxs = std::mem::take(&mut existing.candidate_idxs);
+                    let mut survivor = psm;
+                    merge_unique_candidate_idxs(&mut survivor.candidate_idxs, &merged_idxs);
+                    *existing = survivor;
                 }
-            })
-            .or_insert(psm);
+            }
+        }
     }
 
-    groups.into_values().collect()
+    let mut out = Vec::with_capacity(groups.len());
+    out.extend(groups.into_values());
+    out
+}
+
+/// Mod-aware dedup key: bare residues plus per-position mod mass (1e-5 Da units).
+/// Matches Java pepSeq semantics without string formatting on the hot path.
+#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)]
+struct PepDedupKey {
+    residues: Vec<u8>,
+    mod_units: Vec<i32>,
+}
+
+impl PepDedupKey {
+    fn from_peptide(peptide: &model::Peptide) -> Self {
+        let len = peptide.residues.len();
+        let mut residues = Vec::with_capacity(len);
+        let mut mod_units = Vec::with_capacity(len);
+        for aa in &peptide.residues {
+            residues.push(aa.residue);
+            mod_units.push(
+                aa.mod_
+                    .as_ref()
+                    .map(|m| (m.mass_delta * 100_000.0).round() as i32)
+                    .unwrap_or(0),
+            );
+        }
+        Self { residues, mod_units }
+    }
+}
+
+#[derive(Clone, PartialEq, Eq)]
+struct DedupMapKey {
+    pep: Arc<PepDedupKey>,
+    score: i32,
+}
+
+impl PartialOrd for DedupMapKey {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for DedupMapKey {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.pep
+            .residues
+            .cmp(&other.pep.residues)
+            .then_with(|| self.pep.mod_units.cmp(&other.pep.mod_units))
+            .then(self.score.cmp(&other.score))
+    }
+}
+
+fn merge_unique_candidate_idxs(into: &mut Vec<u32>, from: &[u32]) {
+    for &idx in from {
+        if !into.contains(&idx) {
+            into.push(idx);
+        }
+    }
+}
+
+#[cfg(test)]
+mod dedup_tests {
+    use super::*;
+    use std::sync::Arc;
+    use model::amino_acid::AminoAcid;
+    use model::modification::{ModLocation, Modification};
+    use model::peptide::Peptide;
+    use model::ResidueSpec;
+    use crate::psm::PsmMatch;
+
+    fn seq_peptide(bytes: &[u8]) -> Peptide {
+        let residues: Vec<AminoAcid> = bytes
+            .iter()
+            .filter_map(|&b| AminoAcid::standard(b))
+            .collect();
+        Peptide::new(residues, b'R', b'K')
+    }
+
+    fn cand_with_peptide(peptide: Peptide) -> Candidate {
+        Candidate {
+            peptide,
+            protein_index: 0,
+            start_offset_in_protein: 0,
+            is_decoy: false,
+            is_protein_n_term: false,
+            is_protein_c_term: false,
+        }
+    }
+
+    fn psm(primary: u32, rank: f32, pin: f32) -> PsmMatch {
+        PsmMatch {
+            spectrum_idx: 0,
+            candidate_idxs: vec![primary],
+            charge_used: 2,
+            mass_error_ppm: 0.0,
+            score: pin,
+            rank_score: rank,
+            edge_score: (rank - pin) as i32,
+            spec_e_value: 1.0,
+            de_novo_score: 0,
+            activation_method: None,
+            e_value: 1.0,
+            features: Default::default(),
+            isotope_offset: 0,
+        }
+    }
+
+    #[test]
+    fn dedup_uses_rank_score_not_pin_score() {
+        let pep = seq_peptide(b"PEPTK");
+        let cands = vec![cand_with_peptide(pep.clone())];
+        let psms = vec![
+            psm(0, 100.4, 99.6),
+            psm(0, 120.0, 99.6),
+        ];
+        let out = dedup_pepseq_score(psms, &cands);
+        assert_eq!(out.len(), 2, "different rank_score keys must not merge");
+    }
+
+    #[test]
+    fn dedup_distinguishes_mod_state() {
+        let mut ox = seq_peptide(b"PEPMK");
+        ox.residues[3].mod_ = Some(Arc::new(Modification {
+            name: "Ox".into(),
+            mass_delta: 15.99491,
+            residue: ResidueSpec::Specific(b'M'),
+            location: ModLocation::Anywhere,
+            fixed: false,
+            accession: None,
+        }));
+        let cands = vec![
+            cand_with_peptide(seq_peptide(b"PEPMK")),
+            cand_with_peptide(ox),
+        ];
+        let psms = vec![
+            psm(0, 50.0, 50.0),
+            psm(1, 50.0, 50.0),
+        ];
+        let out = dedup_pepseq_score(psms, &cands);
+        assert_eq!(out.len(), 2, "mod-aware pepSeq keys must differ");
+    }
+
+    #[test]
+    fn dedup_keeps_highest_rank_score_survivor() {
+        let pep = seq_peptide(b"PEPTK");
+        let cands = vec![cand_with_peptide(pep)];
+        // Same rounded score bucket (60) but different float rank_score — merge to best.
+        let psms = vec![
+            psm(0, 59.6, 50.0),
+            psm(0, 60.4, 50.0),
+        ];
+        let out = dedup_pepseq_score(psms, &cands);
+        assert_eq!(out.len(), 1);
+        assert!((out[0].rank_score - 60.4).abs() < f32::EPSILON);
+    }
 }

From 06dc0d158a050b1af6ffc32cca66544586f28981 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Sun, 24 May 2026 07:36:29 +0100
Subject: [PATCH 3/3] docs: fix stale auto-detect, PIN columns, and broken
 links
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Align README and DOCS with post-B2/B5 behavior, correct PIN column count,
document inverted range validation, and replace removed known-divergences.md
references with DOCS §8d.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 BUG_REVIEW.md                         | 20 +++++++++++++++++++-
 DOCS.md                               | 24 +++++++++++++-----------
 README.md                             |  6 +++---
 benchmark/README.md                   |  5 +++++
 benchmark/ci/README.md                |  4 +++-
 crates/search/src/search_index.rs     |  2 +-
 crates/search/tests/gf_java_parity.rs |  8 ++++----
 7 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/BUG_REVIEW.md b/BUG_REVIEW.md
index 25f5aecf..46a90f2e 100644
--- a/BUG_REVIEW.md
+++ b/BUG_REVIEW.md
@@ -22,7 +22,7 @@ full `cargo test --release --workspace`, and targeted code reading.
 
 | ID | Severity | Location | Issue |
 |---|---|---|---|
-| B9 | **Medium** | `sa_walk.rs` | Does not enforce `max_missed_cleavages`; only used in tests today but would inflate search space if wired to production. |
+| B9 | **Low** | `sa_walk.rs` | Test-only SA walk helper does not enforce `max_missed_cleavages`; production search uses `candidate_gen::enumerate_candidates`, which does. |
 | B10 | **High** | `mzml.rs` `Iterator::next` | First per-spectrum parse error sets `done=true` and aborts the entire file; remaining spectra are silently skipped. |
 | B11 | **Low** | `sa_walk.rs` Met pass | Dedupes Met-cleaved peptides on residue bytes only, collapsing distinct C-terminal contexts. |
 
@@ -52,3 +52,21 @@ cargo test --release --workspace -- \
 
 - PepSeq dedup keys use integer mod units + `Arc` cache per candidate (avoids repeated string formatting).
 - Per-charge `TopNQueue` map uses `FxHashMap<u8, _>` (typically 1–3 charges per spectrum).
+
+## Documentation review (2026-05-24)
+
+Fixes applied on this branch:
+
+| Issue | Location | Fix |
+|---|---|---|
+| PIN column count said "28" | `README.md` | Corrected to 36 (default charge 2–3) + EdgeScore note |
+| Auto-detect described "first spectrum" only | `README.md` | First 64 MS2 histogram; `--instrument` does not gate peek |
+| Auto-detect required `--instrument low-res` | `DOCS.md` §4 | Matches code: only `--fragmentation auto` + mzML |
+| TSV `IsotopeError` documented as always 0 | `DOCS.md` §3b | Updated after B5 fix |
+| Broken `known-divergences.md` links | `README.md`, `DOCS.md` §8d | Legacy file removed in iter39; point to §8d / tests |
+| Inverted charge/isotope ranges undocumented | `DOCS.md` §1 | Startup validation documented |
+
+**Still stale (not fixed here):**
+
+- `benchmark/ci/README.md` — references Java Maven workflow; no Rust benchmark workflow in `.github/workflows/` yet.
+- `.claude/CLAUDE.md` — Java-tree context; accurate on `java-legacy` branch only.
diff --git a/DOCS.md b/DOCS.md
index 63de6a69..a531ec56 100644
--- a/DOCS.md
+++ b/DOCS.md
@@ -37,15 +37,15 @@ All flags use kebab-case long options (`--flag-name`). Several flags also accept
 | Flag | Type | Default | Description | Legacy form |
 |---|---|---|---|---|
 | `--precursor-tol-ppm` | f64 | `20.0` | Symmetric precursor mass tolerance in parts per million. | Java `-t 20ppm` |
-| `--charge-min` | u8 | `2` | Minimum precursor charge to try when the spectrum record does not specify charge. | *(no direct Java flag; set via param file in Java)* |
-| `--charge-max` | u8 | `3` | Maximum precursor charge to try when charge is missing from the spectrum. | *(same)* |
+| `--charge-min` | u8 | `2` | Minimum precursor charge to try when the spectrum record does not specify charge. Must be ≤ `--charge-max` (inverted ranges are rejected at startup). | *(no direct Java flag; set via param file in Java)* |
+| `--charge-max` | u8 | `3` | Maximum precursor charge to try when charge is missing from the spectrum. Must be ≥ `--charge-min`. | *(same)* |
 | `--enzyme-specificity` | enum | `fully` | Enzymatic cleavage enforcement at peptide termini (Number of Tolerable Termini). `fully`: both termini must be cleavage sites (Java `-ntt 2`). `semi`: at least one terminus (Java `-ntt 1`). `non-specific`: neither required (Java `-ntt 0`). | `--ntt` alias; numeric `0`/`1`/`2` |
 | `--max-missed-cleavages` | u32 | `1` | Maximum missed enzymatic cleavages allowed per candidate peptide. | Java `-maxMissedCleavages 1` |
 | `--min-length` | u32 | `6` | Minimum peptide length in residues (excluding flanking context). | Java `-minLength 6` |
 | `--max-length` | u32 | `40` | Maximum peptide length in residues. | Java `-maxLength 40` |
 | `--top-n` | u32 | `10` | Maximum PSMs retained per spectrum (ranked by SpecEValue). | Java `-n 10` |
-| `--isotope-error-min` | i8 | `-1` | Minimum isotope error offset to evaluate during precursor matching. | Java `-ti -1,2` (first value) |
-| `--isotope-error-max` | i8 | `2` | Maximum isotope error offset to evaluate. | Java `-ti -1,2` (second value) |
+| `--isotope-error-min` | i8 | `-1` | Minimum isotope error offset to evaluate during precursor matching. Must be ≤ `--isotope-error-max`. | Java `-ti -1,2` (first value) |
+| `--isotope-error-max` | i8 | `2` | Maximum isotope error offset to evaluate. Must be ≥ `--isotope-error-min`. | Java `-ti -1,2` (second value) |
 | `--min-peaks` | u32 | `10` | Minimum number of MS2 peaks required to score a spectrum; spectra below this threshold are skipped. | Java `-minNumPeaks 10` |
 
 ### Modifications
@@ -165,7 +165,7 @@ msgf-rust writes Percolator `.pin` (always) and optionally `.tsv`. Implementatio
 
 ### 3a. PIN columns
 
-Tab-separated, one header row, one row per PSM. Rows are sorted best-first within each spectrum (lowest SpecEValue first). Charge one-hot columns are emitted for every integer charge in `[--charge-min, --charge-max]`; the table below uses the default range 2–3 (`charge2`, `charge3`).
+Tab-separated, one header row, one row per PSM. Rows are sorted best-first within each spectrum (lowest SpecEValue first). With default `--charge-min 2 --charge-max 3`, the header has **36 columns**: 35 Java-parity fields plus Rust-only `EdgeScore` (before `Peptide`). Additional charge states add one `chargeN` column each.
 
 | Column | Type | Description |
 |---|---|---|
@@ -220,7 +220,7 @@ Tab-separated human-readable report. The `Title` column appears **only for MGF**
 | `Title` | string | MGF `TITLE=` field. |
 | `FragMethod` | string | Activation method name (`HCD`, `CID`, …) or `UNKNOWN`. |
 | `Precursor` | float | Precursor m/z (4 decimal places). |
-| `IsotopeError` | int | Always `0` in current release (winning offset not threaded to TSV). |
+| `IsotopeError` | int | Winning isotope offset (same value as PIN `isotope_error`). |
 | `PrecursorError(ppm)` | float | Mass error in ppm when tolerance is ppm mode; column named `PrecursorError(Da)` in Da mode. |
 | `Charge` | int | Assigned precursor charge. |
 | `Peptide` | string | Annotated peptide sequence with modifications. |
@@ -242,14 +242,16 @@ Use **PIN** when the goal is FDR calibration or rescoring: Percolator, MS²Resco
 
 ## 4. Auto-detection
 
-For mzML inputs, when `--fragmentation auto`, `--instrument low-res`, and `--protocol auto` (the CLI defaults), msgf-rust peeks the input file before loading the full dataset:
+For **mzML** inputs when `--fragmentation auto` (the default), msgf-rust peeks the input file before loading the full dataset:
 
 1. **Activation method** — histogram of `<activation>` cvParams across the first 64 MS2 spectra; dominant method wins. Mixed methods trigger an stderr warning but the dominant method is still used file-wide.
 2. **Instrument class** — scans `<instrumentConfiguration>` / analyzer cvParams via `input::detect_instrument_type`; dominant analyzer among MS2 spectra wins. `None` → `low-res` (Java `LOW_RESOLUTION_LTQ` default).
 
-MGF files carry no activation or instrument metadata → auto-detect returns `None` → bundled default `HCD_QExactive_Tryp.param` unless explicit `--fragmentation` / `--instrument` flags override.
+The CLI `--instrument` flag does **not** gate this path — only `--fragmentation auto` + mzML extension does. When peek succeeds, instrument is taken from the file; `--protocol` from the CLI is still used to pick protocol-suffixed `.param` files (e.g. `_TMT`).
 
-Explicit `--fragmentation` (non-auto) or non-default `--instrument` disables the activation peek and uses flag-based resolution directly (§1).
+MGF files carry no activation or instrument metadata → auto-detect returns `None` → bundled default `HCD_QExactive_Tryp.param` unless explicit `--fragmentation` / `--instrument` flags override via `resolve_bundled_param`.
+
+Non-auto `--fragmentation` (e.g. `HCD`, `3`) disables the activation peek and uses flag-based resolution directly (§1), including `--instrument` and `--protocol` from the CLI.
 
 ### Activation CV mapping (mzML `<activation>` cvParam accession → method)
 
@@ -476,9 +478,9 @@ On PSMs where Java and Rust agree on scan + top-1 peptide (the "agreement bucket
 
 | Feature | Divergence | Status |
 |---|---|---|
-| `lnEValue` | ~4 orders of magnitude mean shift (Rust more confident) | Deferred — `num_distinct` semantics differ (`known-divergences.md` #2) |
+| `lnEValue` | ~4 orders of magnitude mean shift (Rust more confident) | Deferred — `num_distinct` semantics differ (see item #2 below) |
 | `MeanRelErrorTop7` / `MeanErrorTop7` / `StdevRelErrorTop7` | >1% relative difference on ~99% of agreement-bucket PSMs | Deferred — error-stat normalization differs |
-| BSA charge-3 SpecEValue (BSA.fasta + test.mgf fixture) | 1–4 OOM gap depending on deconvolution iteration | Known — deconvolution implementation divergence (`known-divergences.md` #3); kept as dev-branch smoke gate |
+| BSA charge-3 SpecEValue (BSA.fasta + test.mgf fixture) | 1–4 OOM gap depending on deconvolution iteration | Known — deconvolution implementation divergence; kept as dev-branch smoke gate (`gf_java_parity` tests) |
 
 Percolator's learned weights absorb these distribution shifts; rescored FDR results remain competitive or better than Java.
 
diff --git a/README.md b/README.md
index f57bc6da..f3a1b553 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,7 @@ msgf-rust \
 
 This runs a tryptic search at 20 ppm precursor tolerance with the bundled HCD_QExactive_Tryp scoring model, writes Percolator-format PSMs to `out.pin`, and prints per-phase timings to stderr. Feed `out.pin` directly into Percolator (Docker or native) to compute q-values.
 
-A row in `out.pin` is one peptide–spectrum match with 28 columns: `SpecId`, `Label`, `ScanNr`, charge one-hot encoding, then features like `RawScore`, `lnSpecEValue`, `DeNovoScore`, ion-current ratios, peptide-length stats, etc. Full column reference: `DOCS.md` §3a.
+A row in `out.pin` is one peptide–spectrum match. With the default charge range (2–3), each row has **36 tab-separated columns**: 35 Java-parity Percolator features plus Rust-only `EdgeScore` (inserted before `Peptide`). Charge one-hot columns scale with `[--charge-min, --charge-max]`. Full column reference: `DOCS.md` §3a.
 
 ## Common workflows
 
@@ -131,11 +131,11 @@ Run `msgf-rust --help` for the auto-generated help with full descriptions.
 
 ## Auto-detection
 
-For mzML inputs, msgf-rust reads the activation block of the first MS2 spectrum and selects a bundled `.param` file accordingly. The detection covers HCD/CID/ETD/UVPD activation and LowRes/HighRes/TOF/QExactive instrument classes (via mzML CV params). The bundled model is then resolved from `(fragmentation, instrument, protocol)`. MGF files have no activation metadata, so they go through the CLI defaults (which can be overridden with explicit `--fragmentation` / `--instrument` flags). Full resolution table: `DOCS.md` §4.
+For mzML inputs with `--fragmentation auto` (the default), msgf-rust peeks the first 64 MS2 spectra, histograms activation methods and analyzer types, and selects a bundled `.param` file from the dominant values. The `--instrument` CLI flag is **not** required for this path — instrument class is read from the mzML when possible. `--protocol` from the CLI is still applied when resolving the bundled model. MGF files have no activation metadata, so they use flag-based resolution (defaulting to `HCD_QExactive_Tryp.param`). Full resolution table: `DOCS.md` §4.
 
 ## Parity vs Java MS-GF+
 
-PIN output columns are bit-exact with Java MS-GF+ on the agreement bucket (same scan + same top-1 peptide) for most features. Three residual divergences exist as deferred research: `lnEValue` (num_distinct semantics), `MeanRelErrorTop7` (error-stat normalization), and the BSA charge-3 SEV gap from the deconvolution-implementation difference (`known-divergences.md` item #3, kept on the development branch). None gate cutover; aggregate 1% FDR PSM counts beat Java on all three benchmark datasets. Full detail: `DOCS.md` §8d.
+PIN output columns are bit-exact with Java MS-GF+ on the agreement bucket (same scan + same top-1 peptide) for most features. Three residual divergences exist as deferred research: `lnEValue` (num_distinct semantics), `MeanRelErrorTop7` (error-stat normalization), and the BSA charge-3 SEV gap from deconvolution-implementation differences. None gate cutover; aggregate 1% FDR PSM counts beat Java on all three benchmark datasets. Full detail: `DOCS.md` §8d.
 
 ## Citation
 
diff --git a/benchmark/README.md b/benchmark/README.md
index 099fe10d..604387cb 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -4,6 +4,11 @@ Only the **CI benchmark scaffold** is committed under this directory; heavy
 local-only harnesses and artifacts (`data/`, `results/`, prebuilt JARs, etc.)
 are intentionally gitignored and not distributed with the fork.
 
+The PXD001819 CI scripts under `ci/` were written for the Java MS-GF+ build
+(`mvn`, mzIdentML output). The Rust binary uses PIN/TSV and a separate CI
+workflow (`.github/workflows/ci.yml`); Rust benchmark automation is not yet
+wired to this scaffold.
+
 ## Datasets
 
 | Dataset | PXD | Instrument | Type | Spectra Source | FASTA / SDRF |
diff --git a/benchmark/ci/README.md b/benchmark/ci/README.md
index aded6d57..be422145 100644
--- a/benchmark/ci/README.md
+++ b/benchmark/ci/README.md
@@ -1,6 +1,8 @@
 # CI benchmark (PXD001819)
 
-- **Workflow:** `.github/workflows/benchmark-pxd001819.yml` (`workflow_dispatch`)
+> **Note:** This scaffold targets the Java MS-GF+ tree (`mvn`, mzIdentML metrics). The Rust port (`msgf-rust`) uses `.github/workflows/ci.yml` for tests but does not yet wire this benchmark harness. See [`benchmark/README.md`](../README.md) for scope.
+
+- **Workflow:** `.github/workflows/benchmark-pxd001819.yml` (`workflow_dispatch`) — Java branch only
 - **Run locally:** `mvn -B package -DskipTests && bash benchmark/ci/PXD001819/run_ci.sh`
 - **Compare:** `python3 benchmark/ci/PXD001819/compare_metrics.py benchmark/results/PXD001819/ci/ci_metrics.txt benchmark/ci/PXD001819/baseline.tsv`
 - **Self-test:** `python3 -m unittest benchmark.ci.PXD001819.test_compare_metrics`
diff --git a/crates/search/src/search_index.rs b/crates/search/src/search_index.rs
index 0a7b7777..cbc9cde3 100644
--- a/crates/search/src/search_index.rs
+++ b/crates/search/src/search_index.rs
@@ -63,7 +63,7 @@ impl SearchIndex {
     ///
     /// Counts distinct prefixes of length `l` across the entire suffix array
     /// (target + decoy combined, modulo the still-open mod-context divergence
-    /// tracked in `docs/parity-analysis/known-divergences.md`).
+    /// tracked in `DOCS.md` §8d).
     ///
     /// Distinct identity is the residue byte sequence with no mods and no
     /// flanking residues. Two candidates with identical residues but different
diff --git a/crates/search/tests/gf_java_parity.rs b/crates/search/tests/gf_java_parity.rs
index 8eb3c93d..0ff65363 100644
--- a/crates/search/tests/gf_java_parity.rs
+++ b/crates/search/tests/gf_java_parity.rs
@@ -11,7 +11,7 @@
 //! Java SP values are now captured directly via
 //! `-Dmsgfplus.gftrace=true` against `target/MSGFPlus.jar` (commit e918376)
 //! so the test compares SP-vs-SP. The remaining `num_distinct`-level
-//! discrepancy is tracked separately as known-divergences item #2
+//! discrepancy is tracked in `DOCS.md` §8d (lnEValue / num_distinct).
 //! (e_value proxy follow-up).
 //!
 //! Reference fixture (for context, not used for the assertion):
@@ -91,7 +91,7 @@ const FIVE_TRACED_PSMS: &[(i32, &str, u8, f64)] = &[
 ///     Best case; Rust and Java agree to within ~18%.
 ///
 /// The remaining SP-level drift is small and is tracked under the
-/// known-divergences list (RawScore scale + Float.MIN_VALUE underflow
+/// known-divergences list (see `DOCS.md` §8d: RawScore scale + Float.MIN_VALUE underflow
 /// guard). The previously suspected scan-3353-specific score-distribution
 /// width bug appears to have been an artifact of the SEV-vs-SP comparison.
 ///
@@ -100,7 +100,7 @@ const FIVE_TRACED_PSMS: &[(i32, &str, u8, f64)] = &[
 /// `NewScoredSpectrum.java:83-88`). The two charge-3 PSMs in this fixture
 /// (scan 3416 and 3353) moved from 0.24/0.13 OOM → 1.03/1.20 OOM. The shift
 /// EXPOSES an underlying deconvolution-implementation divergence between
-/// Rust and Java (`known-divergences.md` item #3, still open). The fix is
+/// Rust and Java (`DOCS.md` §8d — BSA charge-3 deconvolution, still open). The fix is
 /// algorithmically correct — Rust now matches Java's prob_peak ordering —
 /// but the deconvoluted peak list differs from Java's implementation,
 /// shifting ion_existence_score. Charge-2 PSMs (3 of 5 in this fixture) are
@@ -112,7 +112,7 @@ const FIVE_TRACED_PSMS: &[(i32, &str, u8, f64)] = &[
 /// validated on Astral (Rust now BEATS Java by +287 PSMs at 1% FDR;
 /// see project memory `iter32-37-shipped`). It also widens the BSA
 /// charge-3 SEV gap from 1.03/1.20 OOM → 2.56-3.58 OOM because the
-/// deconvolution-implementation divergence (`known-divergences.md` #3)
+/// deconvolution-implementation divergence (`DOCS.md` §8d)
 /// now feeds the corrected score path. Bumping tolerance to 4.0 OOM
 /// keeps this test as a coarse smoke gate while #3 remains open; a
 /// regression beyond 4.0 OOM would still signal a new bug.