Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions BUG_REVIEW.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# msgf-rust bug review (2026-05-23)

Branch: `review/bug-hunt` (from `master` @ 18360a3d)

Systematic review of the Rust MS-GF+ port: static analysis of critical paths,
full `cargo test --release --workspace`, and targeted code reading.

## Fixed in this branch

| ID | Severity | Location | Issue | Fix |
|---|---|---|---|---|
| B1 | **Critical** | `msgf-rust.rs` `send_chunks` | Bench cap (`--max-spectra N`) truncated the final partial chunk to zero when `total == N` (e.g. N=100 with chunk size 5000 → empty output). | Removed erroneous tail `truncate` block; loop already stops at cap. |
| B2 | **High** | `msgf-rust.rs` param routing | Activation auto-detect was gated on `instrument == low-res`, so `--fragmentation auto --instrument QExactive` on mzML skipped peek and resolved to CID params for HCD data. | Gate auto-route on `fragmentation == auto` + mzML extension only. |
| B3 | **High** | `msgf-rust.rs` TSV write | `write_tsv(..., is_mgf=true)` always emitted MGF layout (extra `Title` column) even for mzML inputs. | Pass `!is_mzml`. |
| B4 | **High** | `match_engine.rs` GF | SpecE GF graph used `start_offset == 0` for protein N-term instead of `cand.is_protein_n_term`, breaking Met-cleaved N-termini at offset 1. | Use `cand.is_protein_n_term` / `is_protein_c_term`. |
| B5 | **Medium** | `tsv.rs` | `IsotopeError` column hardcoded to 0 while PIN writes `psm.isotope_offset`. | Thread isotope offset from PSM. |
| B6 | **Medium** | `msgf-rust.rs` CLI | Inverted `--charge-min/--charge-max` or isotope ranges produced empty ranges with no error. | Validate at startup and return clear error. |

## Open — not fixed (documented for follow-up)

| ID | Severity | Location | Issue |
|---|---|---|---|
| B7 | **High** | `match_engine.rs` `dedup_pepseq_score` | Dedup key is `(bare sequence, psm.score.round())`; ignores modifications and uses pin score instead of Java-aligned `rank_score`. Can over-merge TMT/mod variants. |
| B8 | **Medium** | `match_engine.rs` | `dedup_pepseq_score` keeps first PSM on collision (HashMap iteration order) → nondeterministic primary candidate for shared peptides. |
| B9 | **Medium** | `sa_walk.rs` | Does not enforce `max_missed_cleavages`; only used in tests today but would inflate search space if wired to production. |
| B10 | **High** | `mzml.rs` `Iterator::next` | First per-spectrum parse error sets `done=true` and aborts the entire file; remaining spectra are silently skipped. |
| B11 | **Low** | `sa_walk.rs` Met pass | Dedupes Met-cleaved peptides on residue bytes only, collapsing distinct C-terminal contexts. |

## Known test failures (pre-existing, CI-skipped)

These fail on `master` without the 7 CI skip flags; tracked as parity/min_peaks regressions:

- `match_engine_smoke::known_peptide_appears_in_top_n`
- `match_engine_smoke::charge_missing_spectrum_uses_per_charge_scored_spec`
- `match_engine_smoke::spectrum_without_charge_tries_charge_range`
- Maven fixture loads, thread-determinism test (see `.github/workflows/ci.yml`)

## Verification

```bash
cargo test --release --workspace -- \
--skip charge_missing_spectrum_uses_per_charge_scored_spec \
--skip spectrum_without_charge_tries_charge_range \
--skip known_peptide_appears_in_top_n \
--skip read_bsa_canno_text_format \
--skip read_tryp_pig_bov_revcat_csarr_cnlcp \
--skip tryp_pig_bov_revcat_full_set_loads \
--skip match_spectra_output_invariant_across_thread_counts
```
26 changes: 19 additions & 7 deletions crates/msgf-rust/src/bin/msgf-rust.rs
Original file line number Diff line number Diff line change
Expand Up @@ -302,10 +302,6 @@ where
}
}
}
if bench_cap < usize::MAX && total + chunk.len() > bench_cap {
let keep = bench_cap.saturating_sub(total);
chunk.truncate(keep);
}
if !chunk.is_empty() {
let _ = tx.send(chunk);
}
Expand Down Expand Up @@ -396,8 +392,12 @@ fn run(cli: Cli) -> Result<(), Box<dyn std::error::Error>> {
let param_path = match cli.param_file.clone() {
Some(p) => p,
None => {
let auto_route_eligible = cli.fragmentation == Fragmentation::Auto
&& cli.instrument == Instrument::LowRes;
let ext_is_mzml = cli.spectrum
.extension()
.and_then(|e| e.to_str())
.map(|s| s.eq_ignore_ascii_case("mzml"))
.unwrap_or(false);
let auto_route_eligible = cli.fragmentation == Fragmentation::Auto && ext_is_mzml;
if auto_route_eligible {
match detect_dominant_activation(&cli.spectrum) {
Some(method) => {
Expand Down Expand Up @@ -442,7 +442,19 @@ fn run(cli: Cli) -> Result<(), Box<dyn std::error::Error>> {
params.precursor_tolerance =
PrecursorTolerance::symmetric(Tolerance::Ppm(cli.precursor_tol_ppm));
params.charge_range = cli.charge_min..=cli.charge_max;
if cli.charge_min > cli.charge_max {
return Err(format!(
"invalid charge range: --charge-min {} > --charge-max {}",
cli.charge_min, cli.charge_max
).into());
}
params.isotope_error_range = cli.isotope_error_min..=cli.isotope_error_max;
if cli.isotope_error_min > cli.isotope_error_max {
return Err(format!(
"invalid isotope error range: --isotope-error-min {} > --isotope-error-max {}",
cli.isotope_error_min, cli.isotope_error_max
).into());
}
params.top_n_psms_per_spectrum = cli.top_n;
params.num_tolerable_termini = match cli.enzyme_specificity {
EnzymeSpecificity::Fully => 2,
Expand Down Expand Up @@ -665,7 +677,7 @@ fn run(cli: Cli) -> Result<(), Box<dyn std::error::Error>> {
.file_name()
.map(|n| n.to_string_lossy().into_owned())
.unwrap_or_else(|| cli.spectrum.display().to_string());
output::write_tsv(tsv_path, &spectra, &queues, &prepared.candidates, &params, &idx, &spec_file_name, true)?;
output::write_tsv(tsv_path, &spectra, &queues, &prepared.candidates, &params, &idx, &spec_file_name, !is_mzml)?;
eprintln!("Wrote TSV: {}", tsv_path.display());
}

Expand Down
47 changes: 47 additions & 0 deletions crates/msgf-rust/tests/cli_smoke.rs
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,53 @@ fn cli_runs_end_to_end_on_tiny_mzml() {
);
}

#[test]
fn bench_mode_max_spectra_produces_nonempty_pin() {
// Regression for send_chunks bench-cap bug: --max-spectra 100 must not
// drop the entire final partial chunk (which used to truncate to zero).
let dir = tempfile::tempdir().expect("tempdir");
let pin_path = dir.path().join("bench.pin");

let status = base_cmd(
"test-fixtures/test.mgf",
"test-fixtures/BSA.fasta",
&pin_path,
)
.arg("--max-spectra")
.arg("100")
.status()
.expect("run msgf-rust bench mode");

assert!(status.success(), "bench mode should exit 0, got: {status}");
assert!(pin_path.exists(), "PIN should be written in bench mode");

let content = std::fs::read_to_string(&pin_path).unwrap();
assert!(
content.lines().count() > 1,
"bench mode with --max-spectra 100 should produce header + data rows"
);
}

#[test]
fn cli_rejects_inverted_charge_range() {
let dir = tempfile::tempdir().expect("tempdir");
let pin_path = dir.path().join("out.pin");

let status = base_cmd(
"test-fixtures/test.mgf",
"test-fixtures/BSA.fasta",
&pin_path,
)
.arg("--charge-min")
.arg("4")
.arg("--charge-max")
.arg("2")
.status()
.expect("run msgf-rust with inverted charge range");

assert!(!status.success(), "inverted charge range must fail");
}

/// Regression guard: legacy Java numeric flag values and the new
/// Rust-idiomatic named values must resolve to byte-identical PIN output.
/// Quantms scripts use the numeric form; new docs recommend the named form.
Expand Down
4 changes: 2 additions & 2 deletions crates/output/src/tsv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ fn write_psm_row<W: Write>(
// Precursor m/z formatted to 4 decimal places
let precursor = format!("{:.4}", spec.precursor_mz);

// IsotopeError: always 0 (winning isotope offset not threaded here yet)
let isotope_error: i32 = 0;
// IsotopeError: winning isotope offset from the search (matches PIN column).
let isotope_error: i32 = psm.isotope_offset as i32;

// PrecursorError: mass_error_ppm stored on psm; convert to Da if needed
let precursor_error = if ppm_mode {
Expand Down
8 changes: 3 additions & 5 deletions crates/search/src/match_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -680,11 +680,9 @@ fn compute_spec_e_values_for_spectrum(
let mut any_c = false;
for psm in queue.iter_psms() {
let cand = &candidates[psm.primary_candidate_idx() as usize];
if let Some(prot) = search_index.protein_at(cand.protein_index) {
let start = cand.start_offset_in_protein;
let pep_len = cand.peptide.length();
if start == 0 { any_n = true; }
if start + pep_len >= prot.sequence.len() { any_c = true; }
if search_index.protein_at(cand.protein_index).is_some() {
if cand.is_protein_n_term { any_n = true; }
if cand.is_protein_c_term { any_c = true; }
if any_n && any_c { break; }
}
}
Expand Down
Loading