From db7319728a90a3bc7cd2f8e7d2655440be268602 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 17 Apr 2026 18:54:58 +0100 Subject: [PATCH 1/2] chore(mzml): annotate StaxMzMLParser BOM/prolog errors with actionable context (Q8) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Stax XML parser raises terse "ParseError in XML prolog" exceptions when an mzML file has a byte-order mark or malformed encoding declaration — a recurring onboarding failure reported in thomasp85/MSGFplus#8. Users see the message but not the fix. Wraps both parse entry points (buildIndex, preloadAllSpectra) in catches that: - preserve the original XMLStreamException as the cause - prepend the full mzML file path and the parse phase - when the underlying message looks like a BOM / prolog / encoding failure, point the user at ThermoRawFileParser / MSConvert and docs/Troubleshooting.md with a concrete shell command to detect a BOM (`head -c 3 file.mzML | xxd`) No behaviour change for well-formed mzML: annotate() is only reached on an exception, and the wrapped message still includes the original parser error verbatim so tooling that matches on Stax substrings keeps working. Tests: TestStaxMzMLParserErrorContext (2 cases) feeds BOM-prefixed and garbage-prolog mzML to the constructor and verifies the wrapped message is emitted. Co-Authored-By: Claude Sonnet 4.6 --- .../edu/ucsd/msjava/mzml/StaxMzMLParser.java | 37 +++++++++ .../TestStaxMzMLParserErrorContext.java | 80 +++++++++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 src/test/java/msgfplus/TestStaxMzMLParserErrorContext.java diff --git a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java index e1c9d963..99daea0d 100644 --- a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java +++ b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java @@ -179,12 +179,47 @@ private synchronized void preloadAllSpectra() throws IOException, XMLStreamExcep } finally { reader.close(); } + } catch (XMLStreamException e) { + throw annotate(e, "preload"); } allLoaded = true; long elapsed = System.currentTimeMillis() - startTime; System.out.println("StAX mzML preload: " + cache.size() + " spectra loaded in " + elapsed + " ms"); } + /** + * Rethrow an {@link XMLStreamException} with a context-rich message. If + * the underlying error looks like a BOM or XML-prolog / encoding issue + * (the most common cause of "ParseError in XML prolog" on Windows), + * suggest the concrete fix. + * + * @param e the original Stax exception; wrapped as cause + * @param phase short tag identifying the parse phase ("index", "preload") + */ + private XMLStreamException annotate(XMLStreamException e, String phase) { + String msg = e.getMessage() == null ? "" : e.getMessage(); + StringBuilder sb = new StringBuilder(); + sb.append("Could not parse mzML file '").append(specFile.getAbsolutePath()).append("' during ").append(phase).append("."); + if (looksLikeBomOrPrologIssue(msg)) { + sb.append(" This usually means the file has a byte-order mark (BOM) or an encoding mismatch in the XML prolog. Verify that the file starts with `` with no leading whitespace or BOM (on Linux/macOS: `head -c 3 \"") + .append(specFile.getName()).append("\" | xxd`; a BOM shows as `ef bb bf`). Re-converting the raw file with ThermoRawFileParser or MSConvert usually resolves it. See docs/Troubleshooting.md for details."); + } + sb.append(" Underlying parser error: ").append(msg); + XMLStreamException wrapped = new XMLStreamException(sb.toString(), e.getLocation(), e); + return wrapped; + } + + private static boolean looksLikeBomOrPrologIssue(String msg) { + if (msg == null) return false; + String m = msg.toLowerCase(java.util.Locale.ROOT); + return m.contains("prolog") + || m.contains("bom") + || m.contains("byte order mark") + || m.contains("encoding") + || m.contains("invalid character") + || m.contains("content is not allowed"); + } + /** Parse and return the full spectrum by its string ID. */ public Spectrum getSpectrumById(String specId) { SpectrumIndex si = indexById.get(specId); @@ -251,6 +286,8 @@ private void buildIndex() throws IOException, XMLStreamException { } finally { reader.close(); } + } catch (XMLStreamException e) { + throw annotate(e, "index"); } } diff --git a/src/test/java/msgfplus/TestStaxMzMLParserErrorContext.java b/src/test/java/msgfplus/TestStaxMzMLParserErrorContext.java new file mode 100644 index 00000000..aaf69123 --- /dev/null +++ b/src/test/java/msgfplus/TestStaxMzMLParserErrorContext.java @@ -0,0 +1,80 @@ +package msgfplus; + +import edu.ucsd.msjava.mzml.StaxMzMLParser; +import org.junit.Assert; +import org.junit.Test; + +import javax.xml.stream.XMLStreamException; +import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * Covers Q8: when the mzML has a byte-order mark (BOM) or a malformed XML + * prolog, the constructor's {@link XMLStreamException} is re-thrown with an + * actionable message instead of Stax's terse "ParseError in XML prolog". + */ +public class TestStaxMzMLParserErrorContext { + + private File writeBytesToTempMzml(byte[] bytes) throws IOException { + Path tmp = Files.createTempFile("msgfplus-stax-context-", ".mzML"); + Files.write(tmp, bytes); + tmp.toFile().deleteOnExit(); + return tmp.toFile(); + } + + @Test + public void bomPrefixedMzmlGivesActionableMessage() throws Exception { + // UTF-8 BOM (EF BB BF) followed by a plausible-looking mzML prolog. + byte[] bom = new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}; + byte[] prolog = "".getBytes(StandardCharsets.UTF_8); + byte[] content = new byte[bom.length + prolog.length]; + System.arraycopy(bom, 0, content, 0, bom.length); + System.arraycopy(prolog, 0, content, bom.length, prolog.length); + + File mzml = writeBytesToTempMzml(content); + + try { + new StaxMzMLParser(mzml); + // Note: some Stax implementations tolerate a UTF-8 BOM. If this one + // does, the test becomes a no-op — we can't force the parser to + // fail, so just return. + } catch (XMLStreamException e) { + String msg = e.getMessage(); + Assert.assertNotNull("Wrapped XMLStreamException should carry a message", msg); + Assert.assertTrue("Message should include the full file path for context", + msg.contains(mzml.getAbsolutePath())); + Assert.assertTrue("Message should mention the BOM / prolog / encoding hint", + msg.contains("byte-order mark") || msg.contains("BOM") + || msg.contains("XML prolog") || msg.contains("encoding")); + Assert.assertTrue("Message should point at Troubleshooting.md", + msg.contains("Troubleshooting.md")); + } + } + + @Test + public void garbledPrologAlwaysProducesAnnotatedMessage() throws Exception { + // Definitely-malformed XML (just random text, no prolog at all). + // Every Stax impl rejects this. + byte[] garbage = "this is not xml at all".getBytes(StandardCharsets.UTF_8); + File mzml = writeBytesToTempMzml(garbage); + + try { + new StaxMzMLParser(mzml); + Assert.fail("Parsing random bytes as mzML should not succeed"); + } catch (XMLStreamException e) { + String msg = e.getMessage(); + Assert.assertNotNull(msg); + Assert.assertTrue("Message should include the index phase tag", + msg.contains("during index")); + Assert.assertTrue("Message should include the file path", + msg.contains(mzml.getAbsolutePath())); + Assert.assertTrue("Original parser error should be preserved in the message", + msg.contains("Underlying parser error")); + Assert.assertSame("Original exception should be the cause", + e.getCause().getClass(), XMLStreamException.class); + } + } +} From 03e6e5a801d6e8016b0da210b899e4aec4fb462d Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 18 Apr 2026 06:15:30 +0100 Subject: [PATCH 2/2] fix(mzml): explicit initCause on annotated XMLStreamException CI surfaced an assertion failure in TestStaxMzMLParserErrorContext.garbledPrologAlwaysProducesAnnotatedMessage: getCause() was null on the wrapped exception. Root cause: the XMLStreamException(msg, location, nested) constructor stores the original exception as an internal "nested exception" but does NOT invoke Throwable.initCause(), so getCause() returns null. This is a JDK idiosyncrasy of javax.xml.stream.XMLStreamException, not of the generic Throwable chain. Fix: drop the 3-arg constructor and call initCause(e) explicitly on the wrapped exception so standard Java exception-chaining utilities (printStackTrace, causal frames, test assertions) see the original parser error as expected. Co-Authored-By: Claude Sonnet 4.6 --- src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java index 99daea0d..3e6b12b4 100644 --- a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java +++ b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java @@ -205,7 +205,12 @@ private XMLStreamException annotate(XMLStreamException e, String phase) { .append(specFile.getName()).append("\" | xxd`; a BOM shows as `ef bb bf`). Re-converting the raw file with ThermoRawFileParser or MSConvert usually resolves it. See docs/Troubleshooting.md for details."); } sb.append(" Underlying parser error: ").append(msg); - XMLStreamException wrapped = new XMLStreamException(sb.toString(), e.getLocation(), e); + // Note: XMLStreamException(msg, location, nested) stores the cause as a + // "nested exception" but does NOT invoke Throwable.initCause, so + // getCause() returns null. Call initCause() explicitly so standard + // Java chaining (printStackTrace, causal frames) works. + XMLStreamException wrapped = new XMLStreamException(sb.toString(), e.getLocation()); + wrapped.initCause(e); return wrapped; }