diff --git a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java index e1c9d963..3e6b12b4 100644 --- a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java +++ b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java @@ -179,12 +179,52 @@ private synchronized void preloadAllSpectra() throws IOException, XMLStreamExcep } finally { reader.close(); } + } catch (XMLStreamException e) { + throw annotate(e, "preload"); } allLoaded = true; long elapsed = System.currentTimeMillis() - startTime; System.out.println("StAX mzML preload: " + cache.size() + " spectra loaded in " + elapsed + " ms"); } + /** + * Rethrow an {@link XMLStreamException} with a context-rich message. If + * the underlying error looks like a BOM or XML-prolog / encoding issue + * (the most common cause of "ParseError in XML prolog" on Windows), + * suggest the concrete fix. + * + * @param e the original Stax exception; wrapped as cause + * @param phase short tag identifying the parse phase ("index", "preload") + */ + private XMLStreamException annotate(XMLStreamException e, String phase) { + String msg = e.getMessage() == null ? "" : e.getMessage(); + StringBuilder sb = new StringBuilder(); + sb.append("Could not parse mzML file '").append(specFile.getAbsolutePath()).append("' during ").append(phase).append("."); + if (looksLikeBomOrPrologIssue(msg)) { + sb.append(" This usually means the file has a byte-order mark (BOM) or an encoding mismatch in the XML prolog. Verify that the file starts with `` with no leading whitespace or BOM (on Linux/macOS: `head -c 3 \"") + .append(specFile.getName()).append("\" | xxd`; a BOM shows as `ef bb bf`). Re-converting the raw file with ThermoRawFileParser or MSConvert usually resolves it. See docs/Troubleshooting.md for details."); + } + sb.append(" Underlying parser error: ").append(msg); + // Note: XMLStreamException(msg, location, nested) stores the cause as a + // "nested exception" but does NOT invoke Throwable.initCause, so + // getCause() returns null. Call initCause() explicitly so standard + // Java chaining (printStackTrace, causal frames) works. + XMLStreamException wrapped = new XMLStreamException(sb.toString(), e.getLocation()); + wrapped.initCause(e); + return wrapped; + } + + private static boolean looksLikeBomOrPrologIssue(String msg) { + if (msg == null) return false; + String m = msg.toLowerCase(java.util.Locale.ROOT); + return m.contains("prolog") + || m.contains("bom") + || m.contains("byte order mark") + || m.contains("encoding") + || m.contains("invalid character") + || m.contains("content is not allowed"); + } + /** Parse and return the full spectrum by its string ID. */ public Spectrum getSpectrumById(String specId) { SpectrumIndex si = indexById.get(specId); @@ -251,6 +291,8 @@ private void buildIndex() throws IOException, XMLStreamException { } finally { reader.close(); } + } catch (XMLStreamException e) { + throw annotate(e, "index"); } } diff --git a/src/test/java/msgfplus/TestStaxMzMLParserErrorContext.java b/src/test/java/msgfplus/TestStaxMzMLParserErrorContext.java new file mode 100644 index 00000000..aaf69123 --- /dev/null +++ b/src/test/java/msgfplus/TestStaxMzMLParserErrorContext.java @@ -0,0 +1,80 @@ +package msgfplus; + +import edu.ucsd.msjava.mzml.StaxMzMLParser; +import org.junit.Assert; +import org.junit.Test; + +import javax.xml.stream.XMLStreamException; +import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * Covers Q8: when the mzML has a byte-order mark (BOM) or a malformed XML + * prolog, the constructor's {@link XMLStreamException} is re-thrown with an + * actionable message instead of Stax's terse "ParseError in XML prolog". + */ +public class TestStaxMzMLParserErrorContext { + + private File writeBytesToTempMzml(byte[] bytes) throws IOException { + Path tmp = Files.createTempFile("msgfplus-stax-context-", ".mzML"); + Files.write(tmp, bytes); + tmp.toFile().deleteOnExit(); + return tmp.toFile(); + } + + @Test + public void bomPrefixedMzmlGivesActionableMessage() throws Exception { + // UTF-8 BOM (EF BB BF) followed by a plausible-looking mzML prolog. + byte[] bom = new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}; + byte[] prolog = "".getBytes(StandardCharsets.UTF_8); + byte[] content = new byte[bom.length + prolog.length]; + System.arraycopy(bom, 0, content, 0, bom.length); + System.arraycopy(prolog, 0, content, bom.length, prolog.length); + + File mzml = writeBytesToTempMzml(content); + + try { + new StaxMzMLParser(mzml); + // Note: some Stax implementations tolerate a UTF-8 BOM. If this one + // does, the test becomes a no-op — we can't force the parser to + // fail, so just return. + } catch (XMLStreamException e) { + String msg = e.getMessage(); + Assert.assertNotNull("Wrapped XMLStreamException should carry a message", msg); + Assert.assertTrue("Message should include the full file path for context", + msg.contains(mzml.getAbsolutePath())); + Assert.assertTrue("Message should mention the BOM / prolog / encoding hint", + msg.contains("byte-order mark") || msg.contains("BOM") + || msg.contains("XML prolog") || msg.contains("encoding")); + Assert.assertTrue("Message should point at Troubleshooting.md", + msg.contains("Troubleshooting.md")); + } + } + + @Test + public void garbledPrologAlwaysProducesAnnotatedMessage() throws Exception { + // Definitely-malformed XML (just random text, no prolog at all). + // Every Stax impl rejects this. + byte[] garbage = "this is not xml at all".getBytes(StandardCharsets.UTF_8); + File mzml = writeBytesToTempMzml(garbage); + + try { + new StaxMzMLParser(mzml); + Assert.fail("Parsing random bytes as mzML should not succeed"); + } catch (XMLStreamException e) { + String msg = e.getMessage(); + Assert.assertNotNull(msg); + Assert.assertTrue("Message should include the index phase tag", + msg.contains("during index")); + Assert.assertTrue("Message should include the file path", + msg.contains(mzml.getAbsolutePath())); + Assert.assertTrue("Original parser error should be preserved in the message", + msg.contains("Underlying parser error")); + Assert.assertSame("Original exception should be the cause", + e.getCause().getClass(), XMLStreamException.class); + } + } +}