From c737998fc43b07b11a6899f871e7864ddbd76976 Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Tue, 14 Oct 2025 11:19:40 +0200 Subject: [PATCH 1/2] OPENNLP-1781 - SentenceDetectorME throws StringIndexOutOfBoundsException when sentence starts with an abbreviation --- .../tools/sentdetect/SentenceDetectorME.java | 3 ++- .../SentenceDetectorMEGermanTest.java | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java index ddcc3388c..9b113e120 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java @@ -345,7 +345,8 @@ protected boolean isAcceptableBreak(CharSequence s, int fromIndex, int candidate if (tokenPosition == -1) { continue; // skip fast } - final char prevChar = s.charAt(tokenPosition - 1); + + final char prevChar = s.charAt(tokenPosition == 0 ? tokenPosition : tokenPosition - 1); int tokenLength = token.length(); if (tokenPosition + tokenLength < candidateIndex || tokenPosition > candidateIndex || /* diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java index d95a1eeca..cf8cb22f5 100644 --- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java @@ -151,6 +151,24 @@ void testSentDetectWithUseTokenEndFalse() { () -> assertEquals(2, probs.length)); } + /* + * A reproducer and test for OPENNLP-1781. + */ + @Test + void testSentDetectWithAbbreviationsAtSentenceStart() { + prepareResources(true); + + final String sent1 = "S. Träume sind eine Verbindung von Gedanken."; + + final String[] sents = sentenceDetector.sentDetect(sent1); + final double[] probs = sentenceDetector.probs(); + + assertAll( + () -> assertEquals(1, sents.length), + () -> assertEquals(sent1, sents[0]), + () -> assertEquals(1, probs.length)); + } + /* * A reproducer and test for OPENNLP-1767. * It checks that sentence detection with common abbreviations works correctly, From fa96f2cef74bda356e8fb2b0f154607ef3a3882b Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Tue, 14 Oct 2025 11:54:23 +0200 Subject: [PATCH 2/2] Adds additional test case for combination with OPENNLP-1767 --- .../opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java index cf8cb22f5..5417fe170 100644 --- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java @@ -181,6 +181,7 @@ void testSentDetectWithAbbreviationsAtSentenceStart() { "Der Auto stand schief. Wer hat es dort geparkt?", "Es lag am DBMS. Die Performance muss verbessert werden.", "Siehe Buch S. 17f. Dort ist es zu finden.", + "S. Buch S. 17f. Dort ist es zu finden.", // OPENNLP-1781 "Sie trank einen Mocca. Er schmeckte ihr!", "Der Anker hängt zu Beginn des Bugs. Es ist vertaut.", "Das Verfahren testet auf HIV. Es ist präzise."