updated to latest sentence extraction code

updated code from https://github.com/gregko/TtsPlus-FBReader
benetech · Sep 5, 2014 · 8b72d39 · 8b72d39
1 parent 2c5311b
commit 8b72d39
Showing 1 changed file with 71 additions and 19 deletions.
diff --git a/src/com/hyperionics/fbreader/plugin/tts_plus/TtsSentenceExtractor.java b/src/com/hyperionics/fbreader/plugin/tts_plus/TtsSentenceExtractor.java
@@ -37,7 +37,7 @@ public SentenceIndex(String ss, int ii) {
 
     public static SentenceIndex[] extract(String paragraph, Locale loc) {
         paragraph = paragraph.replace(". . .", "...");
-        paragraph = paragraphReplaceEngAbbreviations(paragraph, loc);
+        paragraph = paragraphReplaceAbbreviations(paragraph, loc);
         final Pattern p = Pattern.compile("[\\.\\!\\?]\\s+", Pattern.MULTILINE);
         String[] sentences = p.split(paragraph);
         int len = 0;
@@ -62,32 +62,62 @@ public static SentenceIndex[] build(List<String> wl, ArrayList<Integer> il, Loca
             return new SentenceIndex[0];
         }
 
+        int breakSentences = 500; // other engines have troubles over 5000 characters...
+        boolean wordsOnly = false;
+
         ArrayList<String> ss = new ArrayList<String>();
         ArrayList<Integer> inds = new ArrayList<Integer>();
         String currSent = "";
         int i, indToAdd = 0;
-
-        for (i = 0; i < wl.size(); i++) {
+        int sz = wl.size();
+        if (il.size() < sz)
+            sz = il.size();
+        for (i = 0; i < sz; i++) {
             String w = wl.get(i);
+            if (w.length() == 0)
+                continue;
             int len = currSent.length();
             if (len == 0)
                 indToAdd = il.get(i);
             if (w.length() == 2 && w.endsWith(".") && Character.isUpperCase(w.charAt(0))) {
                 w = w.substring(0, 1) + " ";
             } else {
-                w = replaceEngAbbreviations(w, loc);
+                w = w.replace('\u00A0', ' '); // dec 160, no-break space
+                w = w.replace("\u200B", " ");  // dec. 8203, 'zero width space' (do not replace with empty, or we may get w empty and crash)
+                w = w.replace('\u2019', '\''); // RIGHT SINGLE QUOTATION MARK (U+2019), mis-pronounced by Google TTS?
+                if (w.charAt(0) == '\u2026')  // dec 8230 ellipses ... remove at start
+                    w = " " + w.substring(1);
+                if (i < wl.size()-2 && wl.get(i+1).equals(".") && !wl.get(i+2).equals(".")) {
+                    w += ".";
+                    wl.set(i+1, "");
+                }
+                w = replaceAbbreviations(w, loc);
             }
-            char lastCh = w.charAt(w.length() - 1);
-            boolean endSentence = lastCh == '.' && (i == wl.size()-1 || !wl.get(i+1).equals(".")) ||
-                          lastCh == '!' || lastCh == '?';
-            if (!endSentence && w.length () > 1 && (lastCh == '"' || lastCh == 0x201D || lastCh == ')')) {
-                lastCh = w.charAt(w.length() - 2);
+            boolean endSentence = wordsOnly;
+            if (!wordsOnly) {
+                char lastCh = w.charAt(w.length() - 1);
                 endSentence = lastCh == '.' && (i == wl.size()-1 || !wl.get(i+1).equals(".")) ||
                         lastCh == '!' || lastCh == '?';
+                endSentence |= lastCh == 0x964; // "Devanagari Danda" sentence delimiter. Consider also STerm Unicode set.
+                if (!endSentence && w.length () > 1 && (lastCh == '"' || lastCh == 0x201D || lastCh == ')')) {
+                    lastCh = w.charAt(w.length() - 2);
+                    endSentence = lastCh == '.' && (i == wl.size()-1 || !wl.get(i+1).equals(".")) ||
+                            lastCh == '!' || lastCh == '?';
+                }
+                // Split long sentences, Nuance TTS does not speak beyond 592 characters length...
+                // at the next comma, hyphen, ( or ), ellipses..., colon :, semicolon ;
+                if (breakSentences > 0 && !endSentence && currSent.length() > breakSentences) {
+                    endSentence = lastCh == ',' || lastCh == '-' || lastCh == '(' || lastCh == ')' ||
+                            lastCh == ':' || lastCh == ';' || currSent.length() > breakSentences + 80;
+
+                }
+                if (!currSent.equals("") && (w.length() > 1 || !endSentence) && currSent.charAt(currSent.length()-1) != '.')
+                    currSent += " ";
+                currSent += w;
+            } else {
+                currSent += w; // + "<break time=\"1ms\"/>";
             }
-            if (!currSent.equals("") && (w.length() > 1 || !endSentence) && currSent.charAt(currSent.length()-1) != '.')
-                currSent += " ";
-            currSent += w;
+
             if (endSentence || i == wl.size()-1) {
                 ss.add(currSent);
                 inds.add(indToAdd);
@@ -108,29 +138,41 @@ public static SentenceIndex[] build(List<String> wl, ArrayList<Integer> il, Loca
      * sentences.
      * Eventually this method should take an external file or resource, that a user could edit
      * to correct pronunciation different words. Should take into account the locale as well.
-     * 
+     *
      * @param inStr - input String
      * @return - String with abbreviations replaced
      */
-    private static String replaceEngAbbreviations(String inStr, Locale loc) {
+    private static String replaceAbbreviations(String inStr, Locale loc) {
         // spelling is not important here, pronunciation by TTS engine is.
         if (loc == null)
             return inStr;
 
         String lang = loc.getLanguage();
+        if (lang.equals("pl")) {
+            inStr = inStr.replace("Prof.", "Profesor");
+            inStr = inStr.replace("prof.", "profesor");
+            return inStr;
+        }
+
         if (!(lang.equals("eng") || lang.equals("en")))
             return inStr;
 
         if (inStr.endsWith(".")) {
             inStr = inStr.replace("Mr.", "Mr ");
+            inStr = inStr.replace("Ms.", "Mys ");
             inStr = inStr.replace("Mrs.", "Mrs ");
             inStr = inStr.replace("Dr.", "Dr "); // we don't know if it's "Doctor" or "Drive"
             inStr = inStr.replace("Prof.", "Prof ");
+            inStr = inStr.replace("\"Mr.", "\"Mr ");
+            inStr = inStr.replace("\"Ms.", "\"Mys ");
+            inStr = inStr.replace("\"Mrs.", "\"Mrs ");
+            inStr = inStr.replace("\"Dr.", "\"Dr "); // we don't know if it's "Doctor" or "Drive"
+            inStr = inStr.replace("\"Prof.", "\"Prof ");
             inStr = inStr.replace("i.e.", "I E ");
-            inStr = inStr.replace("Rev.", "Rev ");
-            inStr = inStr.replace("Gen.", "General ");
+            inStr = inStr.replace("\"Rev.", "\"Rev ");
+            inStr = inStr.replace("\"Gen.", "\"General ");
             inStr = inStr.replace("St.", "S T "); // we don't know if it's "Saint" or "Street"...
-            inStr = inStr.replace("Rep.", "Representative ");
+            inStr = inStr.replace("\"Rep.", "\"Representative ");
             inStr = inStr.replace("Ph.D.", "Ph.D ");
             inStr = inStr.replace("Sr.", "Senior ");
             inStr = inStr.replace("Jr.", "Junior ");
@@ -141,8 +183,13 @@ private static String replaceEngAbbreviations(String inStr, Locale loc) {
             inStr = inStr.replace("H.M.", "H M ");
             inStr = inStr.replace("H.M.S.", "H M S ");
             inStr = inStr.replace("U.S.", "U S ");
+            inStr = inStr.replace("Cpt.", "Capitan ");
             inStr = inStr.replace("No.", "No;"); // Ivona reads it at "number", we want "no", negation, with a pause
             inStr = inStr.replace("no.", "no;");
+            inStr = inStr.replace("R.N.", "R N");
+            inStr = inStr.replace("R.A.F.", "R A F");
+            inStr = inStr.replace("Ltd.", "L T D");
+
         }
         // Greg's private replacemtns... Move into preferences...
         inStr = inStr.replace("antiaging", "anti-aging");
@@ -151,16 +198,21 @@ private static String replaceEngAbbreviations(String inStr, Locale loc) {
     }
 
     // Slower version when reading entire paragraphs
-    private static String paragraphReplaceEngAbbreviations(String inStr, Locale loc) {
+    private static String paragraphReplaceAbbreviations(String inStr, Locale loc) {
         // spelling is not important here, pronunciation by TTS engine is.
         if (loc == null)
             return inStr;
 
         String lang = loc.getLanguage();
+        if (lang.equals("pl")) {
+            inStr = inStr.replace("Prof.", "Profesor");
+            return inStr;
+        }
         if (!(lang.equals("eng") || lang.equals("en")))
             return inStr;
 
         inStr = inStr.replace("Mr.", "Mr ");
+        inStr = inStr.replace("Ms.", "Ms ");
         inStr = inStr.replace("Mrs.", "Mrs ");
         inStr = inStr.replace("Dr.", "Dr "); // we don't know if it's "Doctor" or "Drive"
         inStr = inStr.replace("Prof.", "Prof ");
@@ -182,7 +234,7 @@ private static String paragraphReplaceEngAbbreviations(String inStr, Locale loc)
         inStr = inStr.replace("No.", "No;"); // Ivona reads it at "number", we want "no", negation, with a pause
         inStr = inStr.replace("no.", "no;");
 
-        // Greg's private replacements... Move into preferences...
+        // Greg's private replacemtns... Move into preferences...
         inStr = inStr.replace("antiaging", "anti-aging");
         inStr = inStr.replace("Antiaging", "Anti-aging");
         return inStr;