Skip to content
This repository has been archived by the owner on May 24, 2022. It is now read-only.

Commit

Permalink
updated to latest sentence extraction code
Browse files Browse the repository at this point in the history
  • Loading branch information
roms committed Sep 5, 2014
1 parent 2c5311b commit 8b72d39
Showing 1 changed file with 71 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public SentenceIndex(String ss, int ii) {

public static SentenceIndex[] extract(String paragraph, Locale loc) {
paragraph = paragraph.replace(". . .", "...");
paragraph = paragraphReplaceEngAbbreviations(paragraph, loc);
paragraph = paragraphReplaceAbbreviations(paragraph, loc);
final Pattern p = Pattern.compile("[\\.\\!\\?]\\s+", Pattern.MULTILINE);
String[] sentences = p.split(paragraph);
int len = 0;
Expand All @@ -62,32 +62,62 @@ public static SentenceIndex[] build(List<String> wl, ArrayList<Integer> il, Loca
return new SentenceIndex[0];
}

int breakSentences = 500; // other engines have troubles over 5000 characters...
boolean wordsOnly = false;

ArrayList<String> ss = new ArrayList<String>();
ArrayList<Integer> inds = new ArrayList<Integer>();
String currSent = "";
int i, indToAdd = 0;

for (i = 0; i < wl.size(); i++) {
int sz = wl.size();
if (il.size() < sz)
sz = il.size();
for (i = 0; i < sz; i++) {
String w = wl.get(i);
if (w.length() == 0)
continue;
int len = currSent.length();
if (len == 0)
indToAdd = il.get(i);
if (w.length() == 2 && w.endsWith(".") && Character.isUpperCase(w.charAt(0))) {
w = w.substring(0, 1) + " ";
} else {
w = replaceEngAbbreviations(w, loc);
w = w.replace('\u00A0', ' '); // dec 160, no-break space
w = w.replace("\u200B", " "); // dec. 8203, 'zero width space' (do not replace with empty, or we may get w empty and crash)
w = w.replace('\u2019', '\''); // RIGHT SINGLE QUOTATION MARK (U+2019), mis-pronounced by Google TTS?
if (w.charAt(0) == '\u2026') // dec 8230 ellipses ... remove at start
w = " " + w.substring(1);
if (i < wl.size()-2 && wl.get(i+1).equals(".") && !wl.get(i+2).equals(".")) {
w += ".";
wl.set(i+1, "");
}
w = replaceAbbreviations(w, loc);
}
char lastCh = w.charAt(w.length() - 1);
boolean endSentence = lastCh == '.' && (i == wl.size()-1 || !wl.get(i+1).equals(".")) ||
lastCh == '!' || lastCh == '?';
if (!endSentence && w.length () > 1 && (lastCh == '"' || lastCh == 0x201D || lastCh == ')')) {
lastCh = w.charAt(w.length() - 2);
boolean endSentence = wordsOnly;
if (!wordsOnly) {
char lastCh = w.charAt(w.length() - 1);
endSentence = lastCh == '.' && (i == wl.size()-1 || !wl.get(i+1).equals(".")) ||
lastCh == '!' || lastCh == '?';
endSentence |= lastCh == 0x964; // "Devanagari Danda" sentence delimiter. Consider also STerm Unicode set.
if (!endSentence && w.length () > 1 && (lastCh == '"' || lastCh == 0x201D || lastCh == ')')) {
lastCh = w.charAt(w.length() - 2);
endSentence = lastCh == '.' && (i == wl.size()-1 || !wl.get(i+1).equals(".")) ||
lastCh == '!' || lastCh == '?';
}
// Split long sentences, Nuance TTS does not speak beyond 592 characters length...
// at the next comma, hyphen, ( or ), ellipses..., colon :, semicolon ;
if (breakSentences > 0 && !endSentence && currSent.length() > breakSentences) {
endSentence = lastCh == ',' || lastCh == '-' || lastCh == '(' || lastCh == ')' ||
lastCh == ':' || lastCh == ';' || currSent.length() > breakSentences + 80;

}
if (!currSent.equals("") && (w.length() > 1 || !endSentence) && currSent.charAt(currSent.length()-1) != '.')
currSent += " ";
currSent += w;
} else {
currSent += w; // + "<break time=\"1ms\"/>";
}
if (!currSent.equals("") && (w.length() > 1 || !endSentence) && currSent.charAt(currSent.length()-1) != '.')
currSent += " ";
currSent += w;

if (endSentence || i == wl.size()-1) {
ss.add(currSent);
inds.add(indToAdd);
Expand All @@ -108,29 +138,41 @@ public static SentenceIndex[] build(List<String> wl, ArrayList<Integer> il, Loca
* sentences.
* Eventually this method should take an external file or resource, that a user could edit
* to correct pronunciation different words. Should take into account the locale as well.
*
*
* @param inStr - input String
* @return - String with abbreviations replaced
*/
private static String replaceEngAbbreviations(String inStr, Locale loc) {
private static String replaceAbbreviations(String inStr, Locale loc) {
// spelling is not important here, pronunciation by TTS engine is.
if (loc == null)
return inStr;

String lang = loc.getLanguage();
if (lang.equals("pl")) {
inStr = inStr.replace("Prof.", "Profesor");
inStr = inStr.replace("prof.", "profesor");
return inStr;
}

if (!(lang.equals("eng") || lang.equals("en")))
return inStr;

if (inStr.endsWith(".")) {
inStr = inStr.replace("Mr.", "Mr ");
inStr = inStr.replace("Ms.", "Mys ");
inStr = inStr.replace("Mrs.", "Mrs ");
inStr = inStr.replace("Dr.", "Dr "); // we don't know if it's "Doctor" or "Drive"
inStr = inStr.replace("Prof.", "Prof ");
inStr = inStr.replace("\"Mr.", "\"Mr ");
inStr = inStr.replace("\"Ms.", "\"Mys ");
inStr = inStr.replace("\"Mrs.", "\"Mrs ");
inStr = inStr.replace("\"Dr.", "\"Dr "); // we don't know if it's "Doctor" or "Drive"
inStr = inStr.replace("\"Prof.", "\"Prof ");
inStr = inStr.replace("i.e.", "I E ");
inStr = inStr.replace("Rev.", "Rev ");
inStr = inStr.replace("Gen.", "General ");
inStr = inStr.replace("\"Rev.", "\"Rev ");
inStr = inStr.replace("\"Gen.", "\"General ");
inStr = inStr.replace("St.", "S T "); // we don't know if it's "Saint" or "Street"...
inStr = inStr.replace("Rep.", "Representative ");
inStr = inStr.replace("\"Rep.", "\"Representative ");
inStr = inStr.replace("Ph.D.", "Ph.D ");
inStr = inStr.replace("Sr.", "Senior ");
inStr = inStr.replace("Jr.", "Junior ");
Expand All @@ -141,8 +183,13 @@ private static String replaceEngAbbreviations(String inStr, Locale loc) {
inStr = inStr.replace("H.M.", "H M ");
inStr = inStr.replace("H.M.S.", "H M S ");
inStr = inStr.replace("U.S.", "U S ");
inStr = inStr.replace("Cpt.", "Capitan ");
inStr = inStr.replace("No.", "No;"); // Ivona reads it at "number", we want "no", negation, with a pause
inStr = inStr.replace("no.", "no;");
inStr = inStr.replace("R.N.", "R N");
inStr = inStr.replace("R.A.F.", "R A F");
inStr = inStr.replace("Ltd.", "L T D");

}
// Greg's private replacemtns... Move into preferences...
inStr = inStr.replace("antiaging", "anti-aging");
Expand All @@ -151,16 +198,21 @@ private static String replaceEngAbbreviations(String inStr, Locale loc) {
}

// Slower version when reading entire paragraphs
private static String paragraphReplaceEngAbbreviations(String inStr, Locale loc) {
private static String paragraphReplaceAbbreviations(String inStr, Locale loc) {
// spelling is not important here, pronunciation by TTS engine is.
if (loc == null)
return inStr;

String lang = loc.getLanguage();
if (lang.equals("pl")) {
inStr = inStr.replace("Prof.", "Profesor");
return inStr;
}
if (!(lang.equals("eng") || lang.equals("en")))
return inStr;

inStr = inStr.replace("Mr.", "Mr ");
inStr = inStr.replace("Ms.", "Ms ");
inStr = inStr.replace("Mrs.", "Mrs ");
inStr = inStr.replace("Dr.", "Dr "); // we don't know if it's "Doctor" or "Drive"
inStr = inStr.replace("Prof.", "Prof ");
Expand All @@ -182,7 +234,7 @@ private static String paragraphReplaceEngAbbreviations(String inStr, Locale loc)
inStr = inStr.replace("No.", "No;"); // Ivona reads it at "number", we want "no", negation, with a pause
inStr = inStr.replace("no.", "no;");

// Greg's private replacements... Move into preferences...
// Greg's private replacemtns... Move into preferences...
inStr = inStr.replace("antiaging", "anti-aging");
inStr = inStr.replace("Antiaging", "Anti-aging");
return inStr;
Expand Down

0 comments on commit 8b72d39

Please sign in to comment.