Add Builder mechanism to TurkishSentenceExtractor. Add optional param…

…eter so that sentences are not extracted from within double quotation marks. #192 Add thread count helpers to Concurrency util. Add thread count parameter to PreprocessTurkishCorpus.java
ahmetaa · Dec 19, 2018 · cf3789b · cf3789b
1 parent 27cbc2a
commit cf3789b
Show file tree

Hide file tree

Showing 5 changed files with 158 additions and 13 deletions.
diff --git a/apps/src/main/java/zemberek/apps/corpus/PreprocessTurkishCorpus.java b/apps/src/main/java/zemberek/apps/corpus/PreprocessTurkishCorpus.java
@@ -15,6 +15,7 @@
 import me.tongfei.progressbar.ProgressBarStyle;
 import zemberek.apps.ConsoleApp;
 import zemberek.core.concurrency.BlockingExecutor;
+import zemberek.core.concurrency.ConcurrencyUtil;
 import zemberek.core.logging.Log;
 import zemberek.core.text.BlockTextLoader;
 import zemberek.core.text.TextChunk;
@@ -62,6 +63,10 @@ public class PreprocessTurkishCorpus extends ConsoleApp {
           + "longest lemmas. By default sentence segmentation and tokenization is applied.")
   private Operation operation = Operation.NONE;
 
+  @Parameter(names = {"--threadCount", "-tc"},
+      description = "Thread Count.")
+  int threadCount = ConcurrencyUtil.getHalfCpuCount();
+
   @Override
   public String description() {
     return "Applies Turkish Sentence boundary detection and tokenization to a corpus file or a "
@@ -102,7 +107,7 @@ public void run() throws IOException {
               .collect(Collectors.toList());
 
       for (Path directory : directories) {
-        if (dirList!=null && !dirNamesToProcess.contains(directory.toFile().getName())) {
+        if (dirList != null && !dirNamesToProcess.contains(directory.toFile().getName())) {
           continue;
         }
         paths.addAll(Files.walk(directory, 1)
@@ -128,14 +133,13 @@ public void run() throws IOException {
     }
 
     try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
-      ProgressBar pb = new ProgressBar("Lines", totalLines, ProgressBarStyle.ASCII);
+      ProgressBar progressBar = new ProgressBar("Lines", totalLines, ProgressBarStyle.ASCII);
 
       BlockTextLoader loader = BlockTextLoader.fromPaths(paths, 10_000);
-      BlockingExecutor executor =
-          new BlockingExecutor(Runtime.getRuntime().availableProcessors());
+      BlockingExecutor executor = new BlockingExecutor(threadCount);
 
       for (TextChunk chunk : loader) {
-        executor.submit(()-> {
+        executor.submit(() -> {
           List<String> processed = chunk.getData().stream()
               .filter(s -> !s.startsWith("<")) // ignore meta tag lines.
               .map(TextUtil::normalizeSpacesAndSoftHyphens)
@@ -157,12 +161,12 @@ public void run() throws IOException {
           synchronized (this) {
             sentences.forEach(pw::println);
             sentenceCount.addAndGet(sentences.size());
-            pb.stepBy(chunk.size());
+            progressBar.stepBy(chunk.size());
           }
         });
       }
       executor.shutdown();
-      pb.close();
+      progressBar.close();
     }
 
     Log.info("%d sentences are written in %s", sentenceCount.get(), output);

diff --git a/core/src/main/java/zemberek/core/concurrency/ConcurrencyUtil.java b/core/src/main/java/zemberek/core/concurrency/ConcurrencyUtil.java
@@ -29,4 +29,24 @@ public static int validateCpuThreadCount(int threadCount) {
     }
   }
 
+  public static int getHalfCpuCount() {
+    int availableProcessors = Runtime.getRuntime().availableProcessors();
+    if (availableProcessors == 1) {
+      return 1;
+    }
+    return availableProcessors / 2;
+  }
+
+  public static int getMostCpuCount(int leaveCount) {
+    if (leaveCount < 0) {
+      throw new IllegalArgumentException("Remaining count cannot be negative");
+    }
+    int availableProcessors = Runtime.getRuntime().availableProcessors();
+    int count = availableProcessors - leaveCount;
+    if (count <= 0) {
+      return 1;
+    }
+    return count;
+  }
+
 }
diff --git a/tokenization/src/main/java/zemberek/tokenization/Span.java b/tokenization/src/main/java/zemberek/tokenization/Span.java
@@ -56,4 +56,7 @@ public String getSubstring(String input) {
     return input.substring(start, end);
   }
 
+  public boolean inSpan(int i) {
+    return i >= start && i < end;
+  }
 }
diff --git a/tokenization/src/main/java/zemberek/tokenization/TurkishSentenceExtractor.java b/tokenization/src/main/java/zemberek/tokenization/TurkishSentenceExtractor.java
@@ -30,17 +30,19 @@ public class TurkishSentenceExtractor extends PerceptronSegmenter {
    * A singleton instance that is generated from the default internal model.
    */
   public static final TurkishSentenceExtractor DEFAULT = Singleton.Instance.extractor;
+
   static final String BOUNDARY_CHARS = ".!?…";
   private static final Pattern LINE_BREAK_PATTERN = Pattern.compile("[\n\r]+");
+  private boolean doNotSplitInDoubleQuotes = false;
 
   private TurkishSentenceExtractor(FloatValueMap<String> weights) {
     this.weights = weights;
   }
 
-  public static TurkishSentenceExtractor loadFromBinaryFile(Path file) throws IOException {
-    try (DataInputStream dis = IOUtil.getDataInputStream(file)) {
-      return new TurkishSentenceExtractor(load(dis));
-    }
+  private TurkishSentenceExtractor(FloatValueMap<String> weights,
+      boolean doNotSplitInDoubleQuotes) {
+    this.weights = weights;
+    this.doNotSplitInDoubleQuotes = doNotSplitInDoubleQuotes;
   }
 
   private static TurkishSentenceExtractor fromDefaultModel() throws IOException {
@@ -50,6 +52,45 @@ private static TurkishSentenceExtractor fromDefaultModel() throws IOException {
     }
   }
 
+  public static Builder builder() {
+    return new Builder();
+  }
+
+  public static class Builder {
+
+    boolean _doNotSplitInDoubleQuotes = false;
+    FloatValueMap<String> _model;
+
+    Builder doNotSplitInDoubleQuotes() {
+      this._doNotSplitInDoubleQuotes = true;
+      return this;
+    }
+
+    Builder useModelFromResource(String resource) throws IOException {
+      try (DataInputStream dis = IOUtil.getDataInputStream(
+          Resources.getResource(resource).openStream())) {
+        this._model = load(dis);
+      }
+      return this;
+    }
+
+    Builder useModelFromPath(Path path) throws IOException {
+      try (DataInputStream dis = IOUtil.getDataInputStream(path)) {
+        this._model = load(dis);
+      }
+      return this;
+    }
+
+    Builder useDefaultModel() throws IOException {
+      useModelFromResource("tokenization/sentence-boundary-model.bin");
+      return this;
+    }
+
+    TurkishSentenceExtractor build() {
+      return new TurkishSentenceExtractor(_model, _doNotSplitInDoubleQuotes);
+    }
+  }
+
   /**
    * Extracts sentences from a list if paragraph strings. This method does not split from line
    * breaks assuming paragraphs do not contain line breaks. <p> If content contains line breaks, use
@@ -78,15 +119,28 @@ int[] boundaryIndexes(String paragraph) {
     return indexes;
   }
 
+  // TODO: doNotSplitInDoubleQuotes may not be suitable for some cases.
+  // such as for paragraph: "Merhaba. Nasılsın?"
   private List<Span> extractToSpans(String paragraph) {
     List<Span> spans = new ArrayList<>();
+    List<Span> quoteSpans = null;
+    if (doNotSplitInDoubleQuotes) {
+      quoteSpans = doubleQuoteSpans(paragraph);
+    }
     int begin = 0;
     for (int j = 0; j < paragraph.length(); j++) {
+
       // skip if char cannot be a boundary char.
       char chr = paragraph.charAt(j);
       if (BOUNDARY_CHARS.indexOf(chr) < 0) {
         continue;
       }
+
+      // skip is break is not allowed when in double quotes.
+      if (doNotSplitInDoubleQuotes && quoteSpans != null && inSpan(j, quoteSpans)) {
+        continue;
+      }
+
       BoundaryData boundaryData = new BoundaryData(paragraph, j);
       if (boundaryData.nonBoundaryCheck()) {
         continue;
@@ -114,6 +168,44 @@ private List<Span> extractToSpans(String paragraph) {
     return spans;
   }
 
+  private static String doubleQuotes = "\"”“»«";
+
+  /**
+   * Finds double quote spans.
+   */
+  private List<Span> doubleQuoteSpans(String input) {
+    List<Span> spans = new ArrayList<>();
+
+    int start = -1;
+    boolean started = false;
+    for (int j = 0; j < input.length(); j++) {
+      char c = input.charAt(j);
+      if (doubleQuotes.indexOf(c) >= 0) {
+        if (!started) {
+          start = j;
+          started = true;
+        } else {
+          spans.add(new Span(start, j));
+          started = false;
+        }
+      }
+    }
+    return spans;
+  }
+
+  private boolean inSpan(int index, List<Span> spans) {
+    for (Span span : spans) {
+      if (span.start > index) {
+        return false;
+      }
+      if (span.inSpan(index)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+
   /**
    * Extracts sentences from a paragraph string. This method does not split from line breaks
    * assuming paragraphs do not contain line breaks. <p> If content contains line breaks, use {@link

diff --git a/tokenization/src/test/java/zemberek/tokenization/TurkishSentenceExtractorTest.java b/tokenization/src/test/java/zemberek/tokenization/TurkishSentenceExtractorTest.java
@@ -67,17 +67,22 @@ public void extractFromDocument() throws IOException {
 
   }
 
-  private String markBoundariesDocument(String input) throws IOException {
+  private String markBoundariesDocument(String input) {
     List<String> list = TurkishSentenceExtractor.DEFAULT.fromDocument(input);
     return Joiner.on("|").join(list);
   }
 
 
-  private String markBoundariesParagraph(String input) throws IOException {
+  private String markBoundariesParagraph(String input) {
     List<String> list = TurkishSentenceExtractor.DEFAULT.fromParagraph(input);
     return Joiner.on("|").join(list);
   }
 
+  private String markBoundariesParagraph(TurkishSentenceExtractor extractor, String input) {
+    List<String> list = extractor.fromParagraph(input);
+    return Joiner.on("|").join(list);
+  }
+
   @Test
   public void testSimpleSentence() throws IOException {
     Assert.assertEquals("Merhaba!|Bugün 2. köprü Fsm.'de trafik vardı.|değil mi?",
@@ -105,4 +110,25 @@ public void shouldReturn0ForEmptyff() {
     List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraph("");
     Assert.assertEquals(0, sentences.size());
   }
+
+  @Test
+  public void testDoubleQuotes() throws IOException {
+
+    TurkishSentenceExtractor e = TurkishSentenceExtractor
+        .builder()
+        .doNotSplitInDoubleQuotes()
+        .useDefaultModel().build();
+
+    Assert.assertEquals(
+        "\"Merhaba! Bugün hava çok güzel. Ne dersin?\" dedi tavşan.|Havucu kemirirken.",
+        markBoundariesParagraph(
+            e,
+            "\"Merhaba! Bugün hava çok güzel. Ne dersin?\" dedi tavşan. Havucu kemirirken."));
+
+    Assert.assertEquals(
+        "\"Buna hakkı yok!\" diye öfkeyle konuşmaya başladı Baba Kurt.",
+        markBoundariesParagraph(
+            e, "\"Buna hakkı yok!\" diye öfkeyle konuşmaya başladı Baba Kurt."));
+  }
+
 }