Skip to content

Commit

Permalink
Add Builder mechanism to TurkishSentenceExtractor. Add optional param…
Browse files Browse the repository at this point in the history
…eter so that sentences are not extracted from within double quotation marks. #192

Add thread count helpers to Concurrency util.

Add thread count parameter to PreprocessTurkishCorpus.java
  • Loading branch information
ahmetaa committed Dec 19, 2018
1 parent 27cbc2a commit cf3789b
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import me.tongfei.progressbar.ProgressBarStyle;
import zemberek.apps.ConsoleApp;
import zemberek.core.concurrency.BlockingExecutor;
import zemberek.core.concurrency.ConcurrencyUtil;
import zemberek.core.logging.Log;
import zemberek.core.text.BlockTextLoader;
import zemberek.core.text.TextChunk;
Expand Down Expand Up @@ -62,6 +63,10 @@ public class PreprocessTurkishCorpus extends ConsoleApp {
+ "longest lemmas. By default sentence segmentation and tokenization is applied.")
private Operation operation = Operation.NONE;

@Parameter(names = {"--threadCount", "-tc"},
description = "Thread Count.")
int threadCount = ConcurrencyUtil.getHalfCpuCount();

@Override
public String description() {
return "Applies Turkish Sentence boundary detection and tokenization to a corpus file or a "
Expand Down Expand Up @@ -102,7 +107,7 @@ public void run() throws IOException {
.collect(Collectors.toList());

for (Path directory : directories) {
if (dirList!=null && !dirNamesToProcess.contains(directory.toFile().getName())) {
if (dirList != null && !dirNamesToProcess.contains(directory.toFile().getName())) {
continue;
}
paths.addAll(Files.walk(directory, 1)
Expand All @@ -128,14 +133,13 @@ public void run() throws IOException {
}

try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
ProgressBar pb = new ProgressBar("Lines", totalLines, ProgressBarStyle.ASCII);
ProgressBar progressBar = new ProgressBar("Lines", totalLines, ProgressBarStyle.ASCII);

BlockTextLoader loader = BlockTextLoader.fromPaths(paths, 10_000);
BlockingExecutor executor =
new BlockingExecutor(Runtime.getRuntime().availableProcessors());
BlockingExecutor executor = new BlockingExecutor(threadCount);

for (TextChunk chunk : loader) {
executor.submit(()-> {
executor.submit(() -> {
List<String> processed = chunk.getData().stream()
.filter(s -> !s.startsWith("<")) // ignore meta tag lines.
.map(TextUtil::normalizeSpacesAndSoftHyphens)
Expand All @@ -157,12 +161,12 @@ public void run() throws IOException {
synchronized (this) {
sentences.forEach(pw::println);
sentenceCount.addAndGet(sentences.size());
pb.stepBy(chunk.size());
progressBar.stepBy(chunk.size());
}
});
}
executor.shutdown();
pb.close();
progressBar.close();
}

Log.info("%d sentences are written in %s", sentenceCount.get(), output);
Expand Down
20 changes: 20 additions & 0 deletions core/src/main/java/zemberek/core/concurrency/ConcurrencyUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,24 @@ public static int validateCpuThreadCount(int threadCount) {
}
}

public static int getHalfCpuCount() {
int availableProcessors = Runtime.getRuntime().availableProcessors();
if (availableProcessors == 1) {
return 1;
}
return availableProcessors / 2;
}

public static int getMostCpuCount(int leaveCount) {
if (leaveCount < 0) {
throw new IllegalArgumentException("Remaining count cannot be negative");
}
int availableProcessors = Runtime.getRuntime().availableProcessors();
int count = availableProcessors - leaveCount;
if (count <= 0) {
return 1;
}
return count;
}

}
3 changes: 3 additions & 0 deletions tokenization/src/main/java/zemberek/tokenization/Span.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,7 @@ public String getSubstring(String input) {
return input.substring(start, end);
}

public boolean inSpan(int i) {
return i >= start && i < end;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,19 @@ public class TurkishSentenceExtractor extends PerceptronSegmenter {
* A singleton instance that is generated from the default internal model.
*/
public static final TurkishSentenceExtractor DEFAULT = Singleton.Instance.extractor;

static final String BOUNDARY_CHARS = ".!?…";
private static final Pattern LINE_BREAK_PATTERN = Pattern.compile("[\n\r]+");
private boolean doNotSplitInDoubleQuotes = false;

private TurkishSentenceExtractor(FloatValueMap<String> weights) {
this.weights = weights;
}

public static TurkishSentenceExtractor loadFromBinaryFile(Path file) throws IOException {
try (DataInputStream dis = IOUtil.getDataInputStream(file)) {
return new TurkishSentenceExtractor(load(dis));
}
private TurkishSentenceExtractor(FloatValueMap<String> weights,
boolean doNotSplitInDoubleQuotes) {
this.weights = weights;
this.doNotSplitInDoubleQuotes = doNotSplitInDoubleQuotes;
}

private static TurkishSentenceExtractor fromDefaultModel() throws IOException {
Expand All @@ -50,6 +52,45 @@ private static TurkishSentenceExtractor fromDefaultModel() throws IOException {
}
}

public static Builder builder() {
return new Builder();
}

public static class Builder {

boolean _doNotSplitInDoubleQuotes = false;
FloatValueMap<String> _model;

Builder doNotSplitInDoubleQuotes() {
this._doNotSplitInDoubleQuotes = true;
return this;
}

Builder useModelFromResource(String resource) throws IOException {
try (DataInputStream dis = IOUtil.getDataInputStream(
Resources.getResource(resource).openStream())) {
this._model = load(dis);
}
return this;
}

Builder useModelFromPath(Path path) throws IOException {
try (DataInputStream dis = IOUtil.getDataInputStream(path)) {
this._model = load(dis);
}
return this;
}

Builder useDefaultModel() throws IOException {
useModelFromResource("tokenization/sentence-boundary-model.bin");
return this;
}

TurkishSentenceExtractor build() {
return new TurkishSentenceExtractor(_model, _doNotSplitInDoubleQuotes);
}
}

/**
* Extracts sentences from a list if paragraph strings. This method does not split from line
* breaks assuming paragraphs do not contain line breaks. <p> If content contains line breaks, use
Expand Down Expand Up @@ -78,15 +119,28 @@ int[] boundaryIndexes(String paragraph) {
return indexes;
}

// TODO: doNotSplitInDoubleQuotes may not be suitable for some cases.
// such as for paragraph: "Merhaba. Nasılsın?"
private List<Span> extractToSpans(String paragraph) {
List<Span> spans = new ArrayList<>();
List<Span> quoteSpans = null;
if (doNotSplitInDoubleQuotes) {
quoteSpans = doubleQuoteSpans(paragraph);
}
int begin = 0;
for (int j = 0; j < paragraph.length(); j++) {

// skip if char cannot be a boundary char.
char chr = paragraph.charAt(j);
if (BOUNDARY_CHARS.indexOf(chr) < 0) {
continue;
}

// skip is break is not allowed when in double quotes.
if (doNotSplitInDoubleQuotes && quoteSpans != null && inSpan(j, quoteSpans)) {
continue;
}

BoundaryData boundaryData = new BoundaryData(paragraph, j);
if (boundaryData.nonBoundaryCheck()) {
continue;
Expand Down Expand Up @@ -114,6 +168,44 @@ private List<Span> extractToSpans(String paragraph) {
return spans;
}

private static String doubleQuotes = "\"”“»«";

/**
* Finds double quote spans.
*/
private List<Span> doubleQuoteSpans(String input) {
List<Span> spans = new ArrayList<>();

int start = -1;
boolean started = false;
for (int j = 0; j < input.length(); j++) {
char c = input.charAt(j);
if (doubleQuotes.indexOf(c) >= 0) {
if (!started) {
start = j;
started = true;
} else {
spans.add(new Span(start, j));
started = false;
}
}
}
return spans;
}

private boolean inSpan(int index, List<Span> spans) {
for (Span span : spans) {
if (span.start > index) {
return false;
}
if (span.inSpan(index)) {
return true;
}
}
return false;
}


/**
* Extracts sentences from a paragraph string. This method does not split from line breaks
* assuming paragraphs do not contain line breaks. <p> If content contains line breaks, use {@link
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,17 +67,22 @@ public void extractFromDocument() throws IOException {

}

private String markBoundariesDocument(String input) throws IOException {
private String markBoundariesDocument(String input) {
List<String> list = TurkishSentenceExtractor.DEFAULT.fromDocument(input);
return Joiner.on("|").join(list);
}


private String markBoundariesParagraph(String input) throws IOException {
private String markBoundariesParagraph(String input) {
List<String> list = TurkishSentenceExtractor.DEFAULT.fromParagraph(input);
return Joiner.on("|").join(list);
}

private String markBoundariesParagraph(TurkishSentenceExtractor extractor, String input) {
List<String> list = extractor.fromParagraph(input);
return Joiner.on("|").join(list);
}

@Test
public void testSimpleSentence() throws IOException {
Assert.assertEquals("Merhaba!|Bugün 2. köprü Fsm.'de trafik vardı.|değil mi?",
Expand Down Expand Up @@ -105,4 +110,25 @@ public void shouldReturn0ForEmptyff() {
List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraph("");
Assert.assertEquals(0, sentences.size());
}

@Test
public void testDoubleQuotes() throws IOException {

TurkishSentenceExtractor e = TurkishSentenceExtractor
.builder()
.doNotSplitInDoubleQuotes()
.useDefaultModel().build();

Assert.assertEquals(
"\"Merhaba! Bugün hava çok güzel. Ne dersin?\" dedi tavşan.|Havucu kemirirken.",
markBoundariesParagraph(
e,
"\"Merhaba! Bugün hava çok güzel. Ne dersin?\" dedi tavşan. Havucu kemirirken."));

Assert.assertEquals(
"\"Buna hakkı yok!\" diye öfkeyle konuşmaya başladı Baba Kurt.",
markBoundariesParagraph(
e, "\"Buna hakkı yok!\" diye öfkeyle konuşmaya başladı Baba Kurt."));
}

}

0 comments on commit cf3789b

Please sign in to comment.