From eb6bff1e03eec71764691d51895a6450fdd21c85 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Sun, 30 Apr 2017 21:25:03 +0100 Subject: [PATCH] OPENNLP-1050: [WIP] Add formats support for Irish Sentence Bank --- .../tools/cmdline/StreamFactoryRegistry.java | 5 + .../IrishSentenceBankDocument.java | 271 ++++++++++++++++++ .../IrishSentenceBankSentenceStream.java | 72 +++++ ...rishSentenceBankSentenceStreamFactory.java | 61 ++++ .../IrishSentenceBankTokenSampleStream.java | 52 ++++ ...hSentenceBankTokenSampleStreamFactory.java | 60 ++++ .../IrishSentenceBankDocumentTest.java | 67 +++++ .../irishsentencebank-sample.xml | 25 ++ 8 files changed, 613 insertions(+) create mode 100644 opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java create mode 100644 opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java create mode 100644 opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java create mode 100644 opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java index 99775196e..b2e54df26 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java @@ -52,6 +52,8 @@ import opennlp.tools.formats.convert.ParseToSentenceSampleStreamFactory; import opennlp.tools.formats.convert.ParseToTokenSampleStreamFactory; import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory; +import opennlp.tools.formats.irishsentencebank.IrishSentenceBankSentenceStreamFactory; +import opennlp.tools.formats.irishsentencebank.IrishSentenceBankTokenSampleStreamFactory; import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory; import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory; import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory; @@ -115,6 +117,9 @@ public final class StreamFactoryRegistry { ConlluPOSSampleStreamFactory.registerFactory(); ConlluLemmaSampleStreamFactory.registerFactory(); + + IrishSentenceBankSentenceStreamFactory.registerFactory(); + IrishSentenceBankTokenSampleStreamFactory.registerFactory(); } public static final String DEFAULT_FORMAT = "opennlp"; diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java new file mode 100644 index 000000000..bc6f9e15b --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.irishsentencebank; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.lang.StringBuilder; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.Span; + +/** + * A structure to hold an Irish Sentence Bank document, which is a collection + * of tokenized sentences. + *

+ * The sentence bank can be downloaded from, and is described + * here + */ +public class IrishSentenceBankDocument { + + public static class IrishSentenceBankFlex { + String surface; + String[] flex; + public String getSurface() { + return surface; + } + public String[] getFlex() { + return flex; + } + public IrishSentenceBankFlex(String sf, String[] fl) { + this.surface = sf; + this.flex = fl; + } + } + + public static class IrishSentenceBankSentence { + private String source; + private String translation; + private String original; + private Span[] tokens; + private IrishSentenceBankFlex[] flex; + public String getSource() { + return source; + } + public String getTranslation() { + return translation; + } + public String getOriginal() { + return original; + } + public Span[] getTokens() { + return tokens; + } + public IrishSentenceBankFlex[] getFlex() { + return flex; + } + public TokenSample getTokenSample() { + return new TokenSample(original, tokens); + } + public IrishSentenceBankSentence(String src, String trans, String orig, + Span[] toks, IrishSentenceBankFlex[] flx) { + this.source = src; + this.translation = trans; + this.original = orig; + this.tokens = toks; + this.flex = flx; + } + } + + private List sentences; + + public IrishSentenceBankDocument() { + sentences = new ArrayList<>(); + } + + public void add(IrishSentenceBankSentence sent) { + this.sentences.add(sent); + } + + public List getSentences() { + return Collections.unmodifiableList(sentences); + } + + /** + * Helper to adjust the span of punctuation tokens: ignores spaces to the left of the string + * @param s the string to check + * @param start the offset of the start of the string + * @return the offset adjusted to ignore spaces to the left + */ + private static int advanceLeft(String s, int start) { + int ret = start; + for (char c : s.toCharArray()) { + if (c == ' ') { + ret++; + } else { + return ret; + } + } + return ret; + } + + /** + * Helper to adjust the span of punctuation tokens: ignores spaces to the right of the string + * @param s the string to check + * @param start the offset of the start of the string + * @return the offset of the end of the string, adjusted to ignore spaces to the right + */ + private static int advanceRight(String s, int start) { + int end = s.length() - 1; + int ret = start + end + 1; + for (int i = end; i > 0; i--) { + if (s.charAt(i) == ' ') { + ret--; + } else { + return ret; + } + } + return ret; + } + + public static IrishSentenceBankDocument parse(InputStream is) throws IOException { + IrishSentenceBankDocument document = new IrishSentenceBankDocument(); + + try { + DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance(); + DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder(); + Document doc = docBuilder.parse(is); + + String root = doc.getDocumentElement().getNodeName(); + if (!root.equalsIgnoreCase("sentences")) { + throw new IOException("Expected root node " + root); + } + + NodeList nl = doc.getDocumentElement().getChildNodes(); + for (int i = 0; i < nl.getLength(); i++) { + Node sentnode = nl.item(i); + if (sentnode.getNodeName().equals("sentence")) { + String src = sentnode.getAttributes().getNamedItem("source").getNodeValue(); + String trans = ""; + Map toks = new HashMap<>(); + Map> flx = new HashMap<>(); + List spans = new ArrayList<>(); + NodeList sentnl = sentnode.getChildNodes(); + int flexes = 1; + StringBuilder orig = new StringBuilder(); + + for (int j = 0; j < sentnl.getLength(); j++) { + final String name = sentnl.item(j).getNodeName(); + switch (name) { + case "flex": + String slottmpa = sentnl.item(j).getAttributes().getNamedItem("slot").getNodeValue(); + Integer flexslot = Integer.parseInt(slottmpa); + if (flexslot > flexes) { + flexes = flexslot; + } + + flx.computeIfAbsent(flexslot, k -> new ArrayList<>()); + String tkn = sentnl.item(j).getAttributes().getNamedItem("lemma").getNodeValue(); + flx.get(flexslot).add(tkn); + break; + + case "translation": + trans = sentnl.item(j).getFirstChild().getTextContent(); + break; + + case "original": + int last = 0; + NodeList orignl = sentnl.item(j).getChildNodes(); + for (int k = 0; k < orignl.getLength(); k++) { + switch (orignl.item(k).getNodeName()) { + case "token": + String tmptok = orignl.item(k).getFirstChild().getTextContent(); + spans.add(new Span(last, last + tmptok.length())); + + String slottmpb = orignl.item(k).getAttributes().getNamedItem("slot").getNodeValue(); + Integer tokslot = Integer.parseInt(slottmpb); + if (tokslot > flexes) { + flexes = tokslot; + } + + toks.put(tokslot, tmptok); + orig.append(tmptok); + last += tmptok.length(); + break; + + case "#text": + String tmptxt = orignl.item(k).getTextContent(); + orig.append(tmptxt); + + if (!" ".equals(tmptxt)) { + spans.add(new Span(advanceLeft(tmptxt, last), advanceRight(tmptxt, last))); + } + + last += tmptxt.length(); + break; + + default: + throw new IOException("Unexpected node: " + orignl.item(k).getNodeName()); + } + } + break; + + case "#text": + case "#comment": + break; + + default: + throw new IOException("Unexpected node: " + name); + } + } + IrishSentenceBankFlex[] flexa = new IrishSentenceBankFlex[flexes]; + for (Integer flexidx : toks.keySet()) { + String left = toks.get(flexidx); + int rsize = flx.get(flexidx).size(); + String[] right = new String[rsize]; + right = flx.get(flexidx).toArray(right); + flexa[flexidx - 1] = new IrishSentenceBankFlex(left, right); + } + + Span[] spanout = new Span[spans.size()]; + spanout = spans.toArray(spanout); + document.add(new IrishSentenceBankSentence(src, trans, orig.toString(), spanout, flexa)); + } else if (!sentnode.getNodeName().equals("#text") && !sentnode.getNodeName().equals("#comment")) { + throw new IOException("Unexpected node: " + sentnode.getNodeName()); + } + } + return document; + } catch (ParserConfigurationException e) { + throw new IllegalStateException(e); + } catch (SAXException e) { + throw new IOException("Failed to parse IrishSentenceBank document", e); + } + } + + static IrishSentenceBankDocument parse(File file) throws IOException { + try (InputStream in = new FileInputStream(file)) { + return parse(in); + } + } +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java new file mode 100644 index 000000000..e7c06d147 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.irishsentencebank; + +import java.io.IOException; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.Span; + +class IrishSentenceBankSentenceStream implements ObjectStream { + + private final IrishSentenceBankDocument source; + + private Iterator sentenceIt; + + IrishSentenceBankSentenceStream(IrishSentenceBankDocument source) { + this.source = source; + reset(); + } + + @Override + public SentenceSample read() throws IOException { + + StringBuilder sentencesString = new StringBuilder(); + List sentenceSpans = new LinkedList<>(); + + while (sentenceIt.hasNext()) { + IrishSentenceBankDocument.IrishSentenceBankSentence sentence = sentenceIt.next(); + + int begin = sentencesString.length(); + + if (sentence.getOriginal() != null) { + sentencesString.append(sentence.getOriginal()); + } + + sentenceSpans.add(new Span(begin, sentencesString.length())); + sentencesString.append(' '); + } + + // end of stream is reached, indicate that with null return value + if (sentenceSpans.size() == 0) { + return null; + } + + return new SentenceSample(sentencesString.toString(), + sentenceSpans.toArray(new Span[sentenceSpans.size()])); + } + + @Override + public void reset() { + sentenceIt = source.getSentences().iterator(); + } +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java new file mode 100644 index 000000000..e26dc5611 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.irishsentencebank; + +import java.io.IOException; + +import opennlp.tools.cmdline.ArgumentParser; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.sentdetect.SentenceSample; +import opennlp.tools.util.ObjectStream; + +public class IrishSentenceBankSentenceStreamFactory extends AbstractSampleStreamFactory { + + interface Parameters extends BasicFormatParams { + } + + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(SentenceSample.class, + "irishsentencebank", new IrishSentenceBankSentenceStreamFactory( + IrishSentenceBankSentenceStreamFactory.Parameters.class)); + } + + protected

IrishSentenceBankSentenceStreamFactory(Class

params) { + super(params); + } + + @Override + public ObjectStream create(String[] args) { + + Parameters params = ArgumentParser.parse(args, Parameters.class); + + CmdLineUtil.checkInputFile("Data", params.getData()); + + IrishSentenceBankDocument isbDoc = null; + try { + isbDoc = IrishSentenceBankDocument.parse(params.getData()); + } catch (IOException ex) { + CmdLineUtil.handleCreateObjectStreamError(ex); + } + + return new IrishSentenceBankSentenceStream(isbDoc); + } +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java new file mode 100644 index 000000000..8cbfac24f --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.irishsentencebank; + +import java.io.IOException; +import java.util.Iterator; + +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.ObjectStream; + +class IrishSentenceBankTokenSampleStream implements ObjectStream { + + private final IrishSentenceBankDocument source; + + private Iterator sentenceIt; + + IrishSentenceBankTokenSampleStream(IrishSentenceBankDocument source) { + this.source = source; + reset(); + } + + @Override + public TokenSample read() throws IOException { + + if (sentenceIt.hasNext()) { + IrishSentenceBankDocument.IrishSentenceBankSentence sentence = sentenceIt.next(); + return sentence.getTokenSample(); + } else { + return null; + } + } + + @Override + public void reset() { + sentenceIt = source.getSentences().iterator(); + } +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java new file mode 100644 index 000000000..86d12256f --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.irishsentencebank; + +import java.io.IOException; + +import opennlp.tools.cmdline.ArgumentParser; +import opennlp.tools.cmdline.CmdLineUtil; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.params.BasicFormatParams; +import opennlp.tools.formats.DetokenizerSampleStreamFactory; +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.ObjectStream; + +public class IrishSentenceBankTokenSampleStreamFactory extends DetokenizerSampleStreamFactory { + + interface Parameters extends BasicFormatParams { + } + + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(TokenSample.class, + "irishsentencebank", new IrishSentenceBankTokenSampleStreamFactory( + IrishSentenceBankTokenSampleStreamFactory.Parameters.class)); + } + + protected

IrishSentenceBankTokenSampleStreamFactory(Class

params) { + super(params); + } + + public ObjectStream create(String[] args) { + + Parameters params = ArgumentParser.parse(args, Parameters.class); + + CmdLineUtil.checkInputFile("Data", params.getData()); + + IrishSentenceBankDocument isbDoc = null; + try { + isbDoc = IrishSentenceBankDocument.parse(params.getData()); + } catch (IOException ex) { + CmdLineUtil.handleCreateObjectStreamError(ex); + } + + return new IrishSentenceBankTokenSampleStream(isbDoc); + } +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java new file mode 100644 index 000000000..671fea09d --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.irishsentencebank; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; + +import org.junit.Assert; +import org.junit.Test; + +import opennlp.tools.tokenize.TokenSample; +import opennlp.tools.util.Span; + +public class IrishSentenceBankDocumentTest { + + @Test + public void testParsingSimpleDoc() throws IOException { + try (InputStream irishSBXmlIn = + IrishSentenceBankDocumentTest.class.getResourceAsStream("irishsentencebank-sample.xml")) { + + IrishSentenceBankDocument doc = IrishSentenceBankDocument.parse(irishSBXmlIn); + + List sents = doc.getSentences(); + + Assert.assertEquals(2, sents.size()); + + IrishSentenceBankDocument.IrishSentenceBankSentence sent1 = sents.get(0); + IrishSentenceBankDocument.IrishSentenceBankSentence sent2 = sents.get(1); + + Assert.assertEquals("A Dhia, tá mé ag iompar clainne!", sent1.getOriginal()); + + IrishSentenceBankDocument.IrishSentenceBankFlex[] flex = sent1.getFlex(); + Assert.assertEquals(7, flex.length); + Assert.assertEquals("A", flex[0].getSurface()); + Assert.assertArrayEquals(new String[]{"a"}, flex[0].getFlex()); + + IrishSentenceBankDocument.IrishSentenceBankFlex[] flex2 = sent2.getFlex(); + Assert.assertEquals("ón", flex2[4].getSurface()); + Assert.assertArrayEquals(new String[]{"ó", "an"}, flex2[4].getFlex()); + + Assert.assertEquals("Excuse me, are you from the stone age?", sent2.getTranslation()); + + TokenSample ts = sent1.getTokenSample(); + Span[] spans = ts.getTokenSpans(); + Assert.assertEquals(9, spans.length); + Assert.assertEquals(24, spans[7].getStart()); + Assert.assertEquals(31, spans[7].getEnd()); + Assert.assertEquals("clainne", ts.getText().substring(spans[7].getStart(), spans[7].getEnd())); + } + } +} diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml new file mode 100644 index 000000000..91e84c1eb --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml @@ -0,0 +1,25 @@ + + + A Dhia, ag iompar clainne! + Oh my God, I'm pregnant! + + + + + + + + + + Gabh mo leithscéal, an ón chlochaois thú? + Excuse me, are you from the stone age? + + + + + + + + + +