Skip to content

Commit

Permalink
OPENNLP-1065: Use ISO-639-3 in test code
Browse files Browse the repository at this point in the history
  • Loading branch information
kottmann committed Jun 8, 2017
1 parent b2a2d2d commit aae0f29
Show file tree
Hide file tree
Showing 29 changed files with 166 additions and 170 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@
public class Conll02NameSampleStream implements ObjectStream<NameSample> {

public enum LANGUAGE {
NL,
ES
NLD,
SPA
}

public static final int GENERATE_PERSON_ENTITIES = 0x01;
Expand Down Expand Up @@ -120,7 +120,7 @@ public NameSample read() throws IOException {
String line;
while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line)) {

if (LANGUAGE.NL.equals(lang) && line.startsWith(DOCSTART)) {
if (LANGUAGE.NLD.equals(lang) && line.startsWith(DOCSTART)) {
isClearAdaptiveData = true;
continue;
}
Expand All @@ -138,7 +138,7 @@ public NameSample read() throws IOException {
}

// Always clear adaptive data for spanish
if (LANGUAGE.ES.equals(lang))
if (LANGUAGE.SPA.equals(lang))
isClearAdaptiveData = true;

if (sentence.size() > 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
public class Conll02NameSampleStreamFactory extends LanguageSampleStreamFactory<NameSample> {

interface Parameters extends BasicFormatParams {
@ParameterDescription(valueName = "es|nl")
@ParameterDescription(valueName = "spa|nld")
String getLang();

@ParameterDescription(valueName = "per,loc,org,misc")
Expand All @@ -56,12 +56,12 @@ public ObjectStream<NameSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);

LANGUAGE lang;
if ("nl".equals(params.getLang())) {
lang = LANGUAGE.NL;
if ("nl".equals(params.getLang()) || "nld".equals(params.getLang())) {
lang = LANGUAGE.NLD;
language = params.getLang();
}
else if ("es".equals(params.getLang())) {
lang = LANGUAGE.ES;
else if ("es".equals(params.getLang()) || "spa".equals(params.getLang())) {
lang = LANGUAGE.SPA;
language = params.getLang();
}
else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public class Factory {
* @return the alpha numeric pattern for the language or the default pattern.
*/
public Pattern getAlphanumeric(String languageCode) {
if ("pt".equals(languageCode)) {
if ("pt".equals(languageCode) || "por".equals(languageCode)) {
return Pattern.compile("^[0-9a-záãâàéêíóõôúüçA-ZÁÃÂÀÉÊÍÓÕÔÚÜÇ]+$");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ private static ObjectStream<ChunkSample> createSampleStream()

private static ChunkerModel trainModel(ModelType type, ChunkerFactory factory)
throws IOException {
return ChunkerME.train("en", createSampleStream(),
return ChunkerME.train("eng", createSampleStream(),
TrainingParameters.defaultParams(), factory);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ public void startup() throws IOException {
params.put(TrainingParameters.ITERATIONS_PARAM, 70);
params.put(TrainingParameters.CUTOFF_PARAM, 1);

ChunkerModel chunkerModel = ChunkerME.train("en", sampleStream, params, new ChunkerFactory());
ChunkerModel chunkerModel = ChunkerME.train("eng", sampleStream, params, new ChunkerFactory());

this.chunker = new ChunkerME(chunkerModel);
}
Expand Down Expand Up @@ -143,7 +143,7 @@ public void testInsufficientData() throws IOException {
params.put(TrainingParameters.ITERATIONS_PARAM, 70);
params.put(TrainingParameters.CUTOFF_PARAM, 1);

ChunkerME.train("en", sampleStream, params, new ChunkerFactory());
ChunkerME.train("eng", sampleStream, params, new ChunkerFactory());

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ private File trainModel() throws IOException {
TokenNameFinderFactory nameFinderFactory = new TokenNameFinderFactory();

try (ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream)) {
model = NameFinderME.train("en", null, sampleStream, params,
model = NameFinderME.train("eng", null, sampleStream, params,
nameFinderFactory);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
package opennlp.tools.eval;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;

import org.junit.Assert;
import org.junit.Test;
Expand Down Expand Up @@ -66,14 +66,12 @@ public class ArvoresDeitadasEval {
private static final String BOSQUE = "ad/Bosque_CF_8.0.ad.txt";
private static final String FLORESTA_VIRGEM = "ad/FlorestaVirgem_CF_3.0_ad.txt";

private static final String ENCODING = "ISO-8859-1";

private static final String LANG = "pt";
private static final String LANG = "por";

private static ObjectStream<String> getLineSample(String corpus)
throws IOException {
return new PlainTextByLineStream(new MarkableFileInputStreamFactory(
new File(EvalUtil.getOpennlpDataDir(), corpus)), ENCODING);
new File(EvalUtil.getOpennlpDataDir(), corpus)), StandardCharsets.ISO_8859_1);
}

private static void sentenceCrossEval(TrainingParameters params,
Expand All @@ -99,8 +97,7 @@ private static void tokenizerCrossEval(TrainingParameters params,
getLineSample(FLORESTA_VIRGEM), true);

DictionaryDetokenizer detokenizer = new DictionaryDetokenizer(
new DetokenizationDictionary(new FileInputStream(new File(
"lang/pt/tokenizer/pt-detokenizer.xml"))));
new DetokenizationDictionary(new File("lang/pt/tokenizer/pt-detokenizer.xml")));

ObjectStream<TokenSample> samples = new NameToTokenSampleStream(
detokenizer, nameSamples);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ private static ChunkerModel train(File trainFile, TrainingParameters params)
new PlainTextByLineStream(
new MarkableFileInputStreamFactory(trainFile), StandardCharsets.UTF_8));

return ChunkerME.train("en", samples, params, new ChunkerFactory());
return ChunkerME.train("eng", samples, params, new ChunkerFactory());
}

private static void eval(ChunkerModel model, File testData,
Expand Down

0 comments on commit aae0f29

Please sign in to comment.