From c4aee4e2b2910fc9f6040f9c59039f8e3706fcb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Kottmann?= Date: Sun, 29 Jan 2017 11:06:08 +0100 Subject: [PATCH] OPENNLP-176: Switch language codes to ISO-639-3 --- .../cmdline/namefind/CensusDictionaryCreatorTool.java | 2 +- .../tools/cmdline/parser/ParserTrainerTool.java | 4 ++-- .../tools/formats/AbstractSampleStreamFactory.java | 2 +- .../tools/formats/Conll03NameSampleStreamFactory.java | 6 +++--- .../tools/namefind/TokenNameFinderCrossValidator.java | 2 +- .../java/opennlp/tools/sentdetect/lang/Factory.java | 10 +++++----- .../java/opennlp/tools/tokenize/TokenizerFactory.java | 4 ++-- .../main/java/opennlp/tools/tokenize/lang/Factory.java | 2 +- .../main/java/opennlp/tools/util/model/BaseModel.java | 7 ++++--- 9 files changed, 20 insertions(+), 19 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/CensusDictionaryCreatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/CensusDictionaryCreatorTool.java index 604251045..f9bf5e091 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/CensusDictionaryCreatorTool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/CensusDictionaryCreatorTool.java @@ -50,7 +50,7 @@ public class CensusDictionaryCreatorTool extends BasicCmdLineTool { interface Parameters { @ParameterDescription(valueName = "code") - @OptionalParameter(defaultValue = "en") + @OptionalParameter(defaultValue = "eng") String getLang(); @ParameterDescription(valueName = "charsetName") diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTrainerTool.java index 60a4664a4..da5feecf8 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTrainerTool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTrainerTool.java @@ -90,10 +90,10 @@ static HeadRules creaeHeadRules(TrainerToolParams params) throws IOException { params.getHeadRulesSerializerImpl()); } else { - if ("en".equals(params.getLang())) { + if ("en".equalsIgnoreCase(params.getLang()) || "eng".equalsIgnoreCase(params.getLang())) { headRulesSerializer = new opennlp.tools.parser.lang.en.HeadRules.HeadRulesSerializer(); } - else if ("es".equals(params.getLang())) { + else if ("es".equalsIgnoreCase(params.getLang()) || "spa".equalsIgnoreCase(params.getLang())) { headRulesSerializer = new opennlp.tools.parser.lang.es.AncoraSpanishHeadRules.HeadRulesSerializer(); } else { diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java index e229666a6..2592554fe 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java @@ -34,7 +34,7 @@ protected

AbstractSampleStreamFactory(Class

params) { } public String getLang() { - return "en"; + return "eng"; } @SuppressWarnings({"unchecked"}) diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStreamFactory.java index 878565fa2..599d48a2a 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStreamFactory.java @@ -32,7 +32,7 @@ public class Conll03NameSampleStreamFactory extends LanguageSampleStreamFactory { interface Parameters extends BasicFormatParams { - @ParameterDescription(valueName = "en|de") + @ParameterDescription(valueName = "eng|deu") String getLang(); @ParameterDescription(valueName = "per,loc,org,misc") @@ -54,11 +54,11 @@ public ObjectStream create(String[] args) { // TODO: support the other languages with this CoNLL. LANGUAGE lang; - if ("en".equals(params.getLang())) { + if ("eng".equals(params.getLang())) { lang = LANGUAGE.EN; language = params.getLang(); } - else if ("de".equals(params.getLang())) { + else if ("deu".equals(params.getLang())) { lang = LANGUAGE.DE; language = params.getLang(); } diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java index 6a68b86f0..db8029c1e 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java @@ -149,7 +149,7 @@ public NameSample read() throws IOException { * Name finder cross validator * * @param languageCode - * the language of the training data + * the ISO-639-3 language of the training data * @param type * null or an override type for all types in the training data * @param trainParams diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java index 28b515b24..a83e52ee8 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java @@ -49,9 +49,9 @@ public EndOfSentenceScanner createEndOfSentenceScanner( public SDContextGenerator createSentenceContextGenerator(String languageCode, Set abbreviations) { - if ("th".equals(languageCode)) { + if ("th".equalsIgnoreCase(languageCode) || "tha".equalsIgnoreCase(languageCode)) { return new SentenceContextGenerator(); - } else if ("pt".equals(languageCode)) { + } else if ("pt".equalsIgnoreCase(languageCode) || "por".equalsIgnoreCase(languageCode)) { return new DefaultSDContextGenerator(abbreviations, ptEosCharacters); } @@ -68,11 +68,11 @@ public SDContextGenerator createSentenceContextGenerator(String languageCode) { } public char[] getEOSCharacters(String languageCode) { - if ("th".equals(languageCode)) { + if ("th".equalsIgnoreCase(languageCode) || "tha".equalsIgnoreCase(languageCode)) { return thEosCharacters; - } else if ("pt".equals(languageCode)) { + } else if ("pt".equalsIgnoreCase(languageCode) || "por".equalsIgnoreCase(languageCode)) { return ptEosCharacters; - } else if ("jp".equals(languageCode)) { + } else if ("jp".equalsIgnoreCase(languageCode) || "jpn".equalsIgnoreCase(languageCode)) { return jpEosCharacters; } diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java index ba3d285f2..e618eef50 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java @@ -56,7 +56,7 @@ public TokenizerFactory() { * programmatically create a factory. * * @param languageCode - * the language of the natural text + * the ISO-639-3 language of the natural text * @param abbreviationDictionary * an abbreviations dictionary * @param useAlphaNumericOptimization @@ -125,7 +125,7 @@ public Map createManifestEntries() { * Factory method the framework uses create a new {@link TokenizerFactory}. * * @param subclassName the name of the class implementing the {@link TokenizerFactory} - * @param languageCode the language code the tokenizer should use + * @param languageCode the ISO-639-3 language code the tokenizer should use * @param abbreviationDictionary an optional dictionary containing abbreviations, or null if not present * @param useAlphaNumericOptimization indicate if the alpha numeric optimization * should be enabled or disabled diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java index ef2a9f80e..26be7e25a 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java @@ -32,7 +32,7 @@ public class Factory { * locally because this call is expensive. * * @param languageCode - * the language code. If null or unknow the default pattern will be + * the ISO-639-3 language code. If null or unknow the default pattern will be * returned. * @return the alpha numeric pattern for the language or the default pattern. */ diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java b/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java index 062c7879a..e7c1aea7c 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java @@ -94,7 +94,7 @@ private BaseModel(String componentName, boolean isLoadedFromSerialized) { * @param componentName * the component name * @param languageCode - * the language code + * the ISO-639-3 language code * @param manifestInfoEntries * additional information in the manifest * @param factory @@ -151,7 +151,7 @@ protected BaseModel(String componentName, String languageCode, * @param componentName * the component name * @param languageCode - * the language code + * the ISO-639-3 language code * @param manifestInfoEntries * additional information in the manifest */ @@ -518,7 +518,8 @@ protected final void setManifestProperty(String key, String value) { * was used to train the model or x-unspecified if * non was set. * - * @return the language code of this model + * @return the language code of this model, from 1.8.0 this is an ISO-639-3 and for older models + * it depends on what was written into it, usually an ISO-639-1 code */ public final String getLanguage() { return getManifestProperty(LANGUAGE_PROPERTY);